Unicode 7.0.0 update; added generator scripts.
for localedata/ChangeLog
[BZ #17588]
[BZ #13064]
[BZ #14094]
[BZ #17998]
* unicode-gen/Makefile: New.
* unicode-gen/unicode-license.txt: New, from Unicode.
* unicode-gen/UnicodeData.txt: New, from Unicode.
* unicode-gen/DerivedCoreProperties.txt: New, from Unicode.
* unicode-gen/EastAsianWidth.txt: New, from Unicode.
* unicode-gen/gen_unicode_ctype.py: New generator, from Mike
FABIAN <mfabian@redhat.com>.
* unicode-gen/ctype_compatibility.py: New verifier, from
Pravin Satpute <psatpute@redhat.com> and Mike FABIAN.
* unicode-gen/ctype_compatibility_test_cases.py: New verifier
module, from Mike FABIAN.
* unicode-gen/utf8_gen.py: New generator, from Pravin Satpute
and Mike FABIAN.
* unicode-gen/utf8_compatibility.py: New verifier, from Pravin
Satpute and Mike FABIAN.
* charmaps/UTF-8: Update.
* locales/i18n: Update.
* gen-unicode-ctype.c: Remove.
* tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns
true for ordinal indicators.
2015-02-20 23:14:59 +01:00
|
|
|
#!/usr/bin/python3
|
|
|
|
# -*- coding: utf-8 -*-
|
2015-02-23 15:22:37 +01:00
|
|
|
# Copyright (C) 2014-2015 Free Software Foundation, Inc.
|
Unicode 7.0.0 update; added generator scripts.
for localedata/ChangeLog
[BZ #17588]
[BZ #13064]
[BZ #14094]
[BZ #17998]
* unicode-gen/Makefile: New.
* unicode-gen/unicode-license.txt: New, from Unicode.
* unicode-gen/UnicodeData.txt: New, from Unicode.
* unicode-gen/DerivedCoreProperties.txt: New, from Unicode.
* unicode-gen/EastAsianWidth.txt: New, from Unicode.
* unicode-gen/gen_unicode_ctype.py: New generator, from Mike
FABIAN <mfabian@redhat.com>.
* unicode-gen/ctype_compatibility.py: New verifier, from
Pravin Satpute <psatpute@redhat.com> and Mike FABIAN.
* unicode-gen/ctype_compatibility_test_cases.py: New verifier
module, from Mike FABIAN.
* unicode-gen/utf8_gen.py: New generator, from Pravin Satpute
and Mike FABIAN.
* unicode-gen/utf8_compatibility.py: New verifier, from Pravin
Satpute and Mike FABIAN.
* charmaps/UTF-8: Update.
* locales/i18n: Update.
* gen-unicode-ctype.c: Remove.
* tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns
true for ordinal indicators.
2015-02-20 23:14:59 +01:00
|
|
|
# This file is part of the GNU C Library.
|
|
|
|
#
|
|
|
|
# The GNU C Library is free software; you can redistribute it and/or
|
|
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
|
|
# License as published by the Free Software Foundation; either
|
|
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
|
|
#
|
|
|
|
# The GNU C Library is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
# Lesser General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
|
|
# License along with the GNU C Library; if not, see
|
|
|
|
# <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
'''
|
|
|
|
This script is useful for checking the differences between
|
|
|
|
an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
|
|
|
|
new one generated by gen_unicode_ctype.py
|
|
|
|
|
|
|
|
To see how it is used, call it with the “-h” option:
|
|
|
|
|
|
|
|
$ ./ctype_compatibility.py -h
|
|
|
|
… prints usage message …
|
|
|
|
'''
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import re
|
|
|
|
import unicodedata
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
from ctype_compatibility_test_cases import TEST_CASES
|
|
|
|
|
|
|
|
def get_lines_from_file(filename):
|
|
|
|
'''Get all non-comment lines from a i18n file
|
|
|
|
|
|
|
|
Also merge all lines which are continued on the next line because
|
|
|
|
they end in “/” into a single line.
|
|
|
|
'''
|
|
|
|
with open(filename) as i18n_file:
|
|
|
|
current_line = ''
|
|
|
|
for line in i18n_file:
|
|
|
|
line = line.strip('\n')
|
|
|
|
if '%' in line:
|
|
|
|
if line.endswith('/'):
|
|
|
|
line = line[0:line.find('%')] + '/'
|
|
|
|
else:
|
|
|
|
line = line[0:line.find('%')]
|
|
|
|
line = line.strip()
|
|
|
|
if line.endswith('/'):
|
|
|
|
current_line += line[:-1]
|
|
|
|
else:
|
|
|
|
yield current_line + line
|
|
|
|
current_line = ''
|
|
|
|
if current_line: # file ends with a continuation line
|
|
|
|
yield current_line
|
|
|
|
|
|
|
|
def extract_character_classes(filename):
|
|
|
|
'''Get all Unicode code points for each character class from a file
|
|
|
|
|
|
|
|
Store these code points in a dictionary using the character classes
|
|
|
|
as keys and the list of code points in this character class as values.
|
|
|
|
|
|
|
|
In case of the character classes “toupper”, “tolower”, and “totitle”,
|
|
|
|
these area actually pairs of code points
|
|
|
|
'''
|
|
|
|
ctype_dict = {}
|
|
|
|
for line in get_lines_from_file(filename):
|
|
|
|
for char_class in [
|
|
|
|
'upper',
|
|
|
|
'lower',
|
|
|
|
'alpha',
|
|
|
|
'digit',
|
|
|
|
'outdigit',
|
|
|
|
'space',
|
|
|
|
'cntrl',
|
|
|
|
'punct',
|
|
|
|
'graph',
|
|
|
|
'print',
|
|
|
|
'xdigit',
|
|
|
|
'blank',
|
|
|
|
'combining',
|
|
|
|
'combining_level3',
|
|
|
|
'toupper',
|
|
|
|
'tolower',
|
|
|
|
'totitle']:
|
|
|
|
match = re.match(r'^('
|
|
|
|
+'(?:(?:class|map)\s+")'
|
|
|
|
+re.escape(char_class)+
|
|
|
|
'(?:";)\s+'
|
|
|
|
+'|'
|
|
|
|
+re.escape(char_class)+'\s+'
|
|
|
|
+')', line)
|
|
|
|
if match:
|
|
|
|
if char_class not in ctype_dict:
|
|
|
|
ctype_dict[char_class] = []
|
|
|
|
process_chars(
|
|
|
|
ctype_dict[char_class],
|
|
|
|
line[match.end():])
|
|
|
|
return ctype_dict
|
|
|
|
|
|
|
|
def process_chars(char_class_list, code_point_line):
|
|
|
|
'''
|
|
|
|
Extract Unicode values from code_point_line
|
|
|
|
and add to the list of code points in a character class
|
|
|
|
'''
|
|
|
|
for code_points in code_point_line.split(';'):
|
|
|
|
code_points = code_points.strip()
|
|
|
|
match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
|
|
|
|
if match: # <Uxxxx>
|
|
|
|
char_class_list.append(
|
|
|
|
int(match.group('codepoint'), 16))
|
|
|
|
continue
|
|
|
|
match = re.match(
|
|
|
|
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
|
|
|
+'\.\.'+
|
|
|
|
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
|
|
|
|
code_points)
|
|
|
|
if match: # <Uxxxx>..<Uxxxx>
|
|
|
|
for codepoint in range(
|
|
|
|
int(match.group('codepoint1'), 16),
|
|
|
|
int(match.group('codepoint2'), 16) + 1):
|
|
|
|
char_class_list.append(codepoint)
|
|
|
|
continue
|
|
|
|
match = re.match(
|
|
|
|
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
|
|
|
+'\.\.\(2\)\.\.'+
|
|
|
|
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
|
|
|
|
code_points)
|
|
|
|
if match: # <Uxxxx>..(2)..<Uxxxx>
|
|
|
|
for codepoint in range(
|
|
|
|
int(match.group('codepoint1'), 16),
|
|
|
|
int(match.group('codepoint2'), 16) + 1,
|
|
|
|
2):
|
|
|
|
char_class_list.append(codepoint)
|
|
|
|
continue
|
|
|
|
match = re.match(
|
|
|
|
r'^\('
|
|
|
|
+'<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
|
|
|
+','+
|
|
|
|
'<U(?P<codepoint2>[0-9A-F]{4,8})>'
|
|
|
|
+'\)$',
|
|
|
|
code_points)
|
|
|
|
if match: # (<Uxxxx>,<Uxxxx>)
|
|
|
|
char_class_list.append((
|
|
|
|
int(match.group('codepoint1'), 16),
|
|
|
|
int(match.group('codepoint2'), 16)))
|
|
|
|
continue
|
|
|
|
sys.stderr.write(
|
|
|
|
('None of the regexps matched '
|
|
|
|
+ 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
|
|
|
|
'cp': code_points,
|
|
|
|
'cpl': code_point_line
|
|
|
|
})
|
|
|
|
exit(1)
|
|
|
|
|
|
|
|
def compare_lists(old_ctype_dict, new_ctype_dict):
|
|
|
|
'''Compare character classes in the old and the new LC_CTYPE'''
|
|
|
|
print('****************************************************')
|
|
|
|
print('Character classes which are only in the new '
|
|
|
|
+ 'or only in the old file:')
|
|
|
|
for char_class in sorted(old_ctype_dict):
|
|
|
|
if char_class not in new_ctype_dict:
|
|
|
|
print('Character class %s is in old ctype but not in new ctype'
|
|
|
|
%char_class)
|
|
|
|
for char_class in sorted(new_ctype_dict):
|
|
|
|
if char_class not in old_ctype_dict:
|
|
|
|
print('Character class %s is in new ctype but not in old ctype'
|
|
|
|
%char_class)
|
|
|
|
for char_class in sorted(old_ctype_dict):
|
|
|
|
print("****************************************************")
|
|
|
|
print("%s: %d chars in old ctype and %d chars in new ctype" %(
|
|
|
|
char_class,
|
|
|
|
len(old_ctype_dict[char_class]),
|
|
|
|
len(new_ctype_dict[char_class])))
|
|
|
|
print("----------------------------------------------------")
|
|
|
|
report(char_class,
|
|
|
|
old_ctype_dict[char_class],
|
|
|
|
new_ctype_dict[char_class])
|
|
|
|
|
|
|
|
def report_code_points(char_class, code_point_list, text=''):
|
|
|
|
'''Report all code points which have been added to or removed from a
|
|
|
|
character class.
|
|
|
|
'''
|
|
|
|
for code_point in sorted(code_point_list):
|
|
|
|
if type(code_point) == type(int()):
|
|
|
|
print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
|
|
|
|
%{'text': text,
|
|
|
|
'char': chr(code_point),
|
|
|
|
'char_class': char_class,
|
|
|
|
'code_point': hex(code_point),
|
|
|
|
'name': unicodedata.name(chr(code_point), 'name unknown')})
|
|
|
|
else:
|
|
|
|
print(('%(char_class)s: %(text)s: '
|
|
|
|
+ '%(char0)s → %(char1)s '
|
|
|
|
+ '%(code_point0)s → %(code_point1)s '
|
|
|
|
+ '%(name0)s → %(name1)s') %{
|
|
|
|
'text': text,
|
|
|
|
'char_class': char_class,
|
|
|
|
'char0': chr(code_point[0]),
|
|
|
|
'code_point0': hex(code_point[0]),
|
|
|
|
'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
|
|
|
|
'char1': chr(code_point[1]),
|
|
|
|
'code_point1': hex(code_point[1]),
|
|
|
|
'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
|
|
|
|
})
|
|
|
|
|
|
|
|
def report(char_class, old_list, new_list):
|
|
|
|
'''Report the differences for a certain LC_CTYPE character class
|
|
|
|
between the old and the newly generated state
|
|
|
|
'''
|
|
|
|
missing_chars = list(set(old_list)-set(new_list))
|
|
|
|
print(('%(char_class)s: Missing %(number)d characters '
|
|
|
|
+ 'of old ctype in new ctype ')
|
|
|
|
%{'char_class': char_class, 'number': len(missing_chars)})
|
|
|
|
if ARGS.show_missing_characters:
|
|
|
|
report_code_points(char_class, missing_chars, 'Missing')
|
|
|
|
added_chars = list(set(new_list)-set(old_list))
|
|
|
|
print(('%(char_class)s: Added %(number)d characters '
|
|
|
|
+ 'in new ctype which were not in old ctype')
|
|
|
|
%{'char_class': char_class, 'number': len(added_chars)})
|
|
|
|
if ARGS.show_added_characters:
|
|
|
|
report_code_points(char_class, added_chars, 'Added')
|
|
|
|
|
|
|
|
|
|
|
|
def cperror(error_message, errorcounter=0):
|
|
|
|
'''Increase number of errors by one and print an error message'''
|
|
|
|
print(error_message)
|
|
|
|
return errorcounter + 1
|
|
|
|
|
|
|
|
def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
|
|
|
|
errorcounter=0):
|
|
|
|
'''The parameter “code_point_list_with_ranges” is a list of
|
|
|
|
integers or pairs of integers, for example:
|
|
|
|
|
|
|
|
[0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
|
|
|
|
|
|
|
|
where the pairs of integers stand for all the code points in the range
|
|
|
|
of the two integers given, including the two integers of the pair.
|
|
|
|
|
|
|
|
'''
|
|
|
|
for code_point_range in code_point_list_with_ranges:
|
|
|
|
for code_point in ([code_point_range]
|
|
|
|
if type(code_point_range) == type(int())
|
|
|
|
else range(code_point_range[0],
|
|
|
|
code_point_range[1]+1)):
|
|
|
|
for char_class_tuple in char_classes:
|
|
|
|
char_class = char_class_tuple[0]
|
|
|
|
in_char_class = char_class_tuple[1]
|
|
|
|
if (code_point in ctype_dict[char_class]) != in_char_class:
|
|
|
|
errorcounter = cperror(
|
|
|
|
('error: %(code_point)s %(char)s '
|
|
|
|
+ '%(char_class)s %(in)s: %(reason)s') %{
|
|
|
|
'code_point': hex(code_point),
|
|
|
|
'char': chr(code_point),
|
|
|
|
'char_class': char_class,
|
|
|
|
'in': not in_char_class,
|
|
|
|
'reason': reason},
|
|
|
|
errorcounter)
|
|
|
|
return errorcounter
|
|
|
|
|
|
|
|
def tests(ctype_dict, errorcounter = 0):
|
|
|
|
'''Test a LC_CTYPE character class dictionary for known errors'''
|
|
|
|
# copy the information from ctype_dict (which contains lists) in
|
|
|
|
# a new dictionary ctype_dict2 (which contains dictionaries).
|
|
|
|
# The checks below are easier with that type of data structure.
|
|
|
|
|
|
|
|
ctype_dict2 = {}
|
|
|
|
for key in ctype_dict:
|
|
|
|
ctype_dict2[key] = {}
|
|
|
|
if ctype_dict[key]:
|
|
|
|
if type(ctype_dict[key][0]) == type(int()):
|
|
|
|
for value in ctype_dict[key]:
|
|
|
|
ctype_dict2[key][value] = 1
|
|
|
|
else: # key is 'toupper', 'tolower', or 'totitle'
|
|
|
|
for value in ctype_dict[key]:
|
|
|
|
ctype_dict2[key][value[0]] = value[1]
|
|
|
|
|
|
|
|
for test_case in TEST_CASES:
|
|
|
|
errorcounter = cpcheck(ctype_dict2,
|
|
|
|
test_case[0],
|
|
|
|
test_case[1],
|
|
|
|
test_case[2],
|
|
|
|
errorcounter = errorcounter)
|
|
|
|
|
|
|
|
for code_point in range(0, 0x110000):
|
|
|
|
# toupper restriction: "Only characters specified for the keywords
|
|
|
|
# lower and upper shall be specified.
|
|
|
|
if (code_point in ctype_dict2['toupper']
|
|
|
|
and code_point != ctype_dict2['toupper'][code_point]
|
|
|
|
and not (code_point in ctype_dict2['lower']
|
|
|
|
or code_point in ctype_dict2['upper'])):
|
|
|
|
errorcounter = cperror(
|
|
|
|
('error: %(char1)s is not upper|lower '
|
|
|
|
+ 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
|
|
|
|
'char1': chr(code_point),
|
|
|
|
'cp1': hex(code_point),
|
|
|
|
'cp2': hex(ctype_dict2['toupper'][code_point]),
|
|
|
|
'char2': chr(ctype_dict2['toupper'][code_point])
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
# tolower restriction: "Only characters specified for the keywords
|
|
|
|
# lower and upper shall be specified.
|
|
|
|
if (code_point in ctype_dict2['tolower']
|
|
|
|
and code_point != ctype_dict2['tolower'][code_point]
|
|
|
|
and not (code_point in ctype_dict2['lower']
|
|
|
|
or code_point in ctype_dict2['upper'])):
|
|
|
|
errorcounter = cperror(
|
|
|
|
('error: %(char1)s is not upper|lower '
|
|
|
|
+ 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
|
|
|
|
'char1': chr(code_point),
|
|
|
|
'cp1': hex(code_point),
|
|
|
|
'cp2': hex(ctype_dict2['tolower'][code_point]),
|
|
|
|
'char2': chr(ctype_dict2['tolower'][code_point])
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
# alpha restriction: "Characters classified as either upper or lower
|
|
|
|
# shall automatically belong to this class.
|
|
|
|
if ((code_point in ctype_dict2['lower']
|
|
|
|
or code_point in ctype_dict2['upper'])
|
|
|
|
and code_point not in ctype_dict2['alpha']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is upper|lower but not alpha' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
# alpha restriction: "No character specified for the keywords cntrl,
|
|
|
|
# digit, punct or space shall be specified."
|
|
|
|
if (code_point in ctype_dict2['alpha']
|
|
|
|
and code_point in ctype_dict2['cntrl']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is alpha and cntrl' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['alpha']
|
|
|
|
and code_point in ctype_dict2['digit']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is alpha and digit' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['alpha']
|
|
|
|
and code_point in ctype_dict2['punct']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is alpha and punct' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['alpha']
|
|
|
|
and code_point in ctype_dict2['space']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is alpha and space' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
# space restriction: "No character specified for the keywords upper,
|
|
|
|
# lower, alpha, digit, graph or xdigit shall be specified."
|
|
|
|
# upper, lower, alpha already checked above.
|
|
|
|
if (code_point in ctype_dict2['space']
|
|
|
|
and code_point in ctype_dict2['digit']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is space and digit' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['space']
|
|
|
|
and code_point in ctype_dict2['graph']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is space and graph' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['space']
|
|
|
|
and code_point in ctype_dict2['xdigit']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is space and xdigit' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
# cntrl restriction: "No character specified for the keywords upper,
|
|
|
|
# lower, alpha, digit, punct, graph, print or xdigit shall be
|
|
|
|
# specified." upper, lower, alpha already checked above.
|
|
|
|
if (code_point in ctype_dict2['cntrl']
|
|
|
|
and code_point in ctype_dict2['digit']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is cntrl and digit' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['cntrl']
|
|
|
|
and code_point in ctype_dict2['punct']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is cntrl and punct' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['cntrl']
|
|
|
|
and code_point in ctype_dict2['graph']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is cntrl and graph' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['cntrl']
|
|
|
|
and code_point in ctype_dict2['print']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is cntrl and print' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['cntrl']
|
|
|
|
and code_point in ctype_dict2['xdigit']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is cntrl and xdigit' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
# punct restriction: "No character specified for the keywords upper,
|
|
|
|
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
|
|
|
# be specified." upper, lower, alpha, cntrl already checked above.
|
|
|
|
if (code_point in ctype_dict2['punct']
|
|
|
|
and code_point in ctype_dict2['digit']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is punct and digit' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['punct']
|
|
|
|
and code_point in ctype_dict2['xdigit']):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is punct and xdigit' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point in ctype_dict2['punct']
|
|
|
|
and code_point == 0x0020):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is punct.' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
# graph restriction: "No character specified for the keyword cntrl
|
|
|
|
# shall be specified." Already checked above.
|
|
|
|
|
|
|
|
# print restriction: "No character specified for the keyword cntrl
|
|
|
|
# shall be specified." Already checked above.
|
|
|
|
|
|
|
|
# graph - print relation: differ only in the <space> character.
|
|
|
|
# How is this possible if there are more than one space character?!
|
|
|
|
# I think susv2/xbd/locale.html should speak of "space characters",
|
|
|
|
# not "space character".
|
|
|
|
if (code_point in ctype_dict2['print']
|
|
|
|
and not (code_point in ctype_dict2['graph']
|
|
|
|
or code_point in ctype_dict2['space'])):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s is print but not graph|space' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
if (code_point not in ctype_dict2['print']
|
|
|
|
and (code_point in ctype_dict2['graph']
|
|
|
|
or code_point == 0x0020)):
|
|
|
|
errorcounter = cperror(
|
|
|
|
'error: %(char)s %(cp)s graph|space but not print' %{
|
|
|
|
'char': chr(code_point),
|
|
|
|
'cp': hex(code_point)
|
|
|
|
},
|
|
|
|
errorcounter)
|
|
|
|
return errorcounter
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
PARSER = argparse.ArgumentParser(
|
|
|
|
description='''
|
|
|
|
Compare the contents of LC_CTYPE in two files and check for errors.
|
|
|
|
''')
|
|
|
|
PARSER.add_argument(
|
|
|
|
'-o', '--old_ctype_file',
|
|
|
|
nargs='?',
|
|
|
|
type=str,
|
|
|
|
default='i18n',
|
|
|
|
help='The old ctype file, default: %(default)s')
|
|
|
|
PARSER.add_argument(
|
|
|
|
'-n', '--new_ctype_file',
|
|
|
|
nargs='?',
|
|
|
|
type=str,
|
|
|
|
default='unicode-ctype',
|
|
|
|
help='The new ctype file, default: %(default)s')
|
|
|
|
PARSER.add_argument(
|
|
|
|
'-a', '--show_added_characters',
|
|
|
|
action='store_true',
|
|
|
|
help=('Show characters which were added to each '
|
|
|
|
+ 'character class in detail.'))
|
|
|
|
PARSER.add_argument(
|
|
|
|
'-m', '--show_missing_characters',
|
|
|
|
action='store_true',
|
|
|
|
help=('Show characters which were removed from each '
|
|
|
|
+ 'character class in detail.'))
|
|
|
|
ARGS = PARSER.parse_args()
|
|
|
|
|
|
|
|
OLD_CTYPE_DICT = extract_character_classes(
|
|
|
|
ARGS.old_ctype_file)
|
|
|
|
NEW_CTYPE_DICT = extract_character_classes(
|
|
|
|
ARGS.new_ctype_file)
|
|
|
|
compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
|
|
|
|
print('============================================================')
|
|
|
|
print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
|
|
|
|
print('------------------------------------------------------------')
|
|
|
|
NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
|
|
|
|
print('------------------------------------------------------------')
|
|
|
|
print('Old file = %s' %ARGS.old_ctype_file)
|
|
|
|
print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
|
|
|
|
print('------------------------------------------------------------')
|
|
|
|
print('============================================================')
|
|
|
|
print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
|
|
|
|
print('------------------------------------------------------------')
|
|
|
|
NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
|
|
|
|
print('------------------------------------------------------------')
|
|
|
|
print('New file = %s' %ARGS.new_ctype_file)
|
|
|
|
print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
|
|
|
|
print('------------------------------------------------------------')
|
|
|
|
if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
|
|
|
|
exit(1)
|
|
|
|
else:
|
|
|
|
exit(0)
|