Improve utf8_gen.py to set the width for characters with Prepended_Concatenation_Mark property to 1

[BZ #22070]
	* localedata/unicode-gen/utf8_gen.py: Set the width for
	characters with Prepended_Concatenation_Mark property to 1
	* localedata/charmaps/UTF-8: Updated using the improved script.
This commit is contained in:
Mike FABIAN 2017-09-06 11:19:33 +02:00
parent af83ed5c46
commit 2ae5be041d
5 changed files with 1659 additions and 13 deletions

View File

@ -1,3 +1,10 @@
2017-09-06 Mike FABIAN <mfabian@redhat.com>
[BZ #22070]
* localedata/unicode-gen/utf8_gen.py: Set the width for
characters with Prepended_Concatenation_Mark property to 1
* localedata/charmaps/UTF-8: Updated using the improved script.
2017-09-06 Mike FABIAN <mfabian@redhat.com>
[BZ #21750]

View File

@ -46395,7 +46395,7 @@ CHARMAP
<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
END CHARMAP
% Character width according to Unicode 7.0.0.
% Character width according to Unicode 10.0.0.
% - Default width is 1.
% - Double-width characters have width 2; generated from
% "grep '^[^;]*;[WF]' EastAsianWidth.txt"
@ -46411,16 +46411,14 @@ WIDTH
<U05C1>...<U05C2> 0
<U05C4>...<U05C5> 0
<U05C7> 0
<U0600>...<U0605> 0
<U0610>...<U061A> 0
<U061C> 0
<U064B>...<U065F> 0
<U0670> 0
<U06D6>...<U06DD> 0
<U06D6>...<U06DC> 0
<U06DF>...<U06E4> 0
<U06E7>...<U06E8> 0
<U06EA>...<U06ED> 0
<U070F> 0
<U0711> 0
<U0730>...<U074A> 0
<U07A6>...<U07B0> 0
@ -46430,7 +46428,8 @@ WIDTH
<U0825>...<U0827> 0
<U0829>...<U082D> 0
<U0859>...<U085B> 0
<U08D4>...<U0902> 0
<U08D4>...<U08E1> 0
<U08E3>...<U0902> 0
<U093A> 0
<U093C> 0
<U0941>...<U0948> 0
@ -46692,7 +46691,6 @@ WIDTH
<U0001107F>...<U00011081> 0
<U000110B3>...<U000110B6> 0
<U000110B9>...<U000110BA> 0
<U000110BD> 0
<U00011100>...<U00011102> 0
<U00011127>...<U0001112B> 0
<U0001112D>...<U00011134> 0

View File

@ -40,7 +40,7 @@ UNICODE_VERSION = 10.0.0
PYTHON3 = python3
WGET = wget
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt PropList.txt
GENERATED = i18n tr_TR UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction
REPORTS = i18n-report UTF-8-report
@ -92,7 +92,7 @@ tr_TR: gen_unicode_ctype.py
UTF-8: UnicodeData.txt EastAsianWidth.txt
UTF-8: utf8_gen.py
$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt
$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt
UTF-8-report: UTF-8 ../charmaps/UTF-8
UTF-8-report: utf8_compatibility.py

File diff suppressed because it is too large Load Diff

View File

@ -215,9 +215,11 @@ def write_header_width(outfile):
# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
outfile.write("WIDTH\n")
def process_width(outfile, ulines, elines):
def process_width(outfile, ulines, elines, plines):
'''ulines are lines from UnicodeData.txt, elines are lines from
EastAsianWidth.txt
EastAsianWidth.txt containing characters with width W or F,
plines are lines from PropList.txt which contain characters
with the property Prepended_Concatenation_Mark.
'''
width_dict = {}
@ -230,16 +232,29 @@ def process_width(outfile, ulines, elines):
for key in range(int(code_points[0], 16),
int(code_points[1], 16)+1):
width_dict[key] = 2
for line in ulines:
fields = line.split(";")
if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
width_dict[int(fields[0], 16)] = 0
for line in plines:
# Characters with the property “Prepended_Concatenation_Mark”
# should have the width 1:
fields = line.split(";")
if not '..' in fields[0]:
code_points = (fields[0], fields[0])
else:
code_points = fields[0].split("..")
for key in range(int(code_points[0], 16),
int(code_points[1], 16)+1):
del width_dict[key] # default width is 1
# handle special cases for compatibility
for key in list((0x00AD,)):
# https://www.cs.tut.fi/~jkorpela/shy.html
if key in width_dict:
del width_dict[key]
del width_dict[key] # default width is 1
for key in list(range(0x1160, 0x1200)):
width_dict[key] = 0
for key in list(range(0x3248, 0x3250)):
@ -278,7 +293,7 @@ def process_width(outfile, ulines, elines):
if __name__ == "__main__":
if len(sys.argv) < 3:
print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt")
else:
with open(sys.argv[1], mode='r') as UNIDATA_FILE:
UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
@ -298,6 +313,11 @@ if __name__ == "__main__":
continue
if re.match(r'^[^;]*;[WF]', LINE):
EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
with open(sys.argv[3], mode='r') as PROP_LIST_FILE:
PROP_LIST_LINES = []
for LINE in PROP_LIST_FILE:
if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
PROP_LIST_LINES.append(LINE.strip())
with open('UTF-8', mode='w') as OUTFILE:
# Processing UnicodeData.txt and write CHARMAP to UTF-8 file
write_header_charmap(OUTFILE)
@ -305,5 +325,8 @@ if __name__ == "__main__":
OUTFILE.write("END CHARMAP\n\n")
# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
write_header_width(OUTFILE)
process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
process_width(OUTFILE,
UNICODE_DATA_LINES,
EAST_ASIAN_WIDTH_LINES,
PROP_LIST_LINES)
OUTFILE.write("END WIDTH\n")