Improve utf8_gen.py to set the width for characters with Prepended_Concatenation_Mark property to 1

[BZ #22070] * localedata/unicode-gen/utf8_gen.py: Set the width for characters with Prepended_Concatenation_Mark property to 1 * localedata/charmaps/UTF-8: Updated using the improved script.
2017-09-06 11:19:33 +02:00 · 2017-09-06 11:19:33 +02:00 · 2ae5be041d
parent af83ed5c46
commit 2ae5be041d
5 changed files with 1659 additions and 13 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+2017-09-06  Mike FABIAN  <mfabian@redhat.com>
+
+	[BZ #22070]
+	* localedata/unicode-gen/utf8_gen.py: Set the width for
+	characters with Prepended_Concatenation_Mark property to 1
+	* localedata/charmaps/UTF-8: Updated using the improved script.
+
 2017-09-06  Mike FABIAN  <mfabian@redhat.com>

 	[BZ #21750]
--- a/localedata/charmaps/UTF-8
+++ b/localedata/charmaps/UTF-8
@ -46395,7 +46395,7 @@ CHARMAP
 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
 END CHARMAP

-% Character width according to Unicode 7.0.0.
+% Character width according to Unicode 10.0.0.
 % - Default width is 1.
 % - Double-width characters have width 2; generated from
 %        "grep '^[^;]*;[WF]' EastAsianWidth.txt"
@ -46411,16 +46411,14 @@ WIDTH
 <U05C1>...<U05C2>	0
 <U05C4>...<U05C5>	0
 <U05C7>	0
-<U0600>...<U0605>	0
 <U0610>...<U061A>	0
 <U061C>	0
 <U064B>...<U065F>	0
 <U0670>	0
-<U06D6>...<U06DD>	0
+<U06D6>...<U06DC>	0
 <U06DF>...<U06E4>	0
 <U06E7>...<U06E8>	0
 <U06EA>...<U06ED>	0
-<U070F>	0
 <U0711>	0
 <U0730>...<U074A>	0
 <U07A6>...<U07B0>	0
@ -46430,7 +46428,8 @@ WIDTH
 <U0825>...<U0827>	0
 <U0829>...<U082D>	0
 <U0859>...<U085B>	0
-<U08D4>...<U0902>	0
+<U08D4>...<U08E1>	0
+<U08E3>...<U0902>	0
 <U093A>	0
 <U093C>	0
 <U0941>...<U0948>	0
@ -46692,7 +46691,6 @@ WIDTH
 <U0001107F>...<U00011081>	0
 <U000110B3>...<U000110B6>	0
 <U000110B9>...<U000110BA>	0
-<U000110BD>	0
 <U00011100>...<U00011102>	0
 <U00011127>...<U0001112B>	0
 <U0001112D>...<U00011134>	0
--- a/localedata/unicode-gen/Makefile
+++ b/localedata/unicode-gen/Makefile
@ -40,7 +40,7 @@ UNICODE_VERSION = 10.0.0
 PYTHON3 = python3
 WGET = wget

-DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
+DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt PropList.txt
 GENERATED = i18n tr_TR UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction
 REPORTS = i18n-report UTF-8-report

@ -92,7 +92,7 @@ tr_TR: gen_unicode_ctype.py

 UTF-8: UnicodeData.txt EastAsianWidth.txt
 UTF-8: utf8_gen.py
-	$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt
+	$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt

 UTF-8-report: UTF-8 ../charmaps/UTF-8
 UTF-8-report: utf8_compatibility.py
--- a/localedata/unicode-gen/PropList.txt
+++ b/localedata/unicode-gen/PropList.txt
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@ -215,9 +215,11 @@ def write_header_width(outfile):
 #    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
    outfile.write("WIDTH\n")

-def process_width(outfile, ulines, elines):
+def process_width(outfile, ulines, elines, plines):
    '''ulines are lines from UnicodeData.txt, elines are lines from
-    EastAsianWidth.txt
+    EastAsianWidth.txt containing characters with width “W” or “F”,
+    plines are lines from PropList.txt which contain characters
+    with the property “Prepended_Concatenation_Mark”.

    '''
    width_dict = {}
@ -230,16 +232,29 @@ def process_width(outfile, ulines, elines):
        for key in range(int(code_points[0], 16),
                         int(code_points[1], 16)+1):
            width_dict[key] = 2
+
    for line in ulines:
        fields = line.split(";")
        if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
            width_dict[int(fields[0], 16)] = 0

+    for line in plines:
+        # Characters with the property “Prepended_Concatenation_Mark”
+        # should have the width 1:
+        fields = line.split(";")
+        if not '..' in fields[0]:
+            code_points = (fields[0], fields[0])
+        else:
+            code_points = fields[0].split("..")
+        for key in range(int(code_points[0], 16),
+                         int(code_points[1], 16)+1):
+            del width_dict[key] # default width is 1
+
    # handle special cases for compatibility
    for key in list((0x00AD,)):
        # https://www.cs.tut.fi/~jkorpela/shy.html
        if key in width_dict:
-            del width_dict[key]
+            del width_dict[key] # default width is 1
    for key in list(range(0x1160, 0x1200)):
        width_dict[key] = 0
    for key in list(range(0x3248, 0x3250)):
@ -278,7 +293,7 @@ def process_width(outfile, ulines, elines):

 if __name__ == "__main__":
    if len(sys.argv) < 3:
-        print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
+        print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt")
    else:
        with open(sys.argv[1], mode='r') as UNIDATA_FILE:
            UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
@ -298,6 +313,11 @@ if __name__ == "__main__":
                    continue
                if re.match(r'^[^;]*;[WF]', LINE):
                    EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
+        with open(sys.argv[3], mode='r') as PROP_LIST_FILE:
+            PROP_LIST_LINES = []
+            for LINE in PROP_LIST_FILE:
+                if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
+                    PROP_LIST_LINES.append(LINE.strip())
        with open('UTF-8', mode='w') as OUTFILE:
            # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
            write_header_charmap(OUTFILE)
@ -305,5 +325,8 @@ if __name__ == "__main__":
            OUTFILE.write("END CHARMAP\n\n")
            # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
            write_header_width(OUTFILE)
-            process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
+            process_width(OUTFILE,
+                          UNICODE_DATA_LINES,
+                          EAST_ASIAN_WIDTH_LINES,
+                          PROP_LIST_LINES)
            OUTFILE.write("END WIDTH\n")