107 lines
3.2 KiB
Python
107 lines
3.2 KiB
Python
|
#!/usr/bin/env python3
|
||
|
#
|
||
|
# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
|
||
|
#
|
||
|
# This file is part of GCC.
|
||
|
#
|
||
|
# GCC is free software; you can redistribute it and/or modify it under
|
||
|
# the terms of the GNU General Public License as published by the Free
|
||
|
# Software Foundation; either version 3, or (at your option) any later
|
||
|
# version.
|
||
|
#
|
||
|
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||
|
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
|
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||
|
# for more details.
|
||
|
#
|
||
|
# You should have received a copy of the GNU General Public License
|
||
|
# along with GCC; see the file COPYING3. If not see
|
||
|
# <http://www.gnu.org/licenses/>. */
|
||
|
|
||
|
import sys
|
||
|
import os
|
||
|
|
||
|
if len(sys.argv) != 2:
|
||
|
print("usage: %s <unicode version>", file=sys.stderr)
|
||
|
sys.exit(1)
|
||
|
unicode_version = sys.argv[1]
|
||
|
|
||
|
# Parse a codepoint in the format output by glibc tools.
|
||
|
def parse_ucn(s):
|
||
|
if not (s.startswith("<U") and s.endswith(">")):
|
||
|
raise ValueError
|
||
|
return int(s[2:-1], base=16)
|
||
|
|
||
|
# Process a line of width output from utf_gen.py and update global array.
|
||
|
widths = [1] * (1 + 0x10FFFF)
|
||
|
def process_width(line):
|
||
|
# Example lines:
|
||
|
# <UA8FF> 0
|
||
|
# <UA926>...<UA92D> 0
|
||
|
|
||
|
s = line.split()
|
||
|
width = int(s[1])
|
||
|
r = s[0].split("...")
|
||
|
if len(r) == 1:
|
||
|
begin = parse_ucn(r[0])
|
||
|
end = begin + 1
|
||
|
elif len(r) == 2:
|
||
|
begin = parse_ucn(r[0])
|
||
|
end = parse_ucn(r[1]) + 1
|
||
|
else:
|
||
|
raise ValueError
|
||
|
widths[begin:end] = [width] * (end - begin)
|
||
|
|
||
|
# To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a
|
||
|
# file named UTF-8, which is not configurable. Then we parse this into the form
|
||
|
# we want it.
|
||
|
os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
|
||
|
processing = False
|
||
|
for line in open("UTF-8", "r"):
|
||
|
if processing:
|
||
|
if line == "END WIDTH\n":
|
||
|
processing = False
|
||
|
else:
|
||
|
try:
|
||
|
process_width(line)
|
||
|
except (ValueError, IndexError):
|
||
|
print(e, "warning: ignored unexpected line: %s" % line,
|
||
|
file=sys.stderr, end="")
|
||
|
elif line == "WIDTH\n":
|
||
|
processing = True
|
||
|
|
||
|
# All bytes < 256 we treat as width 1.
|
||
|
widths[0:255] = [1] * 255
|
||
|
|
||
|
# Condense the list to contiguous ranges.
|
||
|
cur_range = [-1, 1]
|
||
|
all_ranges = []
|
||
|
for i, width in enumerate(widths):
|
||
|
if width == cur_range[1]:
|
||
|
cur_range[0] = i
|
||
|
else:
|
||
|
all_ranges.append(cur_range)
|
||
|
cur_range = [i, width]
|
||
|
|
||
|
# Output the arrays for generated_cpp_wcwidth.h
|
||
|
print("/* Generated by contrib/unicode/gen_wcwidth.py,",
|
||
|
"with the help of glibc's")
|
||
|
print(" utf8_gen.py, using version %s" % unicode_version,
|
||
|
"of the Unicode standard. */")
|
||
|
print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
|
||
|
for i, r in enumerate(all_ranges):
|
||
|
if i % 8:
|
||
|
print(" ", end="")
|
||
|
else:
|
||
|
print("\n ", end="")
|
||
|
print("0x%x," % (r[0]), end="")
|
||
|
print("\n};\n")
|
||
|
print("static const unsigned char wcwidth_widths[] = {", end="")
|
||
|
for i, r in enumerate(all_ranges):
|
||
|
if i % 24:
|
||
|
print(" ", end="")
|
||
|
else:
|
||
|
print("\n ", end="")
|
||
|
print("%d," % r[1], end="")
|
||
|
print("\n};")
|