gcc/contrib/unicode/gen_wcwidth.py

#!/usr/bin/env python3
#
# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
#
# This file is part of GCC.
#
# GCC is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later
# version.
#
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with GCC; see the file COPYING3.  If not see
# <http://www.gnu.org/licenses/>.  */

import sys
import os

if len(sys.argv) != 2:
    print("usage: %s <unicode version>", file=sys.stderr)
    sys.exit(1)
unicode_version = sys.argv[1]

# Parse a codepoint in the format output by glibc tools.
def parse_ucn(s):
    if not (s.startswith("<U") and s.endswith(">")):
        raise ValueError
    return int(s[2:-1], base=16)

# Process a line of width output from utf_gen.py and update global array.
widths = [1] * (1 + 0x10FFFF)
def process_width(line):
    # Example lines:
    # <UA8FF>	0
    # <UA926>...<UA92D>	0

    s = line.split()
    width = int(s[1])
    r = s[0].split("...")
    if len(r) == 1:
        begin = parse_ucn(r[0])
        end = begin + 1
    elif len(r) == 2:
        begin = parse_ucn(r[0])
        end = parse_ucn(r[1]) + 1
    else:
        raise ValueError
    widths[begin:end] = [width] * (end - begin)

# To keep things simple, we use glibc utf8_gen.py as-is.  It only outputs to a
# file named UTF-8, which is not configurable.  Then we parse this into the form
# we want it.
os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
processing = False
for line in open("UTF-8", "r"):
    if processing:
        if line == "END WIDTH\n":
            processing = False
        else:
            try:
                process_width(line)
            except (ValueError, IndexError):
                print(e, "warning: ignored unexpected line: %s" % line,
                        file=sys.stderr, end="")
    elif line == "WIDTH\n":
        processing = True

# All bytes < 256 we treat as width 1.
widths[0:255] = [1] * 255

# Condense the list to contiguous ranges.
cur_range = [-1, 1]
all_ranges = []
for i, width in enumerate(widths):
    if width == cur_range[1]:
        cur_range[0] = i
    else:
        all_ranges.append(cur_range)
        cur_range = [i, width]

# Output the arrays for generated_cpp_wcwidth.h
print("/*  Generated by contrib/unicode/gen_wcwidth.py,",
          "with the help of glibc's")
print("    utf8_gen.py, using version %s" % unicode_version,
          "of the Unicode standard.  */")
print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
for i, r in enumerate(all_ranges):
    if i % 8:
        print(" ", end="")
    else:
        print("\n  ", end="")
    print("0x%x," % (r[0]), end="")
print("\n};\n")
print("static const unsigned char wcwidth_widths[] = {", end="")
for i, r in enumerate(all_ranges):
    if i % 24:
        print(" ", end="")
    else:
        print("\n  ", end="")
    print("%d," % r[1], end="")
print("\n};")
Byte vs column awareness for diagnostic-show-locus.c (PR 49973) contrib/ChangeLog 2019-12-09 Lewis Hyatt <lhyatt@gmail.com> PR preprocessor/49973 * unicode/from_glibc/unicode_utils.py: Support script from glibc (commit 464cd3) to extract character widths from Unicode data files. * unicode/from_glibc/utf8_gen.py: Likewise. * unicode/UnicodeData.txt: Unicode v. 12.1.0 data file. * unicode/EastAsianWidth.txt: Likewise. * unicode/PropList.txt: Likewise. * unicode/gen_wcwidth.py: New utility to generate libcpp/generated_cpp_wcwidth.h with help from the glibc support scripts and the Unicode data files. * unicode/unicode-license.txt: Added. * unicode/README: New explanatory file. libcpp/ChangeLog 2019-12-09 Lewis Hyatt <lhyatt@gmail.com> PR preprocessor/49973 * generated_cpp_wcwidth.h: New file generated by ../contrib/unicode/gen_wcwidth.py, supports new cpp_wcwidth function. * charset.c (compute_next_display_width): New function to help implement display columns. (cpp_byte_column_to_display_column): Likewise. (cpp_display_column_to_byte_column): Likewise. (cpp_wcwidth): Likewise. * include/cpplib.h (cpp_byte_column_to_display_column): Declare. (cpp_display_column_to_byte_column): Declare. (cpp_wcwidth): Declare. (cpp_display_width): New function. gcc/ChangeLog 2019-12-09 Lewis Hyatt <lhyatt@gmail.com> PR preprocessor/49973 * input.c (location_compute_display_column): New function to help with multibyte awareness in diagnostics. (test_cpp_utf8): New self-test. (input_c_tests): Call the new test. * input.h (location_compute_display_column): Declare. * diagnostic-show-locus.c: Pervasive changes to add multibyte awareness to all classes and functions. (enum column_unit): New enum. (class exploc_with_display_col): New class. (class layout_point): Convert m_column member to array m_columns[2]. (layout_range::contains_point): Add col_unit argument. (test_layout_range_for_single_point): Pass new argument. (test_layout_range_for_single_line): Likewise. (test_layout_range_for_multiple_lines): Likewise. (line_bounds::convert_to_display_cols): New function. (layout::get_state_at_point): Add col_unit argument. (make_range): Use empty filename rather than dummy filename. (get_line_width_without_trailing_whitespace): Rename to... (get_line_bytes_without_trailing_whitespace): ...this. (test_get_line_width_without_trailing_whitespace): Rename to... (test_get_line_bytes_without_trailing_whitespace): ...this. (class layout): m_exploc changed to exploc_with_display_col from plain expanded_location. (layout::get_linenum_width): New accessor member function. (layout::get_x_offset_display): Likewise. (layout::calculate_linenum_width): New subroutine for the constuctor. (layout::calculate_x_offset_display): Likewise. (layout::layout): Use the new subroutines. Add multibyte awareness. (layout::print_source_line): Add multibyte awareness. (layout::print_line): Likewise. (layout::print_annotation_line): Likewise. (line_label::line_label): Likewise. (layout::print_any_labels): Likewise. (layout::annotation_line_showed_range_p): Likewise. (get_printed_columns): Likewise. (class line_label): Rename m_length to m_display_width. (get_affected_columns): Rename to... (get_affected_range): ...this; add col_unit argument and multibyte awareness. (class correction): Add m_affected_bytes and m_display_cols members. Rename m_len to m_byte_length for clarity. Add multibyte awareness throughout. (correction::insertion_p): Add multibyte awareness. (correction::compute_display_cols): New function. (correction::ensure_terminated): Use new member name m_byte_length. (line_corrections::add_hint): Add multibyte awareness. (layout::print_trailing_fixits): Likewise. (layout::get_x_bound_for_row): Likewise. (test_one_liner_simple_caret_utf8): New self-test analogous to the one with _utf8 suffix removed, testing multibyte awareness. (test_one_liner_caret_and_range_utf8): Likewise. (test_one_liner_multiple_carets_and_ranges_utf8): Likewise. (test_one_liner_fixit_insert_before_utf8): Likewise. (test_one_liner_fixit_insert_after_utf8): Likewise. (test_one_liner_fixit_remove_utf8): Likewise. (test_one_liner_fixit_replace_utf8): Likewise. (test_one_liner_fixit_replace_non_equal_range_utf8): Likewise. (test_one_liner_fixit_replace_equal_secondary_range_utf8): Likewise. (test_one_liner_fixit_validation_adhoc_locations_utf8): Likewise. (test_one_liner_many_fixits_1_utf8): Likewise. (test_one_liner_many_fixits_2_utf8): Likewise. (test_one_liner_labels_utf8): Likewise. (test_diagnostic_show_locus_one_liner_utf8): Likewise. (test_overlapped_fixit_printing_utf8): Likewise. (test_overlapped_fixit_printing): Adapt for changes to get_affected_columns, get_printed_columns and class corrections. (test_overlapped_fixit_printing_2): Likewise. (test_linenum_sep): New constant. (test_left_margin): Likewise. (test_offset_impl): Helper function for new test. (test_layout_x_offset_display_utf8): New test. (diagnostic_show_locus_c_tests): Call new tests. gcc/testsuite/ChangeLog: 2019-12-09 Lewis Hyatt <lhyatt@gmail.com> PR preprocessor/49973 * gcc.dg/plugin/diagnostic_plugin_test_show_locus.c (test_show_locus): Tweak so that expected output is the same as before the diagnostic-show-locus.c changes. * gcc.dg/cpp/pr66415-1.c: Likewise. From-SVN: r279137 2019-12-09 21:03:47 +01:00			`#!/usr/bin/env python3`
			`#`
			`# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.`
			`#`
			`# This file is part of GCC.`
			`#`
			`# GCC is free software; you can redistribute it and/or modify it under`
			`# the terms of the GNU General Public License as published by the Free`
			`# Software Foundation; either version 3, or (at your option) any later`
			`# version.`
			`#`
			`# GCC is distributed in the hope that it will be useful, but WITHOUT ANY`
			`# WARRANTY; without even the implied warranty of MERCHANTABILITY or`
			`# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
			`# for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with GCC; see the file COPYING3. If not see`
			`# <http://www.gnu.org/licenses/>. */`

			`import sys`
			`import os`

			`if len(sys.argv) != 2:`
			`print("usage: %s <unicode version>", file=sys.stderr)`
			`sys.exit(1)`
			`unicode_version = sys.argv[1]`

			`# Parse a codepoint in the format output by glibc tools.`
			`def parse_ucn(s):`
			`if not (s.startswith("<U") and s.endswith(">")):`
			`raise ValueError`
			`return int(s[2:-1], base=16)`

			`# Process a line of width output from utf_gen.py and update global array.`
			`widths = [1] * (1 + 0x10FFFF)`
			`def process_width(line):`
			`# Example lines:`
			`# <UA8FF> 0`
			`# <UA926>...<UA92D> 0`

			`s = line.split()`
			`width = int(s[1])`
			`r = s[0].split("...")`
			`if len(r) == 1:`
			`begin = parse_ucn(r[0])`
			`end = begin + 1`
			`elif len(r) == 2:`
			`begin = parse_ucn(r[0])`
			`end = parse_ucn(r[1]) + 1`
			`else:`
			`raise ValueError`
			`widths[begin:end] = [width] * (end - begin)`

			`# To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a`
			`# file named UTF-8, which is not configurable. Then we parse this into the form`
			`# we want it.`
			`os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)`
			`processing = False`
			`for line in open("UTF-8", "r"):`
			`if processing:`
			`if line == "END WIDTH\n":`
			`processing = False`
			`else:`
			`try:`
			`process_width(line)`
			`except (ValueError, IndexError):`
			`print(e, "warning: ignored unexpected line: %s" % line,`
			`file=sys.stderr, end="")`
			`elif line == "WIDTH\n":`
			`processing = True`

			`# All bytes < 256 we treat as width 1.`
			`widths[0:255] = [1] * 255`

			`# Condense the list to contiguous ranges.`
			`cur_range = [-1, 1]`
			`all_ranges = []`
			`for i, width in enumerate(widths):`
			`if width == cur_range[1]:`
			`cur_range[0] = i`
			`else:`
			`all_ranges.append(cur_range)`
			`cur_range = [i, width]`

			`# Output the arrays for generated_cpp_wcwidth.h`
			`print("/* Generated by contrib/unicode/gen_wcwidth.py,",`
			`"with the help of glibc's")`
			`print(" utf8_gen.py, using version %s" % unicode_version,`
			`"of the Unicode standard. */")`
			`print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")`
			`for i, r in enumerate(all_ranges):`
			`if i % 8:`
			`print(" ", end="")`
			`else:`
			`print("\n ", end="")`
			`print("0x%x," % (r[0]), end="")`
			`print("\n};\n")`
			`print("static const unsigned char wcwidth_widths[] = {", end="")`
			`for i, r in enumerate(all_ranges):`
			`if i % 24:`
			`print(" ", end="")`
			`else:`
			`print("\n ", end="")`
			`print("%d," % r[1], end="")`
			`print("\n};")`