Unicode 7.0.0 update; added generator scripts.

for  localedata/ChangeLog

	[BZ #17588]
	[BZ #13064]
	[BZ #14094]
	[BZ #17998]
	* unicode-gen/Makefile: New.
	* unicode-gen/unicode-license.txt: New, from Unicode.
	* unicode-gen/UnicodeData.txt: New, from Unicode.
	* unicode-gen/DerivedCoreProperties.txt: New, from Unicode.
	* unicode-gen/EastAsianWidth.txt: New, from Unicode.
	* unicode-gen/gen_unicode_ctype.py: New generator, from Mike
	FABIAN <mfabian@redhat.com>.
	* unicode-gen/ctype_compatibility.py: New verifier, from
	Pravin Satpute <psatpute@redhat.com> and Mike FABIAN.
	* unicode-gen/ctype_compatibility_test_cases.py: New verifier
	module, from Mike FABIAN.
	* unicode-gen/utf8_gen.py: New generator, from Pravin Satpute
	and Mike FABIAN.
	* unicode-gen/utf8_compatibility.py: New verifier, from Pravin
	Satpute and Mike FABIAN.
	* charmaps/UTF-8: Update.
	* locales/i18n: Update.
	* gen-unicode-ctype.c: Remove.
	* tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns
	true for ordinal indicators.
This commit is contained in:
Alexandre Oliva 2015-02-20 20:14:59 -02:00
parent e4a399dc3d
commit 4a4839c94a
16 changed files with 53305 additions and 5382 deletions

11
NEWS
View File

@ -9,8 +9,15 @@ Version 2.22
* The following bugs are resolved with this release:
4719, 15319, 15467, 15790, 16560, 17569, 17792, 17912, 17932, 17944,
17949, 17964, 17965, 17967, 17969, 17978, 17987, 17991, 17996, 17999.
4719, 13064, 14094, 15319, 15467, 15790, 16560, 17569, 17588, 17792,
17912, 17932, 17944, 17949, 17964, 17965, 17967, 17969, 17978, 17987,
17991, 17996, 17998, 17999.
* Character encoding and ctype tables were updated to Unicode 7.0.0, using
new generator scripts contributed by Pravin Satpute and Mike FABIAN (Red
Hat). These updates cause user visible changes, such as the fix for bug
17998.
Version 2.21

View File

@ -1,3 +1,30 @@
2015-02-20 Alexandre Oliva <aoliva@redhat.com>
[BZ #17588]
[BZ #13064]
[BZ #14094]
[BZ #17998]
* unicode-gen/Makefile: New.
* unicode-gen/unicode-license.txt: New, from Unicode.
* unicode-gen/UnicodeData.txt: New, from Unicode.
* unicode-gen/DerivedCoreProperties.txt: New, from Unicode.
* unicode-gen/EastAsianWidth.txt: New, from Unicode.
* unicode-gen/gen_unicode_ctype.py: New generator, from Mike
FABIAN <mfabian@redhat.com>.
* unicode-gen/ctype_compatibility.py: New verifier, from
Pravin Satpute <psatpute@redhat.com> and Mike FABIAN.
* unicode-gen/ctype_compatibility_test_cases.py: New verifier
module, from Mike FABIAN.
* unicode-gen/utf8_gen.py: New generator, from Pravin Satpute
and Mike FABIAN.
* unicode-gen/utf8_compatibility.py: New verifier, from Pravin
Satpute and Mike FABIAN.
* charmaps/UTF-8: Update.
* locales/i18n: Update.
* gen-unicode-ctype.c: Remove.
* tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns
true for ordinal indicators.
2015-01-21 Marek Polacek <polacek@redhat.com>
* tests-mbwc/tst_wcscpy.c (tst_wcscpy): Fix condition.

File diff suppressed because it is too large Load Diff

View File

@ -1,784 +0,0 @@
/* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
Copyright (C) 2000-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* Usage example:
$ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <time.h>
/* This structure represents one line in the UnicodeData.txt file. */
struct unicode_attribute
{
const char *name; /* Character name */
const char *category; /* General category */
const char *combining; /* Canonical combining classes */
const char *bidi; /* Bidirectional category */
const char *decomposition; /* Character decomposition mapping */
const char *decdigit; /* Decimal digit value */
const char *digit; /* Digit value */
const char *numeric; /* Numeric value */
int mirrored; /* mirrored */
const char *oldname; /* Old Unicode 1.0 name */
const char *comment; /* Comment */
unsigned int upper; /* Uppercase mapping */
unsigned int lower; /* Lowercase mapping */
unsigned int title; /* Titlecase mapping */
};
/* Missing fields are represented with "" for strings, and NONE for
characters. */
#define NONE (~(unsigned int)0)
/* The entire contents of the UnicodeData.txt file. */
struct unicode_attribute unicode_attributes [0x110000];
/* Stores in unicode_attributes[i] the values from the given fields. */
static void
fill_attribute (unsigned int i,
const char *field1, const char *field2,
const char *field3, const char *field4,
const char *field5, const char *field6,
const char *field7, const char *field8,
const char *field9, const char *field10,
const char *field11, const char *field12,
const char *field13, const char *field14)
{
struct unicode_attribute * uni;
if (i >= 0x110000)
{
fprintf (stderr, "index too large\n");
exit (1);
}
if (strcmp (field2, "Cs") == 0)
/* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
return;
uni = &unicode_attributes[i];
/* Copy the strings. */
uni->name = strdup (field1);
uni->category = (field2[0] == '\0' ? "" : strdup (field2));
uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
uni->mirrored = (field9[0] == 'Y');
uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
}
/* Maximum length of a field in the UnicodeData.txt file. */
#define FIELDLEN 120
/* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
Reads up to (but excluding) DELIM.
Returns 1 when a field was successfully read, otherwise 0. */
static int
getfield (FILE *stream, char *buffer, int delim)
{
int count = 0;
int c;
for (; (c = getc (stream)), (c != EOF && c != delim); )
{
/* The original unicode.org UnicodeData.txt file happens to have
CR/LF line terminators. Silently convert to LF. */
if (c == '\r')
continue;
/* Put c into the buffer. */
if (++count >= FIELDLEN - 1)
{
fprintf (stderr, "field too long\n");
exit (1);
}
*buffer++ = c;
}
if (c == EOF)
return 0;
*buffer = '\0';
return 1;
}
/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
file. */
static void
fill_attributes (const char *unicodedata_filename)
{
unsigned int i, j;
FILE *stream;
char field0[FIELDLEN];
char field1[FIELDLEN];
char field2[FIELDLEN];
char field3[FIELDLEN];
char field4[FIELDLEN];
char field5[FIELDLEN];
char field6[FIELDLEN];
char field7[FIELDLEN];
char field8[FIELDLEN];
char field9[FIELDLEN];
char field10[FIELDLEN];
char field11[FIELDLEN];
char field12[FIELDLEN];
char field13[FIELDLEN];
char field14[FIELDLEN];
int lineno = 0;
for (i = 0; i < 0x110000; i++)
unicode_attributes[i].name = NULL;
stream = fopen (unicodedata_filename, "r");
if (stream == NULL)
{
fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
exit (1);
}
for (;;)
{
int n;
lineno++;
n = getfield (stream, field0, ';');
n += getfield (stream, field1, ';');
n += getfield (stream, field2, ';');
n += getfield (stream, field3, ';');
n += getfield (stream, field4, ';');
n += getfield (stream, field5, ';');
n += getfield (stream, field6, ';');
n += getfield (stream, field7, ';');
n += getfield (stream, field8, ';');
n += getfield (stream, field9, ';');
n += getfield (stream, field10, ';');
n += getfield (stream, field11, ';');
n += getfield (stream, field12, ';');
n += getfield (stream, field13, ';');
n += getfield (stream, field14, '\n');
if (n == 0)
break;
if (n != 15)
{
fprintf (stderr, "short line in'%s':%d\n",
unicodedata_filename, lineno);
exit (1);
}
i = strtoul (field0, NULL, 16);
if (field1[0] == '<'
&& strlen (field1) >= 9
&& !strcmp (field1 + strlen(field1) - 8, ", First>"))
{
/* Deal with a range. */
lineno++;
n = getfield (stream, field0, ';');
n += getfield (stream, field1, ';');
n += getfield (stream, field2, ';');
n += getfield (stream, field3, ';');
n += getfield (stream, field4, ';');
n += getfield (stream, field5, ';');
n += getfield (stream, field6, ';');
n += getfield (stream, field7, ';');
n += getfield (stream, field8, ';');
n += getfield (stream, field9, ';');
n += getfield (stream, field10, ';');
n += getfield (stream, field11, ';');
n += getfield (stream, field12, ';');
n += getfield (stream, field13, ';');
n += getfield (stream, field14, '\n');
if (n != 15)
{
fprintf (stderr, "missing end range in '%s':%d\n",
unicodedata_filename, lineno);
exit (1);
}
if (!(field1[0] == '<'
&& strlen (field1) >= 8
&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
{
fprintf (stderr, "missing end range in '%s':%d\n",
unicodedata_filename, lineno);
exit (1);
}
field1[strlen (field1) - 7] = '\0';
j = strtoul (field0, NULL, 16);
for (; i <= j; i++)
fill_attribute (i, field1+1, field2, field3, field4, field5,
field6, field7, field8, field9, field10,
field11, field12, field13, field14);
}
else
{
/* Single character line */
fill_attribute (i, field1, field2, field3, field4, field5,
field6, field7, field8, field9, field10,
field11, field12, field13, field14);
}
}
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
exit (1);
}
}
/* Character mappings. */
static unsigned int
to_upper (unsigned int ch)
{
if (unicode_attributes[ch].name != NULL
&& unicode_attributes[ch].upper != NONE)
return unicode_attributes[ch].upper;
else
return ch;
}
static unsigned int
to_lower (unsigned int ch)
{
if (unicode_attributes[ch].name != NULL
&& unicode_attributes[ch].lower != NONE)
return unicode_attributes[ch].lower;
else
return ch;
}
static unsigned int
to_title (unsigned int ch)
{
if (unicode_attributes[ch].name != NULL
&& unicode_attributes[ch].title != NONE)
return unicode_attributes[ch].title;
else
return ch;
}
/* Character class properties. */
static bool
is_upper (unsigned int ch)
{
return (to_lower (ch) != ch);
}
static bool
is_lower (unsigned int ch)
{
return (to_upper (ch) != ch)
/* <U00DF> is lowercase, but without simple to_upper mapping. */
|| (ch == 0x00DF);
}
static bool
is_alpha (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
&& ((unicode_attributes[ch].category[0] == 'L'
/* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
<U0E2F>, <U0E46> should belong to is_punct. */
&& (ch != 0x0E2F) && (ch != 0x0E46))
/* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
<U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
|| (ch == 0x0E31)
|| (ch >= 0x0E34 && ch <= 0x0E3A)
|| (ch >= 0x0E47 && ch <= 0x0E4E)
/* Avoid warning for <U0345>. */
|| (ch == 0x0345)
/* Avoid warnings for <U2160>..<U217F>. */
|| (unicode_attributes[ch].category[0] == 'N'
&& unicode_attributes[ch].category[1] == 'l')
/* Avoid warnings for <U24B6>..<U24E9>. */
|| (unicode_attributes[ch].category[0] == 'S'
&& unicode_attributes[ch].category[1] == 'o'
&& strstr (unicode_attributes[ch].name, " LETTER ")
!= NULL)
/* Consider all the non-ASCII digits as alphabetic.
ISO C 99 forbids us to have them in category "digit",
but we want iswalnum to return true on them. */
|| (unicode_attributes[ch].category[0] == 'N'
&& unicode_attributes[ch].category[1] == 'd'
&& !(ch >= 0x0030 && ch <= 0x0039))));
}
static bool
is_digit (unsigned int ch)
{
#if 0
return (unicode_attributes[ch].name != NULL
&& unicode_attributes[ch].category[0] == 'N'
&& unicode_attributes[ch].category[1] == 'd');
/* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
a zero. Must add <0> in front of them by hand. */
#else
/* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
takes it away:
7.25.2.1.5:
The iswdigit function tests for any wide character that corresponds
to a decimal-digit character (as defined in 5.2.1).
5.2.1:
the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
*/
return (ch >= 0x0030 && ch <= 0x0039);
#endif
}
static bool
is_outdigit (unsigned int ch)
{
return (ch >= 0x0030 && ch <= 0x0039);
}
static bool
is_blank (unsigned int ch)
{
return (ch == 0x0009 /* '\t' */
/* Category Zs without mention of "<noBreak>" */
|| (unicode_attributes[ch].name != NULL
&& unicode_attributes[ch].category[0] == 'Z'
&& unicode_attributes[ch].category[1] == 's'
&& !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
}
static bool
is_space (unsigned int ch)
{
/* Don't make U+00A0 a space. Non-breaking space means that all programs
should treat it like a punctuation character, not like a space. */
return (ch == 0x0020 /* ' ' */
|| ch == 0x000C /* '\f' */
|| ch == 0x000A /* '\n' */
|| ch == 0x000D /* '\r' */
|| ch == 0x0009 /* '\t' */
|| ch == 0x000B /* '\v' */
/* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
|| (unicode_attributes[ch].name != NULL
&& unicode_attributes[ch].category[0] == 'Z'
&& (unicode_attributes[ch].category[1] == 'l'
|| unicode_attributes[ch].category[1] == 'p'
|| (unicode_attributes[ch].category[1] == 's'
&& !strstr (unicode_attributes[ch].decomposition,
"<noBreak>")))));
}
static bool
is_cntrl (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
&& (!strcmp (unicode_attributes[ch].name, "<control>")
/* Categories Zl and Zp */
|| (unicode_attributes[ch].category[0] == 'Z'
&& (unicode_attributes[ch].category[1] == 'l'
|| unicode_attributes[ch].category[1] == 'p'))));
}
static bool
is_xdigit (unsigned int ch)
{
#if 0
return is_digit (ch)
|| (ch >= 0x0041 && ch <= 0x0046)
|| (ch >= 0x0061 && ch <= 0x0066);
#else
/* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
takes it away:
7.25.2.1.12:
The iswxdigit function tests for any wide character that corresponds
to a hexadecimal-digit character (as defined in 6.4.4.1).
6.4.4.1:
hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
*/
return (ch >= 0x0030 && ch <= 0x0039)
|| (ch >= 0x0041 && ch <= 0x0046)
|| (ch >= 0x0061 && ch <= 0x0066);
#endif
}
static bool
is_graph (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
&& strcmp (unicode_attributes[ch].name, "<control>")
&& !is_space (ch));
}
static bool
is_print (unsigned int ch)
{
return (unicode_attributes[ch].name != NULL
&& strcmp (unicode_attributes[ch].name, "<control>")
/* Categories Zl and Zp */
&& !(unicode_attributes[ch].name != NULL
&& unicode_attributes[ch].category[0] == 'Z'
&& (unicode_attributes[ch].category[1] == 'l'
|| unicode_attributes[ch].category[1] == 'p')));
}
static bool
is_punct (unsigned int ch)
{
#if 0
return (unicode_attributes[ch].name != NULL
&& unicode_attributes[ch].category[0] == 'P');
#else
/* The traditional POSIX definition of punctuation is every graphic,
non-alphanumeric character. */
return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
#endif
}
static bool
is_combining (unsigned int ch)
{
/* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
file. In 3.0.1 it was identical to the union of the general categories
"Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
PropList.txt file, so we take the latter definition. */
return (unicode_attributes[ch].name != NULL
&& unicode_attributes[ch].category[0] == 'M'
&& (unicode_attributes[ch].category[1] == 'n'
|| unicode_attributes[ch].category[1] == 'c'
|| unicode_attributes[ch].category[1] == 'e'));
}
static bool
is_combining_level3 (unsigned int ch)
{
return is_combining (ch)
&& !(unicode_attributes[ch].combining[0] != '\0'
&& unicode_attributes[ch].combining[0] != '0'
&& strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
}
/* Return the UCS symbol string for a Unicode character. */
static const char *
ucs_symbol (unsigned int i)
{
static char buf[11+1];
sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
return buf;
}
/* Return the UCS symbol range string for a Unicode characters interval. */
static const char *
ucs_symbol_range (unsigned int low, unsigned int high)
{
static char buf[24+1];
strcpy (buf, ucs_symbol (low));
strcat (buf, "..");
strcat (buf, ucs_symbol (high));
return buf;
}
/* Output a character class (= property) table. */
static void
output_charclass (FILE *stream, const char *classname,
bool (*func) (unsigned int))
{
char table[0x110000];
unsigned int i;
bool need_semicolon;
const int max_column = 75;
int column;
for (i = 0; i < 0x110000; i++)
table[i] = (int) func (i);
fprintf (stream, "%s ", classname);
need_semicolon = false;
column = 1000;
for (i = 0; i < 0x110000; )
{
if (!table[i])
i++;
else
{
unsigned int low, high;
char buf[25];
low = i;
do
i++;
while (i < 0x110000 && table[i]);
high = i - 1;
if (low == high)
strcpy (buf, ucs_symbol (low));
else
strcpy (buf, ucs_symbol_range (low, high));
if (need_semicolon)
{
fprintf (stream, ";");
column++;
}
if (column + strlen (buf) > max_column)
{
fprintf (stream, "/\n ");
column = 3;
}
fprintf (stream, "%s", buf);
column += strlen (buf);
need_semicolon = true;
}
}
fprintf (stream, "\n");
}
/* Output a character mapping table. */
static void
output_charmap (FILE *stream, const char *mapname,
unsigned int (*func) (unsigned int))
{
char table[0x110000];
unsigned int i;
bool need_semicolon;
const int max_column = 75;
int column;
for (i = 0; i < 0x110000; i++)
table[i] = (func (i) != i);
fprintf (stream, "%s ", mapname);
need_semicolon = false;
column = 1000;
for (i = 0; i < 0x110000; i++)
if (table[i])
{
char buf[25+1];
strcpy (buf, "(");
strcat (buf, ucs_symbol (i));
strcat (buf, ",");
strcat (buf, ucs_symbol (func (i)));
strcat (buf, ")");
if (need_semicolon)
{
fprintf (stream, ";");
column++;
}
if (column + strlen (buf) > max_column)
{
fprintf (stream, "/\n ");
column = 3;
}
fprintf (stream, "%s", buf);
column += strlen (buf);
need_semicolon = true;
}
fprintf (stream, "\n");
}
/* Output the width table. */
static void
output_widthmap (FILE *stream)
{
}
/* Output the tables to the given file. */
static void
output_tables (const char *filename, const char *version)
{
FILE *stream;
unsigned int ch;
stream = fopen (filename, "w");
if (stream == NULL)
{
fprintf (stderr, "cannot open '%s' for writing\n", filename);
exit (1);
}
fprintf (stream, "escape_char /\n");
fprintf (stream, "comment_char %%\n");
fprintf (stream, "\n");
fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
version);
fprintf (stream, "\n");
fprintf (stream, "LC_IDENTIFICATION\n");
fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
fprintf (stream, "address \"\"\n");
fprintf (stream, "contact \"\"\n");
fprintf (stream, "email \"bug-glibc-locales@gnu.org\"\n");
fprintf (stream, "tel \"\"\n");
fprintf (stream, "fax \"\"\n");
fprintf (stream, "language \"\"\n");
fprintf (stream, "territory \"Earth\"\n");
fprintf (stream, "revision \"%s\"\n", version);
{
time_t now;
char date[11];
now = time (NULL);
strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
fprintf (stream, "date \"%s\"\n", date);
}
fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
fprintf (stream, "END LC_IDENTIFICATION\n");
fprintf (stream, "\n");
/* Verifications. */
for (ch = 0; ch < 0x110000; ch++)
{
/* toupper restriction: "Only characters specified for the keywords
lower and upper shall be specified. */
if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
fprintf (stderr,
"%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
ucs_symbol (ch), ch, to_upper (ch));
/* tolower restriction: "Only characters specified for the keywords
lower and upper shall be specified. */
if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
fprintf (stderr,
"%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
ucs_symbol (ch), ch, to_lower (ch));
/* alpha restriction: "Characters classified as either upper or lower
shall automatically belong to this class. */
if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
/* alpha restriction: "No character specified for the keywords cntrl,
digit, punct or space shall be specified." */
if (is_alpha (ch) && is_cntrl (ch))
fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
if (is_alpha (ch) && is_digit (ch))
fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
if (is_alpha (ch) && is_punct (ch))
fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
if (is_alpha (ch) && is_space (ch))
fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
/* space restriction: "No character specified for the keywords upper,
lower, alpha, digit, graph or xdigit shall be specified."
upper, lower, alpha already checked above. */
if (is_space (ch) && is_digit (ch))
fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
if (is_space (ch) && is_graph (ch))
fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
if (is_space (ch) && is_xdigit (ch))
fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
/* cntrl restriction: "No character specified for the keywords upper,
lower, alpha, digit, punct, graph, print or xdigit shall be
specified." upper, lower, alpha already checked above. */
if (is_cntrl (ch) && is_digit (ch))
fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
if (is_cntrl (ch) && is_punct (ch))
fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
if (is_cntrl (ch) && is_graph (ch))
fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
if (is_cntrl (ch) && is_print (ch))
fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
if (is_cntrl (ch) && is_xdigit (ch))
fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
/* punct restriction: "No character specified for the keywords upper,
lower, alpha, digit, cntrl, xdigit or as the <space> character shall
be specified." upper, lower, alpha, cntrl already checked above. */
if (is_punct (ch) && is_digit (ch))
fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
if (is_punct (ch) && is_xdigit (ch))
fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
if (is_punct (ch) && (ch == 0x0020))
fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
/* graph restriction: "No character specified for the keyword cntrl
shall be specified." Already checked above. */
/* print restriction: "No character specified for the keyword cntrl
shall be specified." Already checked above. */
/* graph - print relation: differ only in the <space> character.
How is this possible if there are more than one space character?!
I think susv2/xbd/locale.html should speak of "space characters",
not "space character". */
if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
fprintf (stderr,
"%s is print but not graph|<space>\n", ucs_symbol (ch));
if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
fprintf (stderr,
"%s is graph|<space> but not print\n", ucs_symbol (ch));
}
fprintf (stream, "LC_CTYPE\n");
output_charclass (stream, "upper", is_upper);
output_charclass (stream, "lower", is_lower);
output_charclass (stream, "alpha", is_alpha);
output_charclass (stream, "digit", is_digit);
output_charclass (stream, "outdigit", is_outdigit);
output_charclass (stream, "blank", is_blank);
output_charclass (stream, "space", is_space);
output_charclass (stream, "cntrl", is_cntrl);
output_charclass (stream, "punct", is_punct);
output_charclass (stream, "xdigit", is_xdigit);
output_charclass (stream, "graph", is_graph);
output_charclass (stream, "print", is_print);
output_charclass (stream, "class \"combining\";", is_combining);
output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
output_charmap (stream, "toupper", to_upper);
output_charmap (stream, "tolower", to_lower);
output_charmap (stream, "map \"totitle\";", to_title);
output_widthmap (stream);
fprintf (stream, "END LC_CTYPE\n");
if (ferror (stream) || fclose (stream))
{
fprintf (stderr, "error writing to '%s'\n", filename);
exit (1);
}
}
int
main (int argc, char * argv[])
{
if (argc != 3)
{
fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
exit (1);
}
fill_attributes (argv[1]);
output_tables ("unicode", argv[2]);
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
lower 嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘
000000000000000000000100000000000000000000000000
000000000010000000000100001000000000000000000000
lower 倳眑婭笫崷窙嗲睧颬睼麧緗鴇膹擨闀貘覷鏷禴矙𡜍𦶠<F0A19C8D>
000000000000000111111111111111111111111011111111
upper 嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,99 @@
# Copyright (C) 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
# Makefile for generating and updating Unicode-extracted files.
# This Makefile is NOT used as part of the GNU libc build. It needs
# to be run manually, within the source tree, at Unicode upgrades
# (change UNICODE_VERSION below), to update ../locales/i18n ctype
# information (part of the file is preserved, so don't wipe it all
# out), and ../charmaps/UTF-8.
# Use make all to generate the files used in the glibc build out of
# the original Unicode files; make check to verify that they are what
# we expect; make install to copy them to the location expected by the
# glibc build; and make clean to remove all generated files.
# We keep a local copy of the downloaded Unicode files, to avoid
# running afoul of the LGPL corresponding sources requirements, even
# though it's not clear that they are preferred over the generated
# files for making modifications.
UNICODE_VERSION = 7.0.0
PYTHON3 = python3
WGET = wget
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
GENERATED = i18n UTF-8
REPORTS = i18n-report UTF-8-report
all: $(GENERATED)
check: check-i18n check-UTF-8
install:
cp -p i18n ../locales/i18n
cp -p UTF-8 ../charmaps/UTF-8
clean: mostlyclean
-rm -rf __pycache__
mostlyclean:
-rm -f $(REPORTS) $(GENERATED)
.PHONY: all check clean mostlyclean install
i18n: UnicodeData.txt DerivedCoreProperties.txt
i18n: ../locales/i18n # Preserve non-ctype information.
i18n: gen_unicode_ctype.py
$(PYTHON3) gen_unicode_ctype.py -u UnicodeData.txt \
-d DerivedCoreProperties.txt -i ../locales/i18n -o $@ \
--unicode_version $(UNICODE_VERSION)
i18n-report: i18n ../locales/i18n
i18n-report: ctype_compatibility.py ctype_compatibility_test_cases.py
$(PYTHON3) ./ctype_compatibility.py -o ../locales/i18n \
-n i18n -a -m > $@
check-i18n: i18n-report
@if grep '\(Missing\|Added\) [^0]\|^Number of errors[^=]* = [^0]' \
i18n-report; \
then echo manual verification required; false; else true; fi
UTF-8: UnicodeData.txt EastAsianWidth.txt
UTF-8: utf8_gen.py
$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt
UTF-8-report: UTF-8 ../charmaps/UTF-8
UTF-8-report: utf8_compatibility.py
$(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \
-n UTF-8 -a -m > $@
check-UTF-8: UTF-8-report
@if grep '^Total.*: [^0]' UTF-8-report; \
then echo manual verification required; false; else true; fi
.PHONY: downloads clean-downloads
downloads: $(DOWNLOADS)
clean-downloads:
-rm -f $(DOWNLOADS)
$(DOWNLOADS):
$(WGET) http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$@

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,546 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
This script is useful for checking the differences between
an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
new one generated by gen_unicode_ctype.py
To see how it is used, call it with the -h option:
$ ./ctype_compatibility.py -h
prints usage message
'''
import sys
import re
import unicodedata
import argparse
from ctype_compatibility_test_cases import TEST_CASES
def get_lines_from_file(filename):
'''Get all non-comment lines from a i18n file
Also merge all lines which are continued on the next line because
they end in / into a single line.
'''
with open(filename) as i18n_file:
current_line = ''
for line in i18n_file:
line = line.strip('\n')
if '%' in line:
if line.endswith('/'):
line = line[0:line.find('%')] + '/'
else:
line = line[0:line.find('%')]
line = line.strip()
if line.endswith('/'):
current_line += line[:-1]
else:
yield current_line + line
current_line = ''
if current_line: # file ends with a continuation line
yield current_line
def extract_character_classes(filename):
'''Get all Unicode code points for each character class from a file
Store these code points in a dictionary using the character classes
as keys and the list of code points in this character class as values.
In case of the character classes toupper, tolower, and totitle,
these area actually pairs of code points
'''
ctype_dict = {}
for line in get_lines_from_file(filename):
for char_class in [
'upper',
'lower',
'alpha',
'digit',
'outdigit',
'space',
'cntrl',
'punct',
'graph',
'print',
'xdigit',
'blank',
'combining',
'combining_level3',
'toupper',
'tolower',
'totitle']:
match = re.match(r'^('
+'(?:(?:class|map)\s+")'
+re.escape(char_class)+
'(?:";)\s+'
+'|'
+re.escape(char_class)+'\s+'
+')', line)
if match:
if char_class not in ctype_dict:
ctype_dict[char_class] = []
process_chars(
ctype_dict[char_class],
line[match.end():])
return ctype_dict
def process_chars(char_class_list, code_point_line):
'''
Extract Unicode values from code_point_line
and add to the list of code points in a character class
'''
for code_points in code_point_line.split(';'):
code_points = code_points.strip()
match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
if match: # <Uxxxx>
char_class_list.append(
int(match.group('codepoint'), 16))
continue
match = re.match(
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
+'\.\.'+
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
code_points)
if match: # <Uxxxx>..<Uxxxx>
for codepoint in range(
int(match.group('codepoint1'), 16),
int(match.group('codepoint2'), 16) + 1):
char_class_list.append(codepoint)
continue
match = re.match(
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
+'\.\.\(2\)\.\.'+
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
code_points)
if match: # <Uxxxx>..(2)..<Uxxxx>
for codepoint in range(
int(match.group('codepoint1'), 16),
int(match.group('codepoint2'), 16) + 1,
2):
char_class_list.append(codepoint)
continue
match = re.match(
r'^\('
+'<U(?P<codepoint1>[0-9A-F]{4,8})>'
+','+
'<U(?P<codepoint2>[0-9A-F]{4,8})>'
+'\)$',
code_points)
if match: # (<Uxxxx>,<Uxxxx>)
char_class_list.append((
int(match.group('codepoint1'), 16),
int(match.group('codepoint2'), 16)))
continue
sys.stderr.write(
('None of the regexps matched '
+ 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
'cp': code_points,
'cpl': code_point_line
})
exit(1)
def compare_lists(old_ctype_dict, new_ctype_dict):
'''Compare character classes in the old and the new LC_CTYPE'''
print('****************************************************')
print('Character classes which are only in the new '
+ 'or only in the old file:')
for char_class in sorted(old_ctype_dict):
if char_class not in new_ctype_dict:
print('Character class %s is in old ctype but not in new ctype'
%char_class)
for char_class in sorted(new_ctype_dict):
if char_class not in old_ctype_dict:
print('Character class %s is in new ctype but not in old ctype'
%char_class)
for char_class in sorted(old_ctype_dict):
print("****************************************************")
print("%s: %d chars in old ctype and %d chars in new ctype" %(
char_class,
len(old_ctype_dict[char_class]),
len(new_ctype_dict[char_class])))
print("----------------------------------------------------")
report(char_class,
old_ctype_dict[char_class],
new_ctype_dict[char_class])
def report_code_points(char_class, code_point_list, text=''):
'''Report all code points which have been added to or removed from a
character class.
'''
for code_point in sorted(code_point_list):
if type(code_point) == type(int()):
print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
%{'text': text,
'char': chr(code_point),
'char_class': char_class,
'code_point': hex(code_point),
'name': unicodedata.name(chr(code_point), 'name unknown')})
else:
print(('%(char_class)s: %(text)s: '
+ '%(char0)s%(char1)s '
+ '%(code_point0)s%(code_point1)s '
+ '%(name0)s%(name1)s') %{
'text': text,
'char_class': char_class,
'char0': chr(code_point[0]),
'code_point0': hex(code_point[0]),
'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
'char1': chr(code_point[1]),
'code_point1': hex(code_point[1]),
'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
})
def report(char_class, old_list, new_list):
'''Report the differences for a certain LC_CTYPE character class
between the old and the newly generated state
'''
missing_chars = list(set(old_list)-set(new_list))
print(('%(char_class)s: Missing %(number)d characters '
+ 'of old ctype in new ctype ')
%{'char_class': char_class, 'number': len(missing_chars)})
if ARGS.show_missing_characters:
report_code_points(char_class, missing_chars, 'Missing')
added_chars = list(set(new_list)-set(old_list))
print(('%(char_class)s: Added %(number)d characters '
+ 'in new ctype which were not in old ctype')
%{'char_class': char_class, 'number': len(added_chars)})
if ARGS.show_added_characters:
report_code_points(char_class, added_chars, 'Added')
def cperror(error_message, errorcounter=0):
'''Increase number of errors by one and print an error message'''
print(error_message)
return errorcounter + 1
def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
errorcounter=0):
'''The parameter “code_point_list_with_ranges” is a list of
integers or pairs of integers, for example:
[0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
where the pairs of integers stand for all the code points in the range
of the two integers given, including the two integers of the pair.
'''
for code_point_range in code_point_list_with_ranges:
for code_point in ([code_point_range]
if type(code_point_range) == type(int())
else range(code_point_range[0],
code_point_range[1]+1)):
for char_class_tuple in char_classes:
char_class = char_class_tuple[0]
in_char_class = char_class_tuple[1]
if (code_point in ctype_dict[char_class]) != in_char_class:
errorcounter = cperror(
('error: %(code_point)s %(char)s '
+ '%(char_class)s %(in)s: %(reason)s') %{
'code_point': hex(code_point),
'char': chr(code_point),
'char_class': char_class,
'in': not in_char_class,
'reason': reason},
errorcounter)
return errorcounter
def tests(ctype_dict, errorcounter = 0):
'''Test a LC_CTYPE character class dictionary for known errors'''
# copy the information from ctype_dict (which contains lists) in
# a new dictionary ctype_dict2 (which contains dictionaries).
# The checks below are easier with that type of data structure.
ctype_dict2 = {}
for key in ctype_dict:
ctype_dict2[key] = {}
if ctype_dict[key]:
if type(ctype_dict[key][0]) == type(int()):
for value in ctype_dict[key]:
ctype_dict2[key][value] = 1
else: # key is 'toupper', 'tolower', or 'totitle'
for value in ctype_dict[key]:
ctype_dict2[key][value[0]] = value[1]
for test_case in TEST_CASES:
errorcounter = cpcheck(ctype_dict2,
test_case[0],
test_case[1],
test_case[2],
errorcounter = errorcounter)
for code_point in range(0, 0x110000):
# toupper restriction: "Only characters specified for the keywords
# lower and upper shall be specified.
if (code_point in ctype_dict2['toupper']
and code_point != ctype_dict2['toupper'][code_point]
and not (code_point in ctype_dict2['lower']
or code_point in ctype_dict2['upper'])):
errorcounter = cperror(
('error: %(char1)s is not upper|lower '
+ 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
'char1': chr(code_point),
'cp1': hex(code_point),
'cp2': hex(ctype_dict2['toupper'][code_point]),
'char2': chr(ctype_dict2['toupper'][code_point])
},
errorcounter)
# tolower restriction: "Only characters specified for the keywords
# lower and upper shall be specified.
if (code_point in ctype_dict2['tolower']
and code_point != ctype_dict2['tolower'][code_point]
and not (code_point in ctype_dict2['lower']
or code_point in ctype_dict2['upper'])):
errorcounter = cperror(
('error: %(char1)s is not upper|lower '
+ 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
'char1': chr(code_point),
'cp1': hex(code_point),
'cp2': hex(ctype_dict2['tolower'][code_point]),
'char2': chr(ctype_dict2['tolower'][code_point])
},
errorcounter)
# alpha restriction: "Characters classified as either upper or lower
# shall automatically belong to this class.
if ((code_point in ctype_dict2['lower']
or code_point in ctype_dict2['upper'])
and code_point not in ctype_dict2['alpha']):
errorcounter = cperror(
'error: %(char)s %(cp)s is upper|lower but not alpha' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
# alpha restriction: "No character specified for the keywords cntrl,
# digit, punct or space shall be specified."
if (code_point in ctype_dict2['alpha']
and code_point in ctype_dict2['cntrl']):
errorcounter = cperror(
'error: %(char)s %(cp)s is alpha and cntrl' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['alpha']
and code_point in ctype_dict2['digit']):
errorcounter = cperror(
'error: %(char)s %(cp)s is alpha and digit' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['alpha']
and code_point in ctype_dict2['punct']):
errorcounter = cperror(
'error: %(char)s %(cp)s is alpha and punct' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['alpha']
and code_point in ctype_dict2['space']):
errorcounter = cperror(
'error: %(char)s %(cp)s is alpha and space' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
# space restriction: "No character specified for the keywords upper,
# lower, alpha, digit, graph or xdigit shall be specified."
# upper, lower, alpha already checked above.
if (code_point in ctype_dict2['space']
and code_point in ctype_dict2['digit']):
errorcounter = cperror(
'error: %(char)s %(cp)s is space and digit' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['space']
and code_point in ctype_dict2['graph']):
errorcounter = cperror(
'error: %(char)s %(cp)s is space and graph' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['space']
and code_point in ctype_dict2['xdigit']):
errorcounter = cperror(
'error: %(char)s %(cp)s is space and xdigit' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
# cntrl restriction: "No character specified for the keywords upper,
# lower, alpha, digit, punct, graph, print or xdigit shall be
# specified." upper, lower, alpha already checked above.
if (code_point in ctype_dict2['cntrl']
and code_point in ctype_dict2['digit']):
errorcounter = cperror(
'error: %(char)s %(cp)s is cntrl and digit' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['cntrl']
and code_point in ctype_dict2['punct']):
errorcounter = cperror(
'error: %(char)s %(cp)s is cntrl and punct' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['cntrl']
and code_point in ctype_dict2['graph']):
errorcounter = cperror(
'error: %(char)s %(cp)s is cntrl and graph' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['cntrl']
and code_point in ctype_dict2['print']):
errorcounter = cperror(
'error: %(char)s %(cp)s is cntrl and print' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['cntrl']
and code_point in ctype_dict2['xdigit']):
errorcounter = cperror(
'error: %(char)s %(cp)s is cntrl and xdigit' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
# punct restriction: "No character specified for the keywords upper,
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
# be specified." upper, lower, alpha, cntrl already checked above.
if (code_point in ctype_dict2['punct']
and code_point in ctype_dict2['digit']):
errorcounter = cperror(
'error: %(char)s %(cp)s is punct and digit' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['punct']
and code_point in ctype_dict2['xdigit']):
errorcounter = cperror(
'error: %(char)s %(cp)s is punct and xdigit' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point in ctype_dict2['punct']
and code_point == 0x0020):
errorcounter = cperror(
'error: %(char)s %(cp)s is punct.' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
# graph restriction: "No character specified for the keyword cntrl
# shall be specified." Already checked above.
# print restriction: "No character specified for the keyword cntrl
# shall be specified." Already checked above.
# graph - print relation: differ only in the <space> character.
# How is this possible if there are more than one space character?!
# I think susv2/xbd/locale.html should speak of "space characters",
# not "space character".
if (code_point in ctype_dict2['print']
and not (code_point in ctype_dict2['graph']
or code_point in ctype_dict2['space'])):
errorcounter = cperror(
'error: %(char)s %(cp)s is print but not graph|space' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
if (code_point not in ctype_dict2['print']
and (code_point in ctype_dict2['graph']
or code_point == 0x0020)):
errorcounter = cperror(
'error: %(char)s %(cp)s graph|space but not print' %{
'char': chr(code_point),
'cp': hex(code_point)
},
errorcounter)
return errorcounter
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
Compare the contents of LC_CTYPE in two files and check for errors.
''')
PARSER.add_argument(
'-o', '--old_ctype_file',
nargs='?',
type=str,
default='i18n',
help='The old ctype file, default: %(default)s')
PARSER.add_argument(
'-n', '--new_ctype_file',
nargs='?',
type=str,
default='unicode-ctype',
help='The new ctype file, default: %(default)s')
PARSER.add_argument(
'-a', '--show_added_characters',
action='store_true',
help=('Show characters which were added to each '
+ 'character class in detail.'))
PARSER.add_argument(
'-m', '--show_missing_characters',
action='store_true',
help=('Show characters which were removed from each '
+ 'character class in detail.'))
ARGS = PARSER.parse_args()
OLD_CTYPE_DICT = extract_character_classes(
ARGS.old_ctype_file)
NEW_CTYPE_DICT = extract_character_classes(
ARGS.new_ctype_file)
compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
print('============================================================')
print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
print('------------------------------------------------------------')
NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
print('------------------------------------------------------------')
print('Old file = %s' %ARGS.old_ctype_file)
print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
print('------------------------------------------------------------')
print('============================================================')
print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
print('------------------------------------------------------------')
NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
print('------------------------------------------------------------')
print('New file = %s' %ARGS.new_ctype_file)
print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
print('------------------------------------------------------------')
if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
exit(1)
else:
exit(0)

View File

@ -0,0 +1,951 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
This file contains a list of test cases used by
the ctype_compatibility.py script.
'''
TEST_CASES = [
[[0x0E2F, 0x0E46], [('alpha', True), ('punct', False)],
'''Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
<U0E2F>, <U0E46> should belong to punct. DerivedCoreProperties.txt
says it is alpha. We trust DerivedCoreProperties.txt.'''
],
[[0x0E31, (0x0E34, 0x0E3A)], [('alpha', True)],
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan
<thep@links.nectec.or.th> says <U0E31>, <U0E34>..<U0E3A>
are alpha. DerivedCoreProperties.txt agrees.'''
],
[[(0x0E47, 0x0E4C), 0x0E4E], [('alpha', False)],
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan
<thep@links.nectec.or.th> says <U0E47>..<U0E4E> are
is_alpha. DerivedCoreProperties does says *only* <U0E4D>
in that range is alphabetic, the others are *not*. We
trust DerivedCoreProperties.txt.'''
],
[[0x0E4D], [('alpha', True)],
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan
<thep@links.nectec.or.th> says <U0E47>..<U0E4E> are
is_alpha. DerivedCoreProperties does says *only* <U0E4D>
in that range is alphabetic, the others are *not*. We
trust DerivedCoreProperties.txt.
'''
],
[[0x0345], [('alpha', True), ('lower', True)],
'''COMBINING GREEK YPOGEGRAMMENI
According to DerivedCoreProperties.txt, this is Alphabetic
and Lowercase.'''
],
[[(0x2160, 0x2188)], [('alpha', True)],
'''Roman Numerals are “Alphabetic” according to
DerivedCoreProperties.txt'''
],
[[(0x24B6, 0x24E9)], [('alpha', True)],
'''Circled Latin letters are “Alphabetic” according to
DerivedCoreProperties.txt'''
],
[[0x661], [('alpha', True), ('digit', False)],
'''gen-unicode-ctype.c: All non-ASCII digits should be alphabetic.
ISO C 99 forbids us to have them in category "digit", but we
want iswalnum to return true on them. Dont forget to
have a look at all the other digits, 0x661 is just one
example tested here.'''
],
[[(0x0030, 0x0039)], [('digit', True)],
'''gen-unicode-ctype.c: All ASCII digits should be digits.'''
],
[[0x0009], [('blank', True)],
'''gen-unicode-ctype.c: CHARACTER TABULATION'''
],
[[0x2007], [('blank', False), ('space', False)],
'''gen-unicode-ctype.c: FIGURE SPACE, because it has <noBreak>
in the description.'''
],
[[0x0009, 0x000A, 0x000B, 0x000C, 0x000D], [('space', True)],
'''gen-unicode-ctype.c: CHARACTER TABULATION, LINE FEED (LF), LINE
TABULATION, ;FORM FEED (FF), CARRIAGE RETURN (CR)'''
],
[[0x2028, 0x2029], [('cntrl', True)],
'''gen-unicode-ctype.c: LINE SEPARATOR and PARAGRAPH SEPARATOR
should be cntrl.'''
],
[[(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)],
[('xdigit', True)],
'''gen-unicode-ctype.c: ISO C 99 says (6.4.4.1): hexadecimal-digit:
one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F (nothing else
should be considered as a hexadecimal-digit)'''
],
[[0x0330], [('combining', True), ('combining_level3', False)],
'''gen-unicode-ctype.c: COMBINING TILDE BELOW, canonical combining
class value >= 200, should be in combining but not in
combining_level3'''
],
[[0x0250, 0x0251, 0x0271], [('lower', True)],
'''Should be lower in Unicode 7.0.0 (was not lower in
Unicode 5.0.0).
'''
],
[[0x2184], [('lower', True)],
'''Should be lower both in Unicode 5.0.0 and 7.0.0'''
],
[[0xA67F], [('punct', False), ('alpha', True)],
'''0xa67f CYRILLIC PAYEROK. Not in Unicode 5.0.0. In Unicode
7.0.0. General category Lm (Letter
modifier). DerivedCoreProperties.txt says it is
Alphabetic. Apparently added manually to punct by mistake in
glibcs old LC_CTYPE.'''
],
[[0xA60C], [('punct', False), ('alpha', True)],
'''0xa60c VAI SYLLABLE LENGTHENER. Not in Unicode 5.0.0.
In Unicode 7.0.0. General category Lm (Letter
modifier). DerivedCoreProperties.txt says it is
Alphabetic. Apparently added manually to punct by mistake in
glibcs old LC_CTYPE.'''
],
[[0x2E2F], [('punct', False), ('alpha', True)],
'''0x2E2F VERTICAL TILDE. Not in Unicode 5.0.0. In Unicode
7.0.0. General category Lm (Letter
modifier). DerivedCoreProperties.txt says it is
Alphabetic. Apparently added manually to punct by mistake in
glibcs old LC_CTYPE.'''
],
[[(0x1090, 0x1099)], [('punct', False), ('alpha', True)],
'''MYANMAR SHAN DIGIT ZERO - MYANMAR SHAN DIGIT NINE.
These are digits, but because ISO C 99 forbids to
put them into digit they should go into alpha.'''
],
[[0x103F], [('punct', False), ('alpha', True)],
'''0x103F MYANMAR LETTER GREAT SA. Not in Unicode 5.0.0.
In Unicode 7.0.0. General category Lo
(Other_Letter). DerivedCoreProperties.txt says it is
Alphabetic. Apparently added manually to punct by
mistake in glibcs old LC_CTYPE.'''
],
[[0x0374], [('punct', False), ('alpha', True)],
'''0x0374 GREEK NUMERAL SIGN. Unicode 5.0.0: general category
Sk. Unicode 7.0.0: General category Lm
(Modifier_Letter). DerivedCoreProperties.txt says it is
Alphabetic.'''
],
[[0x02EC], [('punct', False), ('alpha', True)],
'''0x02EC MODIFIER LETTER VOICING. Unicode 5.0.0: general category
Sk. Unicode 7.0.0: General category Lm
(Modifier_Letter). DerivedCoreProperties.txt says it is
Alphabetic.'''
],
[[0x180E], [('space', False), ('blank', False)],
'''0x180e MONGOLIAN VOWEL SEPARATOR. Unicode 5.0.0: General
category Zs (Space_Separator) Unicode 7.0.0: General category Cf
(Format).'''
],
[[0x1E9C, 0x1E9D, 0x1E9F],
[('lower', True), ('upper', False), ('tolower', False),
('toupper', False), ('totitle', False)],
'''ẜ 0x1e9c LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE,
0x1e9d LATIN SMALL LETTER LONG S WITH HIGH STROKE,
0x1e9f LATIN SMALL LETTER DELTA. These are Lowercase
according to DerivedCoreProperties.txt but no upper case versions
exist.'''
],
[[0x1E9E],
[('lower', False), ('upper', True), ('tolower', True),
('toupper', False), ('totitle', False)],
'''0x1E9E ẞ LATIN CAPITAL LETTER SHARP S This is “Uppercase”
according to DerivedCoreProperties.txt and the lower case
version is 0x00DF ß LATIN SMALL LETTER SHARP S.'''
],
[[0x2188],
[('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''0x2188 ROMAN NUMERAL ONE HUNDRED THOUSAND. This is “Alphabetic”
according to DerivedCoreProperties.txt. In glibcs old
LC_CTYPE, it was in lower, which seems to be a
mistake. It is not Lowercase in
DerivedCoreProperties.txt and does not have case mappings
in UnicodeData.txt either.'''
],
[[0x2C71, 0x2C74, (0x2C77, 0x2C7A)],
[('alpha', True), ('lower', True), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''These are Latin small letters which were not in Unicode 5.0.0
but are in Unicode 7.0.0. According to
DerivedCoreProperties.txt they are Lowercase. But no
uppercase versions exist. They have apparently been added
manually to glibcs old LC_CTYPE.'''
],
[[0xA730, 0xA731],
[('alpha', True), ('lower', True), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''These are Latin small “capital” letters which were not in
Unicode 5.0.0 but are in Unicode 7.0.0. According to
DerivedCoreProperties.txt they are Lowercase. But no
uppercase versions exist. They have apparently been added
manually to glibcs old LC_CTYPE.'''
],
[[(0xA771, 0xA778)],
[('alpha', True), ('lower', True), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''These are Latin small (or small “capital”) letters which
were not in Unicodee 5.0.0 but are in Unicode 7.0.0. According to
DerivedCoreProperties.txt they are Lowercase. But no
uppercase versions exist. They have apparently been added
manually to glibcs old LC_CTYPE.'''
],
[[0x0375],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''“0375;GREEK LOWER NUMERAL SIGN;Sk;0;ON;;;;;N;;;;;”. Has
apparently been added manually to glibcs old LC_CTYPE as
combining_level3. That seems wrong, it is no combining
character because it does not have one of the general
categories Mn, Mc, or Me. According to
DerivedCoreProperties.txt it is not Alphabetic.'''
],
[[0x108D],
[('combining', True), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''“108D;MYANMAR SIGN SHAN COUNCIL EMPHATIC
TONE;Mn;220;NSM;;;;;N;;;;;. Has apparently been added
manually to glibcs old LC_CTYPE as
combining_level3. That seems wrong, although it is a
combining character because it has the general category
Mn, it is not combining_level3 because the canonical
combining class value is 220 which is >= 200. According to
gen-unicode-ctype.c, combining_level3 needs a
canonical combining class value < 200. According to
DerivedCoreProperties.txt it is not Alphabetic.'''
],
[[0x06DE],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
''' UnicodeData.txt 5.0.0: “06DE;ARABIC START OF RUB EL
HIZB;Me;0;NSM;;;;;N;;;;;; UnicodeData.txt 7.0.0:
06DE;ARABIC START OF RUB EL
HIZB;So;0;ON;;;;;N;;;;;. I.e. this used to be a
combining character in Unicode 5.0.0 but not anymore in
7.0.0. According to DerivedCoreProperties.txt it is not
Alphabetic.'''
],
[[0x0BD0],
[('combining', False), ('combining_level3', False),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''Not in UnicodeData.txt 5.0.0. UnicodeData.txt 7.0.0:
0BD0;TAMIL OM;Lo;0;L;;;;;N;;;;;. Apparently manually added to
combining and combining_level3 in glibcs old
LC_CTYPE. That seems wrong. According to
DerivedCoreProperties.txt it is Alphabetic.'''
],
[[0x103F],
[('combining', False), ('combining_level3', False),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''Not in UnicodeData.txt 5.0.0. UnicodeData.txt 7.0.0:
103F;MYANMAR LETTER GREAT SA;Lo;0;L;;;;;N;;;;;.
Apparently manually added to combining and
combining_level3 in glibcs old LC_CTYPE. That seems
wrong. According to DerivedCoreProperties.txt it is
Alphabetic.'''
],
[[(0x0901, 0x0903)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''These have general category “Mn” i.e. these are combining
characters (both in UnicodeData.txt 5.0.0 and 7.0.0):
0901;DEVANAGARI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;,
0902;DEVANAGARI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;,
0903;DEVANAGARI SIGN VISARGA;Mc;0;L;;;;;N;;;;;.
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x093C],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''UnicodeData.txt (5.0.0 and 7.0.0): “093C;DEVANAGARI SIGN
NUKTA;Mn;7;NSM;;;;;N;;;;; According to
DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic. glibcs old LC_TYPE has this in alpha.'''
],
[[(0x093E, 0x093F)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''These have general category “Mc” i.e. these are combining
characters (both in UnicodeData.txt 5.0.0 and 7.0.0):
093E;DEVANAGARI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
093F;DEVANAGARI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[(0x0940, 0x094C)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''These are all combining
characters (Mc or Mn both in UnicodeData.txt 5.0.0 and 7.0.0).
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x094D],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
094D;DEVANAGARI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) it is *not*
Alphabetic.'''
],
[[(0x0951, 0x0954)],
[('combining', True), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0x0962, 0x0963), (0x0981, 0x0983)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x09BC],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
09BC;BENGALI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
According to DerivedCoreProperties.txt (7.0.0) it is *not*
Alphabetic.'''
],
[[(0x09BE, 0x09BF), (0x09C0, 0x09C4), (0x09C7, 0x09C8),
(0x09CB, 0x09CC)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
09BE;BENGALI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
09BF;BENGALI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
09C0;BENGALI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
09C1;BENGALI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
09C2;BENGALI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
09C3;BENGALI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
09C4;BENGALI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;
09C7;BENGALI VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
09C8;BENGALI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;
09CB;BENGALI VOWEL SIGN O;Mc;0;L;09C7 09BE;;;;N;;;;;
09CC;BENGALI VOWEL SIGN AU;Mc;0;L;09C7 09D7;;;;N;;;;;
Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x09CD],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
09CD;BENGALI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
According to DerivedCoreProperties.txt (7.0.0) it is *not*
Alphabetic.'''
],
[[0x09D7, (0x09E2, 0x09E3)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x09F2, 0x09F3],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
09F2;BENGALI RUPEE MARK;Sc;0;ET;;;;;N;;;;;
09F3;BENGALI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0x09F4, 0x09FA)],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
09F4;BENGALI CURRENCY NUMERATOR ONE;No;0;L;;;;1/16;N;;;;;
09F5;BENGALI CURRENCY NUMERATOR TWO;No;0;L;;;;1/8;N;;;;;
09F6;BENGALI CURRENCY NUMERATOR THREE;No;0;L;;;;3/16;N;;;;;
09F7;BENGALI CURRENCY NUMERATOR FOUR;No;0;L;;;;1/4;N;;;;;
09F8;BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR;
No;0;L;;;;3/4;N;;;;;
09F9;BENGALI CURRENCY DENOMINATOR SIXTEEN;No;0;L;;;;16;N;;;;;
09FA;BENGALI ISSHAR;So;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0x0A01, 0x0A03)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0A01;GURMUKHI SIGN ADAK BINDI;Mn;0;NSM;;;;;N;;;;;
0A02;GURMUKHI SIGN BINDI;Mn;0;NSM;;;;;N;;;;;
0A03;GURMUKHI SIGN VISARGA;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0A3C],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0A3C;GURMUKHI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[(0x0A3E, 0x0A40), (0x0A41, 0x0A42), (0x0A47, 0x0A48),
(0x0A4B, 0x0A4C)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0A3E;GURMUKHI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
0A3F;GURMUKHI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
0A40;GURMUKHI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
0A41;GURMUKHI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
0A42;GURMUKHI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
0A47;GURMUKHI VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;
0A48;GURMUKHI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;
0A4B;GURMUKHI VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;
0A4C;GURMUKHI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0A4D],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[0x0A51, (0x0A70, 0x0A71), 0x0A75, (0x0A81, 0x0A83)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
0A70;GURMUKHI TIPPI;Mn;0;NSM;;;;;N;;;;;
0A71;GURMUKHI ADDAK;Mn;0;NSM;;;;;N;;;;;
0A75;GURMUKHI SIGN YAKASH;Mn;0;NSM;;;;;N;;;;;
0A81;GUJARATI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
0A82;GUJARATI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
0A83;GUJARATI SIGN VISARGA;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0ABC],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0ABC;GUJARATI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[(0x0ABE, 0x0AC5), (0x0AC7, 0x0AC9), (0x0ACB, 0x0ACC)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0ABE;GUJARATI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
0ABF;GUJARATI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
0AC0;GUJARATI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
0AC1;GUJARATI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
0AC2;GUJARATI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
0AC3;GUJARATI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
0AC4;GUJARATI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;
0AC5;GUJARATI VOWEL SIGN CANDRA E;Mn;0;NSM;;;;;N;;;;;
0AC7;GUJARATI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;
0AC8;GUJARATI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;
0AC9;GUJARATI VOWEL SIGN CANDRA O;Mc;0;L;;;;;N;;;;;
0ACB;GUJARATI VOWEL SIGN O;Mc;0;L;;;;;N;;;;;
0ACC;GUJARATI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0ACD],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0ACD;GUJARATI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[(0x0AE2, 0x0AE3)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0AE2;GUJARATI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
0AE3;GUJARATI VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0AF1],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0AF1;GUJARATI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[(0x0B01, 0x0B03)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0B01;ORIYA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
0B02;ORIYA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
0B03;ORIYA SIGN VISARGA;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0B3C],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0B3C;ORIYA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[(0x0B3E, 0x0B44), (0x0B47, 0x0B48), (0x0B4B, 0x0B4C)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0B3E;ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
0B3F;ORIYA VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;
0B40;ORIYA VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
0B41;ORIYA VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
0B42;ORIYA VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
0B43;ORIYA VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
0B44;ORIYA VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;
0B47;ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
0B48;ORIYA VOWEL SIGN AI;Mc;0;L;0B47 0B56;;;;N;;;;;
0B4B;ORIYA VOWEL SIGN O;Mc;0;L;0B47 0B3E;;;;N;;;;;
0B4C;ORIYA VOWEL SIGN AU;Mc;0;L;0B47 0B57;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0B4D],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0B4D;ORIYA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[(0x0B56, 0x0B57), (0x0B62, 0x0B63)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0B56;ORIYA AI LENGTH MARK;Mn;0;NSM;;;;;N;;;;;
0B57;ORIYA AU LENGTH MARK;Mc;0;L;;;;;N;;;;;
0B62;ORIYA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
0B63;ORIYA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0B70],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0B70;ORIYA ISSHAR;So;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[0x0B82],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0B82;TAMIL SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[(0x0BBE, 0x0BC2), (0x0BC6, 0x0BC8), (0x0BCA, 0x0BCC)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0BBE;TAMIL VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
0BBF;TAMIL VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
0BC0;TAMIL VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;
0BC1;TAMIL VOWEL SIGN U;Mc;0;L;;;;;N;;;;;
0BC2;TAMIL VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;
0BC6;TAMIL VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
0BC7;TAMIL VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;
0BC8;TAMIL VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;
0BCA;TAMIL VOWEL SIGN O;Mc;0;L;0BC6 0BBE;;;;N;;;;;
0BCB;TAMIL VOWEL SIGN OO;Mc;0;L;0BC7 0BBE;;;;N;;;;;
0BCC;TAMIL VOWEL SIGN AU;Mc;0;L;0BC6 0BD7;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0BCD],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0BCD;TAMIL SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[0x0BD7],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0BD7;TAMIL AU LENGTH MARK;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[(0x0BF0, 0x0BFA)],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0BF0;TAMIL NUMBER TEN;No;0;L;;;;10;N;;;;;
0BF1;TAMIL NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;
0BF2;TAMIL NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;
0BF3;TAMIL DAY SIGN;So;0;ON;;;;;N;;;;;
0BF4;TAMIL MONTH SIGN;So;0;ON;;;;;N;;;;;
0BF5;TAMIL YEAR SIGN;So;0;ON;;;;;N;;;;;
0BF6;TAMIL DEBIT SIGN;So;0;ON;;;;;N;;;;;
0BF7;TAMIL CREDIT SIGN;So;0;ON;;;;;N;;;;;
0BF8;TAMIL AS ABOVE SIGN;So;0;ON;;;;;N;;;;;
0BF9;TAMIL RUPEE SIGN;Sc;0;ET;;;;;N;;;;;
0BFA;TAMIL NUMBER SIGN;So;0;ON;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) this is *not*
Alphabetic.'''
],
[[(0x0C01, 0x0C03)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0C01;TELUGU SIGN CANDRABINDU;Mc;0;L;;;;;N;;;;;
0C02;TELUGU SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
0C03;TELUGU SIGN VISARGA;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[(0x0C3E, 0x0C44), (0x0C46, 0x0C48), (0x0C4A, 0x0C4C)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0C3E;TELUGU VOWEL SIGN AA;Mn;0;NSM;;;;;N;;;;;
0C3F;TELUGU VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;
0C40;TELUGU VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;
0C41;TELUGU VOWEL SIGN U;Mc;0;L;;;;;N;;;;;
0C42;TELUGU VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;
0C43;TELUGU VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;
0C44;TELUGU VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;
0C46;TELUGU VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;
0C47;TELUGU VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;
0C48;TELUGU VOWEL SIGN AI;Mn;0;NSM;0C46 0C56;;;;N;;;;;
0C4A;TELUGU VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;;
0C4B;TELUGU VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;
0C4C;TELUGU VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0C4D],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0C4D;TELUGU SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0x0C55, 0x0C56), (0x0C62, 0x0C63)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0C55;TELUGU LENGTH MARK;Mn;84;NSM;;;;;N;;;;;
0C56;TELUGU AI LENGTH MARK;Mn;91;NSM;;;;;N;;;;;
0C62;TELUGU VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
0C63;TELUGU VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[(0x0C78, 0x0C7F)],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0C78;TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR;
No;0;ON;;;;0;N;;;;;
0C79;TELUGU FRACTION DIGIT ONE FOR ODD POWERS OF FOUR;
No;0;ON;;;;1;N;;;;;
0C7A;TELUGU FRACTION DIGIT TWO FOR ODD POWERS OF FOUR;
No;0;ON;;;;2;N;;;;;
0C7B;TELUGU FRACTION DIGIT THREE FOR ODD POWERS OF FOUR;
No;0;ON;;;;3;N;;;;;
0C7C;TELUGU FRACTION DIGIT ONE FOR EVEN POWERS OF FOUR;
No;0;ON;;;;1;N;;;;;
0C7D;TELUGU FRACTION DIGIT TWO FOR EVEN POWERS OF FOUR;
No;0;ON;;;;2;N;;;;;
0C7E;TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR;
No;0;ON;;;;3;N;;;;;
0C7F;TELUGU SIGN TUUMU;So;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0x0C82, 0x0C83)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0C81;KANNADA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
0C82;KANNADA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
0C83;KANNADA SIGN VISARGA;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0CBC],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0CBC;KANNADA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[(0x0CBE, 0x0CC4), (0x0CC6, 0x0CC8), (0x0CCA, 0x0CCC)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0CBE;KANNADA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
0CBF;KANNADA VOWEL SIGN I;Mn;0;L;;;;;N;;;;;
0CC0;KANNADA VOWEL SIGN II;Mc;0;L;0CBF 0CD5;;;;N;;;;;
0CC1;KANNADA VOWEL SIGN U;Mc;0;L;;;;;N;;;;;
0CC2;KANNADA VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;
0CC3;KANNADA VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;
0CC4;KANNADA VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;
0CC6;KANNADA VOWEL SIGN E;Mn;0;L;;;;;N;;;;;
0CC7;KANNADA VOWEL SIGN EE;Mc;0;L;0CC6 0CD5;;;;N;;;;;
0CC8;KANNADA VOWEL SIGN AI;Mc;0;L;0CC6 0CD6;;;;N;;;;;
0CCA;KANNADA VOWEL SIGN O;Mc;0;L;0CC6 0CC2;;;;N;;;;;
0CCB;KANNADA VOWEL SIGN OO;Mc;0;L;0CCA 0CD5;;;;N;;;;;
0CCC;KANNADA VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0CCD],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0CCD;KANNADA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0x0CD5, 0x0CD6), (0x0CE2, 0x0CE3)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0CD5;KANNADA LENGTH MARK;Mc;0;L;;;;;N;;;;;
0CD6;KANNADA AI LENGTH MARK;Mc;0;L;;;;;N;;;;;
0CE2;KANNADA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
0CE3;KANNADA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[(0x0D02, 0x0D03), (0x0D3E, 0x0D44), (0x0D46, 0x0D48),
(0x0D4A, 0x0D4C)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
0D03;MALAYALAM SIGN VISARGA;Mc;0;L;;;;;N;;;;;
0D3E;MALAYALAM VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
0D3F;MALAYALAM VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
0D40;MALAYALAM VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
0D41;MALAYALAM VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
0D42;MALAYALAM VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
0D43;MALAYALAM VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
0D44;MALAYALAM VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;
0D46;MALAYALAM VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
0D47;MALAYALAM VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;
0D48;MALAYALAM VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;
0D4A;MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
0D4B;MALAYALAM VOWEL SIGN OO;Mc;0;L;0D47 0D3E;;;;N;;;;;
0D4C;MALAYALAM VOWEL SIGN AU;Mc;0;L;0D46 0D57;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0D4D],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0D4D;MALAYALAM SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0D57, (0x0D62, 0x0D63)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0D57;MALAYALAM AU LENGTH MARK;Mc;0;L;;;;;N;;;;;
0D62;MALAYALAM VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
0D63;MALAYALAM VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[(0x0D70, 0x0D79)],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0D70;MALAYALAM NUMBER TEN;No;0;L;;;;10;N;;;;;
0D71;MALAYALAM NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;
0D72;MALAYALAM NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;
0D73;MALAYALAM FRACTION ONE QUARTER;No;0;L;;;;1/4;N;;;;;
0D74;MALAYALAM FRACTION ONE HALF;No;0;L;;;;1/2;N;;;;;
0D75;MALAYALAM FRACTION THREE QUARTERS;No;0;L;;;;3/4;N;;;;;
0D79;MALAYALAM DATE MARK;So;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0x0D82, 0x0D83)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0D82;SINHALA SIGN ANUSVARAYA;Mc;0;L;;;;;N;;;;;
0D83;SINHALA SIGN VISARGAYA;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0DCA],
[('combining', True), ('combining_level3', True),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0DCA;SINHALA SIGN AL-LAKUNA;Mn;9;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0x0DCF, 0x0DD4), 0x0DD6, (0x0DD8, 0x0DDF), (0x0DF2, 0x0DF3)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0DCF;SINHALA VOWEL SIGN AELA-PILLA;Mc;0;L;;;;;N;;;;;
0DD0;SINHALA VOWEL SIGN KETTI AEDA-PILLA;Mc;0;L;;;;;N;;;;;
0DD1;SINHALA VOWEL SIGN DIGA AEDA-PILLA;Mc;0;L;;;;;N;;;;;
0DD2;SINHALA VOWEL SIGN KETTI IS-PILLA;Mn;0;NSM;;;;;N;;;;;
0DD3;SINHALA VOWEL SIGN DIGA IS-PILLA;Mn;0;NSM;;;;;N;;;;;
0DD4;SINHALA VOWEL SIGN KETTI PAA-PILLA;Mn;0;NSM;;;;;N;;;;;
0DD6;SINHALA VOWEL SIGN DIGA PAA-PILLA;Mn;0;NSM;;;;;N;;;;;
0DD8;SINHALA VOWEL SIGN GAETTA-PILLA;Mc;0;L;;;;;N;;;;;
0DD9;SINHALA VOWEL SIGN KOMBUVA;Mc;0;L;;;;;N;;;;;
0DDA;SINHALA VOWEL SIGN DIGA KOMBUVA;Mc;0;L;0DD9 0DCA;;;;N;;;;;
0DDB;SINHALA VOWEL SIGN KOMBU DEKA;Mc;0;L;;;;;N;;;;;
0DDC;SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA;
Mc;0;L;0DD9 0DCF;;;;N;;;;;
0DDD;SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA;
Mc;0;L;0DDC 0DCA;;;;N;;;;;
0DDE;SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA;
Mc;0;L;0DD9 0DDF;;;;N;;;;;
0DDF;SINHALA VOWEL SIGN GAYANUKITTA;Mc;0;L;;;;;N;;;;;
0DF2;SINHALA VOWEL SIGN DIGA GAETTA-PILLA;Mc;0;L;;;;;N;;;;;
0DF3;SINHALA VOWEL SIGN DIGA GAYANUKITTA;Mc;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[0x0DF4],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
0DF4;SINHALA PUNCTUATION KUNDDALIYA;Po;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0xA789, 0xA78A)],
[('combining', False), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
A789;MODIFIER LETTER COLON;Sk;0;L;;;;;N;;;;;
A78A;MODIFIER LETTER SHORT EQUALS SIGN;Sk;0;L;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
],
[[(0xA926, 0xA92A)],
[('combining', True), ('combining_level3', True),
('alpha', True), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
A926;KAYAH LI VOWEL UE;Mn;0;NSM;;;;;N;;;;;
A927;KAYAH LI VOWEL E;Mn;0;NSM;;;;;N;;;;;
A928;KAYAH LI VOWEL U;Mn;0;NSM;;;;;N;;;;;
A929;KAYAH LI VOWEL EE;Mn;0;NSM;;;;;N;;;;;
A92A;KAYAH LI VOWEL O;Mn;0;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are
Alphabetic.'''
],
[[(0xA92B, 0xA92D)],
[('combining', True), ('combining_level3', False),
('alpha', False), ('lower', False), ('upper', False),
('tolower', False), ('toupper', False), ('totitle', False)],
'''
A92B;KAYAH LI TONE PLOPHU;Mn;220;NSM;;;;;N;;;;;
A92C;KAYAH LI TONE CALYA;Mn;220;NSM;;;;;N;;;;;
A92D;KAYAH LI TONE CALYA PLOPHU;Mn;220;NSM;;;;;N;;;;;
According to DerivedCoreProperties.txt (7.0.0) these are *not*
Alphabetic.'''
]
]

View File

@ -0,0 +1,751 @@
#!/usr/bin/python3
#
# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
# Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
DerivedCoreProperties.txt files.
To see how this script is used, call it with the -h option:
$ ./gen_unicode_ctype.py -h
prints usage message
'''
import argparse
import sys
import time
import re
# Dictionary holding the entire contents of the UnicodeData.txt file
#
# Contents of this dictionary look like this:
#
# {0: {'category': 'Cc',
# 'title': None,
# 'digit': '',
# 'name': '<control>',
# 'bidi': 'BN',
# 'combining': '0',
# 'comment': '',
# 'oldname': 'NULL',
# 'decomposition': '',
# 'upper': None,
# 'mirrored': 'N',
# 'lower': None,
# 'decdigit': '',
# 'numeric': ''},
# …
# }
UNICODE_ATTRIBUTES = {}
# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
#
# Contents of this dictionary look like this:
#
# {917504: ['Default_Ignorable_Code_Point'],
# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
# …
# }
DERIVED_CORE_PROPERTIES = {}
def fill_attribute(code_point, fields):
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
in the UnicodeData.txt file.
'''
UNICODE_ATTRIBUTES[code_point] = {
'name': fields[1], # Character name
'category': fields[2], # General category
'combining': fields[3], # Canonical combining classes
'bidi': fields[4], # Bidirectional category
'decomposition': fields[5], # Character decomposition mapping
'decdigit': fields[6], # Decimal digit value
'digit': fields[7], # Digit value
'numeric': fields[8], # Numeric value
'mirrored': fields[9], # mirrored
'oldname': fields[10], # Old Unicode 1.0 name
'comment': fields[11], # comment
# Uppercase mapping
'upper': int(fields[12], 16) if fields[12] else None,
# Lowercase mapping
'lower': int(fields[13], 16) if fields[13] else None,
# Titlecase mapping
'title': int(fields[14], 16) if fields[14] else None,
}
def fill_attributes(filename):
'''Stores the entire contents of the UnicodeData.txt file
in the UNICODE_ATTRIBUTES dictionary.
A typical line for a single code point in UnicodeData.txt looks
like this:
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
Code point ranges are indicated by pairs of lines like this:
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
'''
with open(filename, mode='r') as unicode_data_file:
fields_start = []
for line in unicode_data_file:
fields = line.strip().split(';')
if len(fields) != 15:
sys.stderr.write(
'short line in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
if fields[2] == 'Cs':
# Surrogates are UTF-16 artefacts,
# not real characters. Ignore them.
fields_start = []
continue
if fields[1].endswith(', First>'):
fields_start = fields
fields_start[1] = fields_start[1].split(',')[0][1:]
continue
if fields[1].endswith(', Last>'):
fields[1] = fields[1].split(',')[0][1:]
if fields[1:] != fields_start[1:]:
sys.stderr.write(
'broken code point range in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
for code_point in range(
int(fields_start[0], 16),
int(fields[0], 16)+1):
fill_attribute(code_point, fields)
fields_start = []
continue
fill_attribute(int(fields[0], 16), fields)
fields_start = []
def fill_derived_core_properties(filename):
'''Stores the entire contents of the DerivedCoreProperties.txt file
in the DERIVED_CORE_PROPERTIES dictionary.
Lines in DerivedCoreProperties.txt are either a code point range like
this:
0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
or a single code point like this:
00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
'''
with open(filename, mode='r') as derived_core_properties_file:
for line in derived_core_properties_file:
match = re.match(
r'^(?P<codepoint1>[0-9A-F]{4,6})'
+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+ r'\s*;\s*(?P<property>[a-zA-Z_]+)',
line)
if not match:
continue
start = match.group('codepoint1')
end = match.group('codepoint2')
if not end:
end = start
for code_point in range(int(start, 16), int(end, 16)+1):
prop = match.group('property')
if code_point in DERIVED_CORE_PROPERTIES:
DERIVED_CORE_PROPERTIES[code_point].append(prop)
else:
DERIVED_CORE_PROPERTIES[code_point] = [prop]
def to_upper(code_point):
'''Returns the code point of the uppercase version
of the given code point'''
if (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['upper']):
return UNICODE_ATTRIBUTES[code_point]['upper']
else:
return code_point
def to_lower(code_point):
'''Returns the code point of the lowercase version
of the given code point'''
if (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['lower']):
return UNICODE_ATTRIBUTES[code_point]['lower']
else:
return code_point
def to_title(code_point):
'''Returns the code point of the titlecase version
of the given code point'''
if (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['title']):
return UNICODE_ATTRIBUTES[code_point]['title']
else:
return code_point
def is_upper(code_point):
'''Checks whether the character with this code point is uppercase'''
return (to_lower(code_point) != code_point
or (code_point in DERIVED_CORE_PROPERTIES
and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
def is_lower(code_point):
'''Checks whether the character with this code point is lowercase'''
# Some characters are defined as “Lowercase” in
# DerivedCoreProperties.txt but do not have a mapping to upper
# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
# one of these.
return (to_upper(code_point) != code_point
# <U00DF> is lowercase, but without simple to_upper mapping.
or code_point == 0x00DF
or (code_point in DERIVED_CORE_PROPERTIES
and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
def is_alpha(code_point):
'''Checks whether the character with this code point is alphabetic'''
return ((code_point in DERIVED_CORE_PROPERTIES
and
'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
or
# Consider all the non-ASCII digits as alphabetic.
# ISO C 99 forbids us to have them in category “digit”,
# but we want iswalnum to return true on them.
(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
and not (code_point >= 0x0030 and code_point <= 0x0039)))
def is_digit(code_point):
'''Checks whether the character with this code point is a digit'''
if False:
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
# a zero. Must add <0> in front of them by hand.
else:
# SUSV2 gives us some freedom for the "digit" category, but ISO C 99
# takes it away:
# 7.25.2.1.5:
# The iswdigit function tests for any wide character that
# corresponds to a decimal-digit character (as defined in 5.2.1).
# 5.2.1:
# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
return (code_point >= 0x0030 and code_point <= 0x0039)
def is_outdigit(code_point):
'''Checks whether the character with this code point is outdigit'''
return (code_point >= 0x0030 and code_point <= 0x0039)
def is_blank(code_point):
'''Checks whether the character with this code point is blank'''
return (code_point == 0x0009 # '\t'
# Category Zs without mention of '<noBreak>'
or (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
and '<noBreak>' not in
UNICODE_ATTRIBUTES[code_point]['decomposition']))
def is_space(code_point):
'''Checks whether the character with this code point is a space'''
# Dont make U+00A0 a space. Non-breaking space means that all programs
# should treat it like a punctuation character, not like a space.
return (code_point == 0x0020 # ' '
or code_point == 0x000C # '\f'
or code_point == 0x000A # '\n'
or code_point == 0x000D # '\r'
or code_point == 0x0009 # '\t'
or code_point == 0x000B # '\v'
# Categories Zl, Zp, and Zs without mention of "<noBreak>"
or (UNICODE_ATTRIBUTES[code_point]['name']
and
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
or
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
and
'<noBreak>' not in
UNICODE_ATTRIBUTES[code_point]['decomposition']))))
def is_cntrl(code_point):
'''Checks whether the character with this code point is
a control character'''
return (UNICODE_ATTRIBUTES[code_point]['name']
and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
or
UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
def is_xdigit(code_point):
'''Checks whether the character with this code point is
a hexadecimal digit'''
if False:
return (is_digit(code_point)
or (code_point >= 0x0041 and code_point <= 0x0046)
or (code_point >= 0x0061 and code_point <= 0x0066))
else:
# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
# takes it away:
# 7.25.2.1.12:
# The iswxdigit function tests for any wide character that
# corresponds to a hexadecimal-digit character (as defined
# in 6.4.4.1).
# 6.4.4.1:
# hexadecimal-digit: one of
# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
return ((code_point >= 0x0030 and code_point <= 0x0039)
or (code_point >= 0x0041 and code_point <= 0x0046)
or (code_point >= 0x0061 and code_point <= 0x0066))
def is_graph(code_point):
'''Checks whether the character with this code point is
a graphical character'''
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
and not is_space(code_point))
def is_print(code_point):
'''Checks whether the character with this code point is printable'''
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
def is_punct(code_point):
'''Checks whether the character with this code point is punctuation'''
if False:
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
else:
# The traditional POSIX definition of punctuation is every graphic,
# non-alphanumeric character.
return (is_graph(code_point)
and not is_alpha(code_point)
and not is_digit(code_point))
def is_combining(code_point):
'''Checks whether the character with this code point is
a combining character'''
# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
# file. In 3.0.1 it was identical to the union of the general categories
# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
# PropList.txt file, so we take the latter definition.
return (UNICODE_ATTRIBUTES[code_point]['name']
and
UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
def is_combining_level3(code_point):
'''Checks whether the character with this code point is
a combining level3 character'''
return (is_combining(code_point)
and
int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
def ucs_symbol(code_point):
'''Return the UCS symbol string for a Unicode character.'''
if code_point < 0x10000:
return '<U{:04X}>'.format(code_point)
else:
return '<U{:08X}>'.format(code_point)
def ucs_symbol_range(code_point_low, code_point_high):
'''Returns a string UCS symbol string for a code point range.
Example:
<U0041>..<U005A>
'''
return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
def code_point_ranges(is_class_function):
'''Returns a list of ranges of code points for which is_class_function
returns True.
Example:
[[65, 90], [192, 214], [216, 222], [256], ]
'''
cp_ranges = []
for code_point in sorted(UNICODE_ATTRIBUTES):
if is_class_function(code_point):
if (cp_ranges
and cp_ranges[-1][-1] == code_point - 1):
if len(cp_ranges[-1]) == 1:
cp_ranges[-1].append(code_point)
else:
cp_ranges[-1][-1] = code_point
else:
cp_ranges.append([code_point])
return cp_ranges
def output_charclass(i18n_file, class_name, is_class_function):
'''Output a LC_CTYPE character class section
Example:
upper /
<U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
<U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
<U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
'''
cp_ranges = code_point_ranges(is_class_function)
if cp_ranges:
i18n_file.write('%s /\n' %class_name)
max_column = 75
prefix = ' '
line = prefix
range_string = ''
for code_point_range in cp_ranges:
if line.strip():
line += ';'
if len(code_point_range) == 1:
range_string = ucs_symbol(code_point_range[0])
else:
range_string = ucs_symbol_range(
code_point_range[0], code_point_range[-1])
if len(line+range_string) > max_column:
i18n_file.write(line+'/\n')
line = prefix
line += range_string
if line.strip():
i18n_file.write(line+'\n')
i18n_file.write('\n')
def output_charmap(i18n_file, map_name, map_function):
'''Output a LC_CTYPE character map section
Example:
toupper /
(<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
(<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
(<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
'''
max_column = 75
prefix = ' '
line = prefix
map_string = ''
i18n_file.write('%s /\n' %map_name)
for code_point in sorted(UNICODE_ATTRIBUTES):
mapped = map_function(code_point)
if code_point != mapped:
if line.strip():
line += ';'
map_string = '(' \
+ ucs_symbol(code_point) \
+ ',' \
+ ucs_symbol(mapped) \
+ ')'
if len(line+map_string) > max_column:
i18n_file.write(line+'/\n')
line = prefix
line += map_string
if line.strip():
i18n_file.write(line+'\n')
i18n_file.write('\n')
def verifications():
'''Tests whether the is_* functions observe the known restrictions'''
for code_point in sorted(UNICODE_ATTRIBUTES):
# toupper restriction: "Only characters specified for the keywords
# lower and upper shall be specified.
if (to_upper(code_point) != code_point
and not (is_lower(code_point) or is_upper(code_point))):
sys.stderr.write(
('%(sym)s is not upper|lower '
+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
'sym': ucs_symbol(code_point),
'c': code_point,
'uc': to_upper(code_point)})
# tolower restriction: "Only characters specified for the keywords
# lower and upper shall be specified.
if (to_lower(code_point) != code_point
and not (is_lower(code_point) or is_upper(code_point))):
sys.stderr.write(
('%(sym)s is not upper|lower '
+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
'sym': ucs_symbol(code_point),
'c': code_point,
'uc': to_lower(code_point)})
# alpha restriction: "Characters classified as either upper or lower
# shall automatically belong to this class.
if ((is_lower(code_point) or is_upper(code_point))
and not is_alpha(code_point)):
sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
'sym': ucs_symbol(code_point)})
# alpha restriction: “No character specified for the keywords cntrl,
# digit, punct or space shall be specified.”
if (is_alpha(code_point) and is_cntrl(code_point)):
sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
'sym': ucs_symbol(code_point)})
if (is_alpha(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is alpha and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_alpha(code_point) and is_punct(code_point)):
sys.stderr.write('%(sym)s is alpha and punct\n' %{
'sym': ucs_symbol(code_point)})
if (is_alpha(code_point) and is_space(code_point)):
sys.stderr.write('%(sym)s is alpha and space\n' %{
'sym': ucs_symbol(code_point)})
# space restriction: “No character specified for the keywords upper,
# lower, alpha, digit, graph or xdigit shall be specified.”
# upper, lower, alpha already checked above.
if (is_space(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is space and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_space(code_point) and is_graph(code_point)):
sys.stderr.write('%(sym)s is space and graph\n' %{
'sym': ucs_symbol(code_point)})
if (is_space(code_point) and is_xdigit(code_point)):
sys.stderr.write('%(sym)s is space and xdigit\n' %{
'sym': ucs_symbol(code_point)})
# cntrl restriction: “No character specified for the keywords upper,
# lower, alpha, digit, punct, graph, print or xdigit shall be
# specified.” upper, lower, alpha already checked above.
if (is_cntrl(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is cntrl and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_punct(code_point)):
sys.stderr.write('%(sym)s is cntrl and punct\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_graph(code_point)):
sys.stderr.write('%(sym)s is cntrl and graph\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_print(code_point)):
sys.stderr.write('%(sym)s is cntrl and print\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_xdigit(code_point)):
sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
'sym': ucs_symbol(code_point)})
# punct restriction: “No character specified for the keywords upper,
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
# be specified.” upper, lower, alpha, cntrl already checked above.
if (is_punct(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is punct and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_punct(code_point) and is_xdigit(code_point)):
sys.stderr.write('%(sym)s is punct and xdigit\n' %{
'sym': ucs_symbol(code_point)})
if (is_punct(code_point) and code_point == 0x0020):
sys.stderr.write('%(sym)s is punct\n' %{
'sym': ucs_symbol(code_point)})
# graph restriction: “No character specified for the keyword cntrl
# shall be specified.” Already checked above.
# print restriction: “No character specified for the keyword cntrl
# shall be specified.” Already checked above.
# graph - print relation: differ only in the <space> character.
# How is this possible if there are more than one space character?!
# I think susv2/xbd/locale.html should speak of “space characters”,
# not “space character”.
if (is_print(code_point)
and not (is_graph(code_point) or is_space(code_point))):
sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
'sym': ucs_symbol(code_point)})
if (not is_print(code_point)
and (is_graph(code_point) or code_point == 0x0020)):
sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
'sym': ucs_symbol(code_point)})
def read_input_file(filename):
'''Reads the original glibc i18n file to get the original head
and tail.
We want to replace only the character classes in LC_CTYPE, and the
date stamp. All the rest of the i18n file should stay unchanged.
To avoid having to cut and paste the generated data into the
original file, it is helpful to read the original file here
to be able to generate a complete result file.
'''
head = tail = ''
with open(filename, mode='r') as i18n_file:
for line in i18n_file:
match = re.match(
r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
line)
if match:
line = match.group('key') \
+ '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
head = head + line
if line.startswith('LC_CTYPE'):
break
for line in i18n_file:
if line.startswith('translit_start'):
tail = line
break
for line in i18n_file:
tail = tail + line
return (head, tail)
def output_head(i18n_file, unicode_version, head=''):
'''Write the header of the output file, i.e. the part of the file
before the LC_CTYPE line.
'''
if ARGS.input_file and head:
i18n_file.write(head)
else:
i18n_file.write('escape_char /\n')
i18n_file.write('comment_char %\n')
i18n_file.write('\n')
i18n_file.write('% Generated automatically by '
+ 'gen_unicode_ctype.py '
+ 'for Unicode {:s}.\n'.format(unicode_version))
i18n_file.write('\n')
i18n_file.write('LC_IDENTIFICATION\n')
i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format(
unicode_version))
i18n_file.write('source "UnicodeData.txt, '
+ 'DerivedCoreProperties.txt"\n')
i18n_file.write('address ""\n')
i18n_file.write('contact ""\n')
i18n_file.write('email "bug-glibc-locales@gnu.org"\n')
i18n_file.write('tel ""\n')
i18n_file.write('fax ""\n')
i18n_file.write('language ""\n')
i18n_file.write('territory "Earth"\n')
i18n_file.write('revision "{:s}"\n'.format(unicode_version))
i18n_file.write('date "{:s}"\n'.format(
time.strftime('%Y-%m-%d')))
i18n_file.write('category "unicode:2014";LC_CTYPE\n')
i18n_file.write('END LC_IDENTIFICATION\n')
i18n_file.write('\n')
i18n_file.write('LC_CTYPE\n')
def output_tail(i18n_file, tail=''):
'''Write the tail of the output file, i.e. the part of the file
after the last LC_CTYPE character class.
'''
if ARGS.input_file and tail:
i18n_file.write(tail)
else:
i18n_file.write('END LC_CTYPE\n')
def output_tables(i18n_file, unicode_version):
'''Write the new LC_CTYPE character classes to the output file'''
i18n_file.write('% The following is the 14652 i18n fdcc-set '
+ 'LC_CTYPE category.\n')
i18n_file.write('% It covers Unicode version {:s}.\n'.format(
unicode_version))
i18n_file.write('% The character classes and mapping tables were '
+ 'automatically\n')
i18n_file.write('% generated using the gen_unicode_ctype.py '
+ 'program.\n\n')
i18n_file.write('% The "upper" class reflects the uppercase '
+ 'characters of class "alpha"\n')
output_charclass(i18n_file, 'upper', is_upper)
i18n_file.write('% The "lower" class reflects the lowercase '
+ 'characters of class "alpha"\n')
output_charclass(i18n_file, 'lower', is_lower)
i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
+ 'reflecting\n')
i18n_file.write('% the recommendations in TR 10176 annex A\n')
output_charclass(i18n_file, 'alpha', is_alpha)
i18n_file.write('% The "digit" class must only contain the '
+ 'BASIC LATIN digits, says ISO C 99\n')
i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
output_charclass(i18n_file, 'digit', is_digit)
i18n_file.write('% The "outdigit" information is by default '
+ '"0" to "9". We don\'t have to\n')
i18n_file.write('% provide it here since localedef will fill '
+ 'in the bits and it would\n')
i18n_file.write('% prevent locales copying this file define '
+ 'their own values.\n')
i18n_file.write('% outdigit /\n')
i18n_file.write('% <U0030>..<U0039>\n\n')
# output_charclass(i18n_file, 'outdigit', is_outdigit)
output_charclass(i18n_file, 'space', is_space)
output_charclass(i18n_file, 'cntrl', is_cntrl)
output_charclass(i18n_file, 'punct', is_punct)
output_charclass(i18n_file, 'graph', is_graph)
output_charclass(i18n_file, 'print', is_print)
i18n_file.write('% The "xdigit" class must only contain the '
+ 'BASIC LATIN digits and A-F, a-f,\n')
i18n_file.write('% says ISO C 99 '
+ '(sections 7.25.2.1.12 and 6.4.4.1).\n')
output_charclass(i18n_file, 'xdigit', is_xdigit)
output_charclass(i18n_file, 'blank', is_blank)
output_charmap(i18n_file, 'toupper', to_upper)
output_charmap(i18n_file, 'tolower', to_lower)
output_charmap(i18n_file, 'map "totitle";', to_title)
i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
+ 'annex B.1\n')
i18n_file.write('% That is, all combining characters (level 2+3).\n')
output_charclass(i18n_file, 'class "combining";', is_combining)
i18n_file.write('% The "combining_level3" class reflects '
+ 'ISO/IEC 10646-1 annex B.2\n')
i18n_file.write('% That is, combining characters of level 3.\n')
output_charclass(i18n_file,
'class "combining_level3";', is_combining_level3)
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
Generate a Unicode conforming LC_CTYPE category from
UnicodeData.txt and DerivedCoreProperties.txt files.
''')
PARSER.add_argument(
'-u', '--unicode_data_file',
nargs='?',
type=str,
default='UnicodeData.txt',
help=('The UnicodeData.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
'-d', '--derived_core_properties_file',
nargs='?',
type=str,
default='DerivedCoreProperties.txt',
help=('The DerivedCoreProperties.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
'-i', '--input_file',
nargs='?',
type=str,
help='''The original glibc/localedata/locales/i18n file.''')
PARSER.add_argument(
'-o', '--output_file',
nargs='?',
type=str,
default='i18n.new',
help='''The file which shall contain the generated LC_CTYPE category,
default: %(default)s. If the original
glibc/localedata/locales/i18n has been given
as an option, all data from the original file
except the newly generated LC_CTYPE character
classes and the date stamp in
LC_IDENTIFICATION will be copied unchanged
into the output file. ''')
PARSER.add_argument(
'--unicode_version',
nargs='?',
required=True,
type=str,
help='The Unicode version of the input files used.')
ARGS = PARSER.parse_args()
fill_attributes(ARGS.unicode_data_file)
fill_derived_core_properties(ARGS.derived_core_properties_file)
verifications()
HEAD = TAIL = ''
if ARGS.input_file:
(HEAD, TAIL) = read_input_file(ARGS.input_file)
with open(ARGS.output_file, mode='w') as I18N_FILE:
output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
output_tables(I18N_FILE, ARGS.unicode_version)
output_tail(I18N_FILE, tail=TAIL)

View File

@ -0,0 +1,50 @@
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
Unicode Data Files include all data files under the directories
http://www.unicode.org/Public/, http://www.unicode.org/reports/, and
http://www.unicode.org/cldr/data/. Unicode Data Files do not include PDF
online code charts under the directory http://www.unicode.org/Public/.
Software includes any source code published in the Unicode Standard or under
the directories http://www.unicode.org/Public/,
http://www.unicode.org/reports/, and http://www.unicode.org/cldr/data/.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES
("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND
AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF
YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA
FILES OR SOFTWARE.
COPYRIGHT AND PERMISSION NOTICE
Copyright © 1991-2013 Unicode, Inc. All rights reserved. Distributed under
the Terms of Use in http://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining a
copy of the Unicode data files and any associated documentation (the "Data
Files") or Unicode software and any associated documentation (the "Software")
to deal in the Data Files or Software without restriction, including without
limitation the rights to use, copy, modify, merge, publish, distribute, and/or
sell copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that (a) the above
copyright notice(s) and this permission notice appear with all copies of the
Data Files or Software, (b) both the above copyright notice(s) and this
permission notice appear in associated documentation, and (c) there is clear
notice in each modified Data File or in the Software as well as in the
documentation associated with the Data File(s) or Software that the data or
software has been modified.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD
PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE
DATA FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written authorization
of the copyright holder.

View File

@ -0,0 +1,399 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
This script is useful for checking backward compatibility of newly
generated UTF-8 file from utf8_gen.py script
To see how this script is used, call it with the -h option:
$ ./utf8_compatibility.py -h
prints usage message
'''
import sys
import re
import argparse
# Dictionary holding the entire contents of the UnicodeData.txt file
#
# Contents of this dictionary look like this:
#
# {0: {'category': 'Cc',
# 'title': None,
# 'digit': '',
# 'name': '<control>',
# 'bidi': 'BN',
# 'combining': '0',
# 'comment': '',
# 'oldname': 'NULL',
# 'decomposition': '',
# 'upper': None,
# 'mirrored': 'N',
# 'lower': None,
# 'decdigit': '',
# 'numeric': ''},
# …
# }
UNICODE_ATTRIBUTES = {}
# Dictionary holding the entire contents of the EastAsianWidths.txt file
#
# Contents of this dictionary look like this:
#
# {0: 'N', … , 45430: 'W', …}
EAST_ASIAN_WIDTHS = {}
def fill_attribute(code_point, fields):
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
in the UnicodeData.txt file.
'''
UNICODE_ATTRIBUTES[code_point] = {
'name': fields[1], # Character name
'category': fields[2], # General category
'combining': fields[3], # Canonical combining classes
'bidi': fields[4], # Bidirectional category
'decomposition': fields[5], # Character decomposition mapping
'decdigit': fields[6], # Decimal digit value
'digit': fields[7], # Digit value
'numeric': fields[8], # Numeric value
'mirrored': fields[9], # mirrored
'oldname': fields[10], # Old Unicode 1.0 name
'comment': fields[11], # comment
# Uppercase mapping
'upper': int(fields[12], 16) if fields[12] else None,
# Lowercase mapping
'lower': int(fields[13], 16) if fields[13] else None,
# Titlecase mapping
'title': int(fields[14], 16) if fields[14] else None,
}
def fill_attributes(filename):
'''Stores the entire contents of the UnicodeData.txt file
in the UNICODE_ATTRIBUTES dictionary.
A typical line for a single code point in UnicodeData.txt looks
like this:
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
Code point ranges are indicated by pairs of lines like this:
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
'''
with open(filename, mode='r') as unicode_data_file:
fields_start = []
for line in unicode_data_file:
fields = line.strip().split(';')
if len(fields) != 15:
sys.stderr.write(
'short line in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
if fields[2] == 'Cs':
# Surrogates are UTF-16 artefacts,
# not real characters. Ignore them.
fields_start = []
continue
if fields[1].endswith(', First>'):
fields_start = fields
fields_start[1] = fields_start[1].split(',')[0][1:]
continue
if fields[1].endswith(', Last>'):
fields[1] = fields[1].split(',')[0][1:]
if fields[1:] != fields_start[1:]:
sys.stderr.write(
'broken code point range in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
for code_point in range(
int(fields_start[0], 16),
int(fields[0], 16)+1):
fill_attribute(code_point, fields)
fields_start = []
continue
fill_attribute(int(fields[0], 16), fields)
fields_start = []
def fill_east_asian_widths(filename):
'''Stores the entire contents of the EastAsianWidths.txt file
in the EAST_ASIAN_WIDTHS dictionary.
Lines in EastAsianWidths.txt are either a code point range like
this:
9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
or a single code point like this:
A015;W # Lm YI SYLLABLE WU
'''
with open(filename, mode='r') as east_asian_widths_file:
for line in east_asian_widths_file:
match = re.match(
r'^(?P<codepoint1>[0-9A-F]{4,6})'
+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+r'\s*;\s*(?P<property>[a-zA-Z]+)',
line)
if not match:
continue
start = match.group('codepoint1')
end = match.group('codepoint2')
if not end:
end = start
for code_point in range(int(start, 16), int(end, 16)+1):
EAST_ASIAN_WIDTHS[code_point] = match.group('property')
def ucs_symbol(code_point):
'''Return the UCS symbol string for a Unicode character.'''
if code_point < 0x10000:
return '<U{:04X}>'.format(code_point)
else:
return '<U{:08X}>'.format(code_point)
def create_charmap_dictionary(file_name):
'''Create a dictionary for all code points found in the CHARMAP
section of a file
'''
with open(file_name, mode='r') as utf8_file:
charmap_dictionary = {}
for line in utf8_file:
if line.startswith('CHARMAP'):
break
for line in utf8_file:
if line.startswith('END CHARMAP'):
return charmap_dictionary
if line.startswith('%'):
continue
match = re.match(
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
+r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
+r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
line)
if not match:
continue
codepoint1 = match.group('codepoint1')
codepoint2 = match.group('codepoint2')
if not codepoint2:
codepoint2 = codepoint1
for i in range(int(codepoint1, 16),
int(codepoint2, 16) + 1):
charmap_dictionary[i] = match.group('hexutf8')
sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
%file_name)
exit(1)
def check_charmap(original_file_name, new_file_name):
'''Report differences in the CHARMAP section between the old and the
new file
'''
print('************************************************************')
print('Report on CHARMAP:')
ocharmap = create_charmap_dictionary(original_file_name)
ncharmap = create_charmap_dictionary(new_file_name)
print('------------------------------------------------------------')
print('Total removed characters in newly generated CHARMAP: %d'
%len(set(ocharmap)-set(ncharmap)))
if ARGS.show_missing_characters:
for key in sorted(set(ocharmap)-set(ncharmap)):
print('removed: {:s} {:s} {:s}'.format(
ucs_symbol(key),
ocharmap[key],
UNICODE_ATTRIBUTES[key]['name'] \
if key in UNICODE_ATTRIBUTES else None))
print('------------------------------------------------------------')
changed_charmap = {}
for key in set(ocharmap).intersection(set(ncharmap)):
if ocharmap[key] != ncharmap[key]:
changed_charmap[key] = (ocharmap[key], ncharmap[key])
print('Total changed characters in newly generated CHARMAP: %d'
%len(changed_charmap))
if ARGS.show_changed_characters:
for key in sorted(changed_charmap):
print('changed: {:s} {:s}->{:s} {:s}'.format(
ucs_symbol(key),
changed_charmap[key][0],
changed_charmap[key][1],
UNICODE_ATTRIBUTES[key]['name'] \
if key in UNICODE_ATTRIBUTES else None))
print('------------------------------------------------------------')
print('Total added characters in newly generated CHARMAP: %d'
%len(set(ncharmap)-set(ocharmap)))
if ARGS.show_added_characters:
for key in sorted(set(ncharmap)-set(ocharmap)):
print('added: {:s} {:s} {:s}'.format(
ucs_symbol(key),
ncharmap[key],
UNICODE_ATTRIBUTES[key]['name'] \
if key in UNICODE_ATTRIBUTES else None))
def create_width_dictionary(file_name):
'''Create a dictionary for all code points found in the WIDTH
section of a file
'''
with open(file_name, mode='r') as utf8_file:
width_dictionary = {}
for line in utf8_file:
if line.startswith('WIDTH'):
break
for line in utf8_file:
if line.startswith('END WIDTH'):
return width_dictionary
match = re.match(
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
+r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
+r'\s+(?P<width>[02])',
line)
if not match:
continue
codepoint1 = match.group('codepoint1')
codepoint2 = match.group('codepoint2')
if not codepoint2:
codepoint2 = codepoint1
for i in range(int(codepoint1, 16),
int(codepoint2, 16) + 1):
width_dictionary[i] = int(match.group('width'))
sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
def check_width(original_file_name, new_file_name):
'''Report differences in the WIDTH section between the old and the new
file
'''
print('************************************************************')
print('Report on WIDTH:')
owidth = create_width_dictionary(original_file_name)
nwidth = create_width_dictionary(new_file_name)
print('------------------------------------------------------------')
print('Total removed characters in newly generated WIDTH: %d'
%len(set(owidth)-set(nwidth)))
print('(Characters not in WIDTH get width 1 by default, '
+ 'i.e. these have width 1 now.)')
if ARGS.show_missing_characters:
for key in sorted(set(owidth)-set(nwidth)):
print('removed: {:s} '.format(ucs_symbol(key))
+ '{:d} : '.format(owidth[key])
+ 'eaw={:s} '.format(
EAST_ASIAN_WIDTHS[key]
if key in EAST_ASIAN_WIDTHS else None)
+ 'category={:2s} '.format(
UNICODE_ATTRIBUTES[key]['category']
if key in UNICODE_ATTRIBUTES else None)
+ 'bidi={:3s} '.format(
UNICODE_ATTRIBUTES[key]['bidi']
if key in UNICODE_ATTRIBUTES else None)
+ 'name={:s}'.format(
UNICODE_ATTRIBUTES[key]['name']
if key in UNICODE_ATTRIBUTES else None))
print('------------------------------------------------------------')
changed_width = {}
for key in set(owidth).intersection(set(nwidth)):
if owidth[key] != nwidth[key]:
changed_width[key] = (owidth[key], nwidth[key])
print('Total changed characters in newly generated WIDTH: %d'
%len(changed_width))
if ARGS.show_changed_characters:
for key in sorted(changed_width):
print('changed width: {:s} '.format(ucs_symbol(key))
+ '{:d}->{:d} : '.format(changed_width[key][0],
changed_width[key][1])
+ 'eaw={:s} '.format(
EAST_ASIAN_WIDTHS[key]
if key in EAST_ASIAN_WIDTHS else None)
+ 'category={:2s} '.format(
UNICODE_ATTRIBUTES[key]['category']
if key in UNICODE_ATTRIBUTES else None)
+ 'bidi={:3s} '.format(
UNICODE_ATTRIBUTES[key]['bidi']
if key in UNICODE_ATTRIBUTES else None)
+ 'name={:s}'.format(
UNICODE_ATTRIBUTES[key]['name']
if key in UNICODE_ATTRIBUTES else None))
print('------------------------------------------------------------')
print('Total added characters in newly generated WIDTH: %d'
%len(set(nwidth)-set(owidth)))
print('(Characters not in WIDTH get width 1 by default, '
+ 'i.e. these had width 1 before.)')
if ARGS.show_added_characters:
for key in sorted(set(nwidth)-set(owidth)):
print('added: {:s} '.format(ucs_symbol(key))
+ '{:d} : '.format(nwidth[key])
+ 'eaw={:s} '.format(
EAST_ASIAN_WIDTHS[key]
if key in EAST_ASIAN_WIDTHS else None)
+ 'category={:2s} '.format(
UNICODE_ATTRIBUTES[key]['category']
if key in UNICODE_ATTRIBUTES else None)
+ 'bidi={:3s} '.format(
UNICODE_ATTRIBUTES[key]['bidi']
if key in UNICODE_ATTRIBUTES else None)
+ 'name={:s}'.format(
UNICODE_ATTRIBUTES[key]['name']
if key in UNICODE_ATTRIBUTES else None))
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
Compare the contents of LC_CTYPE in two files and check for errors.
''')
PARSER.add_argument(
'-o', '--old_utf8_file',
nargs='?',
required=True,
type=str,
help='The old UTF-8 file.')
PARSER.add_argument(
'-n', '--new_utf8_file',
nargs='?',
required=True,
type=str,
help='The new UTF-8 file.')
PARSER.add_argument(
'-u', '--unicode_data_file',
nargs='?',
type=str,
help='The UnicodeData.txt file to read.')
PARSER.add_argument(
'-e', '--east_asian_width_file',
nargs='?',
type=str,
help='The EastAsianWidth.txt file to read.')
PARSER.add_argument(
'-a', '--show_added_characters',
action='store_true',
help='Show characters which were added in detail.')
PARSER.add_argument(
'-m', '--show_missing_characters',
action='store_true',
help='Show characters which were removed in detail.')
PARSER.add_argument(
'-c', '--show_changed_characters',
action='store_true',
help='Show characters whose width was changed in detail.')
ARGS = PARSER.parse_args()
if ARGS.unicode_data_file:
fill_attributes(ARGS.unicode_data_file)
if ARGS.east_asian_width_file:
fill_east_asian_widths(ARGS.east_asian_width_file)
check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)

View File

@ -0,0 +1,286 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''glibc/localedata/charmaps/UTF-8 file generator script
This script generates a glibc/localedata/charmaps/UTF-8 file
from Unicode data.
Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
It will output UTF-8 file
'''
import sys
import re
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
# sections 3.11 and 4.4.
jamo_initial_short_name = [
'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
'C', 'K', 'T', 'P', 'H'
]
jamo_medial_short_name = [
'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
]
jamo_final_short_name = [
'', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
'P', 'H'
]
def ucs_symbol(code_point):
'''Return the UCS symbol string for a Unicode character.'''
if code_point < 0x10000:
return '<U{:04X}>'.format(code_point)
else:
return '<U{:08X}>'.format(code_point)
def process_range(start, end, outfile, name):
'''Writes a range of code points into the CHARMAP section of the
output file
'''
if 'Hangul Syllable' in name:
# from glibc/localedata/ChangeLog:
#
# 2000-09-24 Bruno Haible <haible@clisp.cons.org>
# * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
# so they become printable and carry a width. Comment out surrogate
# ranges. Add a WIDTH table
#
# So we expand the Hangul Syllables here:
for i in range(int(start, 16), int(end, 16)+1 ):
index2, index3 = divmod(i - 0xaC00, 28)
index1, index2 = divmod(index2, 21)
hangul_syllable_name = 'HANGUL SYLLABLE ' \
+ jamo_initial_short_name[index1] \
+ jamo_medial_short_name[index2] \
+ jamo_final_short_name[index3]
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
ucs_symbol(i), convert_to_hex(i),
hangul_syllable_name))
return
# UnicodeData.txt file has contains code point ranges like this:
#
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
#
# The glibc UTF-8 file splits ranges like these into shorter
# ranges of 64 code points each:
#
# <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
# …
# <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
for i in range(int(start, 16), int(end, 16), 64 ):
if i > (int(end, 16)-64):
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
ucs_symbol(i),
ucs_symbol(int(end,16)),
convert_to_hex(i),
name))
break
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
ucs_symbol(i),
ucs_symbol(i+63),
convert_to_hex(i),
name))
def process_charmap(flines, outfile):
'''This function takes an array which contains *all* lines of
of UnicodeData.txt and write lines to outfile as used in the
CHARMAP
END CHARMAP
section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
Samples for input lines:
0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
<U0010> /x10 DATA LINK ESCAPE
<U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
%<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
%<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
'''
fields_start = []
for line in flines:
fields = line.split(";")
# Some characters have “<control>” as their name. We try to
# use the “Unicode 1.0 Name” (10th field in
# UnicodeData.txt) for them.
#
# The Characters U+0080, U+0081, U+0084 and U+0099 have
# “<control>” as their name but do not even have aa
# ”Unicode 1.0 Name”. We could write code to take their
# alternate names from NameAliases.txt.
if fields[1] == "<control>" and fields[10]:
fields[1] = fields[10]
# Handling code point ranges like:
#
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
fields_start = fields
continue
if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
process_range(fields_start[0], fields[0],
outfile, fields[1][:-7]+'>')
fields_start = []
continue
fields_start = []
if 'Surrogate,' in fields[1]:
# Comment out the surrogates in the UTF-8 file.
# One could of course skip them completely but
# the original UTF-8 file in glibc had them as
# comments, so we keep these comment lines.
outfile.write('%')
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
ucs_symbol(int(fields[0], 16)),
convert_to_hex(int(fields[0], 16)),
fields[1]))
def convert_to_hex(code_point):
'''Converts a code point to a hexadecimal UTF-8 representation
like /x**/x**/x**.'''
# Getting UTF8 of Unicode characters.
# In Python3, .encode('UTF-8') does not work for
# surrogates. Therefore, we use this conversion table
surrogates = {
0xD800: '/xed/xa0/x80',
0xDB7F: '/xed/xad/xbf',
0xDB80: '/xed/xae/x80',
0xDBFF: '/xed/xaf/xbf',
0xDC00: '/xed/xb0/x80',
0xDFFF: '/xed/xbf/xbf',
}
if code_point in surrogates:
return surrogates[code_point]
return ''.join([
'/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
])
def write_header_charmap(outfile):
'''Write the header on top of the CHARMAP section to the output file'''
outfile.write("<code_set_name> UTF-8\n")
outfile.write("<comment_char> %\n")
outfile.write("<escape_char> /\n")
outfile.write("<mb_cur_min> 1\n")
outfile.write("<mb_cur_max> 6\n\n")
outfile.write("% CHARMAP generated using utf8_gen.py\n")
outfile.write("% alias ISO-10646/UTF-8\n")
outfile.write("CHARMAP\n")
def write_header_width(outfile):
'''Writes the header on top of the WIDTH section to the output file'''
outfile.write('% Character width according to Unicode 7.0.0.\n')
outfile.write('% - Default width is 1.\n')
outfile.write('% - Double-width characters have width 2; generated from\n')
outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
outfile.write('% - Non-spacing characters have width 0; '
+ 'generated from PropList.txt or\n')
outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
+ 'UnicodeData.txt"\n')
outfile.write('% - Format control characters have width 0; '
+ 'generated from\n')
outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
# Not needed covered by Cf
# outfile.write("% - Zero width characters have width 0; generated from\n")
# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
outfile.write("WIDTH\n")
def process_width(outfile, ulines, elines):
'''ulines are lines from UnicodeData.txt, elines are lines from
EastAsianWidth.txt
'''
width_dict = {}
for line in ulines:
fields = line.split(";")
if fields[4] == "NSM" or fields[2] == "Cf":
width_dict[int(fields[0], 16)] = ucs_symbol(
int(fields[0], 16)) + '\t0'
for line in elines:
# If an entry in EastAsianWidth.txt is found, it overrides entries in
# UnicodeData.txt:
fields = line.split(";")
if not '..' in fields[0]:
width_dict[int(fields[0], 16)] = ucs_symbol(
int(fields[0], 16)) + '\t2'
else:
code_points = fields[0].split("..")
for key in range(int(code_points[0], 16),
int(code_points[1], 16)+1):
if key in width_dict:
del width_dict[key]
width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
ucs_symbol(int(code_points[0], 16)),
ucs_symbol(int(code_points[1], 16)))
for key in sorted(width_dict):
outfile.write(width_dict[key]+'\n')
if __name__ == "__main__":
if len(sys.argv) < 3:
print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
else:
with open(sys.argv[1], mode='r') as UNIDATA_FILE:
UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
EAST_ASIAN_WIDTH_LINES = []
for LINE in EAST_ASIAN_WIDTH_FILE:
# If characters from EastAasianWidth.txt which are from
# from reserved ranges (i.e. not yet assigned code points)
# are added to the WIDTH section of the UTF-8 file, then
# “make check” produces “Unknown Character” errors for
# these code points because such unassigned code points
# are not in the CHARMAP section of the UTF-8 file.
#
# Therefore, we skip all reserved code points when reading
# the EastAsianWidth.txt file.
if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
continue
if re.match(r'^[^;]*;[WF]', LINE):
EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
with open('UTF-8', mode='w') as OUTFILE:
# Processing UnicodeData.txt and write CHARMAP to UTF-8 file
write_header_charmap(OUTFILE)
process_charmap(UNICODE_DATA_LINES, OUTFILE)
OUTFILE.write("END CHARMAP\n\n")
# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
write_header_width(OUTFILE)
process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
OUTFILE.write("END WIDTH\n")