auto merge of #10621 : Florob/rust/unicode63, r=cmr

This update the unicode.rs file to the latest Unicode version released 2013-09-30.
This commit is contained in:
bors 2013-11-27 16:47:14 -08:00
commit 503e5df3f2
5 changed files with 1340 additions and 675 deletions

View File

@ -5,7 +5,7 @@
# code covering the core properties. Since this is a pretty rare event we
# just store this out-of-line and check the unicode.rs file into git.
#
# The emitted code is "the minimum we think is necessary for libcore", that
# The emitted code is "the minimum we think is necessary for libstd", that
# is, to support basic operations of the compiler and "most nontrivial rust
# programs". It is not meant to be a complete implementation of unicode.
# For that we recommend you use a proper binding to libicu.
@ -41,7 +41,7 @@ def load_unicode_data(f):
continue
[code, name, gencat, combine, bidi,
decomp, deci, digit, num, mirror,
old, iso, upcase, lowcsae, titlecase ] = fields
old, iso, upcase, lowcase, titlecase ] = fields
code = int(code, 16)
@ -89,11 +89,9 @@ def load_unicode_data(f):
return (canon_decomp, compat_decomp, gencats, combines)
def load_derived_core_properties(f):
def load_properties(f, interestingprops):
fetch(f)
derivedprops = {}
interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
props = {}
re1 = re.compile("^([0-9A-F]+) +; (\w+)")
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
@ -118,10 +116,10 @@ def load_derived_core_properties(f):
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if prop not in derivedprops:
derivedprops[prop] = []
derivedprops[prop].append((d_lo, d_hi))
return derivedprops
if prop not in props:
props[prop] = []
props[prop].append((d_lo, d_hi))
return props
def escape_char(c):
if c <= 0xff:
@ -144,7 +142,7 @@ def emit_bsearch_range_table(f):
use cmp::{Equal, Less, Greater};
use vec::ImmutableVector;
use option::None;
(do r.bsearch |&(lo,hi)| {
r.bsearch(|&(lo,hi)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
@ -302,14 +300,14 @@ def emit_decomp_module(f, canon, compat, combine):
ix += 1
f.write("\n ];\n")
f.write(" pub fn canonical(c: char, i: &fn(char)) "
f.write(" pub fn canonical(c: char, i: |char|) "
+ "{ d(c, i, false); }\n\n")
f.write(" pub fn compatibility(c: char, i: &fn(char)) "
f.write(" pub fn compatibility(c: char, i: |char|) "
+"{ d(c, i, true); }\n\n")
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
+ " bsearch_range_value_table(c, combining_class_table)\n"
+ " }\n\n")
f.write(" fn d(c: char, i: &fn(char), k: bool) {\n")
f.write(" fn d(c: char, i: |char|, k: bool) {\n")
f.write(" use iter::Iterator;\n");
f.write(" if c <= '\\x7f' { i(c); return; }\n")
@ -376,5 +374,9 @@ emit_property_module(rf, "general_category", gencats)
emit_decomp_module(rf, canon_decomp, compat_decomp, combines)
derived = load_derived_core_properties("DerivedCoreProperties.txt")
derived = load_properties("DerivedCoreProperties.txt",
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
emit_property_module(rf, "derived_property", derived)
props = load_properties("PropList.txt", ["White_Space"])
emit_property_module(rf, "property", props)

View File

@ -14,7 +14,7 @@ use cast::transmute;
use option::{None, Option, Some};
use iter::{Iterator, range_step};
use str::StrSlice;
use unicode::{derived_property, general_category, decompose};
use unicode::{derived_property, property, general_category, decompose};
use to_str::ToStr;
use str;
@ -89,30 +89,28 @@ pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
///
/// Indicates whether a character is in lower case, defined
/// in terms of the Unicode General Category 'Ll'
/// in terms of the Unicode Derived Core Property 'Lowercase'.
///
#[inline]
pub fn is_lowercase(c: char) -> bool { general_category::Ll(c) }
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
///
/// Indicates whether a character is in upper case, defined
/// in terms of the Unicode General Category 'Lu'.
/// in terms of the Unicode Derived Core Property 'Uppercase'.
///
#[inline]
pub fn is_uppercase(c: char) -> bool { general_category::Lu(c) }
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
///
/// Indicates whether a character is whitespace. Whitespace is defined in
/// terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
/// additional 'Cc'-category control codes in the range [0x09, 0x0d]
/// terms of the Unicode Property 'White_Space'.
///
#[inline]
pub fn is_whitespace(c: char) -> bool {
// As an optimization ASCII whitespace characters are checked separately
c == ' '
|| ('\x09' <= c && c <= '\x0d')
|| general_category::Zs(c)
|| general_category::Zl(c)
|| general_category::Zp(c)
|| property::White_Space(c)
}
///

File diff suppressed because it is too large Load Diff

View File

@ -51,34 +51,34 @@ fn f() {
CR4+2: (should align)
*/
/*
// (NEL deliberately omitted)
NEL4+2: (should align)
*/
/*
Ogham Space Mark 4+2: (should align)
*/
/*
Mongolian Vowel Separator 4+2: (should align)
Ogham Space Mark 4+2: (should align)
*/
/*
Four-per-em space 4+2: (should align)
*/
/*
Mongolian Vowel Sep count 1: (should align)
Mongolian Vowel Sep count 2: (should align)
Mongolian Vowel Sep count 3: (should align)
Mongolian Vowel Sep count 4: (should align)
Mongolian Vowel Sep count 5: (should align)
Mongolian Vowel Sep count 6: (should align)
Mongolian Vowel Sep count 7: (should align)
Mongolian Vowel Sep count 8: (should align)
Mongolian Vowel Sep count 9: (should align)
Mongolian Vowel Sep count A: (should align)
Mongolian Vowel Sep count B: (should align)
Mongolian Vowel Sep count C: (should align)
Mongolian Vowel Sep count D: (should align)
Mongolian Vowel Sep count E: (should align)
Mongolian Vowel Sep count F: (should align)
Ogham Space Mark count 1: (should align)
Ogham Space Mark count 2: (should align)
Ogham Space Mark count 3: (should align)
Ogham Space Mark count 4: (should align)
Ogham Space Mark count 5: (should align)
Ogham Space Mark count 6: (should align)
Ogham Space Mark count 7: (should align)
Ogham Space Mark count 8: (should align)
Ogham Space Mark count 9: (should align)
Ogham Space Mark count A: (should align)
Ogham Space Mark count B: (should align)
Ogham Space Mark count C: (should align)
Ogham Space Mark count D: (should align)
Ogham Space Mark count E: (should align)
Ogham Space Mark count F: (should align)
*/
@ -88,26 +88,25 @@ fn f() {
/*
Hello from offset 6
Space 6+2: compare A
Mongolian Vowel Separator 6+2: compare B
Ogham Space Mark 6+2: compare B
*/
/**/
/**/
/*
Hello from another offset 6 with wchars establishing column offset
Space 6+2: compare C
Mongolian Vowel Separator 6+2: compare D
Ogham Space Mark 6+2: compare D
*/
}
fn main() {
// Taken from http://en.wikipedia.org/wiki/Whitespace_character
// Taken from http://www.unicode.org/Public/UNIDATA/PropList.txt
let chars =
['\x0A', '\x0B', '\x0C', '\x0D', '\x20',
// '\x85', // for some reason Rust thinks NEL isn't whitespace
'\xA0', '\u1680', '\u180E', '\u2000', '\u2001', '\u2002', '\u2003',
'\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A',
'\u2028', '\u2029', '\u202F', '\u205F', '\u3000'];
['\x0A', '\x0B', '\x0C', '\x0D', '\x20', '\x85', '\xA0', '\u1680',
'\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006',
'\u2007', '\u2008', '\u2009', '\u200A', '\u2028', '\u2029', '\u202F',
'\u205F', '\u3000'];
for c in chars.iter() {
let ws = c.is_whitespace();
println!("{:?} {:?}" , c , ws);

View File

@ -51,55 +51,54 @@ fn f() {
CR4+2: (should align)
*/
/*
// (NEL deliberately omitted)
………… NEL4+2: (should align)
*/
/*
Ogham Space Mark 4+2: (should align)
*/
/*
Mongolian Vowel Separator 4+2: (should align)
Ogham Space Mark 4+2: (should align)
*/
/*
Four-per-em space 4+2: (should align)
*/
/*
Mongolian Vowel Sep count 1: (should align)
Mongolian Vowel Sep count 2: (should align)
Mongolian Vowel Sep count 3: (should align)
Mongolian Vowel Sep count 4: (should align)
Mongolian Vowel Sep count 5: (should align)
Mongolian Vowel Sep count 6: (should align)
Mongolian Vowel Sep count 7: (should align)
Mongolian Vowel Sep count 8: (should align)
Mongolian Vowel Sep count 9: (should align)
Mongolian Vowel Sep count A: (should align)
Mongolian Vowel Sep count B: (should align)
Mongolian Vowel Sep count C: (should align)
Mongolian Vowel Sep count D: (should align)
Mongolian Vowel Sep count E: (should align)
Mongolian Vowel Sep count F: (should align)
Ogham Space Mark count 1: (should align)
Ogham Space Mark count 2: (should align)
Ogham Space Mark count 3: (should align)
Ogham Space Mark count 4: (should align)
Ogham Space Mark count 5: (should align)
Ogham Space Mark count 6: (should align)
Ogham Space Mark count 7: (should align)
Ogham Space Mark count 8: (should align)
Ogham Space Mark count 9: (should align)
Ogham Space Mark count A: (should align)
Ogham Space Mark count B: (should align)
Ogham Space Mark count C: (should align)
Ogham Space Mark count D: (should align)
Ogham Space Mark count E: (should align)
Ogham Space Mark count F: (should align)
*/
/* */ /*
Hello from offset 6
Space 6+2: compare A
Mongolian Vowel Separator 6+2: compare B
Ogham Space Mark 6+2: compare B
*/
/**/ /*
/**/ /*
Hello from another offset 6 with wchars establishing column offset
Space 6+2: compare C
Mongolian Vowel Separator 6+2: compare D
Ogham Space Mark 6+2: compare D
*/
}
fn main() {
// Taken from http://en.wikipedia.org/wiki/Whitespace_character
// Taken from http://www.unicode.org/Public/UNIDATA/PropList.txt
let chars =
['\x0A', '\x0B', '\x0C', '\x0D', '\x20',
// '\x85', // for some reason Rust thinks NEL isn't whitespace
'\xA0', '\u1680', '\u180E', '\u2000', '\u2001', '\u2002', '\u2003',
['\x0A', '\x0B', '\x0C', '\x0D', '\x20', '\x85',
'\xA0', '\u1680', '\u2000', '\u2001', '\u2002', '\u2003',
'\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A',
'\u2028', '\u2029', '\u202F', '\u205F', '\u3000'];
for c in chars.iter() {