auto merge of #10621 : Florob/rust/unicode63, r=cmr

This update the unicode.rs file to the latest Unicode version released 2013-09-30.
2013-11-27 16:47:14 -08:00 · 2013-11-27 16:47:14 -08:00 · 503e5df3f2
parent d2c405eeff dfe38dbca4
commit 503e5df3f2
5 changed files with 1340 additions and 675 deletions
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@ -5,7 +5,7 @@
 # code covering the core properties. Since this is a pretty rare event we
 # just store this out-of-line and check the unicode.rs file into git.
 #
-# The emitted code is "the minimum we think is necessary for libcore", that
+# The emitted code is "the minimum we think is necessary for libstd", that
 # is, to support basic operations of the compiler and "most nontrivial rust
 # programs". It is not meant to be a complete implementation of unicode.
 # For that we recommend you use a proper binding to libicu.
@ -41,7 +41,7 @@ def load_unicode_data(f):
            continue
        [code, name, gencat, combine, bidi,
         decomp, deci, digit, num, mirror,
-         old, iso, upcase, lowcsae, titlecase ] = fields
+         old, iso, upcase, lowcase, titlecase ] = fields

        code = int(code, 16)

@ -89,11 +89,9 @@ def load_unicode_data(f):

    return (canon_decomp, compat_decomp, gencats, combines)

-
-def load_derived_core_properties(f):
+def load_properties(f, interestingprops):
    fetch(f)
-    derivedprops = {}
-    interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
+    props = {}
    re1 = re.compile("^([0-9A-F]+) +; (\w+)")
    re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")

@ -118,10 +116,10 @@ def load_derived_core_properties(f):
            continue
        d_lo = int(d_lo, 16)
        d_hi = int(d_hi, 16)
-        if prop not in derivedprops:
-            derivedprops[prop] = []
-        derivedprops[prop].append((d_lo, d_hi))
-    return derivedprops
+        if prop not in props:
+            props[prop] = []
+        props[prop].append((d_lo, d_hi))
+    return props

 def escape_char(c):
    if c <= 0xff:
@ -144,7 +142,7 @@ def emit_bsearch_range_table(f):
        use cmp::{Equal, Less, Greater};
        use vec::ImmutableVector;
        use option::None;
-        (do r.bsearch |&(lo,hi)| {
+        r.bsearch(|&(lo,hi)| {
            if lo <= c && c <= hi { Equal }
            else if hi < c { Less }
            else { Greater }
@ -302,14 +300,14 @@ def emit_decomp_module(f, canon, compat, combine):
        ix += 1
    f.write("\n    ];\n")

-    f.write("    pub fn canonical(c: char, i: &fn(char)) "
+    f.write("    pub fn canonical(c: char, i: |char|) "
        + "{ d(c, i, false); }\n\n")
-    f.write("    pub fn compatibility(c: char, i: &fn(char)) "
+    f.write("    pub fn compatibility(c: char, i: |char|) "
            +"{ d(c, i, true); }\n\n")
    f.write("    pub fn canonical_combining_class(c: char) -> u8 {\n"
        + "        bsearch_range_value_table(c, combining_class_table)\n"
        + "    }\n\n")
-    f.write("    fn d(c: char, i: &fn(char), k: bool) {\n")
+    f.write("    fn d(c: char, i: |char|, k: bool) {\n")
    f.write("        use iter::Iterator;\n");

    f.write("        if c <= '\\x7f' { i(c); return; }\n")
@ -376,5 +374,9 @@ emit_property_module(rf, "general_category", gencats)

 emit_decomp_module(rf, canon_decomp, compat_decomp, combines)

-derived = load_derived_core_properties("DerivedCoreProperties.txt")
+derived = load_properties("DerivedCoreProperties.txt",
+        ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
 emit_property_module(rf, "derived_property", derived)
+
+props = load_properties("PropList.txt", ["White_Space"])
+emit_property_module(rf, "property", props)
--- a/src/libstd/char.rs
+++ b/src/libstd/char.rs
@ -14,7 +14,7 @@ use cast::transmute;
 use option::{None, Option, Some};
 use iter::{Iterator, range_step};
 use str::StrSlice;
-use unicode::{derived_property, general_category, decompose};
+use unicode::{derived_property, property, general_category, decompose};
 use to_str::ToStr;
 use str;

@ -89,30 +89,28 @@ pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }

 ///
 /// Indicates whether a character is in lower case, defined
-/// in terms of the Unicode General Category 'Ll'
+/// in terms of the Unicode Derived Core Property 'Lowercase'.
 ///
 #[inline]
-pub fn is_lowercase(c: char) -> bool { general_category::Ll(c) }
+pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }

 ///
 /// Indicates whether a character is in upper case, defined
-/// in terms of the Unicode General Category 'Lu'.
+/// in terms of the Unicode Derived Core Property 'Uppercase'.
 ///
 #[inline]
-pub fn is_uppercase(c: char) -> bool { general_category::Lu(c) }
+pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }

 ///
 /// Indicates whether a character is whitespace. Whitespace is defined in
-/// terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
-/// additional 'Cc'-category control codes in the range [0x09, 0x0d]
+/// terms of the Unicode Property 'White_Space'.
 ///
 #[inline]
 pub fn is_whitespace(c: char) -> bool {
+    // As an optimization ASCII whitespace characters are checked separately
    c == ' '
        || ('\x09' <= c && c <= '\x0d')
-        || general_category::Zs(c)
-        || general_category::Zl(c)
-        || general_category::Zp(c)
+        || property::White_Space(c)
 }

 ///
--- a/src/libstd/unicode.rs
+++ b/src/libstd/unicode.rs
--- a/src/test/pretty/block-comment-wchar.pp
+++ b/src/test/pretty/block-comment-wchar.pp
@ -51,34 +51,34 @@ fn f() {
      CR4+2:                         (should align)
    */
    /*
-    // (NEL deliberately omitted)
+      NEL4+2:                        (should align)
    */
    /*
      Ogham Space Mark 4+2:          (should align)
    */
    /*
-      Mongolian Vowel Separator 4+2: (should align)
+      Ogham Space Mark 4+2: (should align)
    */
    /*
      Four-per-em space 4+2:         (should align)
    */

    /*
-      Mongolian Vowel Sep   count 1: (should align)
-      Mongolian Vowel Sep   count 2: (should align)
-      Mongolian Vowel Sep   count 3: (should align)
-      Mongolian Vowel Sep   count 4: (should align)
-      Mongolian Vowel Sep   count 5: (should align)
-      Mongolian Vowel Sep   count 6: (should align)
-      Mongolian Vowel Sep   count 7: (should align)
-      Mongolian Vowel Sep   count 8: (should align)
-      Mongolian Vowel Sep   count 9: (should align)
-      Mongolian Vowel Sep   count A: (should align)
-      Mongolian Vowel Sep   count B: (should align)
-      Mongolian Vowel Sep   count C: (should align)
-      Mongolian Vowel Sep   count D: (should align)
-      Mongolian Vowel Sep   count E: (should align)
-      Mongolian Vowel Sep   count F: (should align)
+      Ogham Space Mark   count 1: (should align)
+      Ogham Space Mark   count 2: (should align)
+      Ogham Space Mark   count 3: (should align)
+      Ogham Space Mark   count 4: (should align)
+      Ogham Space Mark   count 5: (should align)
+      Ogham Space Mark   count 6: (should align)
+      Ogham Space Mark   count 7: (should align)
+      Ogham Space Mark   count 8: (should align)
+      Ogham Space Mark   count 9: (should align)
+      Ogham Space Mark   count A: (should align)
+      Ogham Space Mark   count B: (should align)
+      Ogham Space Mark   count C: (should align)
+      Ogham Space Mark   count D: (should align)
+      Ogham Space Mark   count E: (should align)
+      Ogham Space Mark   count F: (should align)
    */


@ -88,26 +88,25 @@ fn f() {
    /*
      Hello from offset 6
      Space 6+2:                     compare A
-      Mongolian Vowel Separator 6+2: compare B
+      Ogham Space Mark 6+2: compare B
    */

-    /*᠎*/
+    /* */

    /*
      Hello from another offset 6 with wchars establishing column offset
      Space 6+2:                     compare C
-      Mongolian Vowel Separator 6+2: compare D
+      Ogham Space Mark 6+2: compare D
    */
 }

 fn main() {
-    // Taken from http://en.wikipedia.org/wiki/Whitespace_character
+    // Taken from http://www.unicode.org/Public/UNIDATA/PropList.txt
    let chars =
-        ['\x0A', '\x0B', '\x0C', '\x0D', '\x20',
-         // '\x85', // for some reason Rust thinks NEL isn't whitespace
-         '\xA0', '\u1680', '\u180E', '\u2000', '\u2001', '\u2002', '\u2003',
-         '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A',
-         '\u2028', '\u2029', '\u202F', '\u205F', '\u3000'];
+        ['\x0A', '\x0B', '\x0C', '\x0D', '\x20', '\x85', '\xA0', '\u1680',
+         '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006',
+         '\u2007', '\u2008', '\u2009', '\u200A', '\u2028', '\u2029', '\u202F',
+         '\u205F', '\u3000'];
    for c in chars.iter() {
        let ws = c.is_whitespace();
        println!("{:?} {:?}" , c , ws);
--- a/src/test/pretty/block-comment-wchar.rs
+++ b/src/test/pretty/block-comment-wchar.rs
@ -51,55 +51,54 @@ fn f() {
 



  CR4+2:                         (should align)
    */
    /*
-    // (NEL deliberately omitted)
+  NEL4+2:                        (should align)
    */
    /*
       Ogham Space Mark 4+2:          (should align)
    */
    /*
-᠎᠎᠎᠎  Mongolian Vowel Separator 4+2: (should align)
+      Ogham Space Mark 4+2: (should align)
    */
    /*
       Four-per-em space 4+2:         (should align)
    */

    /*
-   ᠎  Mongolian Vowel Sep   count 1: (should align)
-  ᠎   Mongolian Vowel Sep   count 2: (should align)
-  ᠎᠎  Mongolian Vowel Sep   count 3: (should align)
- ᠎    Mongolian Vowel Sep   count 4: (should align)
- ᠎ ᠎  Mongolian Vowel Sep   count 5: (should align)
- ᠎᠎   Mongolian Vowel Sep   count 6: (should align)
- ᠎᠎᠎  Mongolian Vowel Sep   count 7: (should align)
-᠎     Mongolian Vowel Sep   count 8: (should align)
-᠎  ᠎  Mongolian Vowel Sep   count 9: (should align)
-᠎ ᠎   Mongolian Vowel Sep   count A: (should align)
-᠎ ᠎᠎  Mongolian Vowel Sep   count B: (should align)
-᠎᠎    Mongolian Vowel Sep   count C: (should align)
-᠎᠎ ᠎  Mongolian Vowel Sep   count D: (should align)
-᠎᠎᠎   Mongolian Vowel Sep   count E: (should align)
-᠎᠎᠎᠎  Mongolian Vowel Sep   count F: (should align)
+      Ogham Space Mark   count 1: (should align)
+      Ogham Space Mark   count 2: (should align)
+      Ogham Space Mark   count 3: (should align)
+      Ogham Space Mark   count 4: (should align)
+      Ogham Space Mark   count 5: (should align)
+      Ogham Space Mark   count 6: (should align)
+      Ogham Space Mark   count 7: (should align)
+      Ogham Space Mark   count 8: (should align)
+      Ogham Space Mark   count 9: (should align)
+      Ogham Space Mark   count A: (should align)
+      Ogham Space Mark   count B: (should align)
+      Ogham Space Mark   count C: (should align)
+      Ogham Space Mark   count D: (should align)
+      Ogham Space Mark   count E: (should align)
+      Ogham Space Mark   count F: (should align)
    */


 /* */ /*
        Hello from offset 6
        Space 6+2:                     compare A
-᠎᠎᠎᠎᠎᠎  Mongolian Vowel Separator 6+2: compare B
+        Ogham Space Mark 6+2: compare B
      */
-/*᠎*/ /*
+/* */ /*
        Hello from another offset 6 with wchars establishing column offset
        Space 6+2:                     compare C
-᠎᠎᠎᠎᠎᠎  Mongolian Vowel Separator 6+2: compare D
+        Ogham Space Mark 6+2: compare D
      */
 }

 fn main() {
-    // Taken from http://en.wikipedia.org/wiki/Whitespace_character
+    // Taken from http://www.unicode.org/Public/UNIDATA/PropList.txt
    let chars =
-        ['\x0A', '\x0B', '\x0C', '\x0D', '\x20',
-         // '\x85', // for some reason Rust thinks NEL isn't whitespace
-         '\xA0', '\u1680', '\u180E', '\u2000', '\u2001', '\u2002', '\u2003',
+        ['\x0A', '\x0B', '\x0C', '\x0D', '\x20', '\x85',
+         '\xA0', '\u1680', '\u2000', '\u2001', '\u2002', '\u2003',
         '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A',
         '\u2028', '\u2029', '\u202F', '\u205F', '\u3000'];
    for c in chars.iter() {