auto merge of #15283 : kwantam/rust/master, r=alexcrichton

Add libunicode; move unicode functions from core - created new crate, libunicode, below libstd - split `Char` trait into `Char` (libcore) and `UnicodeChar` (libunicode) - Unicode-aware functions now live in libunicode - `is_alphabetic`, `is_XID_start`, `is_XID_continue`, `is_lowercase`, `is_uppercase`, `is_whitespace`, `is_alphanumeric`, `is_control`, `is_digit`, `to_uppercase`, `to_lowercase` - added `width` method in UnicodeChar trait - determines printed width of character in columns, or None if it is a non-NULL control character - takes a boolean argument indicating whether the present context is CJK or not (characters with 'A'mbiguous widths are double-wide in CJK contexts, single-wide otherwise) - split `StrSlice` into `StrSlice` (libcore) and `UnicodeStrSlice` (libunicode) - functionality formerly in `StrSlice` that relied upon Unicode functionality from `Char` is now in `UnicodeStrSlice` - `words`, `is_whitespace`, `is_alphanumeric`, `trim`, `trim_left`, `trim_right` - also moved `Words` type alias into libunicode because `words` method is in `UnicodeStrSlice` - unified Unicode tables from libcollections, libcore, and libregex into libunicode - updated `unicode.py` in `src/etc` to generate aforementioned tables - generated new tables based on latest Unicode data - added `UnicodeChar` and `UnicodeStrSlice` traits to prelude - libunicode is now the collection point for the `std::char` module, combining the libunicode functionality with the `Char` functionality from libcore - thus, moved doc comment for `char` from `core::char` to `unicode::char` - libcollections remains the collection point for `std::str` The Unicode-aware functions that previously lived in the `Char` and `StrSlice` traits are no longer available to programs that only use libcore. To regain use of these methods, include the libunicode crate and `use` the `UnicodeChar` and/or `UnicodeStrSlice` traits: extern crate unicode; use unicode::UnicodeChar; use unicode::UnicodeStrSlice; use unicode::Words; // if you want to use the words() method NOTE: this does *not* impact programs that use libstd, since UnicodeChar and UnicodeStrSlice have been added to the prelude. closes #15224 [breaking-change]
2014-07-09 18:36:30 +00:00 · 2014-07-09 18:36:30 +00:00 · fa7cbb5a46
parent f9d3b9e488 85e2bee4a2
commit fa7cbb5a46
27 changed files with 7445 additions and 11597 deletions
--- a/mk/crates.mk
+++ b/mk/crates.mk
@ -51,17 +51,19 @@

 TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
                 uuid serialize sync getopts collections num test time rand \
-                 url log regex graphviz core rlibc alloc debug rustrt
+                 url log regex graphviz core rlibc alloc debug rustrt \
+                 unicode
 HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros fmt_macros
 CRATES := $(TARGET_CRATES) $(HOST_CRATES)
 TOOLS := compiletest rustdoc rustc

 DEPS_core :=
 DEPS_rlibc :=
+DEPS_unicode := core
 DEPS_alloc := core libc native:jemalloc
 DEPS_debug := std
 DEPS_rustrt := alloc core libc collections native:rustrt_native
-DEPS_std := core libc rand alloc collections rustrt sync \
+DEPS_std := core libc rand alloc collections rustrt sync unicode \
 	native:rust_builtin native:backtrace
 DEPS_graphviz := std
 DEPS_green := std native:context_switch
@ -82,7 +84,7 @@ DEPS_semver := std
 DEPS_uuid := std serialize
 DEPS_sync := core alloc rustrt collections
 DEPS_getopts := std
-DEPS_collections := core alloc
+DEPS_collections := core alloc unicode
 DEPS_fourcc := rustc syntax std
 DEPS_hexfloat := rustc syntax std
 DEPS_num := std
@ -108,6 +110,7 @@ ONLY_RLIB_rlibc := 1
 ONLY_RLIB_alloc := 1
 ONLY_RLIB_rand := 1
 ONLY_RLIB_collections := 1
+ONLY_RLIB_unicode := 1

 ################################################################################
 # You should not need to edit below this line
--- a/mk/tests.mk
+++ b/mk/tests.mk
@ -15,11 +15,11 @@

 # The names of crates that must be tested

-# libcore tests are in a separate crate
+# libcore/libunicode tests are in a separate crate
 DEPS_coretest :=
 $(eval $(call RUST_CRATE,coretest))

-TEST_TARGET_CRATES = $(filter-out core,$(TARGET_CRATES)) coretest
+TEST_TARGET_CRATES = $(filter-out core unicode,$(TARGET_CRATES)) coretest
 TEST_DOC_CRATES = $(DOC_CRATES)
 TEST_HOST_CRATES = $(HOST_CRATES)
 TEST_CRATES = $(TEST_TARGET_CRATES) $(TEST_HOST_CRATES)
--- a/src/etc/regex-unicode-tables.py
+++ b/src/etc/regex-unicode-tables.py
@ -1,183 +0,0 @@
-#!/usr/bin/env python2
-
-# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
-# file at the top-level directory of this distribution and at
-# http://rust-lang.org/COPYRIGHT.
-#
-# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-# option. This file may not be copied, modified, or distributed
-# except according to those terms.
-
-from __future__ import absolute_import, division, print_function
-import argparse
-from collections import defaultdict
-import csv
-import datetime
-import urllib2
-
-BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
-DATA = 'UnicodeData.txt'
-SCRIPTS = 'Scripts.txt'
-
-# Mapping taken from Table 12 from:
-# http://www.unicode.org/reports/tr44/#General_Category_Values
-expanded_categories = {
-    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
-    'Lm': ['L'], 'Lo': ['L'],
-    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
-    'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
-    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
-    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
-    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
-    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
-    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
-}
-
-
-def as_4byte_uni(n):
-    s = hex(n)[2:]
-    return '\\U%s%s' % ('0' * (8 - len(s)), s)
-
-
-def expand_cat(c):
-    return expanded_categories.get(c, []) + [c]
-
-
-def is_valid_unicode(n):
-    return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
-
-
-def read_cats(f):
-    assigned = defaultdict(list)
-    for row in csv.reader(f, delimiter=';'):
-        (hex, cats) = (int(row[0], 16), expand_cat(row[2]))
-        if not is_valid_unicode(hex):
-            continue
-        for cat in cats:
-            assigned[cat].append(hex)
-    return assigned
-
-
-def read_scripts(f):
-    assigned = defaultdict(list)
-    for line in f:
-        line = line.strip()
-        if not line or line.startswith('#'):
-            continue
-        hexes, name = map(str.strip, line.split(';'))[:2]
-        name = name[:name.index('#')].strip()
-        if '..' not in hexes:
-            hex = int(hexes, 16)
-            if is_valid_unicode(hex):
-                assigned[name].append(hex)
-        else:
-            hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
-            for hex in xrange(hex1, hex2 + 1):
-                if is_valid_unicode(hex):
-                    assigned[name].append(hex)
-    return assigned
-
-
-def group(letters):
-    letters = sorted(set(letters))
-    grouped = []
-    cur_start = letters.pop(0)
-    cur_end = cur_start
-    for letter in letters:
-        assert letter > cur_end, \
-            'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
-
-        if letter == cur_end + 1:
-            cur_end = letter
-        else:
-            grouped.append((cur_start, cur_end))
-            cur_start, cur_end = letter, letter
-    grouped.append((cur_start, cur_end))
-    return grouped
-
-
-def ranges_to_rust(rs):
-    rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
-    return ',\n    '.join(rs)
-
-
-def groups_to_rust(groups):
-    rust_groups = []
-    for group_name in sorted(groups):
-        rust_groups.append('("%s", &[\n    %s\n    ]),'
-                           % (group_name, ranges_to_rust(groups[group_name])))
-    return '\n'.join(rust_groups)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Generate Unicode character class tables.')
-    aa = parser.add_argument
-    aa('--local', action='store_true',
-       help='When set, Scripts.txt and UnicodeData.txt will be read from '
-            'the CWD.')
-    aa('--base-url', type=str, default=BASE_URL,
-       help='The base URL to use for downloading Unicode data files.')
-    args = parser.parse_args()
-
-    if args.local:
-        cats = read_cats(open(DATA))
-        scripts = read_scripts(open(SCRIPTS))
-    else:
-        cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
-        scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
-
-    # Get Rust code for all Unicode general categories and scripts.
-    combined = dict(cats, **scripts)
-    unigroups = groups_to_rust({k: group(letters)
-                                for k, letters in combined.items()})
-
-    # Now get Perl character classes that are Unicode friendly.
-    perld = range(ord('0'), ord('9') + 1)
-    dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
-
-    perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
-    sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
-
-    low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
-    perlw = [ord('_')] + perld + low + up
-    wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
-
-    tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
-// on {date}.
-
-use parse::{{Class, NamedClasses}};
-
-pub static UNICODE_CLASSES: NamedClasses = &[
-
-{groups}
-
-];
-
-pub static PERLD: Class = &[
-    {dgroups}
-];
-
-pub static PERLS: Class = &[
-    {sgroups}
-];
-
-pub static PERLW: Class = &[
-    {wgroups}
-];
-'''
-    now = datetime.datetime.now()
-    print(tpl.format(date=str(now), groups=unigroups,
-                     dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@ -10,17 +10,46 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.

-# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
-# code covering the core properties. Since this is a pretty rare event we
-# just store this out-of-line and check the unicode.rs file into git.
+# This script uses the following Unicode tables:
+# - DerivedCoreProperties.txt
+# - EastAsianWidth.txt
+# - PropList.txt
+# - Scripts.txt
+# - UnicodeData.txt
 #
-# The emitted code is "the minimum we think is necessary for libstd", that
-# is, to support basic operations of the compiler and "most nontrivial rust
-# programs". It is not meant to be a complete implementation of unicode.
-# For that we recommend you use a proper binding to libicu.
+# Since this should not require frequent updates, we just store this
+# out-of-line and check the unicode.rs file into git.

 import fileinput, re, os, sys, operator

+preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
+
+#![allow(missing_doc, non_uppercase_statics, non_snake_case_functions)]
+'''
+
+# Mapping taken from Table 12 from:
+# http://www.unicode.org/reports/tr44/#General_Category_Values
+expanded_categories = {
+    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
+    'Lm': ['L'], 'Lo': ['L'],
+    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
+    'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
+    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
+    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
+    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
+    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
+    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
+}

 def fetch(f):
    if not os.path.exists(f):
@ -31,21 +60,17 @@ def fetch(f):
        sys.stderr.write("cannot load %s" % f)
        exit(1)

+def is_valid_unicode(n):
+    return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF

 def load_unicode_data(f):
    fetch(f)
    gencats = {}
    upperlower = {}
    lowerupper = {}
-    combines = []
+    combines = {}
    canon_decomp = {}
    compat_decomp = {}
-    curr_cat = ""
-    curr_combine = ""
-    c_lo = 0
-    c_hi = 0
-    com_lo = 0
-    com_hi = 0

    for line in fileinput.input(f):
        fields = line.split(";")
@ -58,6 +83,9 @@ def load_unicode_data(f):
        code_org = code
        code     = int(code, 16)

+        if not is_valid_unicode(code):
+            continue
+
        # generate char to char direct common and simple conversions
        # uppercase to lowercase
        if gencat == "Lu" and lowcase != "" and code_org != lowcase:
@ -67,6 +95,7 @@ def load_unicode_data(f):
        if gencat == "Ll" and upcase != "" and code_org != upcase:
            lowerupper[code] = int(upcase, 16)

+        # store decomposition, if given
        if decomp != "":
            if decomp.startswith('<'):
                seq = []
@ -79,38 +108,76 @@ def load_unicode_data(f):
                    seq.append(int(i, 16))
                canon_decomp[code] = seq

-        if curr_cat == "":
-            curr_cat = gencat
-            c_lo = code
-            c_hi = code
+        # place letter in categories as appropriate
+        for cat in [gencat] + expanded_categories.get(gencat, []):
+            if cat not in gencats:
+                gencats[cat] = []
+            gencats[cat].append(code)

-        if curr_cat == gencat:
-            c_hi = code
-        else:
-            if curr_cat not in gencats:
-                gencats[curr_cat] = []
+        # record combining class, if any
+        if combine != "0":
+            if combine not in combines:
+                combines[combine] = []
+            combines[combine].append(code)

-            gencats[curr_cat].append((c_lo, c_hi))
-            curr_cat = gencat
-            c_lo = code
-            c_hi = code
-
-        if curr_combine == "":
-            curr_combine = combine
-            com_lo = code
-            com_hi = code
-
-        if curr_combine == combine:
-            com_hi = code
-        else:
-            if curr_combine != "0":
-                combines.append((com_lo, com_hi, curr_combine))
-            curr_combine = combine
-            com_lo = code
-            com_hi = code
+    gencats = group_cats(gencats)
+    combines = to_combines(group_cats(combines))

    return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)

+def group_cats(cats):
+    cats_out = {}
+    for cat in cats:
+        cats_out[cat] = group_cat(cats[cat])
+    return cats_out
+
+def group_cat(cat):
+    cat_out = []
+    letters = sorted(set(cat))
+    cur_start = letters.pop(0)
+    cur_end = cur_start
+    for letter in letters:
+        assert letter > cur_end, \
+            "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
+        if letter == cur_end + 1:
+            cur_end = letter
+        else:
+            cat_out.append((cur_start, cur_end))
+            cur_start = cur_end = letter
+    cat_out.append((cur_start, cur_end))
+    return cat_out
+
+def ungroup_cat(cat):
+    cat_out = []
+    for (lo, hi) in cat:
+        while lo <= hi:
+            cat_out.append(lo)
+            lo += 1
+    return cat_out
+
+def to_combines(combs):
+    combs_out = []
+    for comb in combs:
+        for (lo, hi) in combs[comb]:
+            combs_out.append((lo, hi, comb))
+    combs_out.sort(key=lambda comb: comb[0])
+    return combs_out
+
+def format_table_content(f, content, indent):
+    line = " "*indent
+    first = True
+    for chunk in content.split(","):
+        if len(line) + len(chunk) < 98:
+            if first:
+                line += chunk
+            else:
+                line += ", " + chunk
+            first = False
+        else:
+            f.write(line + ",\n")
+            line = " "*indent + chunk
+    f.write(line)
+
 def load_properties(f, interestingprops):
    fetch(f)
    props = {}
@ -134,7 +201,7 @@ def load_properties(f, interestingprops):
                prop = m.group(3)
            else:
                continue
-        if prop not in interestingprops:
+        if interestingprops and prop not in interestingprops:
            continue
        d_lo = int(d_lo, 16)
        d_hi = int(d_hi, 16)
@ -143,6 +210,43 @@ def load_properties(f, interestingprops):
        props[prop].append((d_lo, d_hi))
    return props

+# load all widths of want_widths, except those in except_cats
+def load_east_asian_width(want_widths, except_cats):
+    f = "EastAsianWidth.txt"
+    fetch(f)
+    widths = {}
+    re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
+    re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
+
+    for line in fileinput.input(f):
+        width = None
+        d_lo = 0
+        d_hi = 0
+        cat = None
+        m = re1.match(line)
+        if m:
+            d_lo = m.group(1)
+            d_hi = m.group(1)
+            width = m.group(2)
+            cat = m.group(3)
+        else:
+            m = re2.match(line)
+            if m:
+                d_lo = m.group(1)
+                d_hi = m.group(2)
+                width = m.group(3)
+                cat = m.group(4)
+            else:
+                continue
+        if cat in except_cats or width not in want_widths:
+            continue
+        d_lo = int(d_lo, 16)
+        d_hi = int(d_hi, 16)
+        if width not in widths:
+            widths[width] = []
+        widths[width].append((d_lo, d_hi))
+    return widths
+
 def escape_char(c):
    if c <= 0xff:
        return "'\\x%2.2x'" % c
@ -150,59 +254,72 @@ def escape_char(c):
        return "'\\u%4.4x'" % c
    return "'\\U%8.8x'" % c

-def ch_prefix(ix):
-    if ix == 0:
-        return "        "
-    if ix % 2 == 0:
-        return ",\n        "
-    else:
-        return ", "
-
 def emit_bsearch_range_table(f):
    f.write("""
 fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
-    use cmp::{Equal, Less, Greater};
-    use slice::ImmutableVector;
-    use option::None;
+    use core::cmp::{Equal, Less, Greater};
+    use core::slice::ImmutableVector;
+    use core::option::None;
    r.bsearch(|&(lo,hi)| {
        if lo <= c && c <= hi { Equal }
        else if hi < c { Less }
        else { Greater }
    }) != None
 }\n
-""");
+""")

-def emit_property_module(f, mod, tbl):
+def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
+        pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
+    pub_string = ""
+    if is_pub:
+        pub_string = "pub "
+    f.write("    %sstatic %s: %s = &[\n" % (pub_string, name, t_type))
+    data = ""
+    first = True
+    for dat in t_data:
+        if not first:
+            data += ","
+        first = False
+        data += pfun(dat)
+    format_table_content(f, data, 8)
+    f.write("\n    ];\n\n")
+
+def emit_property_module(f, mod, tbl, emit_fn):
    f.write("pub mod %s {\n" % mod)
    keys = tbl.keys()
    keys.sort()
-
    for cat in keys:
-        if cat not in ["Nd", "Nl", "No", "Cc",
-            "XID_Start", "XID_Continue", "Alphabetic",
-            "Lowercase", "Uppercase", "White_Space"]:
-            continue
-        f.write("    static %s_table : &'static [(char,char)] = &[\n" % cat)
-        ix = 0
-        for pair in tbl[cat]:
-            f.write(ch_prefix(ix))
-            f.write("(%s, %s)" % (escape_char(pair[0]), escape_char(pair[1])))
-            ix += 1
-        f.write("\n    ];\n\n")
-
-        f.write("    pub fn %s(c: char) -> bool {\n" % cat)
-        f.write("        super::bsearch_range_table(c, %s_table)\n" % cat)
-        f.write("    }\n\n")
+        emit_table(f, "%s_table" % cat, tbl[cat])
+        if cat in emit_fn:
+            f.write("    pub fn %s(c: char) -> bool {\n" % cat)
+            f.write("        super::bsearch_range_table(c, %s_table)\n" % cat)
+            f.write("    }\n\n")
    f.write("}\n\n")

+def emit_regex_module(f, cats, w_data):
+    f.write("pub mod regex {\n")
+    regex_class = "&'static [(char, char)]"
+    class_table = "&'static [(&'static str, %s)]" % regex_class
+
+    emit_table(f, "UNICODE_CLASSES", cats, class_table,
+        pfun=lambda x: "(\"%s\",super::%s::%s_table)" % (x[0], x[1], x[0]))
+
+    f.write("    pub static PERLD: %s = super::general_category::Nd_table;\n\n"
+            % regex_class)
+    f.write("    pub static PERLS: %s = super::property::White_Space_table;\n\n"
+            % regex_class)
+
+    emit_table(f, "PERLW", w_data, regex_class)
+
+    f.write("}\n\n")

 def emit_conversions_module(f, lowerupper, upperlower):
    f.write("pub mod conversions {")
    f.write("""
-    use cmp::{Equal, Less, Greater};
-    use slice::ImmutableVector;
-    use tuple::Tuple2;
-    use option::{Option, Some, None};
+    use core::cmp::{Equal, Less, Greater};
+    use core::slice::ImmutableVector;
+    use core::tuple::Tuple2;
+    use core::option::{Option, Some, None};

    pub fn to_lower(c: char) -> char {
        match bsearch_case_table(c, LuLl_table) {
@ -226,189 +343,88 @@ def emit_conversions_module(f, lowerupper, upperlower):
        })
    }

-""");
-    emit_caseconversion_table(f, "LuLl", upperlower)
-    emit_caseconversion_table(f, "LlLu", lowerupper)
+""")
+    emit_table(f, "LuLl_table",
+        sorted(upperlower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
+    emit_table(f, "LlLu_table",
+        sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
+    f.write("}\n\n")
+
+def emit_charwidth_module(f, width_table):
+    f.write("pub mod charwidth {\n")
+    f.write("    use core::option::{Option, Some, None};\n")
+    f.write("    use core::slice::ImmutableVector;\n")
+    f.write("""
+    fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 {
+        use core::cmp::{Equal, Less, Greater};
+        match r.bsearch(|&(lo, hi, _, _)| {
+            if lo <= c && c <= hi { Equal }
+            else if hi < c { Less }
+            else { Greater }
+        }) {
+            Some(idx) => {
+                let (_, _, r_ncjk, r_cjk) = r[idx];
+                if is_cjk { r_cjk } else { r_ncjk }
+            }
+            None => 1
+        }
+    }
+""")
+
+    f.write("""
+    pub fn width(c: char, is_cjk: bool) -> Option<uint> {
+        match c as uint {
+            _c @ 0 => Some(0),          // null is zero width
+            cu if cu < 0x20 => None,    // control sequences have no width
+            cu if cu < 0x7F => Some(1), // ASCII
+            cu if cu < 0xA0 => None,    // more control sequences
+            _ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as uint)
+        }
+    }
+
+""")
+
+    f.write("    // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n")
+    f.write("    //     http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
+    emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
+            pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
    f.write("}\n")

-def emit_caseconversion_table(f, name, table):
-    f.write("    static %s_table : &'static [(char, char)] = &[\n" % name)
-    sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0))
-    ix = 0
-    for key, value in sorted_table:
-        f.write(ch_prefix(ix))
-        f.write("(%s, %s)" % (escape_char(key), escape_char(value)))
-        ix += 1
-    f.write("\n    ];\n\n")
-
-def format_table_content(f, content, indent):
-    line = " "*indent
-    first = True
-    for chunk in content.split(","):
-        if len(line) + len(chunk) < 98:
-            if first:
-                line += chunk
-            else:
-                line += ", " + chunk
-            first = False
-        else:
-            f.write(line + ",\n")
-            line = " "*indent + chunk
-    f.write(line)
-
-def emit_core_norm_module(f, canon, compat):
+def emit_norm_module(f, canon, compat, combine):
    canon_keys = canon.keys()
    canon_keys.sort()

    compat_keys = compat.keys()
    compat_keys.sort()
-    f.write("pub mod normalization {\n");
-    f.write("    use option::Option;\n");
-    f.write("    use option::{Some, None};\n");
-    f.write("    use slice::ImmutableVector;\n");
-    f.write("""
-    fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
-        use cmp::{Equal, Less, Greater};
-        match r.bsearch(|&(val, _)| {
-            if c == val { Equal }
-            else if val < c { Less }
-            else { Greater }
-        }) {
-            Some(idx) => {
-                let (_, result) = r[idx];
-                Some(result)
-            }
-            None => None
-        }
-    }\n\n
-""")
+
+    f.write("pub mod normalization {\n")
+
+    def mkdata_fun(table):
+        def f(char):
+            data = "(%s,&[" % escape_char(char)
+            first = True
+            for d in table[char]:
+                if not first:
+                    data += ","
+                first = False
+                data += escape_char(d)
+            data += "])"
+            return data
+        return f

    f.write("    // Canonical decompositions\n")
-    f.write("    static canonical_table : &'static [(char, &'static [char])] = &[\n")
-    data = ""
-    first = True
-    for char in canon_keys:
-        if not first:
-            data += ","
-        first = False
-        data += "(%s,&[" % escape_char(char)
-        first2 = True
-        for d in canon[char]:
-            if not first2:
-                data += ","
-            first2 = False
-            data += escape_char(d)
-        data += "])"
-    format_table_content(f, data, 8)
-    f.write("\n    ];\n\n")
+    emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]",
+        pfun=mkdata_fun(canon))

    f.write("    // Compatibility decompositions\n")
-    f.write("    static compatibility_table : &'static [(char, &'static [char])] = &[\n")
-    data = ""
-    first = True
-    for char in compat_keys:
-        if not first:
-            data += ","
-        first = False
-        data += "(%s,&[" % escape_char(char)
-        first2 = True
-        for d in compat[char]:
-            if not first2:
-                data += ","
-            first2 = False
-            data += escape_char(d)
-        data += "])"
-    format_table_content(f, data, 8)
-    f.write("\n    ];\n\n")
-
-    f.write("""
-    pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
-
-    pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
-
-    fn d(c: char, i: |char|, k: bool) {
-        use iter::Iterator;
-
-        // 7-bit ASCII never decomposes
-        if c <= '\\x7f' { i(c); return; }
-
-        // Perform decomposition for Hangul
-        if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
-            decompose_hangul(c, i);
-            return;
-        }
-
-        // First check the canonical decompositions
-        match bsearch_table(c, canonical_table) {
-            Some(canon) => {
-                for x in canon.iter() {
-                    d(*x, |b| i(b), k);
-                }
-                return;
-            }
-            None => ()
-        }
-
-        // Bottom out if we're not doing compat.
-        if !k { i(c); return; }
-
-        // Then check the compatibility decompositions
-        match bsearch_table(c, compatibility_table) {
-            Some(compat) => {
-                for x in compat.iter() {
-                    d(*x, |b| i(b), k);
-                }
-                return;
-            }
-            None => ()
-        }
-
-        // Finally bottom out.
-        i(c);
-    }
-
-    // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
-    static S_BASE: u32 = 0xAC00;
-    static L_BASE: u32 = 0x1100;
-    static V_BASE: u32 = 0x1161;
-    static T_BASE: u32 = 0x11A7;
-    static L_COUNT: u32 = 19;
-    static V_COUNT: u32 = 21;
-    static T_COUNT: u32 = 28;
-    static N_COUNT: u32 = (V_COUNT * T_COUNT);
-    static S_COUNT: u32 = (L_COUNT * N_COUNT);
-
-    // Decompose a precomposed Hangul syllable
-    fn decompose_hangul(s: char, f: |char|) {
-        use cast::transmute;
-
-        let si = s as u32 - S_BASE;
-
-        let li = si / N_COUNT;
-        unsafe {
-            f(transmute(L_BASE + li));
-
-            let vi = (si % N_COUNT) / T_COUNT;
-            f(transmute(V_BASE + vi));
-
-            let ti = si % T_COUNT;
-            if ti > 0 {
-                f(transmute(T_BASE + ti));
-            }
-        }
-    }
-}
-
-""")
-
-def emit_std_norm_module(f, combine):
-    f.write("pub mod normalization {\n");
-    f.write("    use option::{Some, None};\n");
-    f.write("    use slice::ImmutableVector;\n");
+    emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
+        pfun=mkdata_fun(compat))

    f.write("""
    fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
-        use cmp::{Equal, Less, Greater};
+        use core::option::{Some, None};
+        use core::cmp::{Equal, Less, Greater};
+        use core::slice::ImmutableVector;
        match r.bsearch(|&(lo, hi, _)| {
            if lo <= c && c <= hi { Equal }
            else if hi < c { Less }
@ -420,72 +436,122 @@ def emit_std_norm_module(f, combine):
            }
            None => 0
        }
-    }\n\n
+    }\n
 """)

-    f.write("    static combining_class_table : &'static [(char, char, u8)] = &[\n")
-    ix = 0
-    for pair in combine:
-        f.write(ch_prefix(ix))
-        f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
-        ix += 1
-    f.write("\n    ];\n\n")
+    emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False,
+            pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))

    f.write("    pub fn canonical_combining_class(c: char) -> u8 {\n"
        + "        bsearch_range_value_table(c, combining_class_table)\n"
        + "    }\n")
-    f.write("}\n")

+    f.write("""
+}

-preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
+""")

-// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
+def remove_from_wtable(wtable, val):
+    wtable_out = []
+    while wtable:
+        if wtable[0][1] < val:
+            wtable_out.append(wtable.pop(0))
+        elif wtable[0][0] > val:
+            break
+        else:
+            (wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
+            if wt_lo == wt_hi == val:
+                continue
+            elif wt_lo == val:
+                wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
+            elif wt_hi == val:
+                wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
+            else:
+                wtable_out.append((wt_lo, val-1, width, width_cjk))
+                wtable_out.append((val+1, wt_hi, width, width_cjk))
+    if wtable:
+        wtable_out.extend(wtable)
+    return wtable_out

-#![allow(missing_doc, non_uppercase_statics)]
+def optimize_width_table(wtable):
+    wtable_out = []
+    w_this = wtable.pop(0)
+    while wtable:
+        if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
+            w_tmp = wtable.pop(0)
+            w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
+        else:
+            wtable_out.append(w_this)
+            w_this = wtable.pop(0)
+    wtable_out.append(w_this)
+    return wtable_out

-'''
-
-(canon_decomp, compat_decomp, gencats,
- combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
-
-def gen_core_unicode():
-    r = "core_unicode.rs"
+if __name__ == "__main__":
+    r = "unicode.rs"
    if os.path.exists(r):
-        os.remove(r);
+        os.remove(r)
    with open(r, "w") as rf:
-        # Preamble
+        # write the file's preamble
        rf.write(preamble)

-        emit_bsearch_range_table(rf);
-        emit_property_module(rf, "general_category", gencats)
+        # download and parse all the data
+        (canon_decomp, compat_decomp, gencats, combines,
+                lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
+        want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
+        other_derived = ["Default_Ignorable_Code_Point"]
+        derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
+        scripts = load_properties("Scripts.txt", [])
+        props = load_properties("PropList.txt",
+                ["White_Space", "Join_Control", "Noncharacter_Code_Point"])

-        emit_core_norm_module(rf, canon_decomp, compat_decomp)
+        # bsearch_range_table is used in all the property modules below
+        emit_bsearch_range_table(rf)

-        derived = load_properties("DerivedCoreProperties.txt",
-                ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
+        # all of these categories will also be available as \p{} in libregex
+        allcats = []
+        for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
+                                  ("derived_property", derived, want_derived), \
+                                  ("script", scripts, []), \
+                                  ("property", props, ["White_Space"]):
+            emit_property_module(rf, name, cat, pfuns)
+            allcats.extend(map(lambda x: (x, name), cat))
+        allcats.sort(key=lambda c: c[0])

-        emit_property_module(rf, "derived_property", derived)
+        # the \w regex corresponds to Alphabetic + Mark + Decimal_Number +
+        # Connector_Punctuation + Join-Control according to UTS#18
+        # http://www.unicode.org/reports/tr18/#Compatibility_Properties
+        perl_words = []
+        for cat in derived["Alphabetic"], gencats["M"], gencats["Nd"], \
+                   gencats["Pc"], props["Join_Control"]:
+            perl_words.extend(ungroup_cat(cat))
+        perl_words = group_cat(perl_words)

-        props = load_properties("PropList.txt", ["White_Space"])
-        emit_property_module(rf, "property", props)
+        # emit lookup tables for \p{}, along with \d, \w, and \s for libregex
+        emit_regex_module(rf, allcats, perl_words)
+
+        # normalizations and conversions module
+        emit_norm_module(rf, canon_decomp, compat_decomp, combines)
        emit_conversions_module(rf, lowerupper, upperlower)

-def gen_std_unicode():
-    r = "std_unicode.rs"
-    if os.path.exists(r):
-        os.remove(r);
-    with open(r, "w") as rf:
-        # Preamble
-        rf.write(preamble)
-        emit_std_norm_module(rf, combines)
+        # character width module
+        width_table = []
+        for zwcat in ["Me", "Mn", "Cf"]:
+            width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
+        width_table.append((4448, 4607, 0, 0))

-gen_core_unicode()
-gen_std_unicode()
+        # get widths, except those that are explicitly marked zero-width above
+        ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"])
+        # these are doublewidth
+        for dwcat in ["W", "F"]:
+            width_table.extend(map(lambda (lo, hi): (lo, hi, 2, 2), ea_widths[dwcat]))
+        width_table.extend(map(lambda (lo, hi): (lo, hi, 1, 2), ea_widths["A"]))
+
+        width_table.sort(key=lambda w: w[0])
+
+        # soft hyphen is not zero width in preformatted text; it's used to indicate
+        # a hyphen inserted to facilitate a linebreak.
+        width_table = remove_from_wtable(width_table, 173)
+
+        # optimize the width table by collapsing adjacent entities when possible
+        width_table = optimize_width_table(width_table)
+        emit_charwidth_module(rf, width_table)
--- a/src/libcollections/lib.rs
+++ b/src/libcollections/lib.rs
@ -28,6 +28,7 @@
 #![allow(unused_attribute)] // NOTE: remove after stage0

 #[phase(plugin, link)] extern crate core;
+extern crate unicode;
 extern crate alloc;

 #[cfg(test)] extern crate native;
@ -69,9 +70,6 @@ pub mod string;
 pub mod vec;
 pub mod hash;

-// Internal unicode fiddly bits for the str module
-mod unicode;
-
 mod deque;

 /// A trait to represent mutable containers
--- a/src/libcollections/str.rs
+++ b/src/libcollections/str.rs
@ -69,7 +69,6 @@ is the same as `&[u8]`.

 use core::prelude::*;

-use core::char;
 use core::default::Default;
 use core::fmt;
 use core::cmp;
@ -79,15 +78,17 @@ use core::mem;
 use Collection;
 use hash;
 use string::String;
+use unicode;
 use vec::Vec;

 pub use core::str::{from_utf8, CharEq, Chars, CharOffsets};
 pub use core::str::{Bytes, CharSplits};
-pub use core::str::{CharSplitsN, Words, AnyLines, MatchIndices, StrSplits};
+pub use core::str::{CharSplitsN, AnyLines, MatchIndices, StrSplits};
 pub use core::str::{eq_slice, is_utf8, is_utf16, Utf16Items};
 pub use core::str::{Utf16Item, ScalarValue, LoneSurrogate, utf16_items};
 pub use core::str::{truncate_utf16_at_nul, utf8_char_width, CharRange};
 pub use core::str::{Str, StrSlice};
+pub use unicode::{Words, UnicodeStrSlice};

 /*
 Section: Creating a string
@ -283,7 +284,7 @@ pub struct Decompositions<'a> {
 impl<'a> Iterator<char> for Decompositions<'a> {
    #[inline]
    fn next(&mut self) -> Option<char> {
-        use unicode::normalization::canonical_combining_class;
+        use unicode::canonical_combining_class;

        match self.buffer.as_slice().head() {
            Some(&(c, 0)) => {
@ -299,8 +300,8 @@ impl<'a> Iterator<char> for Decompositions<'a> {
        }

        let decomposer = match self.kind {
-            Canonical => char::decompose_canonical,
-            Compatible => char::decompose_compatible
+            Canonical => unicode::char::decompose_canonical,
+            Compatible => unicode::char::decompose_compatible
        };

        if !self.sorted {
@ -973,6 +974,8 @@ mod tests {
    use string::String;
    use vec::Vec;

+    use unicode::UnicodeChar;
+
    #[test]
    fn test_eq_slice() {
        assert!((eq_slice("foobar".slice(0, 3), "foo")));
--- a/src/libcollections/unicode.rs
+++ b/src/libcollections/unicode.rs
@ -1,183 +0,0 @@
-// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
-
-#![allow(missing_doc, non_uppercase_statics)]
-
-pub mod normalization {
-    use core::prelude::*;
-
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
-        match r.bsearch(|&(lo, hi, _)| {
-            if lo <= c && c <= hi { Equal }
-            else if hi < c { Less }
-            else { Greater }
-        }) {
-            Some(idx) => {
-                let (_, _, result) = r[idx];
-                result
-            }
-            None => 0
-        }
-    }
-
-
-    static combining_class_table : &'static [(char, char, u8)] = &[
-        ('\u0300', '\u0314', 230), ('\u0315', '\u0315', 232),
-        ('\u0316', '\u0319', 220), ('\u031a', '\u031a', 232),
-        ('\u031b', '\u031b', 216), ('\u031c', '\u0320', 220),
-        ('\u0321', '\u0322', 202), ('\u0323', '\u0326', 220),
-        ('\u0327', '\u0328', 202), ('\u0329', '\u0333', 220),
-        ('\u0334', '\u0338', 1), ('\u0339', '\u033c', 220),
-        ('\u033d', '\u0344', 230), ('\u0345', '\u0345', 240),
-        ('\u0346', '\u0346', 230), ('\u0347', '\u0349', 220),
-        ('\u034a', '\u034c', 230), ('\u034d', '\u034e', 220),
-        ('\u0350', '\u0352', 230), ('\u0353', '\u0356', 220),
-        ('\u0357', '\u0357', 230), ('\u0358', '\u0358', 232),
-        ('\u0359', '\u035a', 220), ('\u035b', '\u035b', 230),
-        ('\u035c', '\u035c', 233), ('\u035d', '\u035e', 234),
-        ('\u035f', '\u035f', 233), ('\u0360', '\u0361', 234),
-        ('\u0362', '\u0362', 233), ('\u0363', '\u036f', 230),
-        ('\u0483', '\u0487', 230), ('\u0591', '\u0591', 220),
-        ('\u0592', '\u0595', 230), ('\u0596', '\u0596', 220),
-        ('\u0597', '\u0599', 230), ('\u059a', '\u059a', 222),
-        ('\u059b', '\u059b', 220), ('\u059c', '\u05a1', 230),
-        ('\u05a2', '\u05a7', 220), ('\u05a8', '\u05a9', 230),
-        ('\u05aa', '\u05aa', 220), ('\u05ab', '\u05ac', 230),
-        ('\u05ad', '\u05ad', 222), ('\u05ae', '\u05ae', 228),
-        ('\u05af', '\u05af', 230), ('\u05b0', '\u05b0', 10),
-        ('\u05b1', '\u05b1', 11), ('\u05b2', '\u05b2', 12),
-        ('\u05b3', '\u05b3', 13), ('\u05b4', '\u05b4', 14),
-        ('\u05b5', '\u05b5', 15), ('\u05b6', '\u05b6', 16),
-        ('\u05b7', '\u05b7', 17), ('\u05b8', '\u05b8', 18),
-        ('\u05b9', '\u05ba', 19), ('\u05bb', '\u05bb', 20),
-        ('\u05bc', '\u05bc', 21), ('\u05bd', '\u05bd', 22),
-        ('\u05bf', '\u05bf', 23), ('\u05c1', '\u05c1', 24),
-        ('\u05c2', '\u05c2', 25), ('\u05c4', '\u05c4', 230),
-        ('\u05c5', '\u05c5', 220), ('\u05c7', '\u05c7', 18),
-        ('\u0610', '\u0617', 230), ('\u0618', '\u0618', 30),
-        ('\u0619', '\u0619', 31), ('\u061a', '\u061a', 32),
-        ('\u064b', '\u064b', 27), ('\u064c', '\u064c', 28),
-        ('\u064d', '\u064d', 29), ('\u064e', '\u064e', 30),
-        ('\u064f', '\u064f', 31), ('\u0650', '\u0650', 32),
-        ('\u0651', '\u0651', 33), ('\u0652', '\u0652', 34),
-        ('\u0653', '\u0654', 230), ('\u0655', '\u0656', 220),
-        ('\u0657', '\u065b', 230), ('\u065c', '\u065c', 220),
-        ('\u065d', '\u065e', 230), ('\u065f', '\u065f', 220),
-        ('\u0670', '\u0670', 35), ('\u06d6', '\u06dc', 230),
-        ('\u06df', '\u06e2', 230), ('\u06e3', '\u06e3', 220),
-        ('\u06e4', '\u06e4', 230), ('\u06e7', '\u06e8', 230),
-        ('\u06ea', '\u06ea', 220), ('\u06eb', '\u06ec', 230),
-        ('\u06ed', '\u06ed', 220), ('\u0711', '\u0711', 36),
-        ('\u0730', '\u0730', 230), ('\u0731', '\u0731', 220),
-        ('\u0732', '\u0733', 230), ('\u0734', '\u0734', 220),
-        ('\u0735', '\u0736', 230), ('\u0737', '\u0739', 220),
-        ('\u073a', '\u073a', 230), ('\u073b', '\u073c', 220),
-        ('\u073d', '\u073d', 230), ('\u073e', '\u073e', 220),
-        ('\u073f', '\u0741', 230), ('\u0742', '\u0742', 220),
-        ('\u0743', '\u0743', 230), ('\u0744', '\u0744', 220),
-        ('\u0745', '\u0745', 230), ('\u0746', '\u0746', 220),
-        ('\u0747', '\u0747', 230), ('\u0748', '\u0748', 220),
-        ('\u0749', '\u074a', 230), ('\u07eb', '\u07f1', 230),
-        ('\u07f2', '\u07f2', 220), ('\u07f3', '\u07f3', 230),
-        ('\u0816', '\u0819', 230), ('\u081b', '\u0823', 230),
-        ('\u0825', '\u0827', 230), ('\u0829', '\u082d', 230),
-        ('\u0859', '\u085b', 220), ('\u08e4', '\u08e5', 230),
-        ('\u08e6', '\u08e6', 220), ('\u08e7', '\u08e8', 230),
-        ('\u08e9', '\u08e9', 220), ('\u08ea', '\u08ec', 230),
-        ('\u08ed', '\u08ef', 220), ('\u08f0', '\u08f0', 27),
-        ('\u08f1', '\u08f1', 28), ('\u08f2', '\u08f2', 29),
-        ('\u08f3', '\u08f5', 230), ('\u08f6', '\u08f6', 220),
-        ('\u08f7', '\u08f8', 230), ('\u08f9', '\u08fa', 220),
-        ('\u08fb', '\u08fe', 230), ('\u093c', '\u093c', 7),
-        ('\u094d', '\u094d', 9), ('\u0951', '\u0951', 230),
-        ('\u0952', '\u0952', 220), ('\u0953', '\u0954', 230),
-        ('\u09bc', '\u09bc', 7), ('\u09cd', '\u09cd', 9),
-        ('\u0a3c', '\u0a3c', 7), ('\u0a4d', '\u0a4d', 9),
-        ('\u0abc', '\u0abc', 7), ('\u0acd', '\u0acd', 9),
-        ('\u0b3c', '\u0b3c', 7), ('\u0b4d', '\u0b4d', 9),
-        ('\u0bcd', '\u0bcd', 9), ('\u0c4d', '\u0c4d', 9),
-        ('\u0c55', '\u0c55', 84), ('\u0c56', '\u0c56', 91),
-        ('\u0cbc', '\u0cbc', 7), ('\u0ccd', '\u0ccd', 9),
-        ('\u0d4d', '\u0d4d', 9), ('\u0dca', '\u0dca', 9),
-        ('\u0e38', '\u0e39', 103), ('\u0e3a', '\u0e3a', 9),
-        ('\u0e48', '\u0e4b', 107), ('\u0eb8', '\u0eb9', 118),
-        ('\u0ec8', '\u0ecb', 122), ('\u0f18', '\u0f19', 220),
-        ('\u0f35', '\u0f35', 220), ('\u0f37', '\u0f37', 220),
-        ('\u0f39', '\u0f39', 216), ('\u0f71', '\u0f71', 129),
-        ('\u0f72', '\u0f72', 130), ('\u0f74', '\u0f74', 132),
-        ('\u0f7a', '\u0f7d', 130), ('\u0f80', '\u0f80', 130),
-        ('\u0f82', '\u0f83', 230), ('\u0f84', '\u0f84', 9),
-        ('\u0f86', '\u0f87', 230), ('\u0fc6', '\u0fc6', 220),
-        ('\u1037', '\u1037', 7), ('\u1039', '\u103a', 9),
-        ('\u108d', '\u108d', 220), ('\u135d', '\u135f', 230),
-        ('\u1714', '\u1714', 9), ('\u1734', '\u1734', 9),
-        ('\u17d2', '\u17d2', 9), ('\u17dd', '\u17dd', 230),
-        ('\u18a9', '\u18a9', 228), ('\u1939', '\u1939', 222),
-        ('\u193a', '\u193a', 230), ('\u193b', '\u193b', 220),
-        ('\u1a17', '\u1a17', 230), ('\u1a18', '\u1a18', 220),
-        ('\u1a60', '\u1a60', 9), ('\u1a75', '\u1a7c', 230),
-        ('\u1a7f', '\u1a7f', 220), ('\u1b34', '\u1b34', 7),
-        ('\u1b44', '\u1b44', 9), ('\u1b6b', '\u1b6b', 230),
-        ('\u1b6c', '\u1b6c', 220), ('\u1b6d', '\u1b73', 230),
-        ('\u1baa', '\u1bab', 9), ('\u1be6', '\u1be6', 7),
-        ('\u1bf2', '\u1bf3', 9), ('\u1c37', '\u1c37', 7),
-        ('\u1cd0', '\u1cd2', 230), ('\u1cd4', '\u1cd4', 1),
-        ('\u1cd5', '\u1cd9', 220), ('\u1cda', '\u1cdb', 230),
-        ('\u1cdc', '\u1cdf', 220), ('\u1ce0', '\u1ce0', 230),
-        ('\u1ce2', '\u1ce8', 1), ('\u1ced', '\u1ced', 220),
-        ('\u1cf4', '\u1cf4', 230), ('\u1dc0', '\u1dc1', 230),
-        ('\u1dc2', '\u1dc2', 220), ('\u1dc3', '\u1dc9', 230),
-        ('\u1dca', '\u1dca', 220), ('\u1dcb', '\u1dcc', 230),
-        ('\u1dcd', '\u1dcd', 234), ('\u1dce', '\u1dce', 214),
-        ('\u1dcf', '\u1dcf', 220), ('\u1dd0', '\u1dd0', 202),
-        ('\u1dd1', '\u1de6', 230), ('\u1dfc', '\u1dfc', 233),
-        ('\u1dfd', '\u1dfd', 220), ('\u1dfe', '\u1dfe', 230),
-        ('\u1dff', '\u1dff', 220), ('\u20d0', '\u20d1', 230),
-        ('\u20d2', '\u20d3', 1), ('\u20d4', '\u20d7', 230),
-        ('\u20d8', '\u20da', 1), ('\u20db', '\u20dc', 230),
-        ('\u20e1', '\u20e1', 230), ('\u20e5', '\u20e6', 1),
-        ('\u20e7', '\u20e7', 230), ('\u20e8', '\u20e8', 220),
-        ('\u20e9', '\u20e9', 230), ('\u20ea', '\u20eb', 1),
-        ('\u20ec', '\u20ef', 220), ('\u20f0', '\u20f0', 230),
-        ('\u2cef', '\u2cf1', 230), ('\u2d7f', '\u2d7f', 9),
-        ('\u2de0', '\u2dff', 230), ('\u302a', '\u302a', 218),
-        ('\u302b', '\u302b', 228), ('\u302c', '\u302c', 232),
-        ('\u302d', '\u302d', 222), ('\u302e', '\u302f', 224),
-        ('\u3099', '\u309a', 8), ('\ua66f', '\ua66f', 230),
-        ('\ua674', '\ua67d', 230), ('\ua69f', '\ua69f', 230),
-        ('\ua6f0', '\ua6f1', 230), ('\ua806', '\ua806', 9),
-        ('\ua8c4', '\ua8c4', 9), ('\ua8e0', '\ua8f1', 230),
-        ('\ua92b', '\ua92d', 220), ('\ua953', '\ua953', 9),
-        ('\ua9b3', '\ua9b3', 7), ('\ua9c0', '\ua9c0', 9),
-        ('\uaab0', '\uaab0', 230), ('\uaab2', '\uaab3', 230),
-        ('\uaab4', '\uaab4', 220), ('\uaab7', '\uaab8', 230),
-        ('\uaabe', '\uaabf', 230), ('\uaac1', '\uaac1', 230),
-        ('\uaaf6', '\uaaf6', 9), ('\uabed', '\uabed', 9),
-        ('\ufb1e', '\ufb1e', 26), ('\ufe20', '\ufe26', 230),
-        ('\U000101fd', '\U000101fd', 220), ('\U00010a0d', '\U00010a0d', 220),
-        ('\U00010a0f', '\U00010a0f', 230), ('\U00010a38', '\U00010a38', 230),
-        ('\U00010a39', '\U00010a39', 1), ('\U00010a3a', '\U00010a3a', 220),
-        ('\U00010a3f', '\U00010a3f', 9), ('\U00011046', '\U00011046', 9),
-        ('\U000110b9', '\U000110b9', 9), ('\U000110ba', '\U000110ba', 7),
-        ('\U00011100', '\U00011102', 230), ('\U00011133', '\U00011134', 9),
-        ('\U000111c0', '\U000111c0', 9), ('\U000116b6', '\U000116b6', 9),
-        ('\U000116b7', '\U000116b7', 7), ('\U0001d165', '\U0001d166', 216),
-        ('\U0001d167', '\U0001d169', 1), ('\U0001d16d', '\U0001d16d', 226),
-        ('\U0001d16e', '\U0001d172', 216), ('\U0001d17b', '\U0001d182', 220),
-        ('\U0001d185', '\U0001d189', 230), ('\U0001d18a', '\U0001d18b', 220),
-        ('\U0001d1aa', '\U0001d1ad', 230), ('\U0001d242', '\U0001d244', 230)
-    ];
-
-    pub fn canonical_combining_class(c: char) -> u8 {
-        bsearch_range_value_table(c, combining_class_table)
-    }
-}
--- a/src/libcore/char.rs
+++ b/src/libcore/char.rs
@ -8,20 +8,9 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

-//! Character manipulation (`char` type, Unicode Scalar Value)
+//! Character manipulation.
 //!
-//! This module  provides the `Char` trait, as well as its implementation
-//! for the primitive `char` type, in order to allow basic character manipulation.
-//!
-//! A `char` actually represents a
-//! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
-//! as it can contain any Unicode code point except high-surrogate and
-//! low-surrogate code points.
-//!
-//! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
-//! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
-//! however the converse is not always true due to the above range limits
-//! and, as such, should be performed via the `from_u32` function..
+//! For more details, see ::unicode::char (a.k.a. std::char)

 #![allow(non_snake_case_functions)]
 #![doc(primitive = "char")]
@ -29,12 +18,6 @@
 use mem::transmute;
 use option::{None, Option, Some};
 use iter::{Iterator, range_step};
-use unicode::{derived_property, property, general_category, conversions};
-
-/// Returns the canonical decomposition of a character.
-pub use unicode::normalization::decompose_canonical;
-/// Returns the compatibility decomposition of a character.
-pub use unicode::normalization::decompose_compatible;

 // UTF-8 ranges and tags for encoding characters
 static TAG_CONT: u8    = 0b1000_0000u8;
@ -93,84 +76,6 @@ pub fn from_u32(i: u32) -> Option<char> {
    }
 }

-/// Returns whether the specified `char` is considered a Unicode alphabetic
-/// code point
-pub fn is_alphabetic(c: char) -> bool   { derived_property::Alphabetic(c) }
-
-/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
-///
-/// 'XID_Start' is a Unicode Derived Property specified in
-/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
-/// mostly similar to ID_Start but modified for closure under NFKx.
-pub fn is_XID_start(c: char) -> bool    { derived_property::XID_Start(c) }
-
-/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
-///
-/// 'XID_Continue' is a Unicode Derived Property specified in
-/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
-/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
-pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
-
-///
-/// Indicates whether a `char` is in lower case
-///
-/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
-///
-#[inline]
-pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
-
-///
-/// Indicates whether a `char` is in upper case
-///
-/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
-///
-#[inline]
-pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
-
-///
-/// Indicates whether a `char` is whitespace
-///
-/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
-///
-#[inline]
-pub fn is_whitespace(c: char) -> bool {
-    // As an optimization ASCII whitespace characters are checked separately
-    c == ' '
-        || ('\x09' <= c && c <= '\x0d')
-        || property::White_Space(c)
-}
-
-///
-/// Indicates whether a `char` is alphanumeric
-///
-/// Alphanumericness is defined in terms of the Unicode General Categories
-/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
-///
-#[inline]
-pub fn is_alphanumeric(c: char) -> bool {
-    derived_property::Alphabetic(c)
-        || general_category::Nd(c)
-        || general_category::Nl(c)
-        || general_category::No(c)
-}
-
-///
-/// Indicates whether a `char` is a control code point
-///
-/// Control code points are defined in terms of the Unicode General Category
-/// 'Cc'.
-///
-#[inline]
-pub fn is_control(c: char) -> bool { general_category::Cc(c) }
-
-/// Indicates whether the `char` is numeric (Nd, Nl, or No)
-#[inline]
-pub fn is_digit(c: char) -> bool {
-    general_category::Nd(c)
-        || general_category::Nl(c)
-        || general_category::No(c)
-}
-
 ///
 /// Checks if a `char` parses as a numeric digit in the given radix
 ///
@ -227,38 +132,6 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
    else { None }
 }

-/// Convert a char to its uppercase equivalent
-///
-/// The case-folding performed is the common or simple mapping:
-/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
-/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
-/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
-/// codepoints in some cases.
-///
-/// A full reference can be found here
-/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
-///
-/// # Return value
-///
-/// Returns the char itself if no conversion was made
-#[inline]
-pub fn to_uppercase(c: char) -> char {
-    conversions::to_upper(c)
-}
-
-/// Convert a char to its lowercase equivalent
-///
-/// The case-folding performed is the common or simple mapping
-/// see `to_uppercase` for references and more information
-///
-/// # Return value
-///
-/// Returns the char itself if no conversion if possible
-#[inline]
-pub fn to_lowercase(c: char) -> char {
-    conversions::to_lower(c)
-}
-
 ///
 /// Converts a number to the character representing it
 ///
@ -355,61 +228,8 @@ pub fn len_utf8_bytes(c: char) -> uint {
    }
 }

-/// Useful functions for Unicode characters.
+/// Basic `char` manipulations.
 pub trait Char {
-    /// Returns whether the specified character is considered a Unicode
-    /// alphabetic code point.
-    fn is_alphabetic(&self) -> bool;
-
-    /// Returns whether the specified character satisfies the 'XID_Start'
-    /// Unicode property.
-    ///
-    /// 'XID_Start' is a Unicode Derived Property specified in
-    /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
-    /// mostly similar to ID_Start but modified for closure under NFKx.
-    fn is_XID_start(&self) -> bool;
-
-    /// Returns whether the specified `char` satisfies the 'XID_Continue'
-    /// Unicode property.
-    ///
-    /// 'XID_Continue' is a Unicode Derived Property specified in
-    /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
-    /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
-    fn is_XID_continue(&self) -> bool;
-
-
-    /// Indicates whether a character is in lowercase.
-    ///
-    /// This is defined according to the terms of the Unicode Derived Core
-    /// Property `Lowercase`.
-    fn is_lowercase(&self) -> bool;
-
-    /// Indicates whether a character is in uppercase.
-    ///
-    /// This is defined according to the terms of the Unicode Derived Core
-    /// Property `Uppercase`.
-    fn is_uppercase(&self) -> bool;
-
-    /// Indicates whether a character is whitespace.
-    ///
-    /// Whitespace is defined in terms of the Unicode Property `White_Space`.
-    fn is_whitespace(&self) -> bool;
-
-    /// Indicates whether a character is alphanumeric.
-    ///
-    /// Alphanumericness is defined in terms of the Unicode General Categories
-    /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
-    fn is_alphanumeric(&self) -> bool;
-
-    /// Indicates whether a character is a control code point.
-    ///
-    /// Control code points are defined in terms of the Unicode General
-    /// Category `Cc`.
-    fn is_control(&self) -> bool;
-
-    /// Indicates whether the character is numeric (Nd, Nl, or No).
-    fn is_digit(&self) -> bool;
-
    /// Checks if a `char` parses as a numeric digit in the given radix.
    ///
    /// Compared to `is_digit()`, this function only recognizes the characters
@ -438,37 +258,6 @@ pub trait Char {
    /// Fails if given a radix outside the range [0..36].
    fn to_digit(&self, radix: uint) -> Option<uint>;

-    /// Converts a character to its lowercase equivalent.
-    ///
-    /// The case-folding performed is the common or simple mapping. See
-    /// `to_uppercase()` for references and more information.
-    ///
-    /// # Return value
-    ///
-    /// Returns the lowercase equivalent of the character, or the character
-    /// itself if no conversion is possible.
-    fn to_lowercase(&self) -> char;
-
-    /// Converts a character to its uppercase equivalent.
-    ///
-    /// The case-folding performed is the common or simple mapping: it maps
-    /// one unicode codepoint (one character in Rust) to its uppercase
-    /// equivalent according to the Unicode database [1]. The additional
-    /// `SpecialCasing.txt` is not considered here, as it expands to multiple
-    /// codepoints in some cases.
-    ///
-    /// A full reference can be found here [2].
-    ///
-    /// # Return value
-    ///
-    /// Returns the uppercase equivalent of the character, or the character
-    /// itself if no conversion was made.
-    ///
-    /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
-    ///
-    /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
-    fn to_uppercase(&self) -> char;
-
    /// Converts a number to the character representing it.
    ///
    /// # Return value
@ -526,32 +315,10 @@ pub trait Char {
 }

 impl Char for char {
-    fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
-
-    fn is_XID_start(&self) -> bool { is_XID_start(*self) }
-
-    fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
-
-    fn is_lowercase(&self) -> bool { is_lowercase(*self) }
-
-    fn is_uppercase(&self) -> bool { is_uppercase(*self) }
-
-    fn is_whitespace(&self) -> bool { is_whitespace(*self) }
-
-    fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
-
-    fn is_control(&self) -> bool { is_control(*self) }
-
-    fn is_digit(&self) -> bool { is_digit(*self) }
-
    fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }

    fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }

-    fn to_lowercase(&self) -> char { to_lowercase(*self) }
-
-    fn to_uppercase(&self) -> char { to_uppercase(*self) }
-
    fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }

    fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
@ -600,5 +367,3 @@ impl Char for char {
        }
    }
 }
-
-
--- a/src/libcore/lib.rs
+++ b/src/libcore/lib.rs
@ -108,7 +108,6 @@ pub mod collections;

 /* Core types and methods on primitives */

-mod unicode;
 pub mod any;
 pub mod atomics;
 pub mod bool;
--- a/src/libcore/str.rs
+++ b/src/libcore/str.rs
@ -22,7 +22,7 @@ use cmp;
 use cmp::{PartialEq, Eq};
 use collections::Collection;
 use default::Default;
-use iter::{Filter, Map, Iterator};
+use iter::{Map, Iterator};
 use iter::{DoubleEndedIterator, ExactSize};
 use iter::range;
 use num::{CheckedMul, Saturating};
@ -204,10 +204,6 @@ pub struct CharSplitsN<'a, Sep> {
    invert: bool,
 }

-/// An iterator over the words of a string, separated by a sequence of whitespace
-pub type Words<'a> =
-    Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
-
 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
 pub type AnyLines<'a> =
    Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
@ -1209,48 +1205,6 @@ pub trait StrSlice<'a> {
    /// ```
    fn lines_any(&self) -> AnyLines<'a>;

-    /// An iterator over the words of a string (subsequences separated
-    /// by any sequence of whitespace). Sequences of whitespace are
-    /// collapsed, so empty "words" are not included.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// let some_words = " Mary   had\ta little  \n\t lamb";
-    /// let v: Vec<&str> = some_words.words().collect();
-    /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
-    /// ```
-    fn words(&self) -> Words<'a>;
-
-    /// Returns true if the string contains only whitespace.
-    ///
-    /// Whitespace characters are determined by `char::is_whitespace`.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// assert!(" \t\n".is_whitespace());
-    /// assert!("".is_whitespace());
-    ///
-    /// assert!( !"abc".is_whitespace());
-    /// ```
-    fn is_whitespace(&self) -> bool;
-
-    /// Returns true if the string contains only alphanumeric code
-    /// points.
-    ///
-    /// Alphanumeric characters are determined by `char::is_alphanumeric`.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// assert!("Löwe老虎Léopard123".is_alphanumeric());
-    /// assert!("".is_alphanumeric());
-    ///
-    /// assert!( !" &*~".is_alphanumeric());
-    /// ```
-    fn is_alphanumeric(&self) -> bool;
-
    /// Returns the number of Unicode code points (`char`) that a
    /// string holds.
    ///
@ -1368,15 +1322,6 @@ pub trait StrSlice<'a> {
    /// Returns true if `needle` is a suffix of the string.
    fn ends_with(&self, needle: &str) -> bool;

-    /// Returns a string with leading and trailing whitespace removed.
-    fn trim(&self) -> &'a str;
-
-    /// Returns a string with leading whitespace removed.
-    fn trim_left(&self) -> &'a str;
-
-    /// Returns a string with trailing whitespace removed.
-    fn trim_right(&self) -> &'a str;
-
    /// Returns a string with characters that match `to_trim` removed.
    ///
    /// # Arguments
@ -1748,17 +1693,6 @@ impl<'a> StrSlice<'a> for &'a str {
        })
    }

-    #[inline]
-    fn words(&self) -> Words<'a> {
-        self.split(char::is_whitespace).filter(|s| !s.is_empty())
-    }
-
-    #[inline]
-    fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
-
-    #[inline]
-    fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
-
    #[inline]
    fn char_len(&self) -> uint { self.chars().count() }

@ -1817,21 +1751,6 @@ impl<'a> StrSlice<'a> for &'a str {
        m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
    }

-    #[inline]
-    fn trim(&self) -> &'a str {
-        self.trim_left().trim_right()
-    }
-
-    #[inline]
-    fn trim_left(&self) -> &'a str {
-        self.trim_left_chars(char::is_whitespace)
-    }
-
-    #[inline]
-    fn trim_right(&self) -> &'a str {
-        self.trim_right_chars(char::is_whitespace)
-    }
-
    #[inline]
    fn trim_chars<C: CharEq>(&self, mut to_trim: C) -> &'a str {
        let cur = match self.find(|c: char| !to_trim.matches(c)) {
--- a/src/libcore/unicode.rs
+++ b/src/libcore/unicode.rs
--- a/src/libcoretest/char.rs
+++ b/src/libcoretest/char.rs
@ -194,3 +194,30 @@ fn test_encode_utf16() {
    check('\ua66e', [0xa66e]);
    check('\U0001f4a9', [0xd83d, 0xdca9]);
 }
+
+#[test]
+fn test_width() {
+    assert_eq!('\x00'.width(false),Some(0));
+    assert_eq!('\x00'.width(true),Some(0));
+
+    assert_eq!('\x0A'.width(false),None);
+    assert_eq!('\x0A'.width(true),None);
+
+    assert_eq!('w'.width(false),Some(1));
+    assert_eq!('w'.width(true),Some(1));
+
+    assert_eq!('ｈ'.width(false),Some(2));
+    assert_eq!('ｈ'.width(true),Some(2));
+
+    assert_eq!('\xAD'.width(false),Some(1));
+    assert_eq!('\xAD'.width(true),Some(1));
+
+    assert_eq!('\u1160'.width(false),Some(0));
+    assert_eq!('\u1160'.width(true),Some(0));
+
+    assert_eq!('\u00a1'.width(false),Some(1));
+    assert_eq!('\u00a1'.width(true),Some(2));
+
+    assert_eq!('\u0300'.width(false),Some(0));
+    assert_eq!('\u0300'.width(true),Some(0));
+}
--- a/src/libregex/lib.rs
+++ b/src/libregex/lib.rs
@ -306,12 +306,15 @@
 //!
 //! ## Perl character classes (Unicode friendly)
 //!
+//! These classes are based on the definitions provided in
+//! [UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
+//!
 //! <pre class="rust">
-//! \d     digit ([0-9] + \p{Nd})
+//! \d     digit (\p{Nd})
 //! \D     not digit
-//! \s     whitespace ([\t\n\f\r ] + \p{Z})
+//! \s     whitespace (\p{White_Space})
 //! \S     not whitespace
-//! \w     word character ([0-9A-Za-z_] + \p{L})
+//! \w     word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
 //! \W     not word character
 //! </pre>
 //!
@ -378,6 +381,9 @@ extern crate rand;
 #[cfg(test)]
 extern crate regex;

+// unicode tables for character classes are defined in libunicode
+extern crate unicode;
+
 pub use parse::Error;
 pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
 pub use re::{FindCaptures, FindMatches};
--- a/src/libregex/parse/mod.rs
+++ b/src/libregex/parse/mod.rs
@ -16,9 +16,7 @@ use std::num;
 use std::str;

 /// Static data containing Unicode ranges for general categories and scripts.
-use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
-#[allow(visible_private_types)]
-pub mod unicode;
+use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};

 /// The maximum number of repetitions allowed with the `{n,m}` syntax.
 static MAX_REPEAT: uint = 1000;
--- a/src/libregex/parse/unicode.rs
+++ b/src/libregex/parse/unicode.rs
--- a/src/libregex/test/tests.rs
+++ b/src/libregex/test/tests.rs
@ -195,8 +195,8 @@ mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)))

 // Test the Unicode friendliness of Perl character classes.
 mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)))
-mat!(uni_perl_w_not, r"\w+", "Ⅱ", None)
-mat!(uni_perl_w_neg, r"\W+", "Ⅱ", Some((0, 3)))
+mat!(uni_perl_w_not, r"\w+", "⥡", None)
+mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3)))
 mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)))
 mat!(uni_perl_d_not, r"\d+", "Ⅱ", None)
 mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)))
--- a/src/libregex/vm.rs
+++ b/src/libregex/vm.rs
@ -42,7 +42,7 @@ use compile::{
    Save, Jump, Split,
 };
 use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
-use parse::unicode::PERLW;
+use unicode::regex::PERLW;

 pub type CaptureLocs = Vec<Option<uint>>;

--- a/src/libstd/io/mod.rs
+++ b/src/libstd/io/mod.rs
@ -237,6 +237,7 @@ use str::{Str, StrSlice};
 use str;
 use string::String;
 use uint;
+use unicode::UnicodeChar;
 use vec::Vec;

 // Reexports
--- a/src/libstd/lib.rs
+++ b/src/libstd/lib.rs
@ -126,6 +126,7 @@
 #[cfg(test)] #[phase(plugin, link)] extern crate log;

 extern crate alloc;
+extern crate unicode;
 extern crate core;
 extern crate core_collections = "collections";
 extern crate core_rand = "rand";
@ -148,7 +149,6 @@ extern crate rustrt;
 pub use core::any;
 pub use core::bool;
 pub use core::cell;
-pub use core::char;
 pub use core::clone;
 #[cfg(not(test))] pub use core::cmp;
 pub use core::default;
@ -180,6 +180,8 @@ pub use core_collections::vec;
 pub use rustrt::c_str;
 pub use rustrt::local_data;

+pub use unicode::char;
+
 pub use core_sync::comm;

 // Run tests with libgreen instead of libnative.
--- a/src/libstd/path/windows.rs
+++ b/src/libstd/path/windows.rs
@ -24,6 +24,7 @@ use option::{Option, Some, None};
 use slice::{Vector, ImmutableVector};
 use str::{CharSplits, Str, StrAllocating, StrVector, StrSlice};
 use string::String;
+use unicode::UnicodeChar;
 use vec::Vec;

 use super::{contains_nul, BytesContainer, GenericPath, GenericPathUnsafe};
@ -997,7 +998,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
                let idx = path.find('\\');
                if idx == Some(2) && path.as_bytes()[1] == ':' as u8 {
                    let c = path.as_bytes()[0];
-                    if c.is_ascii() && ::char::is_alphabetic(c as char) {
+                    if c.is_ascii() && (c as char).is_alphabetic() {
                        // \\?\C:\ path
                        return Some(VerbatimDiskPrefix);
                    }
@ -1021,7 +1022,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
    } else if path.len() > 1 && path.as_bytes()[1] == ':' as u8 {
        // C:
        let c = path.as_bytes()[0];
-        if c.is_ascii() && ::char::is_alphabetic(c as char) {
+        if c.is_ascii() && (c as char).is_alphabetic() {
            return Some(DiskPrefix);
        }
    }
--- a/src/libstd/prelude.rs
+++ b/src/libstd/prelude.rs
@ -89,6 +89,7 @@
 #[doc(no_inline)] pub use slice::{Vector, VectorVector};
 #[doc(no_inline)] pub use slice::MutableVectorAllocating;
 #[doc(no_inline)] pub use string::String;
+#[doc(no_inline)] pub use unicode::{UnicodeChar, UnicodeStrSlice};
 #[doc(no_inline)] pub use vec::Vec;

 // Reexported runtime types
--- a/src/libstd/rt/backtrace.rs
+++ b/src/libstd/rt/backtrace.rs
@ -12,7 +12,6 @@

 #![allow(non_camel_case_types)]

-use char::Char;
 use collections::Collection;
 use from_str::from_str;
 use io::{IoResult, Writer};
@ -22,6 +21,7 @@ use os;
 use result::{Ok, Err};
 use str::StrSlice;
 use sync::atomics;
+use unicode::UnicodeChar;

 pub use self::imp::write;

--- a/src/libunicode/decompose.rs
+++ b/src/libunicode/decompose.rs
@ -0,0 +1,111 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/*!
+  Functions for computing canonical and compatible decompositions
+  for Unicode characters.
+  */
+
+use core::option::{Option, Some, None};
+use core::slice::ImmutableVector;
+use tables::normalization::{canonical_table, compatibility_table};
+
+fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
+    use core::cmp::{Equal, Less, Greater};
+    match r.bsearch(|&(val, _)| {
+        if c == val { Equal }
+        else if val < c { Less }
+        else { Greater }
+    }) {
+        Some(idx) => {
+            let (_, result) = r[idx];
+            Some(result)
+        }
+        None => None
+    }
+}
+
+/// Compute canonical Unicode decomposition for character
+pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
+
+/// Compute canonical or compatible Unicode decomposition for character
+pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
+
+fn d(c: char, i: |char|, k: bool) {
+    use core::iter::Iterator;
+
+    // 7-bit ASCII never decomposes
+    if c <= '\x7f' { i(c); return; }
+
+    // Perform decomposition for Hangul
+    if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
+        decompose_hangul(c, i);
+        return;
+    }
+
+    // First check the canonical decompositions
+    match bsearch_table(c, canonical_table) {
+        Some(canon) => {
+            for x in canon.iter() {
+                d(*x, |b| i(b), k);
+            }
+            return;
+        }
+        None => ()
+    }
+
+    // Bottom out if we're not doing compat.
+    if !k { i(c); return; }
+
+    // Then check the compatibility decompositions
+    match bsearch_table(c, compatibility_table) {
+        Some(compat) => {
+            for x in compat.iter() {
+                d(*x, |b| i(b), k);
+            }
+            return;
+        }
+        None => ()
+    }
+
+    // Finally bottom out.
+    i(c);
+}
+
+// Constants from Unicode 6.3.0 Section 3.12 Conjoining Jamo Behavior
+static S_BASE: u32 = 0xAC00;
+static L_BASE: u32 = 0x1100;
+static V_BASE: u32 = 0x1161;
+static T_BASE: u32 = 0x11A7;
+static L_COUNT: u32 = 19;
+static V_COUNT: u32 = 21;
+static T_COUNT: u32 = 28;
+static N_COUNT: u32 = (V_COUNT * T_COUNT);
+static S_COUNT: u32 = (L_COUNT * N_COUNT);
+
+// Decompose a precomposed Hangul syllable
+fn decompose_hangul(s: char, f: |char|) {
+    use core::mem::transmute;
+
+    let si = s as u32 - S_BASE;
+
+    let li = si / N_COUNT;
+    unsafe {
+        f(transmute(L_BASE + li));
+
+        let vi = (si % N_COUNT) / T_COUNT;
+        f(transmute(V_BASE + vi));
+
+        let ti = si % T_COUNT;
+        if ti > 0 {
+            f(transmute(T_BASE + ti));
+        }
+    }
+}
--- a/src/libunicode/lib.rs
+++ b/src/libunicode/lib.rs
@ -0,0 +1,77 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! # The Unicode Library
+//!
+//! Unicode-intensive functions for `char` and `str` types.
+//!
+//! This crate provides a collection of Unicode-related functionality,
+//! including decompositions, conversions, etc., and provides traits
+//! implementing these functions for the `char` and `str` types.
+//!
+//! The functionality included here is only that which is necessary to
+//! provide for basic string-related manipulations. This crate does not
+//! (yet) aim to provide a full set of Unicode tables.
+
+#![crate_id = "unicode#0.11.0"]
+#![crate_name = "unicode"]
+#![experimental]
+#![license = "MIT/ASL2"]
+#![crate_type = "rlib"]
+#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
+       html_favicon_url = "http://www.rust-lang.org/favicon.ico",
+       html_root_url = "http://doc.rust-lang.org/",
+       html_playground_url = "http://play.rust-lang.org/")]
+#![no_std]
+#![allow(unused_attribute)] // NOTE: remove after stage0
+
+extern crate core;
+
+pub use tables::normalization::canonical_combining_class;
+pub use tables::regex;
+
+pub use u_char::UnicodeChar;
+pub use u_str::UnicodeStrSlice;
+pub use u_str::Words;
+
+mod decompose;
+mod tables;
+mod u_char;
+mod u_str;
+
+// re-export char so that std et al see it correctly
+/// Character manipulation (`char` type, Unicode Scalar Value)
+///
+/// This module  provides the `Char` and `UnicodeChar` traits, as well as their
+/// implementation for the primitive `char` type, in order to allow basic character
+/// manipulation.
+///
+/// A `char` actually represents a
+/// *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
+/// as it can contain any Unicode code point except high-surrogate and
+/// low-surrogate code points.
+///
+/// As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
+/// (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
+/// however the converse is not always true due to the above range limits
+/// and, as such, should be performed via the `from_u32` function..
+pub mod char {
+    pub use core::char::{MAX, from_u32, is_digit_radix, to_digit};
+    pub use core::char::{from_digit, escape_unicode, escape_default};
+    pub use core::char::{len_utf8_bytes, Char};
+
+    pub use decompose::decompose_canonical;
+    pub use decompose::decompose_compatible;
+
+    pub use u_char::{is_alphabetic, is_XID_start, is_XID_continue};
+    pub use u_char::{is_lowercase, is_uppercase, is_whitespace};
+    pub use u_char::{is_alphanumeric, is_control, is_digit};
+    pub use u_char::{to_uppercase, to_lowercase, width, UnicodeChar};
+}
--- a/src/libunicode/tables.rs
+++ b/src/libunicode/tables.rs
--- a/src/libunicode/u_char.rs
+++ b/src/libunicode/u_char.rs
@ -0,0 +1,266 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/*!
+ * Unicode-intensive `char` methods.
+ *
+ * These methods implement functionality for `char` that requires knowledge of
+ * Unicode definitions, including normalization, categorization, and display information.
+ */
+
+use core::option::Option;
+use tables::{derived_property, property, general_category, conversions, charwidth};
+
+/// Returns whether the specified `char` is considered a Unicode alphabetic
+/// code point
+pub fn is_alphabetic(c: char) -> bool   { derived_property::Alphabetic(c) }
+
+/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
+///
+/// 'XID_Start' is a Unicode Derived Property specified in
+/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
+/// mostly similar to ID_Start but modified for closure under NFKx.
+#[allow(non_snake_case_functions)]
+pub fn is_XID_start(c: char) -> bool    { derived_property::XID_Start(c) }
+
+/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
+///
+/// 'XID_Continue' is a Unicode Derived Property specified in
+/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
+/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
+#[allow(non_snake_case_functions)]
+pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
+
+///
+/// Indicates whether a `char` is in lower case
+///
+/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
+///
+#[inline]
+pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
+
+///
+/// Indicates whether a `char` is in upper case
+///
+/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
+///
+#[inline]
+pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
+
+///
+/// Indicates whether a `char` is whitespace
+///
+/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
+///
+#[inline]
+pub fn is_whitespace(c: char) -> bool {
+    // As an optimization ASCII whitespace characters are checked separately
+    c == ' '
+        || ('\x09' <= c && c <= '\x0d')
+        || property::White_Space(c)
+}
+
+///
+/// Indicates whether a `char` is alphanumeric
+///
+/// Alphanumericness is defined in terms of the Unicode General Categories
+/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
+///
+#[inline]
+pub fn is_alphanumeric(c: char) -> bool {
+    derived_property::Alphabetic(c)
+        || general_category::N(c)
+}
+
+///
+/// Indicates whether a `char` is a control code point
+///
+/// Control code points are defined in terms of the Unicode General Category
+/// 'Cc'.
+///
+#[inline]
+pub fn is_control(c: char) -> bool { general_category::Cc(c) }
+
+/// Indicates whether the `char` is numeric (Nd, Nl, or No)
+#[inline]
+pub fn is_digit(c: char) -> bool {
+    general_category::N(c)
+}
+
+/// Convert a char to its uppercase equivalent
+///
+/// The case-folding performed is the common or simple mapping:
+/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
+/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
+/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
+/// codepoints in some cases.
+///
+/// A full reference can be found here
+/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
+///
+/// # Return value
+///
+/// Returns the char itself if no conversion was made
+#[inline]
+pub fn to_uppercase(c: char) -> char {
+    conversions::to_upper(c)
+}
+
+/// Convert a char to its lowercase equivalent
+///
+/// The case-folding performed is the common or simple mapping
+/// see `to_uppercase` for references and more information
+///
+/// # Return value
+///
+/// Returns the char itself if no conversion if possible
+#[inline]
+pub fn to_lowercase(c: char) -> char {
+    conversions::to_lower(c)
+}
+
+/// Returns this character's displayed width in columns, or `None` if it is a
+/// control character other than `'\x00'`.
+///
+/// `is_cjk` determines behavior for characters in the Ambiguous category:
+/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
+/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
+/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
+/// recommends that these characters be treated as 1 column (i.e.,
+/// `is_cjk` = `false`) if the context cannot be reliably determined.
+pub fn width(c: char, is_cjk: bool) -> Option<uint> {
+    charwidth::width(c, is_cjk)
+}
+
+/// Useful functions for Unicode characters.
+pub trait UnicodeChar {
+    /// Returns whether the specified character is considered a Unicode
+    /// alphabetic code point.
+    fn is_alphabetic(&self) -> bool;
+
+    /// Returns whether the specified character satisfies the 'XID_Start'
+    /// Unicode property.
+    ///
+    /// 'XID_Start' is a Unicode Derived Property specified in
+    /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
+    /// mostly similar to ID_Start but modified for closure under NFKx.
+    #[allow(non_snake_case_functions)]
+    fn is_XID_start(&self) -> bool;
+
+    /// Returns whether the specified `char` satisfies the 'XID_Continue'
+    /// Unicode property.
+    ///
+    /// 'XID_Continue' is a Unicode Derived Property specified in
+    /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
+    /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
+    #[allow(non_snake_case_functions)]
+    fn is_XID_continue(&self) -> bool;
+
+
+    /// Indicates whether a character is in lowercase.
+    ///
+    /// This is defined according to the terms of the Unicode Derived Core
+    /// Property `Lowercase`.
+    fn is_lowercase(&self) -> bool;
+
+    /// Indicates whether a character is in uppercase.
+    ///
+    /// This is defined according to the terms of the Unicode Derived Core
+    /// Property `Uppercase`.
+    fn is_uppercase(&self) -> bool;
+
+    /// Indicates whether a character is whitespace.
+    ///
+    /// Whitespace is defined in terms of the Unicode Property `White_Space`.
+    fn is_whitespace(&self) -> bool;
+
+    /// Indicates whether a character is alphanumeric.
+    ///
+    /// Alphanumericness is defined in terms of the Unicode General Categories
+    /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
+    fn is_alphanumeric(&self) -> bool;
+
+    /// Indicates whether a character is a control code point.
+    ///
+    /// Control code points are defined in terms of the Unicode General
+    /// Category `Cc`.
+    fn is_control(&self) -> bool;
+
+    /// Indicates whether the character is numeric (Nd, Nl, or No).
+    fn is_digit(&self) -> bool;
+
+    /// Converts a character to its lowercase equivalent.
+    ///
+    /// The case-folding performed is the common or simple mapping. See
+    /// `to_uppercase()` for references and more information.
+    ///
+    /// # Return value
+    ///
+    /// Returns the lowercase equivalent of the character, or the character
+    /// itself if no conversion is possible.
+    fn to_lowercase(&self) -> char;
+
+    /// Converts a character to its uppercase equivalent.
+    ///
+    /// The case-folding performed is the common or simple mapping: it maps
+    /// one unicode codepoint (one character in Rust) to its uppercase
+    /// equivalent according to the Unicode database [1]. The additional
+    /// `SpecialCasing.txt` is not considered here, as it expands to multiple
+    /// codepoints in some cases.
+    ///
+    /// A full reference can be found here [2].
+    ///
+    /// # Return value
+    ///
+    /// Returns the uppercase equivalent of the character, or the character
+    /// itself if no conversion was made.
+    ///
+    /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
+    ///
+    /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
+    fn to_uppercase(&self) -> char;
+
+    /// Returns this character's displayed width in columns, or `None` if it is a
+    /// control character other than `'\x00'`.
+    ///
+    /// `is_cjk` determines behavior for characters in the Ambiguous category:
+    /// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
+    /// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
+    /// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
+    /// recommends that these characters be treated as 1 column (i.e.,
+    /// `is_cjk` = `false`) if the context cannot be reliably determined.
+    fn width(&self, is_cjk: bool) -> Option<uint>;
+}
+
+impl UnicodeChar for char {
+    fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
+
+    fn is_XID_start(&self) -> bool { is_XID_start(*self) }
+
+    fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
+
+    fn is_lowercase(&self) -> bool { is_lowercase(*self) }
+
+    fn is_uppercase(&self) -> bool { is_uppercase(*self) }
+
+    fn is_whitespace(&self) -> bool { is_whitespace(*self) }
+
+    fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
+
+    fn is_control(&self) -> bool { is_control(*self) }
+
+    fn is_digit(&self) -> bool { is_digit(*self) }
+
+    fn to_lowercase(&self) -> char { to_lowercase(*self) }
+
+    fn to_uppercase(&self) -> char { to_uppercase(*self) }
+
+    fn width(&self, is_cjk: bool) -> Option<uint> { width(*self, is_cjk) }
+}
--- a/src/libunicode/u_str.rs
+++ b/src/libunicode/u_str.rs
@ -0,0 +1,119 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/*!
+ * Unicode-intensive string manipulations.
+ *
+ * This module provides functionality to `str` that requires the Unicode
+ * methods provided by the UnicodeChar trait.
+ */
+
+use core::collections::Collection;
+use core::iter::{Filter};
+use core::str::{CharSplits, StrSlice};
+use core::iter::Iterator;
+use u_char;
+
+/// An iterator over the words of a string, separated by a sequence of whitespace
+pub type Words<'a> =
+    Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
+
+/// Methods for Unicode string slices
+pub trait UnicodeStrSlice<'a> {
+    /// An iterator over the words of a string (subsequences separated
+    /// by any sequence of whitespace). Sequences of whitespace are
+    /// collapsed, so empty "words" are not included.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// let some_words = " Mary   had\ta little  \n\t lamb";
+    /// let v: Vec<&str> = some_words.words().collect();
+    /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
+    /// ```
+    fn words(&self) -> Words<'a>;
+
+    /// Returns true if the string contains only whitespace.
+    ///
+    /// Whitespace characters are determined by `char::is_whitespace`.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// assert!(" \t\n".is_whitespace());
+    /// assert!("".is_whitespace());
+    ///
+    /// assert!( !"abc".is_whitespace());
+    /// ```
+    fn is_whitespace(&self) -> bool;
+
+    /// Returns true if the string contains only alphanumeric code
+    /// points.
+    ///
+    /// Alphanumeric characters are determined by `char::is_alphanumeric`.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// assert!("Löwe老虎Léopard123".is_alphanumeric());
+    /// assert!("".is_alphanumeric());
+    ///
+    /// assert!( !" &*~".is_alphanumeric());
+    /// ```
+    fn is_alphanumeric(&self) -> bool;
+
+    /// Returns a string's displayed width in columns, treating control
+    /// characters as zero-width.
+    ///
+    /// `is_cjk` determines behavior for characters in the Ambiguous category:
+    /// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
+    /// In CJK locales, `is_cjk` should be `true`, else it should be `false`.
+    /// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
+    /// recommends that these characters be treated as 1 column (i.e.,
+    /// `is_cjk` = `false`) if the locale is unknown.
+    //fn width(&self, is_cjk: bool) -> uint;
+
+    /// Returns a string with leading and trailing whitespace removed.
+    fn trim(&self) -> &'a str;
+
+    /// Returns a string with leading whitespace removed.
+    fn trim_left(&self) -> &'a str;
+
+    /// Returns a string with trailing whitespace removed.
+    fn trim_right(&self) -> &'a str;
+}
+
+impl<'a> UnicodeStrSlice<'a> for &'a str {
+    #[inline]
+    fn words(&self) -> Words<'a> {
+        self.split(u_char::is_whitespace).filter(|s| !s.is_empty())
+    }
+
+    #[inline]
+    fn is_whitespace(&self) -> bool { self.chars().all(u_char::is_whitespace) }
+
+    #[inline]
+    fn is_alphanumeric(&self) -> bool { self.chars().all(u_char::is_alphanumeric) }
+
+    #[inline]
+    fn trim(&self) -> &'a str {
+        self.trim_left().trim_right()
+    }
+
+    #[inline]
+    fn trim_left(&self) -> &'a str {
+        self.trim_left_chars(u_char::is_whitespace)
+    }
+
+    #[inline]
+    fn trim_right(&self) -> &'a str {
+        self.trim_right_chars(u_char::is_whitespace)
+    }
+}