auto merge of #15283 : kwantam/rust/master, r=alexcrichton

Add libunicode; move unicode functions from core

- created new crate, libunicode, below libstd
- split `Char` trait into `Char` (libcore) and `UnicodeChar` (libunicode)
  - Unicode-aware functions now live in libunicode
    - `is_alphabetic`, `is_XID_start`, `is_XID_continue`, `is_lowercase`,
      `is_uppercase`, `is_whitespace`, `is_alphanumeric`, `is_control`, `is_digit`,
      `to_uppercase`, `to_lowercase`
  - added `width` method in UnicodeChar trait
    - determines printed width of character in columns, or None if it is a non-NULL control character
    - takes a boolean argument indicating whether the present context is CJK or not (characters with 'A'mbiguous widths are double-wide in CJK contexts, single-wide otherwise)
- split `StrSlice` into `StrSlice` (libcore) and `UnicodeStrSlice` (libunicode)
  - functionality formerly in `StrSlice` that relied upon Unicode functionality from `Char` is now in `UnicodeStrSlice`
    - `words`, `is_whitespace`, `is_alphanumeric`, `trim`, `trim_left`, `trim_right`
  - also moved `Words` type alias into libunicode because `words` method is in `UnicodeStrSlice`
- unified Unicode tables from libcollections, libcore, and libregex into libunicode
- updated `unicode.py` in `src/etc` to generate aforementioned tables
- generated new tables based on latest Unicode data
- added `UnicodeChar` and `UnicodeStrSlice` traits to prelude
- libunicode is now the collection point for the `std::char` module, combining the libunicode functionality with the `Char` functionality from libcore
  - thus, moved doc comment for `char` from `core::char` to `unicode::char`
- libcollections remains the collection point for `std::str`

The Unicode-aware functions that previously lived in the `Char` and `StrSlice` traits are no longer available to programs that only use libcore. To regain use of these methods, include the libunicode crate and `use` the `UnicodeChar` and/or `UnicodeStrSlice` traits:

    extern crate unicode;
    use unicode::UnicodeChar;
    use unicode::UnicodeStrSlice;
    use unicode::Words; // if you want to use the words() method

NOTE: this does *not* impact programs that use libstd, since UnicodeChar and UnicodeStrSlice have been added to the prelude.

closes #15224
[breaking-change]
This commit is contained in:
bors 2014-07-09 18:36:30 +00:00
commit fa7cbb5a46
27 changed files with 7445 additions and 11597 deletions

View File

@ -51,17 +51,19 @@
TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
uuid serialize sync getopts collections num test time rand \
url log regex graphviz core rlibc alloc debug rustrt
url log regex graphviz core rlibc alloc debug rustrt \
unicode
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros fmt_macros
CRATES := $(TARGET_CRATES) $(HOST_CRATES)
TOOLS := compiletest rustdoc rustc
DEPS_core :=
DEPS_rlibc :=
DEPS_unicode := core
DEPS_alloc := core libc native:jemalloc
DEPS_debug := std
DEPS_rustrt := alloc core libc collections native:rustrt_native
DEPS_std := core libc rand alloc collections rustrt sync \
DEPS_std := core libc rand alloc collections rustrt sync unicode \
native:rust_builtin native:backtrace
DEPS_graphviz := std
DEPS_green := std native:context_switch
@ -82,7 +84,7 @@ DEPS_semver := std
DEPS_uuid := std serialize
DEPS_sync := core alloc rustrt collections
DEPS_getopts := std
DEPS_collections := core alloc
DEPS_collections := core alloc unicode
DEPS_fourcc := rustc syntax std
DEPS_hexfloat := rustc syntax std
DEPS_num := std
@ -108,6 +110,7 @@ ONLY_RLIB_rlibc := 1
ONLY_RLIB_alloc := 1
ONLY_RLIB_rand := 1
ONLY_RLIB_collections := 1
ONLY_RLIB_unicode := 1
################################################################################
# You should not need to edit below this line

View File

@ -15,11 +15,11 @@
# The names of crates that must be tested
# libcore tests are in a separate crate
# libcore/libunicode tests are in a separate crate
DEPS_coretest :=
$(eval $(call RUST_CRATE,coretest))
TEST_TARGET_CRATES = $(filter-out core,$(TARGET_CRATES)) coretest
TEST_TARGET_CRATES = $(filter-out core unicode,$(TARGET_CRATES)) coretest
TEST_DOC_CRATES = $(DOC_CRATES)
TEST_HOST_CRATES = $(HOST_CRATES)
TEST_CRATES = $(TEST_TARGET_CRATES) $(TEST_HOST_CRATES)

View File

@ -1,183 +0,0 @@
#!/usr/bin/env python2
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
from __future__ import absolute_import, division, print_function
import argparse
from collections import defaultdict
import csv
import datetime
import urllib2
BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
DATA = 'UnicodeData.txt'
SCRIPTS = 'Scripts.txt'
# Mapping taken from Table 12 from:
# http://www.unicode.org/reports/tr44/#General_Category_Values
expanded_categories = {
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
'Lm': ['L'], 'Lo': ['L'],
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}
def as_4byte_uni(n):
s = hex(n)[2:]
return '\\U%s%s' % ('0' * (8 - len(s)), s)
def expand_cat(c):
return expanded_categories.get(c, []) + [c]
def is_valid_unicode(n):
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
def read_cats(f):
assigned = defaultdict(list)
for row in csv.reader(f, delimiter=';'):
(hex, cats) = (int(row[0], 16), expand_cat(row[2]))
if not is_valid_unicode(hex):
continue
for cat in cats:
assigned[cat].append(hex)
return assigned
def read_scripts(f):
assigned = defaultdict(list)
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
hexes, name = map(str.strip, line.split(';'))[:2]
name = name[:name.index('#')].strip()
if '..' not in hexes:
hex = int(hexes, 16)
if is_valid_unicode(hex):
assigned[name].append(hex)
else:
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
for hex in xrange(hex1, hex2 + 1):
if is_valid_unicode(hex):
assigned[name].append(hex)
return assigned
def group(letters):
letters = sorted(set(letters))
grouped = []
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
if letter == cur_end + 1:
cur_end = letter
else:
grouped.append((cur_start, cur_end))
cur_start, cur_end = letter, letter
grouped.append((cur_start, cur_end))
return grouped
def ranges_to_rust(rs):
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
return ',\n '.join(rs)
def groups_to_rust(groups):
rust_groups = []
for group_name in sorted(groups):
rust_groups.append('("%s", &[\n %s\n ]),'
% (group_name, ranges_to_rust(groups[group_name])))
return '\n'.join(rust_groups)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Generate Unicode character class tables.')
aa = parser.add_argument
aa('--local', action='store_true',
help='When set, Scripts.txt and UnicodeData.txt will be read from '
'the CWD.')
aa('--base-url', type=str, default=BASE_URL,
help='The base URL to use for downloading Unicode data files.')
args = parser.parse_args()
if args.local:
cats = read_cats(open(DATA))
scripts = read_scripts(open(SCRIPTS))
else:
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
# Get Rust code for all Unicode general categories and scripts.
combined = dict(cats, **scripts)
unigroups = groups_to_rust({k: group(letters)
for k, letters in combined.items()})
# Now get Perl character classes that are Unicode friendly.
perld = range(ord('0'), ord('9') + 1)
dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
perlw = [ord('_')] + perld + low + up
wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
// on {date}.
use parse::{{Class, NamedClasses}};
pub static UNICODE_CLASSES: NamedClasses = &[
{groups}
];
pub static PERLD: Class = &[
{dgroups}
];
pub static PERLS: Class = &[
{sgroups}
];
pub static PERLW: Class = &[
{wgroups}
];
'''
now = datetime.datetime.now()
print(tpl.format(date=str(now), groups=unigroups,
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))

View File

@ -10,17 +10,46 @@
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
# code covering the core properties. Since this is a pretty rare event we
# just store this out-of-line and check the unicode.rs file into git.
# This script uses the following Unicode tables:
# - DerivedCoreProperties.txt
# - EastAsianWidth.txt
# - PropList.txt
# - Scripts.txt
# - UnicodeData.txt
#
# The emitted code is "the minimum we think is necessary for libstd", that
# is, to support basic operations of the compiler and "most nontrivial rust
# programs". It is not meant to be a complete implementation of unicode.
# For that we recommend you use a proper binding to libicu.
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
import fileinput, re, os, sys, operator
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
#![allow(missing_doc, non_uppercase_statics, non_snake_case_functions)]
'''
# Mapping taken from Table 12 from:
# http://www.unicode.org/reports/tr44/#General_Category_Values
expanded_categories = {
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
'Lm': ['L'], 'Lo': ['L'],
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}
def fetch(f):
if not os.path.exists(f):
@ -31,21 +60,17 @@ def fetch(f):
sys.stderr.write("cannot load %s" % f)
exit(1)
def is_valid_unicode(n):
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
def load_unicode_data(f):
fetch(f)
gencats = {}
upperlower = {}
lowerupper = {}
combines = []
combines = {}
canon_decomp = {}
compat_decomp = {}
curr_cat = ""
curr_combine = ""
c_lo = 0
c_hi = 0
com_lo = 0
com_hi = 0
for line in fileinput.input(f):
fields = line.split(";")
@ -58,6 +83,9 @@ def load_unicode_data(f):
code_org = code
code = int(code, 16)
if not is_valid_unicode(code):
continue
# generate char to char direct common and simple conversions
# uppercase to lowercase
if gencat == "Lu" and lowcase != "" and code_org != lowcase:
@ -67,6 +95,7 @@ def load_unicode_data(f):
if gencat == "Ll" and upcase != "" and code_org != upcase:
lowerupper[code] = int(upcase, 16)
# store decomposition, if given
if decomp != "":
if decomp.startswith('<'):
seq = []
@ -79,38 +108,76 @@ def load_unicode_data(f):
seq.append(int(i, 16))
canon_decomp[code] = seq
if curr_cat == "":
curr_cat = gencat
c_lo = code
c_hi = code
# place letter in categories as appropriate
for cat in [gencat] + expanded_categories.get(gencat, []):
if cat not in gencats:
gencats[cat] = []
gencats[cat].append(code)
if curr_cat == gencat:
c_hi = code
else:
if curr_cat not in gencats:
gencats[curr_cat] = []
# record combining class, if any
if combine != "0":
if combine not in combines:
combines[combine] = []
combines[combine].append(code)
gencats[curr_cat].append((c_lo, c_hi))
curr_cat = gencat
c_lo = code
c_hi = code
if curr_combine == "":
curr_combine = combine
com_lo = code
com_hi = code
if curr_combine == combine:
com_hi = code
else:
if curr_combine != "0":
combines.append((com_lo, com_hi, curr_combine))
curr_combine = combine
com_lo = code
com_hi = code
gencats = group_cats(gencats)
combines = to_combines(group_cats(combines))
return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)
def group_cats(cats):
cats_out = {}
for cat in cats:
cats_out[cat] = group_cat(cats[cat])
return cats_out
def group_cat(cat):
cat_out = []
letters = sorted(set(cat))
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
if letter == cur_end + 1:
cur_end = letter
else:
cat_out.append((cur_start, cur_end))
cur_start = cur_end = letter
cat_out.append((cur_start, cur_end))
return cat_out
def ungroup_cat(cat):
cat_out = []
for (lo, hi) in cat:
while lo <= hi:
cat_out.append(lo)
lo += 1
return cat_out
def to_combines(combs):
combs_out = []
for comb in combs:
for (lo, hi) in combs[comb]:
combs_out.append((lo, hi, comb))
combs_out.sort(key=lambda comb: comb[0])
return combs_out
def format_table_content(f, content, indent):
line = " "*indent
first = True
for chunk in content.split(","):
if len(line) + len(chunk) < 98:
if first:
line += chunk
else:
line += ", " + chunk
first = False
else:
f.write(line + ",\n")
line = " "*indent + chunk
f.write(line)
def load_properties(f, interestingprops):
fetch(f)
props = {}
@ -134,7 +201,7 @@ def load_properties(f, interestingprops):
prop = m.group(3)
else:
continue
if prop not in interestingprops:
if interestingprops and prop not in interestingprops:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
@ -143,6 +210,43 @@ def load_properties(f, interestingprops):
props[prop].append((d_lo, d_hi))
return props
# load all widths of want_widths, except those in except_cats
def load_east_asian_width(want_widths, except_cats):
f = "EastAsianWidth.txt"
fetch(f)
widths = {}
re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
for line in fileinput.input(f):
width = None
d_lo = 0
d_hi = 0
cat = None
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
width = m.group(2)
cat = m.group(3)
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
width = m.group(3)
cat = m.group(4)
else:
continue
if cat in except_cats or width not in want_widths:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if width not in widths:
widths[width] = []
widths[width].append((d_lo, d_hi))
return widths
def escape_char(c):
if c <= 0xff:
return "'\\x%2.2x'" % c
@ -150,59 +254,72 @@ def escape_char(c):
return "'\\u%4.4x'" % c
return "'\\U%8.8x'" % c
def ch_prefix(ix):
if ix == 0:
return " "
if ix % 2 == 0:
return ",\n "
else:
return ", "
def emit_bsearch_range_table(f):
f.write("""
fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
use cmp::{Equal, Less, Greater};
use slice::ImmutableVector;
use option::None;
use core::cmp::{Equal, Less, Greater};
use core::slice::ImmutableVector;
use core::option::None;
r.bsearch(|&(lo,hi)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) != None
}\n
""");
""")
def emit_property_module(f, mod, tbl):
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
pub_string = ""
if is_pub:
pub_string = "pub "
f.write(" %sstatic %s: %s = &[\n" % (pub_string, name, t_type))
data = ""
first = True
for dat in t_data:
if not first:
data += ","
first = False
data += pfun(dat)
format_table_content(f, data, 8)
f.write("\n ];\n\n")
def emit_property_module(f, mod, tbl, emit_fn):
f.write("pub mod %s {\n" % mod)
keys = tbl.keys()
keys.sort()
for cat in keys:
if cat not in ["Nd", "Nl", "No", "Cc",
"XID_Start", "XID_Continue", "Alphabetic",
"Lowercase", "Uppercase", "White_Space"]:
continue
f.write(" static %s_table : &'static [(char,char)] = &[\n" % cat)
ix = 0
for pair in tbl[cat]:
f.write(ch_prefix(ix))
f.write("(%s, %s)" % (escape_char(pair[0]), escape_char(pair[1])))
ix += 1
f.write("\n ];\n\n")
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
emit_table(f, "%s_table" % cat, tbl[cat])
if cat in emit_fn:
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
f.write("}\n\n")
def emit_regex_module(f, cats, w_data):
f.write("pub mod regex {\n")
regex_class = "&'static [(char, char)]"
class_table = "&'static [(&'static str, %s)]" % regex_class
emit_table(f, "UNICODE_CLASSES", cats, class_table,
pfun=lambda x: "(\"%s\",super::%s::%s_table)" % (x[0], x[1], x[0]))
f.write(" pub static PERLD: %s = super::general_category::Nd_table;\n\n"
% regex_class)
f.write(" pub static PERLS: %s = super::property::White_Space_table;\n\n"
% regex_class)
emit_table(f, "PERLW", w_data, regex_class)
f.write("}\n\n")
def emit_conversions_module(f, lowerupper, upperlower):
f.write("pub mod conversions {")
f.write("""
use cmp::{Equal, Less, Greater};
use slice::ImmutableVector;
use tuple::Tuple2;
use option::{Option, Some, None};
use core::cmp::{Equal, Less, Greater};
use core::slice::ImmutableVector;
use core::tuple::Tuple2;
use core::option::{Option, Some, None};
pub fn to_lower(c: char) -> char {
match bsearch_case_table(c, LuLl_table) {
@ -226,189 +343,88 @@ def emit_conversions_module(f, lowerupper, upperlower):
})
}
""");
emit_caseconversion_table(f, "LuLl", upperlower)
emit_caseconversion_table(f, "LlLu", lowerupper)
""")
emit_table(f, "LuLl_table",
sorted(upperlower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
emit_table(f, "LlLu_table",
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
f.write("}\n\n")
def emit_charwidth_module(f, width_table):
f.write("pub mod charwidth {\n")
f.write(" use core::option::{Option, Some, None};\n")
f.write(" use core::slice::ImmutableVector;\n")
f.write("""
fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 {
use core::cmp::{Equal, Less, Greater};
match r.bsearch(|&(lo, hi, _, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, _, r_ncjk, r_cjk) = r[idx];
if is_cjk { r_cjk } else { r_ncjk }
}
None => 1
}
}
""")
f.write("""
pub fn width(c: char, is_cjk: bool) -> Option<uint> {
match c as uint {
_c @ 0 => Some(0), // null is zero width
cu if cu < 0x20 => None, // control sequences have no width
cu if cu < 0x7F => Some(1), // ASCII
cu if cu < 0xA0 => None, // more control sequences
_ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as uint)
}
}
""")
f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n")
f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
f.write("}\n")
def emit_caseconversion_table(f, name, table):
f.write(" static %s_table : &'static [(char, char)] = &[\n" % name)
sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0))
ix = 0
for key, value in sorted_table:
f.write(ch_prefix(ix))
f.write("(%s, %s)" % (escape_char(key), escape_char(value)))
ix += 1
f.write("\n ];\n\n")
def format_table_content(f, content, indent):
line = " "*indent
first = True
for chunk in content.split(","):
if len(line) + len(chunk) < 98:
if first:
line += chunk
else:
line += ", " + chunk
first = False
else:
f.write(line + ",\n")
line = " "*indent + chunk
f.write(line)
def emit_core_norm_module(f, canon, compat):
def emit_norm_module(f, canon, compat, combine):
canon_keys = canon.keys()
canon_keys.sort()
compat_keys = compat.keys()
compat_keys.sort()
f.write("pub mod normalization {\n");
f.write(" use option::Option;\n");
f.write(" use option::{Some, None};\n");
f.write(" use slice::ImmutableVector;\n");
f.write("""
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
use cmp::{Equal, Less, Greater};
match r.bsearch(|&(val, _)| {
if c == val { Equal }
else if val < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, result) = r[idx];
Some(result)
}
None => None
}
}\n\n
""")
f.write("pub mod normalization {\n")
def mkdata_fun(table):
def f(char):
data = "(%s,&[" % escape_char(char)
first = True
for d in table[char]:
if not first:
data += ","
first = False
data += escape_char(d)
data += "])"
return data
return f
f.write(" // Canonical decompositions\n")
f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n")
data = ""
first = True
for char in canon_keys:
if not first:
data += ","
first = False
data += "(%s,&[" % escape_char(char)
first2 = True
for d in canon[char]:
if not first2:
data += ","
first2 = False
data += escape_char(d)
data += "])"
format_table_content(f, data, 8)
f.write("\n ];\n\n")
emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]",
pfun=mkdata_fun(canon))
f.write(" // Compatibility decompositions\n")
f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n")
data = ""
first = True
for char in compat_keys:
if not first:
data += ","
first = False
data += "(%s,&[" % escape_char(char)
first2 = True
for d in compat[char]:
if not first2:
data += ","
first2 = False
data += escape_char(d)
data += "])"
format_table_content(f, data, 8)
f.write("\n ];\n\n")
f.write("""
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
fn d(c: char, i: |char|, k: bool) {
use iter::Iterator;
// 7-bit ASCII never decomposes
if c <= '\\x7f' { i(c); return; }
// Perform decomposition for Hangul
if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
decompose_hangul(c, i);
return;
}
// First check the canonical decompositions
match bsearch_table(c, canonical_table) {
Some(canon) => {
for x in canon.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}
// Bottom out if we're not doing compat.
if !k { i(c); return; }
// Then check the compatibility decompositions
match bsearch_table(c, compatibility_table) {
Some(compat) => {
for x in compat.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}
// Finally bottom out.
i(c);
}
// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
static S_BASE: u32 = 0xAC00;
static L_BASE: u32 = 0x1100;
static V_BASE: u32 = 0x1161;
static T_BASE: u32 = 0x11A7;
static L_COUNT: u32 = 19;
static V_COUNT: u32 = 21;
static T_COUNT: u32 = 28;
static N_COUNT: u32 = (V_COUNT * T_COUNT);
static S_COUNT: u32 = (L_COUNT * N_COUNT);
// Decompose a precomposed Hangul syllable
fn decompose_hangul(s: char, f: |char|) {
use cast::transmute;
let si = s as u32 - S_BASE;
let li = si / N_COUNT;
unsafe {
f(transmute(L_BASE + li));
let vi = (si % N_COUNT) / T_COUNT;
f(transmute(V_BASE + vi));
let ti = si % T_COUNT;
if ti > 0 {
f(transmute(T_BASE + ti));
}
}
}
}
""")
def emit_std_norm_module(f, combine):
f.write("pub mod normalization {\n");
f.write(" use option::{Some, None};\n");
f.write(" use slice::ImmutableVector;\n");
emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
pfun=mkdata_fun(compat))
f.write("""
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
use cmp::{Equal, Less, Greater};
use core::option::{Some, None};
use core::cmp::{Equal, Less, Greater};
use core::slice::ImmutableVector;
match r.bsearch(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
@ -420,72 +436,122 @@ def emit_std_norm_module(f, combine):
}
None => 0
}
}\n\n
}\n
""")
f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n")
ix = 0
for pair in combine:
f.write(ch_prefix(ix))
f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
ix += 1
f.write("\n ];\n\n")
emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False,
pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
+ " bsearch_range_value_table(c, combining_class_table)\n"
+ " }\n")
f.write("}\n")
f.write("""
}
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
""")
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
def remove_from_wtable(wtable, val):
wtable_out = []
while wtable:
if wtable[0][1] < val:
wtable_out.append(wtable.pop(0))
elif wtable[0][0] > val:
break
else:
(wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
if wt_lo == wt_hi == val:
continue
elif wt_lo == val:
wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
elif wt_hi == val:
wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
else:
wtable_out.append((wt_lo, val-1, width, width_cjk))
wtable_out.append((val+1, wt_hi, width, width_cjk))
if wtable:
wtable_out.extend(wtable)
return wtable_out
#![allow(missing_doc, non_uppercase_statics)]
def optimize_width_table(wtable):
wtable_out = []
w_this = wtable.pop(0)
while wtable:
if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
w_tmp = wtable.pop(0)
w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
else:
wtable_out.append(w_this)
w_this = wtable.pop(0)
wtable_out.append(w_this)
return wtable_out
'''
(canon_decomp, compat_decomp, gencats,
combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
def gen_core_unicode():
r = "core_unicode.rs"
if __name__ == "__main__":
r = "unicode.rs"
if os.path.exists(r):
os.remove(r);
os.remove(r)
with open(r, "w") as rf:
# Preamble
# write the file's preamble
rf.write(preamble)
emit_bsearch_range_table(rf);
emit_property_module(rf, "general_category", gencats)
# download and parse all the data
(canon_decomp, compat_decomp, gencats, combines,
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
other_derived = ["Default_Ignorable_Code_Point"]
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
scripts = load_properties("Scripts.txt", [])
props = load_properties("PropList.txt",
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
emit_core_norm_module(rf, canon_decomp, compat_decomp)
# bsearch_range_table is used in all the property modules below
emit_bsearch_range_table(rf)
derived = load_properties("DerivedCoreProperties.txt",
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
# all of these categories will also be available as \p{} in libregex
allcats = []
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
("derived_property", derived, want_derived), \
("script", scripts, []), \
("property", props, ["White_Space"]):
emit_property_module(rf, name, cat, pfuns)
allcats.extend(map(lambda x: (x, name), cat))
allcats.sort(key=lambda c: c[0])
emit_property_module(rf, "derived_property", derived)
# the \w regex corresponds to Alphabetic + Mark + Decimal_Number +
# Connector_Punctuation + Join-Control according to UTS#18
# http://www.unicode.org/reports/tr18/#Compatibility_Properties
perl_words = []
for cat in derived["Alphabetic"], gencats["M"], gencats["Nd"], \
gencats["Pc"], props["Join_Control"]:
perl_words.extend(ungroup_cat(cat))
perl_words = group_cat(perl_words)
props = load_properties("PropList.txt", ["White_Space"])
emit_property_module(rf, "property", props)
# emit lookup tables for \p{}, along with \d, \w, and \s for libregex
emit_regex_module(rf, allcats, perl_words)
# normalizations and conversions module
emit_norm_module(rf, canon_decomp, compat_decomp, combines)
emit_conversions_module(rf, lowerupper, upperlower)
def gen_std_unicode():
r = "std_unicode.rs"
if os.path.exists(r):
os.remove(r);
with open(r, "w") as rf:
# Preamble
rf.write(preamble)
emit_std_norm_module(rf, combines)
# character width module
width_table = []
for zwcat in ["Me", "Mn", "Cf"]:
width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
width_table.append((4448, 4607, 0, 0))
gen_core_unicode()
gen_std_unicode()
# get widths, except those that are explicitly marked zero-width above
ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"])
# these are doublewidth
for dwcat in ["W", "F"]:
width_table.extend(map(lambda (lo, hi): (lo, hi, 2, 2), ea_widths[dwcat]))
width_table.extend(map(lambda (lo, hi): (lo, hi, 1, 2), ea_widths["A"]))
width_table.sort(key=lambda w: w[0])
# soft hyphen is not zero width in preformatted text; it's used to indicate
# a hyphen inserted to facilitate a linebreak.
width_table = remove_from_wtable(width_table, 173)
# optimize the width table by collapsing adjacent entities when possible
width_table = optimize_width_table(width_table)
emit_charwidth_module(rf, width_table)

View File

@ -28,6 +28,7 @@
#![allow(unused_attribute)] // NOTE: remove after stage0
#[phase(plugin, link)] extern crate core;
extern crate unicode;
extern crate alloc;
#[cfg(test)] extern crate native;
@ -69,9 +70,6 @@ pub mod string;
pub mod vec;
pub mod hash;
// Internal unicode fiddly bits for the str module
mod unicode;
mod deque;
/// A trait to represent mutable containers

View File

@ -69,7 +69,6 @@ is the same as `&[u8]`.
use core::prelude::*;
use core::char;
use core::default::Default;
use core::fmt;
use core::cmp;
@ -79,15 +78,17 @@ use core::mem;
use Collection;
use hash;
use string::String;
use unicode;
use vec::Vec;
pub use core::str::{from_utf8, CharEq, Chars, CharOffsets};
pub use core::str::{Bytes, CharSplits};
pub use core::str::{CharSplitsN, Words, AnyLines, MatchIndices, StrSplits};
pub use core::str::{CharSplitsN, AnyLines, MatchIndices, StrSplits};
pub use core::str::{eq_slice, is_utf8, is_utf16, Utf16Items};
pub use core::str::{Utf16Item, ScalarValue, LoneSurrogate, utf16_items};
pub use core::str::{truncate_utf16_at_nul, utf8_char_width, CharRange};
pub use core::str::{Str, StrSlice};
pub use unicode::{Words, UnicodeStrSlice};
/*
Section: Creating a string
@ -283,7 +284,7 @@ pub struct Decompositions<'a> {
impl<'a> Iterator<char> for Decompositions<'a> {
#[inline]
fn next(&mut self) -> Option<char> {
use unicode::normalization::canonical_combining_class;
use unicode::canonical_combining_class;
match self.buffer.as_slice().head() {
Some(&(c, 0)) => {
@ -299,8 +300,8 @@ impl<'a> Iterator<char> for Decompositions<'a> {
}
let decomposer = match self.kind {
Canonical => char::decompose_canonical,
Compatible => char::decompose_compatible
Canonical => unicode::char::decompose_canonical,
Compatible => unicode::char::decompose_compatible
};
if !self.sorted {
@ -973,6 +974,8 @@ mod tests {
use string::String;
use vec::Vec;
use unicode::UnicodeChar;
#[test]
fn test_eq_slice() {
assert!((eq_slice("foobar".slice(0, 3), "foo")));

View File

@ -1,183 +0,0 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
#![allow(missing_doc, non_uppercase_statics)]
pub mod normalization {
use core::prelude::*;
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
match r.bsearch(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, _, result) = r[idx];
result
}
None => 0
}
}
static combining_class_table : &'static [(char, char, u8)] = &[
('\u0300', '\u0314', 230), ('\u0315', '\u0315', 232),
('\u0316', '\u0319', 220), ('\u031a', '\u031a', 232),
('\u031b', '\u031b', 216), ('\u031c', '\u0320', 220),
('\u0321', '\u0322', 202), ('\u0323', '\u0326', 220),
('\u0327', '\u0328', 202), ('\u0329', '\u0333', 220),
('\u0334', '\u0338', 1), ('\u0339', '\u033c', 220),
('\u033d', '\u0344', 230), ('\u0345', '\u0345', 240),
('\u0346', '\u0346', 230), ('\u0347', '\u0349', 220),
('\u034a', '\u034c', 230), ('\u034d', '\u034e', 220),
('\u0350', '\u0352', 230), ('\u0353', '\u0356', 220),
('\u0357', '\u0357', 230), ('\u0358', '\u0358', 232),
('\u0359', '\u035a', 220), ('\u035b', '\u035b', 230),
('\u035c', '\u035c', 233), ('\u035d', '\u035e', 234),
('\u035f', '\u035f', 233), ('\u0360', '\u0361', 234),
('\u0362', '\u0362', 233), ('\u0363', '\u036f', 230),
('\u0483', '\u0487', 230), ('\u0591', '\u0591', 220),
('\u0592', '\u0595', 230), ('\u0596', '\u0596', 220),
('\u0597', '\u0599', 230), ('\u059a', '\u059a', 222),
('\u059b', '\u059b', 220), ('\u059c', '\u05a1', 230),
('\u05a2', '\u05a7', 220), ('\u05a8', '\u05a9', 230),
('\u05aa', '\u05aa', 220), ('\u05ab', '\u05ac', 230),
('\u05ad', '\u05ad', 222), ('\u05ae', '\u05ae', 228),
('\u05af', '\u05af', 230), ('\u05b0', '\u05b0', 10),
('\u05b1', '\u05b1', 11), ('\u05b2', '\u05b2', 12),
('\u05b3', '\u05b3', 13), ('\u05b4', '\u05b4', 14),
('\u05b5', '\u05b5', 15), ('\u05b6', '\u05b6', 16),
('\u05b7', '\u05b7', 17), ('\u05b8', '\u05b8', 18),
('\u05b9', '\u05ba', 19), ('\u05bb', '\u05bb', 20),
('\u05bc', '\u05bc', 21), ('\u05bd', '\u05bd', 22),
('\u05bf', '\u05bf', 23), ('\u05c1', '\u05c1', 24),
('\u05c2', '\u05c2', 25), ('\u05c4', '\u05c4', 230),
('\u05c5', '\u05c5', 220), ('\u05c7', '\u05c7', 18),
('\u0610', '\u0617', 230), ('\u0618', '\u0618', 30),
('\u0619', '\u0619', 31), ('\u061a', '\u061a', 32),
('\u064b', '\u064b', 27), ('\u064c', '\u064c', 28),
('\u064d', '\u064d', 29), ('\u064e', '\u064e', 30),
('\u064f', '\u064f', 31), ('\u0650', '\u0650', 32),
('\u0651', '\u0651', 33), ('\u0652', '\u0652', 34),
('\u0653', '\u0654', 230), ('\u0655', '\u0656', 220),
('\u0657', '\u065b', 230), ('\u065c', '\u065c', 220),
('\u065d', '\u065e', 230), ('\u065f', '\u065f', 220),
('\u0670', '\u0670', 35), ('\u06d6', '\u06dc', 230),
('\u06df', '\u06e2', 230), ('\u06e3', '\u06e3', 220),
('\u06e4', '\u06e4', 230), ('\u06e7', '\u06e8', 230),
('\u06ea', '\u06ea', 220), ('\u06eb', '\u06ec', 230),
('\u06ed', '\u06ed', 220), ('\u0711', '\u0711', 36),
('\u0730', '\u0730', 230), ('\u0731', '\u0731', 220),
('\u0732', '\u0733', 230), ('\u0734', '\u0734', 220),
('\u0735', '\u0736', 230), ('\u0737', '\u0739', 220),
('\u073a', '\u073a', 230), ('\u073b', '\u073c', 220),
('\u073d', '\u073d', 230), ('\u073e', '\u073e', 220),
('\u073f', '\u0741', 230), ('\u0742', '\u0742', 220),
('\u0743', '\u0743', 230), ('\u0744', '\u0744', 220),
('\u0745', '\u0745', 230), ('\u0746', '\u0746', 220),
('\u0747', '\u0747', 230), ('\u0748', '\u0748', 220),
('\u0749', '\u074a', 230), ('\u07eb', '\u07f1', 230),
('\u07f2', '\u07f2', 220), ('\u07f3', '\u07f3', 230),
('\u0816', '\u0819', 230), ('\u081b', '\u0823', 230),
('\u0825', '\u0827', 230), ('\u0829', '\u082d', 230),
('\u0859', '\u085b', 220), ('\u08e4', '\u08e5', 230),
('\u08e6', '\u08e6', 220), ('\u08e7', '\u08e8', 230),
('\u08e9', '\u08e9', 220), ('\u08ea', '\u08ec', 230),
('\u08ed', '\u08ef', 220), ('\u08f0', '\u08f0', 27),
('\u08f1', '\u08f1', 28), ('\u08f2', '\u08f2', 29),
('\u08f3', '\u08f5', 230), ('\u08f6', '\u08f6', 220),
('\u08f7', '\u08f8', 230), ('\u08f9', '\u08fa', 220),
('\u08fb', '\u08fe', 230), ('\u093c', '\u093c', 7),
('\u094d', '\u094d', 9), ('\u0951', '\u0951', 230),
('\u0952', '\u0952', 220), ('\u0953', '\u0954', 230),
('\u09bc', '\u09bc', 7), ('\u09cd', '\u09cd', 9),
('\u0a3c', '\u0a3c', 7), ('\u0a4d', '\u0a4d', 9),
('\u0abc', '\u0abc', 7), ('\u0acd', '\u0acd', 9),
('\u0b3c', '\u0b3c', 7), ('\u0b4d', '\u0b4d', 9),
('\u0bcd', '\u0bcd', 9), ('\u0c4d', '\u0c4d', 9),
('\u0c55', '\u0c55', 84), ('\u0c56', '\u0c56', 91),
('\u0cbc', '\u0cbc', 7), ('\u0ccd', '\u0ccd', 9),
('\u0d4d', '\u0d4d', 9), ('\u0dca', '\u0dca', 9),
('\u0e38', '\u0e39', 103), ('\u0e3a', '\u0e3a', 9),
('\u0e48', '\u0e4b', 107), ('\u0eb8', '\u0eb9', 118),
('\u0ec8', '\u0ecb', 122), ('\u0f18', '\u0f19', 220),
('\u0f35', '\u0f35', 220), ('\u0f37', '\u0f37', 220),
('\u0f39', '\u0f39', 216), ('\u0f71', '\u0f71', 129),
('\u0f72', '\u0f72', 130), ('\u0f74', '\u0f74', 132),
('\u0f7a', '\u0f7d', 130), ('\u0f80', '\u0f80', 130),
('\u0f82', '\u0f83', 230), ('\u0f84', '\u0f84', 9),
('\u0f86', '\u0f87', 230), ('\u0fc6', '\u0fc6', 220),
('\u1037', '\u1037', 7), ('\u1039', '\u103a', 9),
('\u108d', '\u108d', 220), ('\u135d', '\u135f', 230),
('\u1714', '\u1714', 9), ('\u1734', '\u1734', 9),
('\u17d2', '\u17d2', 9), ('\u17dd', '\u17dd', 230),
('\u18a9', '\u18a9', 228), ('\u1939', '\u1939', 222),
('\u193a', '\u193a', 230), ('\u193b', '\u193b', 220),
('\u1a17', '\u1a17', 230), ('\u1a18', '\u1a18', 220),
('\u1a60', '\u1a60', 9), ('\u1a75', '\u1a7c', 230),
('\u1a7f', '\u1a7f', 220), ('\u1b34', '\u1b34', 7),
('\u1b44', '\u1b44', 9), ('\u1b6b', '\u1b6b', 230),
('\u1b6c', '\u1b6c', 220), ('\u1b6d', '\u1b73', 230),
('\u1baa', '\u1bab', 9), ('\u1be6', '\u1be6', 7),
('\u1bf2', '\u1bf3', 9), ('\u1c37', '\u1c37', 7),
('\u1cd0', '\u1cd2', 230), ('\u1cd4', '\u1cd4', 1),
('\u1cd5', '\u1cd9', 220), ('\u1cda', '\u1cdb', 230),
('\u1cdc', '\u1cdf', 220), ('\u1ce0', '\u1ce0', 230),
('\u1ce2', '\u1ce8', 1), ('\u1ced', '\u1ced', 220),
('\u1cf4', '\u1cf4', 230), ('\u1dc0', '\u1dc1', 230),
('\u1dc2', '\u1dc2', 220), ('\u1dc3', '\u1dc9', 230),
('\u1dca', '\u1dca', 220), ('\u1dcb', '\u1dcc', 230),
('\u1dcd', '\u1dcd', 234), ('\u1dce', '\u1dce', 214),
('\u1dcf', '\u1dcf', 220), ('\u1dd0', '\u1dd0', 202),
('\u1dd1', '\u1de6', 230), ('\u1dfc', '\u1dfc', 233),
('\u1dfd', '\u1dfd', 220), ('\u1dfe', '\u1dfe', 230),
('\u1dff', '\u1dff', 220), ('\u20d0', '\u20d1', 230),
('\u20d2', '\u20d3', 1), ('\u20d4', '\u20d7', 230),
('\u20d8', '\u20da', 1), ('\u20db', '\u20dc', 230),
('\u20e1', '\u20e1', 230), ('\u20e5', '\u20e6', 1),
('\u20e7', '\u20e7', 230), ('\u20e8', '\u20e8', 220),
('\u20e9', '\u20e9', 230), ('\u20ea', '\u20eb', 1),
('\u20ec', '\u20ef', 220), ('\u20f0', '\u20f0', 230),
('\u2cef', '\u2cf1', 230), ('\u2d7f', '\u2d7f', 9),
('\u2de0', '\u2dff', 230), ('\u302a', '\u302a', 218),
('\u302b', '\u302b', 228), ('\u302c', '\u302c', 232),
('\u302d', '\u302d', 222), ('\u302e', '\u302f', 224),
('\u3099', '\u309a', 8), ('\ua66f', '\ua66f', 230),
('\ua674', '\ua67d', 230), ('\ua69f', '\ua69f', 230),
('\ua6f0', '\ua6f1', 230), ('\ua806', '\ua806', 9),
('\ua8c4', '\ua8c4', 9), ('\ua8e0', '\ua8f1', 230),
('\ua92b', '\ua92d', 220), ('\ua953', '\ua953', 9),
('\ua9b3', '\ua9b3', 7), ('\ua9c0', '\ua9c0', 9),
('\uaab0', '\uaab0', 230), ('\uaab2', '\uaab3', 230),
('\uaab4', '\uaab4', 220), ('\uaab7', '\uaab8', 230),
('\uaabe', '\uaabf', 230), ('\uaac1', '\uaac1', 230),
('\uaaf6', '\uaaf6', 9), ('\uabed', '\uabed', 9),
('\ufb1e', '\ufb1e', 26), ('\ufe20', '\ufe26', 230),
('\U000101fd', '\U000101fd', 220), ('\U00010a0d', '\U00010a0d', 220),
('\U00010a0f', '\U00010a0f', 230), ('\U00010a38', '\U00010a38', 230),
('\U00010a39', '\U00010a39', 1), ('\U00010a3a', '\U00010a3a', 220),
('\U00010a3f', '\U00010a3f', 9), ('\U00011046', '\U00011046', 9),
('\U000110b9', '\U000110b9', 9), ('\U000110ba', '\U000110ba', 7),
('\U00011100', '\U00011102', 230), ('\U00011133', '\U00011134', 9),
('\U000111c0', '\U000111c0', 9), ('\U000116b6', '\U000116b6', 9),
('\U000116b7', '\U000116b7', 7), ('\U0001d165', '\U0001d166', 216),
('\U0001d167', '\U0001d169', 1), ('\U0001d16d', '\U0001d16d', 226),
('\U0001d16e', '\U0001d172', 216), ('\U0001d17b', '\U0001d182', 220),
('\U0001d185', '\U0001d189', 230), ('\U0001d18a', '\U0001d18b', 220),
('\U0001d1aa', '\U0001d1ad', 230), ('\U0001d242', '\U0001d244', 230)
];
pub fn canonical_combining_class(c: char) -> u8 {
bsearch_range_value_table(c, combining_class_table)
}
}

View File

@ -8,20 +8,9 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Character manipulation (`char` type, Unicode Scalar Value)
//! Character manipulation.
//!
//! This module provides the `Char` trait, as well as its implementation
//! for the primitive `char` type, in order to allow basic character manipulation.
//!
//! A `char` actually represents a
//! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
//! as it can contain any Unicode code point except high-surrogate and
//! low-surrogate code points.
//!
//! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
//! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
//! however the converse is not always true due to the above range limits
//! and, as such, should be performed via the `from_u32` function..
//! For more details, see ::unicode::char (a.k.a. std::char)
#![allow(non_snake_case_functions)]
#![doc(primitive = "char")]
@ -29,12 +18,6 @@
use mem::transmute;
use option::{None, Option, Some};
use iter::{Iterator, range_step};
use unicode::{derived_property, property, general_category, conversions};
/// Returns the canonical decomposition of a character.
pub use unicode::normalization::decompose_canonical;
/// Returns the compatibility decomposition of a character.
pub use unicode::normalization::decompose_compatible;
// UTF-8 ranges and tags for encoding characters
static TAG_CONT: u8 = 0b1000_0000u8;
@ -93,84 +76,6 @@ pub fn from_u32(i: u32) -> Option<char> {
}
}
/// Returns whether the specified `char` is considered a Unicode alphabetic
/// code point
pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
///
/// 'XID_Start' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to ID_Start but modified for closure under NFKx.
pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
///
/// 'XID_Continue' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
///
/// Indicates whether a `char` is in lower case
///
/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
///
#[inline]
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
///
/// Indicates whether a `char` is in upper case
///
/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
///
#[inline]
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
///
/// Indicates whether a `char` is whitespace
///
/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
///
#[inline]
pub fn is_whitespace(c: char) -> bool {
// As an optimization ASCII whitespace characters are checked separately
c == ' '
|| ('\x09' <= c && c <= '\x0d')
|| property::White_Space(c)
}
///
/// Indicates whether a `char` is alphanumeric
///
/// Alphanumericness is defined in terms of the Unicode General Categories
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
///
#[inline]
pub fn is_alphanumeric(c: char) -> bool {
derived_property::Alphabetic(c)
|| general_category::Nd(c)
|| general_category::Nl(c)
|| general_category::No(c)
}
///
/// Indicates whether a `char` is a control code point
///
/// Control code points are defined in terms of the Unicode General Category
/// 'Cc'.
///
#[inline]
pub fn is_control(c: char) -> bool { general_category::Cc(c) }
/// Indicates whether the `char` is numeric (Nd, Nl, or No)
#[inline]
pub fn is_digit(c: char) -> bool {
general_category::Nd(c)
|| general_category::Nl(c)
|| general_category::No(c)
}
///
/// Checks if a `char` parses as a numeric digit in the given radix
///
@ -227,38 +132,6 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
else { None }
}
/// Convert a char to its uppercase equivalent
///
/// The case-folding performed is the common or simple mapping:
/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
/// codepoints in some cases.
///
/// A full reference can be found here
/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
///
/// # Return value
///
/// Returns the char itself if no conversion was made
#[inline]
pub fn to_uppercase(c: char) -> char {
conversions::to_upper(c)
}
/// Convert a char to its lowercase equivalent
///
/// The case-folding performed is the common or simple mapping
/// see `to_uppercase` for references and more information
///
/// # Return value
///
/// Returns the char itself if no conversion if possible
#[inline]
pub fn to_lowercase(c: char) -> char {
conversions::to_lower(c)
}
///
/// Converts a number to the character representing it
///
@ -355,61 +228,8 @@ pub fn len_utf8_bytes(c: char) -> uint {
}
}
/// Useful functions for Unicode characters.
/// Basic `char` manipulations.
pub trait Char {
/// Returns whether the specified character is considered a Unicode
/// alphabetic code point.
fn is_alphabetic(&self) -> bool;
/// Returns whether the specified character satisfies the 'XID_Start'
/// Unicode property.
///
/// 'XID_Start' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to ID_Start but modified for closure under NFKx.
fn is_XID_start(&self) -> bool;
/// Returns whether the specified `char` satisfies the 'XID_Continue'
/// Unicode property.
///
/// 'XID_Continue' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
fn is_XID_continue(&self) -> bool;
/// Indicates whether a character is in lowercase.
///
/// This is defined according to the terms of the Unicode Derived Core
/// Property `Lowercase`.
fn is_lowercase(&self) -> bool;
/// Indicates whether a character is in uppercase.
///
/// This is defined according to the terms of the Unicode Derived Core
/// Property `Uppercase`.
fn is_uppercase(&self) -> bool;
/// Indicates whether a character is whitespace.
///
/// Whitespace is defined in terms of the Unicode Property `White_Space`.
fn is_whitespace(&self) -> bool;
/// Indicates whether a character is alphanumeric.
///
/// Alphanumericness is defined in terms of the Unicode General Categories
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
fn is_alphanumeric(&self) -> bool;
/// Indicates whether a character is a control code point.
///
/// Control code points are defined in terms of the Unicode General
/// Category `Cc`.
fn is_control(&self) -> bool;
/// Indicates whether the character is numeric (Nd, Nl, or No).
fn is_digit(&self) -> bool;
/// Checks if a `char` parses as a numeric digit in the given radix.
///
/// Compared to `is_digit()`, this function only recognizes the characters
@ -438,37 +258,6 @@ pub trait Char {
/// Fails if given a radix outside the range [0..36].
fn to_digit(&self, radix: uint) -> Option<uint>;
/// Converts a character to its lowercase equivalent.
///
/// The case-folding performed is the common or simple mapping. See
/// `to_uppercase()` for references and more information.
///
/// # Return value
///
/// Returns the lowercase equivalent of the character, or the character
/// itself if no conversion is possible.
fn to_lowercase(&self) -> char;
/// Converts a character to its uppercase equivalent.
///
/// The case-folding performed is the common or simple mapping: it maps
/// one unicode codepoint (one character in Rust) to its uppercase
/// equivalent according to the Unicode database [1]. The additional
/// `SpecialCasing.txt` is not considered here, as it expands to multiple
/// codepoints in some cases.
///
/// A full reference can be found here [2].
///
/// # Return value
///
/// Returns the uppercase equivalent of the character, or the character
/// itself if no conversion was made.
///
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
///
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
fn to_uppercase(&self) -> char;
/// Converts a number to the character representing it.
///
/// # Return value
@ -526,32 +315,10 @@ pub trait Char {
}
impl Char for char {
fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
fn is_XID_start(&self) -> bool { is_XID_start(*self) }
fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
fn is_lowercase(&self) -> bool { is_lowercase(*self) }
fn is_uppercase(&self) -> bool { is_uppercase(*self) }
fn is_whitespace(&self) -> bool { is_whitespace(*self) }
fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
fn is_control(&self) -> bool { is_control(*self) }
fn is_digit(&self) -> bool { is_digit(*self) }
fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }
fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }
fn to_lowercase(&self) -> char { to_lowercase(*self) }
fn to_uppercase(&self) -> char { to_uppercase(*self) }
fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
@ -600,5 +367,3 @@ impl Char for char {
}
}
}

View File

@ -108,7 +108,6 @@ pub mod collections;
/* Core types and methods on primitives */
mod unicode;
pub mod any;
pub mod atomics;
pub mod bool;

View File

@ -22,7 +22,7 @@ use cmp;
use cmp::{PartialEq, Eq};
use collections::Collection;
use default::Default;
use iter::{Filter, Map, Iterator};
use iter::{Map, Iterator};
use iter::{DoubleEndedIterator, ExactSize};
use iter::range;
use num::{CheckedMul, Saturating};
@ -204,10 +204,6 @@ pub struct CharSplitsN<'a, Sep> {
invert: bool,
}
/// An iterator over the words of a string, separated by a sequence of whitespace
pub type Words<'a> =
Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
/// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
pub type AnyLines<'a> =
Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
@ -1209,48 +1205,6 @@ pub trait StrSlice<'a> {
/// ```
fn lines_any(&self) -> AnyLines<'a>;
/// An iterator over the words of a string (subsequences separated
/// by any sequence of whitespace). Sequences of whitespace are
/// collapsed, so empty "words" are not included.
///
/// # Example
///
/// ```rust
/// let some_words = " Mary had\ta little \n\t lamb";
/// let v: Vec<&str> = some_words.words().collect();
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
/// ```
fn words(&self) -> Words<'a>;
/// Returns true if the string contains only whitespace.
///
/// Whitespace characters are determined by `char::is_whitespace`.
///
/// # Example
///
/// ```rust
/// assert!(" \t\n".is_whitespace());
/// assert!("".is_whitespace());
///
/// assert!( !"abc".is_whitespace());
/// ```
fn is_whitespace(&self) -> bool;
/// Returns true if the string contains only alphanumeric code
/// points.
///
/// Alphanumeric characters are determined by `char::is_alphanumeric`.
///
/// # Example
///
/// ```rust
/// assert!("Löwe老虎Léopard123".is_alphanumeric());
/// assert!("".is_alphanumeric());
///
/// assert!( !" &*~".is_alphanumeric());
/// ```
fn is_alphanumeric(&self) -> bool;
/// Returns the number of Unicode code points (`char`) that a
/// string holds.
///
@ -1368,15 +1322,6 @@ pub trait StrSlice<'a> {
/// Returns true if `needle` is a suffix of the string.
fn ends_with(&self, needle: &str) -> bool;
/// Returns a string with leading and trailing whitespace removed.
fn trim(&self) -> &'a str;
/// Returns a string with leading whitespace removed.
fn trim_left(&self) -> &'a str;
/// Returns a string with trailing whitespace removed.
fn trim_right(&self) -> &'a str;
/// Returns a string with characters that match `to_trim` removed.
///
/// # Arguments
@ -1748,17 +1693,6 @@ impl<'a> StrSlice<'a> for &'a str {
})
}
#[inline]
fn words(&self) -> Words<'a> {
self.split(char::is_whitespace).filter(|s| !s.is_empty())
}
#[inline]
fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
#[inline]
fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
#[inline]
fn char_len(&self) -> uint { self.chars().count() }
@ -1817,21 +1751,6 @@ impl<'a> StrSlice<'a> for &'a str {
m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
}
#[inline]
fn trim(&self) -> &'a str {
self.trim_left().trim_right()
}
#[inline]
fn trim_left(&self) -> &'a str {
self.trim_left_chars(char::is_whitespace)
}
#[inline]
fn trim_right(&self) -> &'a str {
self.trim_right_chars(char::is_whitespace)
}
#[inline]
fn trim_chars<C: CharEq>(&self, mut to_trim: C) -> &'a str {
let cur = match self.find(|c: char| !to_trim.matches(c)) {

File diff suppressed because it is too large Load Diff

View File

@ -194,3 +194,30 @@ fn test_encode_utf16() {
check('\ua66e', [0xa66e]);
check('\U0001f4a9', [0xd83d, 0xdca9]);
}
#[test]
fn test_width() {
assert_eq!('\x00'.width(false),Some(0));
assert_eq!('\x00'.width(true),Some(0));
assert_eq!('\x0A'.width(false),None);
assert_eq!('\x0A'.width(true),None);
assert_eq!('w'.width(false),Some(1));
assert_eq!('w'.width(true),Some(1));
assert_eq!(''.width(false),Some(2));
assert_eq!(''.width(true),Some(2));
assert_eq!('\xAD'.width(false),Some(1));
assert_eq!('\xAD'.width(true),Some(1));
assert_eq!('\u1160'.width(false),Some(0));
assert_eq!('\u1160'.width(true),Some(0));
assert_eq!('\u00a1'.width(false),Some(1));
assert_eq!('\u00a1'.width(true),Some(2));
assert_eq!('\u0300'.width(false),Some(0));
assert_eq!('\u0300'.width(true),Some(0));
}

View File

@ -306,12 +306,15 @@
//!
//! ## Perl character classes (Unicode friendly)
//!
//! These classes are based on the definitions provided in
//! [UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
//!
//! <pre class="rust">
//! \d digit ([0-9] + \p{Nd})
//! \d digit (\p{Nd})
//! \D not digit
//! \s whitespace ([\t\n\f\r ] + \p{Z})
//! \s whitespace (\p{White_Space})
//! \S not whitespace
//! \w word character ([0-9A-Za-z_] + \p{L})
//! \w word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
//! \W not word character
//! </pre>
//!
@ -378,6 +381,9 @@ extern crate rand;
#[cfg(test)]
extern crate regex;
// unicode tables for character classes are defined in libunicode
extern crate unicode;
pub use parse::Error;
pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
pub use re::{FindCaptures, FindMatches};

View File

@ -16,9 +16,7 @@ use std::num;
use std::str;
/// Static data containing Unicode ranges for general categories and scripts.
use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
#[allow(visible_private_types)]
pub mod unicode;
use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
/// The maximum number of repetitions allowed with the `{n,m}` syntax.
static MAX_REPEAT: uint = 1000;

File diff suppressed because it is too large Load Diff

View File

@ -195,8 +195,8 @@ mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)))
// Test the Unicode friendliness of Perl character classes.
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)))
mat!(uni_perl_w_not, r"\w+", "", None)
mat!(uni_perl_w_neg, r"\W+", "", Some((0, 3)))
mat!(uni_perl_w_not, r"\w+", "", None)
mat!(uni_perl_w_neg, r"\W+", "", Some((0, 3)))
mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)))
mat!(uni_perl_d_not, r"\d+", "", None)
mat!(uni_perl_d_neg, r"\D+", "", Some((0, 3)))

View File

@ -42,7 +42,7 @@ use compile::{
Save, Jump, Split,
};
use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
use parse::unicode::PERLW;
use unicode::regex::PERLW;
pub type CaptureLocs = Vec<Option<uint>>;

View File

@ -237,6 +237,7 @@ use str::{Str, StrSlice};
use str;
use string::String;
use uint;
use unicode::UnicodeChar;
use vec::Vec;
// Reexports

View File

@ -126,6 +126,7 @@
#[cfg(test)] #[phase(plugin, link)] extern crate log;
extern crate alloc;
extern crate unicode;
extern crate core;
extern crate core_collections = "collections";
extern crate core_rand = "rand";
@ -148,7 +149,6 @@ extern crate rustrt;
pub use core::any;
pub use core::bool;
pub use core::cell;
pub use core::char;
pub use core::clone;
#[cfg(not(test))] pub use core::cmp;
pub use core::default;
@ -180,6 +180,8 @@ pub use core_collections::vec;
pub use rustrt::c_str;
pub use rustrt::local_data;
pub use unicode::char;
pub use core_sync::comm;
// Run tests with libgreen instead of libnative.

View File

@ -24,6 +24,7 @@ use option::{Option, Some, None};
use slice::{Vector, ImmutableVector};
use str::{CharSplits, Str, StrAllocating, StrVector, StrSlice};
use string::String;
use unicode::UnicodeChar;
use vec::Vec;
use super::{contains_nul, BytesContainer, GenericPath, GenericPathUnsafe};
@ -997,7 +998,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
let idx = path.find('\\');
if idx == Some(2) && path.as_bytes()[1] == ':' as u8 {
let c = path.as_bytes()[0];
if c.is_ascii() && ::char::is_alphabetic(c as char) {
if c.is_ascii() && (c as char).is_alphabetic() {
// \\?\C:\ path
return Some(VerbatimDiskPrefix);
}
@ -1021,7 +1022,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
} else if path.len() > 1 && path.as_bytes()[1] == ':' as u8 {
// C:
let c = path.as_bytes()[0];
if c.is_ascii() && ::char::is_alphabetic(c as char) {
if c.is_ascii() && (c as char).is_alphabetic() {
return Some(DiskPrefix);
}
}

View File

@ -89,6 +89,7 @@
#[doc(no_inline)] pub use slice::{Vector, VectorVector};
#[doc(no_inline)] pub use slice::MutableVectorAllocating;
#[doc(no_inline)] pub use string::String;
#[doc(no_inline)] pub use unicode::{UnicodeChar, UnicodeStrSlice};
#[doc(no_inline)] pub use vec::Vec;
// Reexported runtime types

View File

@ -12,7 +12,6 @@
#![allow(non_camel_case_types)]
use char::Char;
use collections::Collection;
use from_str::from_str;
use io::{IoResult, Writer};
@ -22,6 +21,7 @@ use os;
use result::{Ok, Err};
use str::StrSlice;
use sync::atomics;
use unicode::UnicodeChar;
pub use self::imp::write;

111
src/libunicode/decompose.rs Normal file
View File

@ -0,0 +1,111 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/*!
Functions for computing canonical and compatible decompositions
for Unicode characters.
*/
use core::option::{Option, Some, None};
use core::slice::ImmutableVector;
use tables::normalization::{canonical_table, compatibility_table};
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
use core::cmp::{Equal, Less, Greater};
match r.bsearch(|&(val, _)| {
if c == val { Equal }
else if val < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, result) = r[idx];
Some(result)
}
None => None
}
}
/// Compute canonical Unicode decomposition for character
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
/// Compute canonical or compatible Unicode decomposition for character
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
fn d(c: char, i: |char|, k: bool) {
use core::iter::Iterator;
// 7-bit ASCII never decomposes
if c <= '\x7f' { i(c); return; }
// Perform decomposition for Hangul
if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
decompose_hangul(c, i);
return;
}
// First check the canonical decompositions
match bsearch_table(c, canonical_table) {
Some(canon) => {
for x in canon.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}
// Bottom out if we're not doing compat.
if !k { i(c); return; }
// Then check the compatibility decompositions
match bsearch_table(c, compatibility_table) {
Some(compat) => {
for x in compat.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}
// Finally bottom out.
i(c);
}
// Constants from Unicode 6.3.0 Section 3.12 Conjoining Jamo Behavior
static S_BASE: u32 = 0xAC00;
static L_BASE: u32 = 0x1100;
static V_BASE: u32 = 0x1161;
static T_BASE: u32 = 0x11A7;
static L_COUNT: u32 = 19;
static V_COUNT: u32 = 21;
static T_COUNT: u32 = 28;
static N_COUNT: u32 = (V_COUNT * T_COUNT);
static S_COUNT: u32 = (L_COUNT * N_COUNT);
// Decompose a precomposed Hangul syllable
fn decompose_hangul(s: char, f: |char|) {
use core::mem::transmute;
let si = s as u32 - S_BASE;
let li = si / N_COUNT;
unsafe {
f(transmute(L_BASE + li));
let vi = (si % N_COUNT) / T_COUNT;
f(transmute(V_BASE + vi));
let ti = si % T_COUNT;
if ti > 0 {
f(transmute(T_BASE + ti));
}
}
}

77
src/libunicode/lib.rs Normal file
View File

@ -0,0 +1,77 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! # The Unicode Library
//!
//! Unicode-intensive functions for `char` and `str` types.
//!
//! This crate provides a collection of Unicode-related functionality,
//! including decompositions, conversions, etc., and provides traits
//! implementing these functions for the `char` and `str` types.
//!
//! The functionality included here is only that which is necessary to
//! provide for basic string-related manipulations. This crate does not
//! (yet) aim to provide a full set of Unicode tables.
#![crate_id = "unicode#0.11.0"]
#![crate_name = "unicode"]
#![experimental]
#![license = "MIT/ASL2"]
#![crate_type = "rlib"]
#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
html_favicon_url = "http://www.rust-lang.org/favicon.ico",
html_root_url = "http://doc.rust-lang.org/",
html_playground_url = "http://play.rust-lang.org/")]
#![no_std]
#![allow(unused_attribute)] // NOTE: remove after stage0
extern crate core;
pub use tables::normalization::canonical_combining_class;
pub use tables::regex;
pub use u_char::UnicodeChar;
pub use u_str::UnicodeStrSlice;
pub use u_str::Words;
mod decompose;
mod tables;
mod u_char;
mod u_str;
// re-export char so that std et al see it correctly
/// Character manipulation (`char` type, Unicode Scalar Value)
///
/// This module provides the `Char` and `UnicodeChar` traits, as well as their
/// implementation for the primitive `char` type, in order to allow basic character
/// manipulation.
///
/// A `char` actually represents a
/// *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
/// as it can contain any Unicode code point except high-surrogate and
/// low-surrogate code points.
///
/// As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
/// (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
/// however the converse is not always true due to the above range limits
/// and, as such, should be performed via the `from_u32` function..
pub mod char {
pub use core::char::{MAX, from_u32, is_digit_radix, to_digit};
pub use core::char::{from_digit, escape_unicode, escape_default};
pub use core::char::{len_utf8_bytes, Char};
pub use decompose::decompose_canonical;
pub use decompose::decompose_compatible;
pub use u_char::{is_alphabetic, is_XID_start, is_XID_continue};
pub use u_char::{is_lowercase, is_uppercase, is_whitespace};
pub use u_char::{is_alphanumeric, is_control, is_digit};
pub use u_char::{to_uppercase, to_lowercase, width, UnicodeChar};
}

6445
src/libunicode/tables.rs Normal file

File diff suppressed because it is too large Load Diff

266
src/libunicode/u_char.rs Normal file
View File

@ -0,0 +1,266 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/*!
* Unicode-intensive `char` methods.
*
* These methods implement functionality for `char` that requires knowledge of
* Unicode definitions, including normalization, categorization, and display information.
*/
use core::option::Option;
use tables::{derived_property, property, general_category, conversions, charwidth};
/// Returns whether the specified `char` is considered a Unicode alphabetic
/// code point
pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
///
/// 'XID_Start' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to ID_Start but modified for closure under NFKx.
#[allow(non_snake_case_functions)]
pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
///
/// 'XID_Continue' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
#[allow(non_snake_case_functions)]
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
///
/// Indicates whether a `char` is in lower case
///
/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
///
#[inline]
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
///
/// Indicates whether a `char` is in upper case
///
/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
///
#[inline]
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
///
/// Indicates whether a `char` is whitespace
///
/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
///
#[inline]
pub fn is_whitespace(c: char) -> bool {
// As an optimization ASCII whitespace characters are checked separately
c == ' '
|| ('\x09' <= c && c <= '\x0d')
|| property::White_Space(c)
}
///
/// Indicates whether a `char` is alphanumeric
///
/// Alphanumericness is defined in terms of the Unicode General Categories
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
///
#[inline]
pub fn is_alphanumeric(c: char) -> bool {
derived_property::Alphabetic(c)
|| general_category::N(c)
}
///
/// Indicates whether a `char` is a control code point
///
/// Control code points are defined in terms of the Unicode General Category
/// 'Cc'.
///
#[inline]
pub fn is_control(c: char) -> bool { general_category::Cc(c) }
/// Indicates whether the `char` is numeric (Nd, Nl, or No)
#[inline]
pub fn is_digit(c: char) -> bool {
general_category::N(c)
}
/// Convert a char to its uppercase equivalent
///
/// The case-folding performed is the common or simple mapping:
/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
/// codepoints in some cases.
///
/// A full reference can be found here
/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
///
/// # Return value
///
/// Returns the char itself if no conversion was made
#[inline]
pub fn to_uppercase(c: char) -> char {
conversions::to_upper(c)
}
/// Convert a char to its lowercase equivalent
///
/// The case-folding performed is the common or simple mapping
/// see `to_uppercase` for references and more information
///
/// # Return value
///
/// Returns the char itself if no conversion if possible
#[inline]
pub fn to_lowercase(c: char) -> char {
conversions::to_lower(c)
}
/// Returns this character's displayed width in columns, or `None` if it is a
/// control character other than `'\x00'`.
///
/// `is_cjk` determines behavior for characters in the Ambiguous category:
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// recommends that these characters be treated as 1 column (i.e.,
/// `is_cjk` = `false`) if the context cannot be reliably determined.
pub fn width(c: char, is_cjk: bool) -> Option<uint> {
charwidth::width(c, is_cjk)
}
/// Useful functions for Unicode characters.
pub trait UnicodeChar {
/// Returns whether the specified character is considered a Unicode
/// alphabetic code point.
fn is_alphabetic(&self) -> bool;
/// Returns whether the specified character satisfies the 'XID_Start'
/// Unicode property.
///
/// 'XID_Start' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to ID_Start but modified for closure under NFKx.
#[allow(non_snake_case_functions)]
fn is_XID_start(&self) -> bool;
/// Returns whether the specified `char` satisfies the 'XID_Continue'
/// Unicode property.
///
/// 'XID_Continue' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
#[allow(non_snake_case_functions)]
fn is_XID_continue(&self) -> bool;
/// Indicates whether a character is in lowercase.
///
/// This is defined according to the terms of the Unicode Derived Core
/// Property `Lowercase`.
fn is_lowercase(&self) -> bool;
/// Indicates whether a character is in uppercase.
///
/// This is defined according to the terms of the Unicode Derived Core
/// Property `Uppercase`.
fn is_uppercase(&self) -> bool;
/// Indicates whether a character is whitespace.
///
/// Whitespace is defined in terms of the Unicode Property `White_Space`.
fn is_whitespace(&self) -> bool;
/// Indicates whether a character is alphanumeric.
///
/// Alphanumericness is defined in terms of the Unicode General Categories
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
fn is_alphanumeric(&self) -> bool;
/// Indicates whether a character is a control code point.
///
/// Control code points are defined in terms of the Unicode General
/// Category `Cc`.
fn is_control(&self) -> bool;
/// Indicates whether the character is numeric (Nd, Nl, or No).
fn is_digit(&self) -> bool;
/// Converts a character to its lowercase equivalent.
///
/// The case-folding performed is the common or simple mapping. See
/// `to_uppercase()` for references and more information.
///
/// # Return value
///
/// Returns the lowercase equivalent of the character, or the character
/// itself if no conversion is possible.
fn to_lowercase(&self) -> char;
/// Converts a character to its uppercase equivalent.
///
/// The case-folding performed is the common or simple mapping: it maps
/// one unicode codepoint (one character in Rust) to its uppercase
/// equivalent according to the Unicode database [1]. The additional
/// `SpecialCasing.txt` is not considered here, as it expands to multiple
/// codepoints in some cases.
///
/// A full reference can be found here [2].
///
/// # Return value
///
/// Returns the uppercase equivalent of the character, or the character
/// itself if no conversion was made.
///
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
///
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
fn to_uppercase(&self) -> char;
/// Returns this character's displayed width in columns, or `None` if it is a
/// control character other than `'\x00'`.
///
/// `is_cjk` determines behavior for characters in the Ambiguous category:
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// recommends that these characters be treated as 1 column (i.e.,
/// `is_cjk` = `false`) if the context cannot be reliably determined.
fn width(&self, is_cjk: bool) -> Option<uint>;
}
impl UnicodeChar for char {
fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
fn is_XID_start(&self) -> bool { is_XID_start(*self) }
fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
fn is_lowercase(&self) -> bool { is_lowercase(*self) }
fn is_uppercase(&self) -> bool { is_uppercase(*self) }
fn is_whitespace(&self) -> bool { is_whitespace(*self) }
fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
fn is_control(&self) -> bool { is_control(*self) }
fn is_digit(&self) -> bool { is_digit(*self) }
fn to_lowercase(&self) -> char { to_lowercase(*self) }
fn to_uppercase(&self) -> char { to_uppercase(*self) }
fn width(&self, is_cjk: bool) -> Option<uint> { width(*self, is_cjk) }
}

119
src/libunicode/u_str.rs Normal file
View File

@ -0,0 +1,119 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/*!
* Unicode-intensive string manipulations.
*
* This module provides functionality to `str` that requires the Unicode
* methods provided by the UnicodeChar trait.
*/
use core::collections::Collection;
use core::iter::{Filter};
use core::str::{CharSplits, StrSlice};
use core::iter::Iterator;
use u_char;
/// An iterator over the words of a string, separated by a sequence of whitespace
pub type Words<'a> =
Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
/// Methods for Unicode string slices
pub trait UnicodeStrSlice<'a> {
/// An iterator over the words of a string (subsequences separated
/// by any sequence of whitespace). Sequences of whitespace are
/// collapsed, so empty "words" are not included.
///
/// # Example
///
/// ```rust
/// let some_words = " Mary had\ta little \n\t lamb";
/// let v: Vec<&str> = some_words.words().collect();
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
/// ```
fn words(&self) -> Words<'a>;
/// Returns true if the string contains only whitespace.
///
/// Whitespace characters are determined by `char::is_whitespace`.
///
/// # Example
///
/// ```rust
/// assert!(" \t\n".is_whitespace());
/// assert!("".is_whitespace());
///
/// assert!( !"abc".is_whitespace());
/// ```
fn is_whitespace(&self) -> bool;
/// Returns true if the string contains only alphanumeric code
/// points.
///
/// Alphanumeric characters are determined by `char::is_alphanumeric`.
///
/// # Example
///
/// ```rust
/// assert!("Löwe老虎Léopard123".is_alphanumeric());
/// assert!("".is_alphanumeric());
///
/// assert!( !" &*~".is_alphanumeric());
/// ```
fn is_alphanumeric(&self) -> bool;
/// Returns a string's displayed width in columns, treating control
/// characters as zero-width.
///
/// `is_cjk` determines behavior for characters in the Ambiguous category:
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
/// In CJK locales, `is_cjk` should be `true`, else it should be `false`.
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// recommends that these characters be treated as 1 column (i.e.,
/// `is_cjk` = `false`) if the locale is unknown.
//fn width(&self, is_cjk: bool) -> uint;
/// Returns a string with leading and trailing whitespace removed.
fn trim(&self) -> &'a str;
/// Returns a string with leading whitespace removed.
fn trim_left(&self) -> &'a str;
/// Returns a string with trailing whitespace removed.
fn trim_right(&self) -> &'a str;
}
impl<'a> UnicodeStrSlice<'a> for &'a str {
#[inline]
fn words(&self) -> Words<'a> {
self.split(u_char::is_whitespace).filter(|s| !s.is_empty())
}
#[inline]
fn is_whitespace(&self) -> bool { self.chars().all(u_char::is_whitespace) }
#[inline]
fn is_alphanumeric(&self) -> bool { self.chars().all(u_char::is_alphanumeric) }
#[inline]
fn trim(&self) -> &'a str {
self.trim_left().trim_right()
}
#[inline]
fn trim_left(&self) -> &'a str {
self.trim_left_chars(u_char::is_whitespace)
}
#[inline]
fn trim_right(&self) -> &'a str {
self.trim_right_chars(u_char::is_whitespace)
}
}