auto merge of #15283 : kwantam/rust/master, r=alexcrichton
Add libunicode; move unicode functions from core - created new crate, libunicode, below libstd - split `Char` trait into `Char` (libcore) and `UnicodeChar` (libunicode) - Unicode-aware functions now live in libunicode - `is_alphabetic`, `is_XID_start`, `is_XID_continue`, `is_lowercase`, `is_uppercase`, `is_whitespace`, `is_alphanumeric`, `is_control`, `is_digit`, `to_uppercase`, `to_lowercase` - added `width` method in UnicodeChar trait - determines printed width of character in columns, or None if it is a non-NULL control character - takes a boolean argument indicating whether the present context is CJK or not (characters with 'A'mbiguous widths are double-wide in CJK contexts, single-wide otherwise) - split `StrSlice` into `StrSlice` (libcore) and `UnicodeStrSlice` (libunicode) - functionality formerly in `StrSlice` that relied upon Unicode functionality from `Char` is now in `UnicodeStrSlice` - `words`, `is_whitespace`, `is_alphanumeric`, `trim`, `trim_left`, `trim_right` - also moved `Words` type alias into libunicode because `words` method is in `UnicodeStrSlice` - unified Unicode tables from libcollections, libcore, and libregex into libunicode - updated `unicode.py` in `src/etc` to generate aforementioned tables - generated new tables based on latest Unicode data - added `UnicodeChar` and `UnicodeStrSlice` traits to prelude - libunicode is now the collection point for the `std::char` module, combining the libunicode functionality with the `Char` functionality from libcore - thus, moved doc comment for `char` from `core::char` to `unicode::char` - libcollections remains the collection point for `std::str` The Unicode-aware functions that previously lived in the `Char` and `StrSlice` traits are no longer available to programs that only use libcore. To regain use of these methods, include the libunicode crate and `use` the `UnicodeChar` and/or `UnicodeStrSlice` traits: extern crate unicode; use unicode::UnicodeChar; use unicode::UnicodeStrSlice; use unicode::Words; // if you want to use the words() method NOTE: this does *not* impact programs that use libstd, since UnicodeChar and UnicodeStrSlice have been added to the prelude. closes #15224 [breaking-change]
This commit is contained in:
commit
fa7cbb5a46
|
@ -51,17 +51,19 @@
|
|||
|
||||
TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
|
||||
uuid serialize sync getopts collections num test time rand \
|
||||
url log regex graphviz core rlibc alloc debug rustrt
|
||||
url log regex graphviz core rlibc alloc debug rustrt \
|
||||
unicode
|
||||
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros fmt_macros
|
||||
CRATES := $(TARGET_CRATES) $(HOST_CRATES)
|
||||
TOOLS := compiletest rustdoc rustc
|
||||
|
||||
DEPS_core :=
|
||||
DEPS_rlibc :=
|
||||
DEPS_unicode := core
|
||||
DEPS_alloc := core libc native:jemalloc
|
||||
DEPS_debug := std
|
||||
DEPS_rustrt := alloc core libc collections native:rustrt_native
|
||||
DEPS_std := core libc rand alloc collections rustrt sync \
|
||||
DEPS_std := core libc rand alloc collections rustrt sync unicode \
|
||||
native:rust_builtin native:backtrace
|
||||
DEPS_graphviz := std
|
||||
DEPS_green := std native:context_switch
|
||||
|
@ -82,7 +84,7 @@ DEPS_semver := std
|
|||
DEPS_uuid := std serialize
|
||||
DEPS_sync := core alloc rustrt collections
|
||||
DEPS_getopts := std
|
||||
DEPS_collections := core alloc
|
||||
DEPS_collections := core alloc unicode
|
||||
DEPS_fourcc := rustc syntax std
|
||||
DEPS_hexfloat := rustc syntax std
|
||||
DEPS_num := std
|
||||
|
@ -108,6 +110,7 @@ ONLY_RLIB_rlibc := 1
|
|||
ONLY_RLIB_alloc := 1
|
||||
ONLY_RLIB_rand := 1
|
||||
ONLY_RLIB_collections := 1
|
||||
ONLY_RLIB_unicode := 1
|
||||
|
||||
################################################################################
|
||||
# You should not need to edit below this line
|
||||
|
|
|
@ -15,11 +15,11 @@
|
|||
|
||||
# The names of crates that must be tested
|
||||
|
||||
# libcore tests are in a separate crate
|
||||
# libcore/libunicode tests are in a separate crate
|
||||
DEPS_coretest :=
|
||||
$(eval $(call RUST_CRATE,coretest))
|
||||
|
||||
TEST_TARGET_CRATES = $(filter-out core,$(TARGET_CRATES)) coretest
|
||||
TEST_TARGET_CRATES = $(filter-out core unicode,$(TARGET_CRATES)) coretest
|
||||
TEST_DOC_CRATES = $(DOC_CRATES)
|
||||
TEST_HOST_CRATES = $(HOST_CRATES)
|
||||
TEST_CRATES = $(TEST_TARGET_CRATES) $(TEST_HOST_CRATES)
|
||||
|
|
|
@ -1,183 +0,0 @@
|
|||
#!/usr/bin/env python2
|
||||
|
||||
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
# file at the top-level directory of this distribution and at
|
||||
# http://rust-lang.org/COPYRIGHT.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
import csv
|
||||
import datetime
|
||||
import urllib2
|
||||
|
||||
BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
|
||||
DATA = 'UnicodeData.txt'
|
||||
SCRIPTS = 'Scripts.txt'
|
||||
|
||||
# Mapping taken from Table 12 from:
|
||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
expanded_categories = {
|
||||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
||||
'Lm': ['L'], 'Lo': ['L'],
|
||||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
||||
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
|
||||
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
|
||||
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
|
||||
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
|
||||
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
|
||||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
|
||||
}
|
||||
|
||||
|
||||
def as_4byte_uni(n):
|
||||
s = hex(n)[2:]
|
||||
return '\\U%s%s' % ('0' * (8 - len(s)), s)
|
||||
|
||||
|
||||
def expand_cat(c):
|
||||
return expanded_categories.get(c, []) + [c]
|
||||
|
||||
|
||||
def is_valid_unicode(n):
|
||||
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
|
||||
|
||||
|
||||
def read_cats(f):
|
||||
assigned = defaultdict(list)
|
||||
for row in csv.reader(f, delimiter=';'):
|
||||
(hex, cats) = (int(row[0], 16), expand_cat(row[2]))
|
||||
if not is_valid_unicode(hex):
|
||||
continue
|
||||
for cat in cats:
|
||||
assigned[cat].append(hex)
|
||||
return assigned
|
||||
|
||||
|
||||
def read_scripts(f):
|
||||
assigned = defaultdict(list)
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
hexes, name = map(str.strip, line.split(';'))[:2]
|
||||
name = name[:name.index('#')].strip()
|
||||
if '..' not in hexes:
|
||||
hex = int(hexes, 16)
|
||||
if is_valid_unicode(hex):
|
||||
assigned[name].append(hex)
|
||||
else:
|
||||
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
|
||||
for hex in xrange(hex1, hex2 + 1):
|
||||
if is_valid_unicode(hex):
|
||||
assigned[name].append(hex)
|
||||
return assigned
|
||||
|
||||
|
||||
def group(letters):
|
||||
letters = sorted(set(letters))
|
||||
grouped = []
|
||||
cur_start = letters.pop(0)
|
||||
cur_end = cur_start
|
||||
for letter in letters:
|
||||
assert letter > cur_end, \
|
||||
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
|
||||
|
||||
if letter == cur_end + 1:
|
||||
cur_end = letter
|
||||
else:
|
||||
grouped.append((cur_start, cur_end))
|
||||
cur_start, cur_end = letter, letter
|
||||
grouped.append((cur_start, cur_end))
|
||||
return grouped
|
||||
|
||||
|
||||
def ranges_to_rust(rs):
|
||||
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
|
||||
return ',\n '.join(rs)
|
||||
|
||||
|
||||
def groups_to_rust(groups):
|
||||
rust_groups = []
|
||||
for group_name in sorted(groups):
|
||||
rust_groups.append('("%s", &[\n %s\n ]),'
|
||||
% (group_name, ranges_to_rust(groups[group_name])))
|
||||
return '\n'.join(rust_groups)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate Unicode character class tables.')
|
||||
aa = parser.add_argument
|
||||
aa('--local', action='store_true',
|
||||
help='When set, Scripts.txt and UnicodeData.txt will be read from '
|
||||
'the CWD.')
|
||||
aa('--base-url', type=str, default=BASE_URL,
|
||||
help='The base URL to use for downloading Unicode data files.')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.local:
|
||||
cats = read_cats(open(DATA))
|
||||
scripts = read_scripts(open(SCRIPTS))
|
||||
else:
|
||||
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
|
||||
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
|
||||
|
||||
# Get Rust code for all Unicode general categories and scripts.
|
||||
combined = dict(cats, **scripts)
|
||||
unigroups = groups_to_rust({k: group(letters)
|
||||
for k, letters in combined.items()})
|
||||
|
||||
# Now get Perl character classes that are Unicode friendly.
|
||||
perld = range(ord('0'), ord('9') + 1)
|
||||
dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
|
||||
|
||||
perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
|
||||
sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
|
||||
|
||||
low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
|
||||
perlw = [ord('_')] + perld + low + up
|
||||
wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
|
||||
|
||||
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
|
||||
// on {date}.
|
||||
|
||||
use parse::{{Class, NamedClasses}};
|
||||
|
||||
pub static UNICODE_CLASSES: NamedClasses = &[
|
||||
|
||||
{groups}
|
||||
|
||||
];
|
||||
|
||||
pub static PERLD: Class = &[
|
||||
{dgroups}
|
||||
];
|
||||
|
||||
pub static PERLS: Class = &[
|
||||
{sgroups}
|
||||
];
|
||||
|
||||
pub static PERLW: Class = &[
|
||||
{wgroups}
|
||||
];
|
||||
'''
|
||||
now = datetime.datetime.now()
|
||||
print(tpl.format(date=str(now), groups=unigroups,
|
||||
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))
|
|
@ -10,17 +10,46 @@
|
|||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
|
||||
# code covering the core properties. Since this is a pretty rare event we
|
||||
# just store this out-of-line and check the unicode.rs file into git.
|
||||
# This script uses the following Unicode tables:
|
||||
# - DerivedCoreProperties.txt
|
||||
# - EastAsianWidth.txt
|
||||
# - PropList.txt
|
||||
# - Scripts.txt
|
||||
# - UnicodeData.txt
|
||||
#
|
||||
# The emitted code is "the minimum we think is necessary for libstd", that
|
||||
# is, to support basic operations of the compiler and "most nontrivial rust
|
||||
# programs". It is not meant to be a complete implementation of unicode.
|
||||
# For that we recommend you use a proper binding to libicu.
|
||||
# Since this should not require frequent updates, we just store this
|
||||
# out-of-line and check the unicode.rs file into git.
|
||||
|
||||
import fileinput, re, os, sys, operator
|
||||
|
||||
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
|
||||
|
||||
#![allow(missing_doc, non_uppercase_statics, non_snake_case_functions)]
|
||||
'''
|
||||
|
||||
# Mapping taken from Table 12 from:
|
||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
expanded_categories = {
|
||||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
||||
'Lm': ['L'], 'Lo': ['L'],
|
||||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
||||
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
|
||||
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
|
||||
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
|
||||
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
|
||||
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
|
||||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
|
||||
}
|
||||
|
||||
def fetch(f):
|
||||
if not os.path.exists(f):
|
||||
|
@ -31,21 +60,17 @@ def fetch(f):
|
|||
sys.stderr.write("cannot load %s" % f)
|
||||
exit(1)
|
||||
|
||||
def is_valid_unicode(n):
|
||||
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
|
||||
|
||||
def load_unicode_data(f):
|
||||
fetch(f)
|
||||
gencats = {}
|
||||
upperlower = {}
|
||||
lowerupper = {}
|
||||
combines = []
|
||||
combines = {}
|
||||
canon_decomp = {}
|
||||
compat_decomp = {}
|
||||
curr_cat = ""
|
||||
curr_combine = ""
|
||||
c_lo = 0
|
||||
c_hi = 0
|
||||
com_lo = 0
|
||||
com_hi = 0
|
||||
|
||||
for line in fileinput.input(f):
|
||||
fields = line.split(";")
|
||||
|
@ -58,6 +83,9 @@ def load_unicode_data(f):
|
|||
code_org = code
|
||||
code = int(code, 16)
|
||||
|
||||
if not is_valid_unicode(code):
|
||||
continue
|
||||
|
||||
# generate char to char direct common and simple conversions
|
||||
# uppercase to lowercase
|
||||
if gencat == "Lu" and lowcase != "" and code_org != lowcase:
|
||||
|
@ -67,6 +95,7 @@ def load_unicode_data(f):
|
|||
if gencat == "Ll" and upcase != "" and code_org != upcase:
|
||||
lowerupper[code] = int(upcase, 16)
|
||||
|
||||
# store decomposition, if given
|
||||
if decomp != "":
|
||||
if decomp.startswith('<'):
|
||||
seq = []
|
||||
|
@ -79,38 +108,76 @@ def load_unicode_data(f):
|
|||
seq.append(int(i, 16))
|
||||
canon_decomp[code] = seq
|
||||
|
||||
if curr_cat == "":
|
||||
curr_cat = gencat
|
||||
c_lo = code
|
||||
c_hi = code
|
||||
# place letter in categories as appropriate
|
||||
for cat in [gencat] + expanded_categories.get(gencat, []):
|
||||
if cat not in gencats:
|
||||
gencats[cat] = []
|
||||
gencats[cat].append(code)
|
||||
|
||||
if curr_cat == gencat:
|
||||
c_hi = code
|
||||
else:
|
||||
if curr_cat not in gencats:
|
||||
gencats[curr_cat] = []
|
||||
# record combining class, if any
|
||||
if combine != "0":
|
||||
if combine not in combines:
|
||||
combines[combine] = []
|
||||
combines[combine].append(code)
|
||||
|
||||
gencats[curr_cat].append((c_lo, c_hi))
|
||||
curr_cat = gencat
|
||||
c_lo = code
|
||||
c_hi = code
|
||||
|
||||
if curr_combine == "":
|
||||
curr_combine = combine
|
||||
com_lo = code
|
||||
com_hi = code
|
||||
|
||||
if curr_combine == combine:
|
||||
com_hi = code
|
||||
else:
|
||||
if curr_combine != "0":
|
||||
combines.append((com_lo, com_hi, curr_combine))
|
||||
curr_combine = combine
|
||||
com_lo = code
|
||||
com_hi = code
|
||||
gencats = group_cats(gencats)
|
||||
combines = to_combines(group_cats(combines))
|
||||
|
||||
return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)
|
||||
|
||||
def group_cats(cats):
|
||||
cats_out = {}
|
||||
for cat in cats:
|
||||
cats_out[cat] = group_cat(cats[cat])
|
||||
return cats_out
|
||||
|
||||
def group_cat(cat):
|
||||
cat_out = []
|
||||
letters = sorted(set(cat))
|
||||
cur_start = letters.pop(0)
|
||||
cur_end = cur_start
|
||||
for letter in letters:
|
||||
assert letter > cur_end, \
|
||||
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
|
||||
if letter == cur_end + 1:
|
||||
cur_end = letter
|
||||
else:
|
||||
cat_out.append((cur_start, cur_end))
|
||||
cur_start = cur_end = letter
|
||||
cat_out.append((cur_start, cur_end))
|
||||
return cat_out
|
||||
|
||||
def ungroup_cat(cat):
|
||||
cat_out = []
|
||||
for (lo, hi) in cat:
|
||||
while lo <= hi:
|
||||
cat_out.append(lo)
|
||||
lo += 1
|
||||
return cat_out
|
||||
|
||||
def to_combines(combs):
|
||||
combs_out = []
|
||||
for comb in combs:
|
||||
for (lo, hi) in combs[comb]:
|
||||
combs_out.append((lo, hi, comb))
|
||||
combs_out.sort(key=lambda comb: comb[0])
|
||||
return combs_out
|
||||
|
||||
def format_table_content(f, content, indent):
|
||||
line = " "*indent
|
||||
first = True
|
||||
for chunk in content.split(","):
|
||||
if len(line) + len(chunk) < 98:
|
||||
if first:
|
||||
line += chunk
|
||||
else:
|
||||
line += ", " + chunk
|
||||
first = False
|
||||
else:
|
||||
f.write(line + ",\n")
|
||||
line = " "*indent + chunk
|
||||
f.write(line)
|
||||
|
||||
def load_properties(f, interestingprops):
|
||||
fetch(f)
|
||||
props = {}
|
||||
|
@ -134,7 +201,7 @@ def load_properties(f, interestingprops):
|
|||
prop = m.group(3)
|
||||
else:
|
||||
continue
|
||||
if prop not in interestingprops:
|
||||
if interestingprops and prop not in interestingprops:
|
||||
continue
|
||||
d_lo = int(d_lo, 16)
|
||||
d_hi = int(d_hi, 16)
|
||||
|
@ -143,6 +210,43 @@ def load_properties(f, interestingprops):
|
|||
props[prop].append((d_lo, d_hi))
|
||||
return props
|
||||
|
||||
# load all widths of want_widths, except those in except_cats
|
||||
def load_east_asian_width(want_widths, except_cats):
|
||||
f = "EastAsianWidth.txt"
|
||||
fetch(f)
|
||||
widths = {}
|
||||
re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
|
||||
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
|
||||
|
||||
for line in fileinput.input(f):
|
||||
width = None
|
||||
d_lo = 0
|
||||
d_hi = 0
|
||||
cat = None
|
||||
m = re1.match(line)
|
||||
if m:
|
||||
d_lo = m.group(1)
|
||||
d_hi = m.group(1)
|
||||
width = m.group(2)
|
||||
cat = m.group(3)
|
||||
else:
|
||||
m = re2.match(line)
|
||||
if m:
|
||||
d_lo = m.group(1)
|
||||
d_hi = m.group(2)
|
||||
width = m.group(3)
|
||||
cat = m.group(4)
|
||||
else:
|
||||
continue
|
||||
if cat in except_cats or width not in want_widths:
|
||||
continue
|
||||
d_lo = int(d_lo, 16)
|
||||
d_hi = int(d_hi, 16)
|
||||
if width not in widths:
|
||||
widths[width] = []
|
||||
widths[width].append((d_lo, d_hi))
|
||||
return widths
|
||||
|
||||
def escape_char(c):
|
||||
if c <= 0xff:
|
||||
return "'\\x%2.2x'" % c
|
||||
|
@ -150,59 +254,72 @@ def escape_char(c):
|
|||
return "'\\u%4.4x'" % c
|
||||
return "'\\U%8.8x'" % c
|
||||
|
||||
def ch_prefix(ix):
|
||||
if ix == 0:
|
||||
return " "
|
||||
if ix % 2 == 0:
|
||||
return ",\n "
|
||||
else:
|
||||
return ", "
|
||||
|
||||
def emit_bsearch_range_table(f):
|
||||
f.write("""
|
||||
fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
|
||||
use cmp::{Equal, Less, Greater};
|
||||
use slice::ImmutableVector;
|
||||
use option::None;
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
use core::slice::ImmutableVector;
|
||||
use core::option::None;
|
||||
r.bsearch(|&(lo,hi)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
}) != None
|
||||
}\n
|
||||
""");
|
||||
""")
|
||||
|
||||
def emit_property_module(f, mod, tbl):
|
||||
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
|
||||
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
|
||||
pub_string = ""
|
||||
if is_pub:
|
||||
pub_string = "pub "
|
||||
f.write(" %sstatic %s: %s = &[\n" % (pub_string, name, t_type))
|
||||
data = ""
|
||||
first = True
|
||||
for dat in t_data:
|
||||
if not first:
|
||||
data += ","
|
||||
first = False
|
||||
data += pfun(dat)
|
||||
format_table_content(f, data, 8)
|
||||
f.write("\n ];\n\n")
|
||||
|
||||
def emit_property_module(f, mod, tbl, emit_fn):
|
||||
f.write("pub mod %s {\n" % mod)
|
||||
keys = tbl.keys()
|
||||
keys.sort()
|
||||
|
||||
for cat in keys:
|
||||
if cat not in ["Nd", "Nl", "No", "Cc",
|
||||
"XID_Start", "XID_Continue", "Alphabetic",
|
||||
"Lowercase", "Uppercase", "White_Space"]:
|
||||
continue
|
||||
f.write(" static %s_table : &'static [(char,char)] = &[\n" % cat)
|
||||
ix = 0
|
||||
for pair in tbl[cat]:
|
||||
f.write(ch_prefix(ix))
|
||||
f.write("(%s, %s)" % (escape_char(pair[0]), escape_char(pair[1])))
|
||||
ix += 1
|
||||
f.write("\n ];\n\n")
|
||||
|
||||
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
|
||||
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
|
||||
f.write(" }\n\n")
|
||||
emit_table(f, "%s_table" % cat, tbl[cat])
|
||||
if cat in emit_fn:
|
||||
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
|
||||
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
|
||||
f.write(" }\n\n")
|
||||
f.write("}\n\n")
|
||||
|
||||
def emit_regex_module(f, cats, w_data):
|
||||
f.write("pub mod regex {\n")
|
||||
regex_class = "&'static [(char, char)]"
|
||||
class_table = "&'static [(&'static str, %s)]" % regex_class
|
||||
|
||||
emit_table(f, "UNICODE_CLASSES", cats, class_table,
|
||||
pfun=lambda x: "(\"%s\",super::%s::%s_table)" % (x[0], x[1], x[0]))
|
||||
|
||||
f.write(" pub static PERLD: %s = super::general_category::Nd_table;\n\n"
|
||||
% regex_class)
|
||||
f.write(" pub static PERLS: %s = super::property::White_Space_table;\n\n"
|
||||
% regex_class)
|
||||
|
||||
emit_table(f, "PERLW", w_data, regex_class)
|
||||
|
||||
f.write("}\n\n")
|
||||
|
||||
def emit_conversions_module(f, lowerupper, upperlower):
|
||||
f.write("pub mod conversions {")
|
||||
f.write("""
|
||||
use cmp::{Equal, Less, Greater};
|
||||
use slice::ImmutableVector;
|
||||
use tuple::Tuple2;
|
||||
use option::{Option, Some, None};
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
use core::slice::ImmutableVector;
|
||||
use core::tuple::Tuple2;
|
||||
use core::option::{Option, Some, None};
|
||||
|
||||
pub fn to_lower(c: char) -> char {
|
||||
match bsearch_case_table(c, LuLl_table) {
|
||||
|
@ -226,189 +343,88 @@ def emit_conversions_module(f, lowerupper, upperlower):
|
|||
})
|
||||
}
|
||||
|
||||
""");
|
||||
emit_caseconversion_table(f, "LuLl", upperlower)
|
||||
emit_caseconversion_table(f, "LlLu", lowerupper)
|
||||
""")
|
||||
emit_table(f, "LuLl_table",
|
||||
sorted(upperlower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
|
||||
emit_table(f, "LlLu_table",
|
||||
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
|
||||
f.write("}\n\n")
|
||||
|
||||
def emit_charwidth_module(f, width_table):
|
||||
f.write("pub mod charwidth {\n")
|
||||
f.write(" use core::option::{Option, Some, None};\n")
|
||||
f.write(" use core::slice::ImmutableVector;\n")
|
||||
f.write("""
|
||||
fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 {
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
match r.bsearch(|&(lo, hi, _, _)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Some(idx) => {
|
||||
let (_, _, r_ncjk, r_cjk) = r[idx];
|
||||
if is_cjk { r_cjk } else { r_ncjk }
|
||||
}
|
||||
None => 1
|
||||
}
|
||||
}
|
||||
""")
|
||||
|
||||
f.write("""
|
||||
pub fn width(c: char, is_cjk: bool) -> Option<uint> {
|
||||
match c as uint {
|
||||
_c @ 0 => Some(0), // null is zero width
|
||||
cu if cu < 0x20 => None, // control sequences have no width
|
||||
cu if cu < 0x7F => Some(1), // ASCII
|
||||
cu if cu < 0xA0 => None, // more control sequences
|
||||
_ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as uint)
|
||||
}
|
||||
}
|
||||
|
||||
""")
|
||||
|
||||
f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n")
|
||||
f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
|
||||
emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
|
||||
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
|
||||
f.write("}\n")
|
||||
|
||||
def emit_caseconversion_table(f, name, table):
|
||||
f.write(" static %s_table : &'static [(char, char)] = &[\n" % name)
|
||||
sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0))
|
||||
ix = 0
|
||||
for key, value in sorted_table:
|
||||
f.write(ch_prefix(ix))
|
||||
f.write("(%s, %s)" % (escape_char(key), escape_char(value)))
|
||||
ix += 1
|
||||
f.write("\n ];\n\n")
|
||||
|
||||
def format_table_content(f, content, indent):
|
||||
line = " "*indent
|
||||
first = True
|
||||
for chunk in content.split(","):
|
||||
if len(line) + len(chunk) < 98:
|
||||
if first:
|
||||
line += chunk
|
||||
else:
|
||||
line += ", " + chunk
|
||||
first = False
|
||||
else:
|
||||
f.write(line + ",\n")
|
||||
line = " "*indent + chunk
|
||||
f.write(line)
|
||||
|
||||
def emit_core_norm_module(f, canon, compat):
|
||||
def emit_norm_module(f, canon, compat, combine):
|
||||
canon_keys = canon.keys()
|
||||
canon_keys.sort()
|
||||
|
||||
compat_keys = compat.keys()
|
||||
compat_keys.sort()
|
||||
f.write("pub mod normalization {\n");
|
||||
f.write(" use option::Option;\n");
|
||||
f.write(" use option::{Some, None};\n");
|
||||
f.write(" use slice::ImmutableVector;\n");
|
||||
f.write("""
|
||||
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
|
||||
use cmp::{Equal, Less, Greater};
|
||||
match r.bsearch(|&(val, _)| {
|
||||
if c == val { Equal }
|
||||
else if val < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Some(idx) => {
|
||||
let (_, result) = r[idx];
|
||||
Some(result)
|
||||
}
|
||||
None => None
|
||||
}
|
||||
}\n\n
|
||||
""")
|
||||
|
||||
f.write("pub mod normalization {\n")
|
||||
|
||||
def mkdata_fun(table):
|
||||
def f(char):
|
||||
data = "(%s,&[" % escape_char(char)
|
||||
first = True
|
||||
for d in table[char]:
|
||||
if not first:
|
||||
data += ","
|
||||
first = False
|
||||
data += escape_char(d)
|
||||
data += "])"
|
||||
return data
|
||||
return f
|
||||
|
||||
f.write(" // Canonical decompositions\n")
|
||||
f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n")
|
||||
data = ""
|
||||
first = True
|
||||
for char in canon_keys:
|
||||
if not first:
|
||||
data += ","
|
||||
first = False
|
||||
data += "(%s,&[" % escape_char(char)
|
||||
first2 = True
|
||||
for d in canon[char]:
|
||||
if not first2:
|
||||
data += ","
|
||||
first2 = False
|
||||
data += escape_char(d)
|
||||
data += "])"
|
||||
format_table_content(f, data, 8)
|
||||
f.write("\n ];\n\n")
|
||||
emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]",
|
||||
pfun=mkdata_fun(canon))
|
||||
|
||||
f.write(" // Compatibility decompositions\n")
|
||||
f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n")
|
||||
data = ""
|
||||
first = True
|
||||
for char in compat_keys:
|
||||
if not first:
|
||||
data += ","
|
||||
first = False
|
||||
data += "(%s,&[" % escape_char(char)
|
||||
first2 = True
|
||||
for d in compat[char]:
|
||||
if not first2:
|
||||
data += ","
|
||||
first2 = False
|
||||
data += escape_char(d)
|
||||
data += "])"
|
||||
format_table_content(f, data, 8)
|
||||
f.write("\n ];\n\n")
|
||||
|
||||
f.write("""
|
||||
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
|
||||
|
||||
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
|
||||
|
||||
fn d(c: char, i: |char|, k: bool) {
|
||||
use iter::Iterator;
|
||||
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\\x7f' { i(c); return; }
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
|
||||
decompose_hangul(c, i);
|
||||
return;
|
||||
}
|
||||
|
||||
// First check the canonical decompositions
|
||||
match bsearch_table(c, canonical_table) {
|
||||
Some(canon) => {
|
||||
for x in canon.iter() {
|
||||
d(*x, |b| i(b), k);
|
||||
}
|
||||
return;
|
||||
}
|
||||
None => ()
|
||||
}
|
||||
|
||||
// Bottom out if we're not doing compat.
|
||||
if !k { i(c); return; }
|
||||
|
||||
// Then check the compatibility decompositions
|
||||
match bsearch_table(c, compatibility_table) {
|
||||
Some(compat) => {
|
||||
for x in compat.iter() {
|
||||
d(*x, |b| i(b), k);
|
||||
}
|
||||
return;
|
||||
}
|
||||
None => ()
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
i(c);
|
||||
}
|
||||
|
||||
// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
|
||||
static S_BASE: u32 = 0xAC00;
|
||||
static L_BASE: u32 = 0x1100;
|
||||
static V_BASE: u32 = 0x1161;
|
||||
static T_BASE: u32 = 0x11A7;
|
||||
static L_COUNT: u32 = 19;
|
||||
static V_COUNT: u32 = 21;
|
||||
static T_COUNT: u32 = 28;
|
||||
static N_COUNT: u32 = (V_COUNT * T_COUNT);
|
||||
static S_COUNT: u32 = (L_COUNT * N_COUNT);
|
||||
|
||||
// Decompose a precomposed Hangul syllable
|
||||
fn decompose_hangul(s: char, f: |char|) {
|
||||
use cast::transmute;
|
||||
|
||||
let si = s as u32 - S_BASE;
|
||||
|
||||
let li = si / N_COUNT;
|
||||
unsafe {
|
||||
f(transmute(L_BASE + li));
|
||||
|
||||
let vi = (si % N_COUNT) / T_COUNT;
|
||||
f(transmute(V_BASE + vi));
|
||||
|
||||
let ti = si % T_COUNT;
|
||||
if ti > 0 {
|
||||
f(transmute(T_BASE + ti));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
""")
|
||||
|
||||
def emit_std_norm_module(f, combine):
|
||||
f.write("pub mod normalization {\n");
|
||||
f.write(" use option::{Some, None};\n");
|
||||
f.write(" use slice::ImmutableVector;\n");
|
||||
emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
|
||||
pfun=mkdata_fun(compat))
|
||||
|
||||
f.write("""
|
||||
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
|
||||
use cmp::{Equal, Less, Greater};
|
||||
use core::option::{Some, None};
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
use core::slice::ImmutableVector;
|
||||
match r.bsearch(|&(lo, hi, _)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
|
@ -420,72 +436,122 @@ def emit_std_norm_module(f, combine):
|
|||
}
|
||||
None => 0
|
||||
}
|
||||
}\n\n
|
||||
}\n
|
||||
""")
|
||||
|
||||
f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n")
|
||||
ix = 0
|
||||
for pair in combine:
|
||||
f.write(ch_prefix(ix))
|
||||
f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
|
||||
ix += 1
|
||||
f.write("\n ];\n\n")
|
||||
emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False,
|
||||
pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
|
||||
|
||||
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
|
||||
+ " bsearch_range_value_table(c, combining_class_table)\n"
|
||||
+ " }\n")
|
||||
f.write("}\n")
|
||||
|
||||
f.write("""
|
||||
}
|
||||
|
||||
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
""")
|
||||
|
||||
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
|
||||
def remove_from_wtable(wtable, val):
|
||||
wtable_out = []
|
||||
while wtable:
|
||||
if wtable[0][1] < val:
|
||||
wtable_out.append(wtable.pop(0))
|
||||
elif wtable[0][0] > val:
|
||||
break
|
||||
else:
|
||||
(wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
|
||||
if wt_lo == wt_hi == val:
|
||||
continue
|
||||
elif wt_lo == val:
|
||||
wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
|
||||
elif wt_hi == val:
|
||||
wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
|
||||
else:
|
||||
wtable_out.append((wt_lo, val-1, width, width_cjk))
|
||||
wtable_out.append((val+1, wt_hi, width, width_cjk))
|
||||
if wtable:
|
||||
wtable_out.extend(wtable)
|
||||
return wtable_out
|
||||
|
||||
#![allow(missing_doc, non_uppercase_statics)]
|
||||
def optimize_width_table(wtable):
|
||||
wtable_out = []
|
||||
w_this = wtable.pop(0)
|
||||
while wtable:
|
||||
if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
|
||||
w_tmp = wtable.pop(0)
|
||||
w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
|
||||
else:
|
||||
wtable_out.append(w_this)
|
||||
w_this = wtable.pop(0)
|
||||
wtable_out.append(w_this)
|
||||
return wtable_out
|
||||
|
||||
'''
|
||||
|
||||
(canon_decomp, compat_decomp, gencats,
|
||||
combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
|
||||
|
||||
def gen_core_unicode():
|
||||
r = "core_unicode.rs"
|
||||
if __name__ == "__main__":
|
||||
r = "unicode.rs"
|
||||
if os.path.exists(r):
|
||||
os.remove(r);
|
||||
os.remove(r)
|
||||
with open(r, "w") as rf:
|
||||
# Preamble
|
||||
# write the file's preamble
|
||||
rf.write(preamble)
|
||||
|
||||
emit_bsearch_range_table(rf);
|
||||
emit_property_module(rf, "general_category", gencats)
|
||||
# download and parse all the data
|
||||
(canon_decomp, compat_decomp, gencats, combines,
|
||||
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
|
||||
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
|
||||
other_derived = ["Default_Ignorable_Code_Point"]
|
||||
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
|
||||
scripts = load_properties("Scripts.txt", [])
|
||||
props = load_properties("PropList.txt",
|
||||
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
|
||||
|
||||
emit_core_norm_module(rf, canon_decomp, compat_decomp)
|
||||
# bsearch_range_table is used in all the property modules below
|
||||
emit_bsearch_range_table(rf)
|
||||
|
||||
derived = load_properties("DerivedCoreProperties.txt",
|
||||
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
|
||||
# all of these categories will also be available as \p{} in libregex
|
||||
allcats = []
|
||||
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
|
||||
("derived_property", derived, want_derived), \
|
||||
("script", scripts, []), \
|
||||
("property", props, ["White_Space"]):
|
||||
emit_property_module(rf, name, cat, pfuns)
|
||||
allcats.extend(map(lambda x: (x, name), cat))
|
||||
allcats.sort(key=lambda c: c[0])
|
||||
|
||||
emit_property_module(rf, "derived_property", derived)
|
||||
# the \w regex corresponds to Alphabetic + Mark + Decimal_Number +
|
||||
# Connector_Punctuation + Join-Control according to UTS#18
|
||||
# http://www.unicode.org/reports/tr18/#Compatibility_Properties
|
||||
perl_words = []
|
||||
for cat in derived["Alphabetic"], gencats["M"], gencats["Nd"], \
|
||||
gencats["Pc"], props["Join_Control"]:
|
||||
perl_words.extend(ungroup_cat(cat))
|
||||
perl_words = group_cat(perl_words)
|
||||
|
||||
props = load_properties("PropList.txt", ["White_Space"])
|
||||
emit_property_module(rf, "property", props)
|
||||
# emit lookup tables for \p{}, along with \d, \w, and \s for libregex
|
||||
emit_regex_module(rf, allcats, perl_words)
|
||||
|
||||
# normalizations and conversions module
|
||||
emit_norm_module(rf, canon_decomp, compat_decomp, combines)
|
||||
emit_conversions_module(rf, lowerupper, upperlower)
|
||||
|
||||
def gen_std_unicode():
|
||||
r = "std_unicode.rs"
|
||||
if os.path.exists(r):
|
||||
os.remove(r);
|
||||
with open(r, "w") as rf:
|
||||
# Preamble
|
||||
rf.write(preamble)
|
||||
emit_std_norm_module(rf, combines)
|
||||
# character width module
|
||||
width_table = []
|
||||
for zwcat in ["Me", "Mn", "Cf"]:
|
||||
width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
|
||||
width_table.append((4448, 4607, 0, 0))
|
||||
|
||||
gen_core_unicode()
|
||||
gen_std_unicode()
|
||||
# get widths, except those that are explicitly marked zero-width above
|
||||
ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"])
|
||||
# these are doublewidth
|
||||
for dwcat in ["W", "F"]:
|
||||
width_table.extend(map(lambda (lo, hi): (lo, hi, 2, 2), ea_widths[dwcat]))
|
||||
width_table.extend(map(lambda (lo, hi): (lo, hi, 1, 2), ea_widths["A"]))
|
||||
|
||||
width_table.sort(key=lambda w: w[0])
|
||||
|
||||
# soft hyphen is not zero width in preformatted text; it's used to indicate
|
||||
# a hyphen inserted to facilitate a linebreak.
|
||||
width_table = remove_from_wtable(width_table, 173)
|
||||
|
||||
# optimize the width table by collapsing adjacent entities when possible
|
||||
width_table = optimize_width_table(width_table)
|
||||
emit_charwidth_module(rf, width_table)
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#![allow(unused_attribute)] // NOTE: remove after stage0
|
||||
|
||||
#[phase(plugin, link)] extern crate core;
|
||||
extern crate unicode;
|
||||
extern crate alloc;
|
||||
|
||||
#[cfg(test)] extern crate native;
|
||||
|
@ -69,9 +70,6 @@ pub mod string;
|
|||
pub mod vec;
|
||||
pub mod hash;
|
||||
|
||||
// Internal unicode fiddly bits for the str module
|
||||
mod unicode;
|
||||
|
||||
mod deque;
|
||||
|
||||
/// A trait to represent mutable containers
|
||||
|
|
|
@ -69,7 +69,6 @@ is the same as `&[u8]`.
|
|||
|
||||
use core::prelude::*;
|
||||
|
||||
use core::char;
|
||||
use core::default::Default;
|
||||
use core::fmt;
|
||||
use core::cmp;
|
||||
|
@ -79,15 +78,17 @@ use core::mem;
|
|||
use Collection;
|
||||
use hash;
|
||||
use string::String;
|
||||
use unicode;
|
||||
use vec::Vec;
|
||||
|
||||
pub use core::str::{from_utf8, CharEq, Chars, CharOffsets};
|
||||
pub use core::str::{Bytes, CharSplits};
|
||||
pub use core::str::{CharSplitsN, Words, AnyLines, MatchIndices, StrSplits};
|
||||
pub use core::str::{CharSplitsN, AnyLines, MatchIndices, StrSplits};
|
||||
pub use core::str::{eq_slice, is_utf8, is_utf16, Utf16Items};
|
||||
pub use core::str::{Utf16Item, ScalarValue, LoneSurrogate, utf16_items};
|
||||
pub use core::str::{truncate_utf16_at_nul, utf8_char_width, CharRange};
|
||||
pub use core::str::{Str, StrSlice};
|
||||
pub use unicode::{Words, UnicodeStrSlice};
|
||||
|
||||
/*
|
||||
Section: Creating a string
|
||||
|
@ -283,7 +284,7 @@ pub struct Decompositions<'a> {
|
|||
impl<'a> Iterator<char> for Decompositions<'a> {
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
use unicode::normalization::canonical_combining_class;
|
||||
use unicode::canonical_combining_class;
|
||||
|
||||
match self.buffer.as_slice().head() {
|
||||
Some(&(c, 0)) => {
|
||||
|
@ -299,8 +300,8 @@ impl<'a> Iterator<char> for Decompositions<'a> {
|
|||
}
|
||||
|
||||
let decomposer = match self.kind {
|
||||
Canonical => char::decompose_canonical,
|
||||
Compatible => char::decompose_compatible
|
||||
Canonical => unicode::char::decompose_canonical,
|
||||
Compatible => unicode::char::decompose_compatible
|
||||
};
|
||||
|
||||
if !self.sorted {
|
||||
|
@ -973,6 +974,8 @@ mod tests {
|
|||
use string::String;
|
||||
use vec::Vec;
|
||||
|
||||
use unicode::UnicodeChar;
|
||||
|
||||
#[test]
|
||||
fn test_eq_slice() {
|
||||
assert!((eq_slice("foobar".slice(0, 3), "foo")));
|
||||
|
|
|
@ -1,183 +0,0 @@
|
|||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
|
||||
|
||||
#![allow(missing_doc, non_uppercase_statics)]
|
||||
|
||||
pub mod normalization {
|
||||
use core::prelude::*;
|
||||
|
||||
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
|
||||
match r.bsearch(|&(lo, hi, _)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Some(idx) => {
|
||||
let (_, _, result) = r[idx];
|
||||
result
|
||||
}
|
||||
None => 0
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static combining_class_table : &'static [(char, char, u8)] = &[
|
||||
('\u0300', '\u0314', 230), ('\u0315', '\u0315', 232),
|
||||
('\u0316', '\u0319', 220), ('\u031a', '\u031a', 232),
|
||||
('\u031b', '\u031b', 216), ('\u031c', '\u0320', 220),
|
||||
('\u0321', '\u0322', 202), ('\u0323', '\u0326', 220),
|
||||
('\u0327', '\u0328', 202), ('\u0329', '\u0333', 220),
|
||||
('\u0334', '\u0338', 1), ('\u0339', '\u033c', 220),
|
||||
('\u033d', '\u0344', 230), ('\u0345', '\u0345', 240),
|
||||
('\u0346', '\u0346', 230), ('\u0347', '\u0349', 220),
|
||||
('\u034a', '\u034c', 230), ('\u034d', '\u034e', 220),
|
||||
('\u0350', '\u0352', 230), ('\u0353', '\u0356', 220),
|
||||
('\u0357', '\u0357', 230), ('\u0358', '\u0358', 232),
|
||||
('\u0359', '\u035a', 220), ('\u035b', '\u035b', 230),
|
||||
('\u035c', '\u035c', 233), ('\u035d', '\u035e', 234),
|
||||
('\u035f', '\u035f', 233), ('\u0360', '\u0361', 234),
|
||||
('\u0362', '\u0362', 233), ('\u0363', '\u036f', 230),
|
||||
('\u0483', '\u0487', 230), ('\u0591', '\u0591', 220),
|
||||
('\u0592', '\u0595', 230), ('\u0596', '\u0596', 220),
|
||||
('\u0597', '\u0599', 230), ('\u059a', '\u059a', 222),
|
||||
('\u059b', '\u059b', 220), ('\u059c', '\u05a1', 230),
|
||||
('\u05a2', '\u05a7', 220), ('\u05a8', '\u05a9', 230),
|
||||
('\u05aa', '\u05aa', 220), ('\u05ab', '\u05ac', 230),
|
||||
('\u05ad', '\u05ad', 222), ('\u05ae', '\u05ae', 228),
|
||||
('\u05af', '\u05af', 230), ('\u05b0', '\u05b0', 10),
|
||||
('\u05b1', '\u05b1', 11), ('\u05b2', '\u05b2', 12),
|
||||
('\u05b3', '\u05b3', 13), ('\u05b4', '\u05b4', 14),
|
||||
('\u05b5', '\u05b5', 15), ('\u05b6', '\u05b6', 16),
|
||||
('\u05b7', '\u05b7', 17), ('\u05b8', '\u05b8', 18),
|
||||
('\u05b9', '\u05ba', 19), ('\u05bb', '\u05bb', 20),
|
||||
('\u05bc', '\u05bc', 21), ('\u05bd', '\u05bd', 22),
|
||||
('\u05bf', '\u05bf', 23), ('\u05c1', '\u05c1', 24),
|
||||
('\u05c2', '\u05c2', 25), ('\u05c4', '\u05c4', 230),
|
||||
('\u05c5', '\u05c5', 220), ('\u05c7', '\u05c7', 18),
|
||||
('\u0610', '\u0617', 230), ('\u0618', '\u0618', 30),
|
||||
('\u0619', '\u0619', 31), ('\u061a', '\u061a', 32),
|
||||
('\u064b', '\u064b', 27), ('\u064c', '\u064c', 28),
|
||||
('\u064d', '\u064d', 29), ('\u064e', '\u064e', 30),
|
||||
('\u064f', '\u064f', 31), ('\u0650', '\u0650', 32),
|
||||
('\u0651', '\u0651', 33), ('\u0652', '\u0652', 34),
|
||||
('\u0653', '\u0654', 230), ('\u0655', '\u0656', 220),
|
||||
('\u0657', '\u065b', 230), ('\u065c', '\u065c', 220),
|
||||
('\u065d', '\u065e', 230), ('\u065f', '\u065f', 220),
|
||||
('\u0670', '\u0670', 35), ('\u06d6', '\u06dc', 230),
|
||||
('\u06df', '\u06e2', 230), ('\u06e3', '\u06e3', 220),
|
||||
('\u06e4', '\u06e4', 230), ('\u06e7', '\u06e8', 230),
|
||||
('\u06ea', '\u06ea', 220), ('\u06eb', '\u06ec', 230),
|
||||
('\u06ed', '\u06ed', 220), ('\u0711', '\u0711', 36),
|
||||
('\u0730', '\u0730', 230), ('\u0731', '\u0731', 220),
|
||||
('\u0732', '\u0733', 230), ('\u0734', '\u0734', 220),
|
||||
('\u0735', '\u0736', 230), ('\u0737', '\u0739', 220),
|
||||
('\u073a', '\u073a', 230), ('\u073b', '\u073c', 220),
|
||||
('\u073d', '\u073d', 230), ('\u073e', '\u073e', 220),
|
||||
('\u073f', '\u0741', 230), ('\u0742', '\u0742', 220),
|
||||
('\u0743', '\u0743', 230), ('\u0744', '\u0744', 220),
|
||||
('\u0745', '\u0745', 230), ('\u0746', '\u0746', 220),
|
||||
('\u0747', '\u0747', 230), ('\u0748', '\u0748', 220),
|
||||
('\u0749', '\u074a', 230), ('\u07eb', '\u07f1', 230),
|
||||
('\u07f2', '\u07f2', 220), ('\u07f3', '\u07f3', 230),
|
||||
('\u0816', '\u0819', 230), ('\u081b', '\u0823', 230),
|
||||
('\u0825', '\u0827', 230), ('\u0829', '\u082d', 230),
|
||||
('\u0859', '\u085b', 220), ('\u08e4', '\u08e5', 230),
|
||||
('\u08e6', '\u08e6', 220), ('\u08e7', '\u08e8', 230),
|
||||
('\u08e9', '\u08e9', 220), ('\u08ea', '\u08ec', 230),
|
||||
('\u08ed', '\u08ef', 220), ('\u08f0', '\u08f0', 27),
|
||||
('\u08f1', '\u08f1', 28), ('\u08f2', '\u08f2', 29),
|
||||
('\u08f3', '\u08f5', 230), ('\u08f6', '\u08f6', 220),
|
||||
('\u08f7', '\u08f8', 230), ('\u08f9', '\u08fa', 220),
|
||||
('\u08fb', '\u08fe', 230), ('\u093c', '\u093c', 7),
|
||||
('\u094d', '\u094d', 9), ('\u0951', '\u0951', 230),
|
||||
('\u0952', '\u0952', 220), ('\u0953', '\u0954', 230),
|
||||
('\u09bc', '\u09bc', 7), ('\u09cd', '\u09cd', 9),
|
||||
('\u0a3c', '\u0a3c', 7), ('\u0a4d', '\u0a4d', 9),
|
||||
('\u0abc', '\u0abc', 7), ('\u0acd', '\u0acd', 9),
|
||||
('\u0b3c', '\u0b3c', 7), ('\u0b4d', '\u0b4d', 9),
|
||||
('\u0bcd', '\u0bcd', 9), ('\u0c4d', '\u0c4d', 9),
|
||||
('\u0c55', '\u0c55', 84), ('\u0c56', '\u0c56', 91),
|
||||
('\u0cbc', '\u0cbc', 7), ('\u0ccd', '\u0ccd', 9),
|
||||
('\u0d4d', '\u0d4d', 9), ('\u0dca', '\u0dca', 9),
|
||||
('\u0e38', '\u0e39', 103), ('\u0e3a', '\u0e3a', 9),
|
||||
('\u0e48', '\u0e4b', 107), ('\u0eb8', '\u0eb9', 118),
|
||||
('\u0ec8', '\u0ecb', 122), ('\u0f18', '\u0f19', 220),
|
||||
('\u0f35', '\u0f35', 220), ('\u0f37', '\u0f37', 220),
|
||||
('\u0f39', '\u0f39', 216), ('\u0f71', '\u0f71', 129),
|
||||
('\u0f72', '\u0f72', 130), ('\u0f74', '\u0f74', 132),
|
||||
('\u0f7a', '\u0f7d', 130), ('\u0f80', '\u0f80', 130),
|
||||
('\u0f82', '\u0f83', 230), ('\u0f84', '\u0f84', 9),
|
||||
('\u0f86', '\u0f87', 230), ('\u0fc6', '\u0fc6', 220),
|
||||
('\u1037', '\u1037', 7), ('\u1039', '\u103a', 9),
|
||||
('\u108d', '\u108d', 220), ('\u135d', '\u135f', 230),
|
||||
('\u1714', '\u1714', 9), ('\u1734', '\u1734', 9),
|
||||
('\u17d2', '\u17d2', 9), ('\u17dd', '\u17dd', 230),
|
||||
('\u18a9', '\u18a9', 228), ('\u1939', '\u1939', 222),
|
||||
('\u193a', '\u193a', 230), ('\u193b', '\u193b', 220),
|
||||
('\u1a17', '\u1a17', 230), ('\u1a18', '\u1a18', 220),
|
||||
('\u1a60', '\u1a60', 9), ('\u1a75', '\u1a7c', 230),
|
||||
('\u1a7f', '\u1a7f', 220), ('\u1b34', '\u1b34', 7),
|
||||
('\u1b44', '\u1b44', 9), ('\u1b6b', '\u1b6b', 230),
|
||||
('\u1b6c', '\u1b6c', 220), ('\u1b6d', '\u1b73', 230),
|
||||
('\u1baa', '\u1bab', 9), ('\u1be6', '\u1be6', 7),
|
||||
('\u1bf2', '\u1bf3', 9), ('\u1c37', '\u1c37', 7),
|
||||
('\u1cd0', '\u1cd2', 230), ('\u1cd4', '\u1cd4', 1),
|
||||
('\u1cd5', '\u1cd9', 220), ('\u1cda', '\u1cdb', 230),
|
||||
('\u1cdc', '\u1cdf', 220), ('\u1ce0', '\u1ce0', 230),
|
||||
('\u1ce2', '\u1ce8', 1), ('\u1ced', '\u1ced', 220),
|
||||
('\u1cf4', '\u1cf4', 230), ('\u1dc0', '\u1dc1', 230),
|
||||
('\u1dc2', '\u1dc2', 220), ('\u1dc3', '\u1dc9', 230),
|
||||
('\u1dca', '\u1dca', 220), ('\u1dcb', '\u1dcc', 230),
|
||||
('\u1dcd', '\u1dcd', 234), ('\u1dce', '\u1dce', 214),
|
||||
('\u1dcf', '\u1dcf', 220), ('\u1dd0', '\u1dd0', 202),
|
||||
('\u1dd1', '\u1de6', 230), ('\u1dfc', '\u1dfc', 233),
|
||||
('\u1dfd', '\u1dfd', 220), ('\u1dfe', '\u1dfe', 230),
|
||||
('\u1dff', '\u1dff', 220), ('\u20d0', '\u20d1', 230),
|
||||
('\u20d2', '\u20d3', 1), ('\u20d4', '\u20d7', 230),
|
||||
('\u20d8', '\u20da', 1), ('\u20db', '\u20dc', 230),
|
||||
('\u20e1', '\u20e1', 230), ('\u20e5', '\u20e6', 1),
|
||||
('\u20e7', '\u20e7', 230), ('\u20e8', '\u20e8', 220),
|
||||
('\u20e9', '\u20e9', 230), ('\u20ea', '\u20eb', 1),
|
||||
('\u20ec', '\u20ef', 220), ('\u20f0', '\u20f0', 230),
|
||||
('\u2cef', '\u2cf1', 230), ('\u2d7f', '\u2d7f', 9),
|
||||
('\u2de0', '\u2dff', 230), ('\u302a', '\u302a', 218),
|
||||
('\u302b', '\u302b', 228), ('\u302c', '\u302c', 232),
|
||||
('\u302d', '\u302d', 222), ('\u302e', '\u302f', 224),
|
||||
('\u3099', '\u309a', 8), ('\ua66f', '\ua66f', 230),
|
||||
('\ua674', '\ua67d', 230), ('\ua69f', '\ua69f', 230),
|
||||
('\ua6f0', '\ua6f1', 230), ('\ua806', '\ua806', 9),
|
||||
('\ua8c4', '\ua8c4', 9), ('\ua8e0', '\ua8f1', 230),
|
||||
('\ua92b', '\ua92d', 220), ('\ua953', '\ua953', 9),
|
||||
('\ua9b3', '\ua9b3', 7), ('\ua9c0', '\ua9c0', 9),
|
||||
('\uaab0', '\uaab0', 230), ('\uaab2', '\uaab3', 230),
|
||||
('\uaab4', '\uaab4', 220), ('\uaab7', '\uaab8', 230),
|
||||
('\uaabe', '\uaabf', 230), ('\uaac1', '\uaac1', 230),
|
||||
('\uaaf6', '\uaaf6', 9), ('\uabed', '\uabed', 9),
|
||||
('\ufb1e', '\ufb1e', 26), ('\ufe20', '\ufe26', 230),
|
||||
('\U000101fd', '\U000101fd', 220), ('\U00010a0d', '\U00010a0d', 220),
|
||||
('\U00010a0f', '\U00010a0f', 230), ('\U00010a38', '\U00010a38', 230),
|
||||
('\U00010a39', '\U00010a39', 1), ('\U00010a3a', '\U00010a3a', 220),
|
||||
('\U00010a3f', '\U00010a3f', 9), ('\U00011046', '\U00011046', 9),
|
||||
('\U000110b9', '\U000110b9', 9), ('\U000110ba', '\U000110ba', 7),
|
||||
('\U00011100', '\U00011102', 230), ('\U00011133', '\U00011134', 9),
|
||||
('\U000111c0', '\U000111c0', 9), ('\U000116b6', '\U000116b6', 9),
|
||||
('\U000116b7', '\U000116b7', 7), ('\U0001d165', '\U0001d166', 216),
|
||||
('\U0001d167', '\U0001d169', 1), ('\U0001d16d', '\U0001d16d', 226),
|
||||
('\U0001d16e', '\U0001d172', 216), ('\U0001d17b', '\U0001d182', 220),
|
||||
('\U0001d185', '\U0001d189', 230), ('\U0001d18a', '\U0001d18b', 220),
|
||||
('\U0001d1aa', '\U0001d1ad', 230), ('\U0001d242', '\U0001d244', 230)
|
||||
];
|
||||
|
||||
pub fn canonical_combining_class(c: char) -> u8 {
|
||||
bsearch_range_value_table(c, combining_class_table)
|
||||
}
|
||||
}
|
|
@ -8,20 +8,9 @@
|
|||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Character manipulation (`char` type, Unicode Scalar Value)
|
||||
//! Character manipulation.
|
||||
//!
|
||||
//! This module provides the `Char` trait, as well as its implementation
|
||||
//! for the primitive `char` type, in order to allow basic character manipulation.
|
||||
//!
|
||||
//! A `char` actually represents a
|
||||
//! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
|
||||
//! as it can contain any Unicode code point except high-surrogate and
|
||||
//! low-surrogate code points.
|
||||
//!
|
||||
//! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
|
||||
//! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
|
||||
//! however the converse is not always true due to the above range limits
|
||||
//! and, as such, should be performed via the `from_u32` function..
|
||||
//! For more details, see ::unicode::char (a.k.a. std::char)
|
||||
|
||||
#![allow(non_snake_case_functions)]
|
||||
#![doc(primitive = "char")]
|
||||
|
@ -29,12 +18,6 @@
|
|||
use mem::transmute;
|
||||
use option::{None, Option, Some};
|
||||
use iter::{Iterator, range_step};
|
||||
use unicode::{derived_property, property, general_category, conversions};
|
||||
|
||||
/// Returns the canonical decomposition of a character.
|
||||
pub use unicode::normalization::decompose_canonical;
|
||||
/// Returns the compatibility decomposition of a character.
|
||||
pub use unicode::normalization::decompose_compatible;
|
||||
|
||||
// UTF-8 ranges and tags for encoding characters
|
||||
static TAG_CONT: u8 = 0b1000_0000u8;
|
||||
|
@ -93,84 +76,6 @@ pub fn from_u32(i: u32) -> Option<char> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns whether the specified `char` is considered a Unicode alphabetic
|
||||
/// code point
|
||||
pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
|
||||
///
|
||||
/// 'XID_Start' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to ID_Start but modified for closure under NFKx.
|
||||
pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
|
||||
///
|
||||
/// 'XID_Continue' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
|
||||
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is in lower case
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is in upper case
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is whitespace
|
||||
///
|
||||
/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_whitespace(c: char) -> bool {
|
||||
// As an optimization ASCII whitespace characters are checked separately
|
||||
c == ' '
|
||||
|| ('\x09' <= c && c <= '\x0d')
|
||||
|| property::White_Space(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is alphanumeric
|
||||
///
|
||||
/// Alphanumericness is defined in terms of the Unicode General Categories
|
||||
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_alphanumeric(c: char) -> bool {
|
||||
derived_property::Alphabetic(c)
|
||||
|| general_category::Nd(c)
|
||||
|| general_category::Nl(c)
|
||||
|| general_category::No(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is a control code point
|
||||
///
|
||||
/// Control code points are defined in terms of the Unicode General Category
|
||||
/// 'Cc'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_control(c: char) -> bool { general_category::Cc(c) }
|
||||
|
||||
/// Indicates whether the `char` is numeric (Nd, Nl, or No)
|
||||
#[inline]
|
||||
pub fn is_digit(c: char) -> bool {
|
||||
general_category::Nd(c)
|
||||
|| general_category::Nl(c)
|
||||
|| general_category::No(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Checks if a `char` parses as a numeric digit in the given radix
|
||||
///
|
||||
|
@ -227,38 +132,6 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
|
|||
else { None }
|
||||
}
|
||||
|
||||
/// Convert a char to its uppercase equivalent
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping:
|
||||
/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
|
||||
/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
|
||||
/// codepoints in some cases.
|
||||
///
|
||||
/// A full reference can be found here
|
||||
/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the char itself if no conversion was made
|
||||
#[inline]
|
||||
pub fn to_uppercase(c: char) -> char {
|
||||
conversions::to_upper(c)
|
||||
}
|
||||
|
||||
/// Convert a char to its lowercase equivalent
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping
|
||||
/// see `to_uppercase` for references and more information
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the char itself if no conversion if possible
|
||||
#[inline]
|
||||
pub fn to_lowercase(c: char) -> char {
|
||||
conversions::to_lower(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Converts a number to the character representing it
|
||||
///
|
||||
|
@ -355,61 +228,8 @@ pub fn len_utf8_bytes(c: char) -> uint {
|
|||
}
|
||||
}
|
||||
|
||||
/// Useful functions for Unicode characters.
|
||||
/// Basic `char` manipulations.
|
||||
pub trait Char {
|
||||
/// Returns whether the specified character is considered a Unicode
|
||||
/// alphabetic code point.
|
||||
fn is_alphabetic(&self) -> bool;
|
||||
|
||||
/// Returns whether the specified character satisfies the 'XID_Start'
|
||||
/// Unicode property.
|
||||
///
|
||||
/// 'XID_Start' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to ID_Start but modified for closure under NFKx.
|
||||
fn is_XID_start(&self) -> bool;
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Continue'
|
||||
/// Unicode property.
|
||||
///
|
||||
/// 'XID_Continue' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
|
||||
fn is_XID_continue(&self) -> bool;
|
||||
|
||||
|
||||
/// Indicates whether a character is in lowercase.
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core
|
||||
/// Property `Lowercase`.
|
||||
fn is_lowercase(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is in uppercase.
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core
|
||||
/// Property `Uppercase`.
|
||||
fn is_uppercase(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is whitespace.
|
||||
///
|
||||
/// Whitespace is defined in terms of the Unicode Property `White_Space`.
|
||||
fn is_whitespace(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is alphanumeric.
|
||||
///
|
||||
/// Alphanumericness is defined in terms of the Unicode General Categories
|
||||
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
|
||||
fn is_alphanumeric(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is a control code point.
|
||||
///
|
||||
/// Control code points are defined in terms of the Unicode General
|
||||
/// Category `Cc`.
|
||||
fn is_control(&self) -> bool;
|
||||
|
||||
/// Indicates whether the character is numeric (Nd, Nl, or No).
|
||||
fn is_digit(&self) -> bool;
|
||||
|
||||
/// Checks if a `char` parses as a numeric digit in the given radix.
|
||||
///
|
||||
/// Compared to `is_digit()`, this function only recognizes the characters
|
||||
|
@ -438,37 +258,6 @@ pub trait Char {
|
|||
/// Fails if given a radix outside the range [0..36].
|
||||
fn to_digit(&self, radix: uint) -> Option<uint>;
|
||||
|
||||
/// Converts a character to its lowercase equivalent.
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping. See
|
||||
/// `to_uppercase()` for references and more information.
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the lowercase equivalent of the character, or the character
|
||||
/// itself if no conversion is possible.
|
||||
fn to_lowercase(&self) -> char;
|
||||
|
||||
/// Converts a character to its uppercase equivalent.
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping: it maps
|
||||
/// one unicode codepoint (one character in Rust) to its uppercase
|
||||
/// equivalent according to the Unicode database [1]. The additional
|
||||
/// `SpecialCasing.txt` is not considered here, as it expands to multiple
|
||||
/// codepoints in some cases.
|
||||
///
|
||||
/// A full reference can be found here [2].
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the uppercase equivalent of the character, or the character
|
||||
/// itself if no conversion was made.
|
||||
///
|
||||
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
///
|
||||
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
|
||||
fn to_uppercase(&self) -> char;
|
||||
|
||||
/// Converts a number to the character representing it.
|
||||
///
|
||||
/// # Return value
|
||||
|
@ -526,32 +315,10 @@ pub trait Char {
|
|||
}
|
||||
|
||||
impl Char for char {
|
||||
fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
|
||||
|
||||
fn is_XID_start(&self) -> bool { is_XID_start(*self) }
|
||||
|
||||
fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
|
||||
|
||||
fn is_lowercase(&self) -> bool { is_lowercase(*self) }
|
||||
|
||||
fn is_uppercase(&self) -> bool { is_uppercase(*self) }
|
||||
|
||||
fn is_whitespace(&self) -> bool { is_whitespace(*self) }
|
||||
|
||||
fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
|
||||
|
||||
fn is_control(&self) -> bool { is_control(*self) }
|
||||
|
||||
fn is_digit(&self) -> bool { is_digit(*self) }
|
||||
|
||||
fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }
|
||||
|
||||
fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }
|
||||
|
||||
fn to_lowercase(&self) -> char { to_lowercase(*self) }
|
||||
|
||||
fn to_uppercase(&self) -> char { to_uppercase(*self) }
|
||||
|
||||
fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
|
||||
|
||||
fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
|
||||
|
@ -600,5 +367,3 @@ impl Char for char {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -108,7 +108,6 @@ pub mod collections;
|
|||
|
||||
/* Core types and methods on primitives */
|
||||
|
||||
mod unicode;
|
||||
pub mod any;
|
||||
pub mod atomics;
|
||||
pub mod bool;
|
||||
|
|
|
@ -22,7 +22,7 @@ use cmp;
|
|||
use cmp::{PartialEq, Eq};
|
||||
use collections::Collection;
|
||||
use default::Default;
|
||||
use iter::{Filter, Map, Iterator};
|
||||
use iter::{Map, Iterator};
|
||||
use iter::{DoubleEndedIterator, ExactSize};
|
||||
use iter::range;
|
||||
use num::{CheckedMul, Saturating};
|
||||
|
@ -204,10 +204,6 @@ pub struct CharSplitsN<'a, Sep> {
|
|||
invert: bool,
|
||||
}
|
||||
|
||||
/// An iterator over the words of a string, separated by a sequence of whitespace
|
||||
pub type Words<'a> =
|
||||
Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
|
||||
|
||||
/// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
|
||||
pub type AnyLines<'a> =
|
||||
Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
|
||||
|
@ -1209,48 +1205,6 @@ pub trait StrSlice<'a> {
|
|||
/// ```
|
||||
fn lines_any(&self) -> AnyLines<'a>;
|
||||
|
||||
/// An iterator over the words of a string (subsequences separated
|
||||
/// by any sequence of whitespace). Sequences of whitespace are
|
||||
/// collapsed, so empty "words" are not included.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// let some_words = " Mary had\ta little \n\t lamb";
|
||||
/// let v: Vec<&str> = some_words.words().collect();
|
||||
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
|
||||
/// ```
|
||||
fn words(&self) -> Words<'a>;
|
||||
|
||||
/// Returns true if the string contains only whitespace.
|
||||
///
|
||||
/// Whitespace characters are determined by `char::is_whitespace`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// assert!(" \t\n".is_whitespace());
|
||||
/// assert!("".is_whitespace());
|
||||
///
|
||||
/// assert!( !"abc".is_whitespace());
|
||||
/// ```
|
||||
fn is_whitespace(&self) -> bool;
|
||||
|
||||
/// Returns true if the string contains only alphanumeric code
|
||||
/// points.
|
||||
///
|
||||
/// Alphanumeric characters are determined by `char::is_alphanumeric`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// assert!("Löwe老虎Léopard123".is_alphanumeric());
|
||||
/// assert!("".is_alphanumeric());
|
||||
///
|
||||
/// assert!( !" &*~".is_alphanumeric());
|
||||
/// ```
|
||||
fn is_alphanumeric(&self) -> bool;
|
||||
|
||||
/// Returns the number of Unicode code points (`char`) that a
|
||||
/// string holds.
|
||||
///
|
||||
|
@ -1368,15 +1322,6 @@ pub trait StrSlice<'a> {
|
|||
/// Returns true if `needle` is a suffix of the string.
|
||||
fn ends_with(&self, needle: &str) -> bool;
|
||||
|
||||
/// Returns a string with leading and trailing whitespace removed.
|
||||
fn trim(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with leading whitespace removed.
|
||||
fn trim_left(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with trailing whitespace removed.
|
||||
fn trim_right(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with characters that match `to_trim` removed.
|
||||
///
|
||||
/// # Arguments
|
||||
|
@ -1748,17 +1693,6 @@ impl<'a> StrSlice<'a> for &'a str {
|
|||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn words(&self) -> Words<'a> {
|
||||
self.split(char::is_whitespace).filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
|
||||
|
||||
#[inline]
|
||||
fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
|
||||
|
||||
#[inline]
|
||||
fn char_len(&self) -> uint { self.chars().count() }
|
||||
|
||||
|
@ -1817,21 +1751,6 @@ impl<'a> StrSlice<'a> for &'a str {
|
|||
m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim(&self) -> &'a str {
|
||||
self.trim_left().trim_right()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_left(&self) -> &'a str {
|
||||
self.trim_left_chars(char::is_whitespace)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_right(&self) -> &'a str {
|
||||
self.trim_right_chars(char::is_whitespace)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_chars<C: CharEq>(&self, mut to_trim: C) -> &'a str {
|
||||
let cur = match self.find(|c: char| !to_trim.matches(c)) {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -194,3 +194,30 @@ fn test_encode_utf16() {
|
|||
check('\ua66e', [0xa66e]);
|
||||
check('\U0001f4a9', [0xd83d, 0xdca9]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_width() {
|
||||
assert_eq!('\x00'.width(false),Some(0));
|
||||
assert_eq!('\x00'.width(true),Some(0));
|
||||
|
||||
assert_eq!('\x0A'.width(false),None);
|
||||
assert_eq!('\x0A'.width(true),None);
|
||||
|
||||
assert_eq!('w'.width(false),Some(1));
|
||||
assert_eq!('w'.width(true),Some(1));
|
||||
|
||||
assert_eq!('h'.width(false),Some(2));
|
||||
assert_eq!('h'.width(true),Some(2));
|
||||
|
||||
assert_eq!('\xAD'.width(false),Some(1));
|
||||
assert_eq!('\xAD'.width(true),Some(1));
|
||||
|
||||
assert_eq!('\u1160'.width(false),Some(0));
|
||||
assert_eq!('\u1160'.width(true),Some(0));
|
||||
|
||||
assert_eq!('\u00a1'.width(false),Some(1));
|
||||
assert_eq!('\u00a1'.width(true),Some(2));
|
||||
|
||||
assert_eq!('\u0300'.width(false),Some(0));
|
||||
assert_eq!('\u0300'.width(true),Some(0));
|
||||
}
|
||||
|
|
|
@ -306,12 +306,15 @@
|
|||
//!
|
||||
//! ## Perl character classes (Unicode friendly)
|
||||
//!
|
||||
//! These classes are based on the definitions provided in
|
||||
//! [UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! \d digit ([0-9] + \p{Nd})
|
||||
//! \d digit (\p{Nd})
|
||||
//! \D not digit
|
||||
//! \s whitespace ([\t\n\f\r ] + \p{Z})
|
||||
//! \s whitespace (\p{White_Space})
|
||||
//! \S not whitespace
|
||||
//! \w word character ([0-9A-Za-z_] + \p{L})
|
||||
//! \w word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
|
||||
//! \W not word character
|
||||
//! </pre>
|
||||
//!
|
||||
|
@ -378,6 +381,9 @@ extern crate rand;
|
|||
#[cfg(test)]
|
||||
extern crate regex;
|
||||
|
||||
// unicode tables for character classes are defined in libunicode
|
||||
extern crate unicode;
|
||||
|
||||
pub use parse::Error;
|
||||
pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
|
||||
pub use re::{FindCaptures, FindMatches};
|
||||
|
|
|
@ -16,9 +16,7 @@ use std::num;
|
|||
use std::str;
|
||||
|
||||
/// Static data containing Unicode ranges for general categories and scripts.
|
||||
use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
|
||||
#[allow(visible_private_types)]
|
||||
pub mod unicode;
|
||||
use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
|
||||
|
||||
/// The maximum number of repetitions allowed with the `{n,m}` syntax.
|
||||
static MAX_REPEAT: uint = 1000;
|
File diff suppressed because it is too large
Load Diff
|
@ -195,8 +195,8 @@ mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)))
|
|||
|
||||
// Test the Unicode friendliness of Perl character classes.
|
||||
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)))
|
||||
mat!(uni_perl_w_not, r"\w+", "Ⅱ", None)
|
||||
mat!(uni_perl_w_neg, r"\W+", "Ⅱ", Some((0, 3)))
|
||||
mat!(uni_perl_w_not, r"\w+", "⥡", None)
|
||||
mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3)))
|
||||
mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)))
|
||||
mat!(uni_perl_d_not, r"\d+", "Ⅱ", None)
|
||||
mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)))
|
||||
|
|
|
@ -42,7 +42,7 @@ use compile::{
|
|||
Save, Jump, Split,
|
||||
};
|
||||
use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
|
||||
use parse::unicode::PERLW;
|
||||
use unicode::regex::PERLW;
|
||||
|
||||
pub type CaptureLocs = Vec<Option<uint>>;
|
||||
|
||||
|
|
|
@ -237,6 +237,7 @@ use str::{Str, StrSlice};
|
|||
use str;
|
||||
use string::String;
|
||||
use uint;
|
||||
use unicode::UnicodeChar;
|
||||
use vec::Vec;
|
||||
|
||||
// Reexports
|
||||
|
|
|
@ -126,6 +126,7 @@
|
|||
#[cfg(test)] #[phase(plugin, link)] extern crate log;
|
||||
|
||||
extern crate alloc;
|
||||
extern crate unicode;
|
||||
extern crate core;
|
||||
extern crate core_collections = "collections";
|
||||
extern crate core_rand = "rand";
|
||||
|
@ -148,7 +149,6 @@ extern crate rustrt;
|
|||
pub use core::any;
|
||||
pub use core::bool;
|
||||
pub use core::cell;
|
||||
pub use core::char;
|
||||
pub use core::clone;
|
||||
#[cfg(not(test))] pub use core::cmp;
|
||||
pub use core::default;
|
||||
|
@ -180,6 +180,8 @@ pub use core_collections::vec;
|
|||
pub use rustrt::c_str;
|
||||
pub use rustrt::local_data;
|
||||
|
||||
pub use unicode::char;
|
||||
|
||||
pub use core_sync::comm;
|
||||
|
||||
// Run tests with libgreen instead of libnative.
|
||||
|
|
|
@ -24,6 +24,7 @@ use option::{Option, Some, None};
|
|||
use slice::{Vector, ImmutableVector};
|
||||
use str::{CharSplits, Str, StrAllocating, StrVector, StrSlice};
|
||||
use string::String;
|
||||
use unicode::UnicodeChar;
|
||||
use vec::Vec;
|
||||
|
||||
use super::{contains_nul, BytesContainer, GenericPath, GenericPathUnsafe};
|
||||
|
@ -997,7 +998,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
|
|||
let idx = path.find('\\');
|
||||
if idx == Some(2) && path.as_bytes()[1] == ':' as u8 {
|
||||
let c = path.as_bytes()[0];
|
||||
if c.is_ascii() && ::char::is_alphabetic(c as char) {
|
||||
if c.is_ascii() && (c as char).is_alphabetic() {
|
||||
// \\?\C:\ path
|
||||
return Some(VerbatimDiskPrefix);
|
||||
}
|
||||
|
@ -1021,7 +1022,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
|
|||
} else if path.len() > 1 && path.as_bytes()[1] == ':' as u8 {
|
||||
// C:
|
||||
let c = path.as_bytes()[0];
|
||||
if c.is_ascii() && ::char::is_alphabetic(c as char) {
|
||||
if c.is_ascii() && (c as char).is_alphabetic() {
|
||||
return Some(DiskPrefix);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -89,6 +89,7 @@
|
|||
#[doc(no_inline)] pub use slice::{Vector, VectorVector};
|
||||
#[doc(no_inline)] pub use slice::MutableVectorAllocating;
|
||||
#[doc(no_inline)] pub use string::String;
|
||||
#[doc(no_inline)] pub use unicode::{UnicodeChar, UnicodeStrSlice};
|
||||
#[doc(no_inline)] pub use vec::Vec;
|
||||
|
||||
// Reexported runtime types
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
|
||||
#![allow(non_camel_case_types)]
|
||||
|
||||
use char::Char;
|
||||
use collections::Collection;
|
||||
use from_str::from_str;
|
||||
use io::{IoResult, Writer};
|
||||
|
@ -22,6 +21,7 @@ use os;
|
|||
use result::{Ok, Err};
|
||||
use str::StrSlice;
|
||||
use sync::atomics;
|
||||
use unicode::UnicodeChar;
|
||||
|
||||
pub use self::imp::write;
|
||||
|
||||
|
|
|
@ -0,0 +1,111 @@
|
|||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
/*!
|
||||
Functions for computing canonical and compatible decompositions
|
||||
for Unicode characters.
|
||||
*/
|
||||
|
||||
use core::option::{Option, Some, None};
|
||||
use core::slice::ImmutableVector;
|
||||
use tables::normalization::{canonical_table, compatibility_table};
|
||||
|
||||
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
match r.bsearch(|&(val, _)| {
|
||||
if c == val { Equal }
|
||||
else if val < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Some(idx) => {
|
||||
let (_, result) = r[idx];
|
||||
Some(result)
|
||||
}
|
||||
None => None
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute canonical Unicode decomposition for character
|
||||
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
|
||||
|
||||
/// Compute canonical or compatible Unicode decomposition for character
|
||||
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
|
||||
|
||||
fn d(c: char, i: |char|, k: bool) {
|
||||
use core::iter::Iterator;
|
||||
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' { i(c); return; }
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
|
||||
decompose_hangul(c, i);
|
||||
return;
|
||||
}
|
||||
|
||||
// First check the canonical decompositions
|
||||
match bsearch_table(c, canonical_table) {
|
||||
Some(canon) => {
|
||||
for x in canon.iter() {
|
||||
d(*x, |b| i(b), k);
|
||||
}
|
||||
return;
|
||||
}
|
||||
None => ()
|
||||
}
|
||||
|
||||
// Bottom out if we're not doing compat.
|
||||
if !k { i(c); return; }
|
||||
|
||||
// Then check the compatibility decompositions
|
||||
match bsearch_table(c, compatibility_table) {
|
||||
Some(compat) => {
|
||||
for x in compat.iter() {
|
||||
d(*x, |b| i(b), k);
|
||||
}
|
||||
return;
|
||||
}
|
||||
None => ()
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
i(c);
|
||||
}
|
||||
|
||||
// Constants from Unicode 6.3.0 Section 3.12 Conjoining Jamo Behavior
|
||||
static S_BASE: u32 = 0xAC00;
|
||||
static L_BASE: u32 = 0x1100;
|
||||
static V_BASE: u32 = 0x1161;
|
||||
static T_BASE: u32 = 0x11A7;
|
||||
static L_COUNT: u32 = 19;
|
||||
static V_COUNT: u32 = 21;
|
||||
static T_COUNT: u32 = 28;
|
||||
static N_COUNT: u32 = (V_COUNT * T_COUNT);
|
||||
static S_COUNT: u32 = (L_COUNT * N_COUNT);
|
||||
|
||||
// Decompose a precomposed Hangul syllable
|
||||
fn decompose_hangul(s: char, f: |char|) {
|
||||
use core::mem::transmute;
|
||||
|
||||
let si = s as u32 - S_BASE;
|
||||
|
||||
let li = si / N_COUNT;
|
||||
unsafe {
|
||||
f(transmute(L_BASE + li));
|
||||
|
||||
let vi = (si % N_COUNT) / T_COUNT;
|
||||
f(transmute(V_BASE + vi));
|
||||
|
||||
let ti = si % T_COUNT;
|
||||
if ti > 0 {
|
||||
f(transmute(T_BASE + ti));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! # The Unicode Library
|
||||
//!
|
||||
//! Unicode-intensive functions for `char` and `str` types.
|
||||
//!
|
||||
//! This crate provides a collection of Unicode-related functionality,
|
||||
//! including decompositions, conversions, etc., and provides traits
|
||||
//! implementing these functions for the `char` and `str` types.
|
||||
//!
|
||||
//! The functionality included here is only that which is necessary to
|
||||
//! provide for basic string-related manipulations. This crate does not
|
||||
//! (yet) aim to provide a full set of Unicode tables.
|
||||
|
||||
#![crate_id = "unicode#0.11.0"]
|
||||
#![crate_name = "unicode"]
|
||||
#![experimental]
|
||||
#![license = "MIT/ASL2"]
|
||||
#![crate_type = "rlib"]
|
||||
#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
|
||||
html_favicon_url = "http://www.rust-lang.org/favicon.ico",
|
||||
html_root_url = "http://doc.rust-lang.org/",
|
||||
html_playground_url = "http://play.rust-lang.org/")]
|
||||
#![no_std]
|
||||
#![allow(unused_attribute)] // NOTE: remove after stage0
|
||||
|
||||
extern crate core;
|
||||
|
||||
pub use tables::normalization::canonical_combining_class;
|
||||
pub use tables::regex;
|
||||
|
||||
pub use u_char::UnicodeChar;
|
||||
pub use u_str::UnicodeStrSlice;
|
||||
pub use u_str::Words;
|
||||
|
||||
mod decompose;
|
||||
mod tables;
|
||||
mod u_char;
|
||||
mod u_str;
|
||||
|
||||
// re-export char so that std et al see it correctly
|
||||
/// Character manipulation (`char` type, Unicode Scalar Value)
|
||||
///
|
||||
/// This module provides the `Char` and `UnicodeChar` traits, as well as their
|
||||
/// implementation for the primitive `char` type, in order to allow basic character
|
||||
/// manipulation.
|
||||
///
|
||||
/// A `char` actually represents a
|
||||
/// *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
|
||||
/// as it can contain any Unicode code point except high-surrogate and
|
||||
/// low-surrogate code points.
|
||||
///
|
||||
/// As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
|
||||
/// (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
|
||||
/// however the converse is not always true due to the above range limits
|
||||
/// and, as such, should be performed via the `from_u32` function..
|
||||
pub mod char {
|
||||
pub use core::char::{MAX, from_u32, is_digit_radix, to_digit};
|
||||
pub use core::char::{from_digit, escape_unicode, escape_default};
|
||||
pub use core::char::{len_utf8_bytes, Char};
|
||||
|
||||
pub use decompose::decompose_canonical;
|
||||
pub use decompose::decompose_compatible;
|
||||
|
||||
pub use u_char::{is_alphabetic, is_XID_start, is_XID_continue};
|
||||
pub use u_char::{is_lowercase, is_uppercase, is_whitespace};
|
||||
pub use u_char::{is_alphanumeric, is_control, is_digit};
|
||||
pub use u_char::{to_uppercase, to_lowercase, width, UnicodeChar};
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,266 @@
|
|||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
/*!
|
||||
* Unicode-intensive `char` methods.
|
||||
*
|
||||
* These methods implement functionality for `char` that requires knowledge of
|
||||
* Unicode definitions, including normalization, categorization, and display information.
|
||||
*/
|
||||
|
||||
use core::option::Option;
|
||||
use tables::{derived_property, property, general_category, conversions, charwidth};
|
||||
|
||||
/// Returns whether the specified `char` is considered a Unicode alphabetic
|
||||
/// code point
|
||||
pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
|
||||
///
|
||||
/// 'XID_Start' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to ID_Start but modified for closure under NFKx.
|
||||
#[allow(non_snake_case_functions)]
|
||||
pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
|
||||
///
|
||||
/// 'XID_Continue' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
|
||||
#[allow(non_snake_case_functions)]
|
||||
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is in lower case
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is in upper case
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is whitespace
|
||||
///
|
||||
/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_whitespace(c: char) -> bool {
|
||||
// As an optimization ASCII whitespace characters are checked separately
|
||||
c == ' '
|
||||
|| ('\x09' <= c && c <= '\x0d')
|
||||
|| property::White_Space(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is alphanumeric
|
||||
///
|
||||
/// Alphanumericness is defined in terms of the Unicode General Categories
|
||||
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_alphanumeric(c: char) -> bool {
|
||||
derived_property::Alphabetic(c)
|
||||
|| general_category::N(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is a control code point
|
||||
///
|
||||
/// Control code points are defined in terms of the Unicode General Category
|
||||
/// 'Cc'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_control(c: char) -> bool { general_category::Cc(c) }
|
||||
|
||||
/// Indicates whether the `char` is numeric (Nd, Nl, or No)
|
||||
#[inline]
|
||||
pub fn is_digit(c: char) -> bool {
|
||||
general_category::N(c)
|
||||
}
|
||||
|
||||
/// Convert a char to its uppercase equivalent
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping:
|
||||
/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
|
||||
/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
|
||||
/// codepoints in some cases.
|
||||
///
|
||||
/// A full reference can be found here
|
||||
/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the char itself if no conversion was made
|
||||
#[inline]
|
||||
pub fn to_uppercase(c: char) -> char {
|
||||
conversions::to_upper(c)
|
||||
}
|
||||
|
||||
/// Convert a char to its lowercase equivalent
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping
|
||||
/// see `to_uppercase` for references and more information
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the char itself if no conversion if possible
|
||||
#[inline]
|
||||
pub fn to_lowercase(c: char) -> char {
|
||||
conversions::to_lower(c)
|
||||
}
|
||||
|
||||
/// Returns this character's displayed width in columns, or `None` if it is a
|
||||
/// control character other than `'\x00'`.
|
||||
///
|
||||
/// `is_cjk` determines behavior for characters in the Ambiguous category:
|
||||
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
|
||||
/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
|
||||
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
|
||||
/// recommends that these characters be treated as 1 column (i.e.,
|
||||
/// `is_cjk` = `false`) if the context cannot be reliably determined.
|
||||
pub fn width(c: char, is_cjk: bool) -> Option<uint> {
|
||||
charwidth::width(c, is_cjk)
|
||||
}
|
||||
|
||||
/// Useful functions for Unicode characters.
|
||||
pub trait UnicodeChar {
|
||||
/// Returns whether the specified character is considered a Unicode
|
||||
/// alphabetic code point.
|
||||
fn is_alphabetic(&self) -> bool;
|
||||
|
||||
/// Returns whether the specified character satisfies the 'XID_Start'
|
||||
/// Unicode property.
|
||||
///
|
||||
/// 'XID_Start' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to ID_Start but modified for closure under NFKx.
|
||||
#[allow(non_snake_case_functions)]
|
||||
fn is_XID_start(&self) -> bool;
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Continue'
|
||||
/// Unicode property.
|
||||
///
|
||||
/// 'XID_Continue' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
|
||||
#[allow(non_snake_case_functions)]
|
||||
fn is_XID_continue(&self) -> bool;
|
||||
|
||||
|
||||
/// Indicates whether a character is in lowercase.
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core
|
||||
/// Property `Lowercase`.
|
||||
fn is_lowercase(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is in uppercase.
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core
|
||||
/// Property `Uppercase`.
|
||||
fn is_uppercase(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is whitespace.
|
||||
///
|
||||
/// Whitespace is defined in terms of the Unicode Property `White_Space`.
|
||||
fn is_whitespace(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is alphanumeric.
|
||||
///
|
||||
/// Alphanumericness is defined in terms of the Unicode General Categories
|
||||
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
|
||||
fn is_alphanumeric(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is a control code point.
|
||||
///
|
||||
/// Control code points are defined in terms of the Unicode General
|
||||
/// Category `Cc`.
|
||||
fn is_control(&self) -> bool;
|
||||
|
||||
/// Indicates whether the character is numeric (Nd, Nl, or No).
|
||||
fn is_digit(&self) -> bool;
|
||||
|
||||
/// Converts a character to its lowercase equivalent.
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping. See
|
||||
/// `to_uppercase()` for references and more information.
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the lowercase equivalent of the character, or the character
|
||||
/// itself if no conversion is possible.
|
||||
fn to_lowercase(&self) -> char;
|
||||
|
||||
/// Converts a character to its uppercase equivalent.
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping: it maps
|
||||
/// one unicode codepoint (one character in Rust) to its uppercase
|
||||
/// equivalent according to the Unicode database [1]. The additional
|
||||
/// `SpecialCasing.txt` is not considered here, as it expands to multiple
|
||||
/// codepoints in some cases.
|
||||
///
|
||||
/// A full reference can be found here [2].
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the uppercase equivalent of the character, or the character
|
||||
/// itself if no conversion was made.
|
||||
///
|
||||
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
///
|
||||
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
|
||||
fn to_uppercase(&self) -> char;
|
||||
|
||||
/// Returns this character's displayed width in columns, or `None` if it is a
|
||||
/// control character other than `'\x00'`.
|
||||
///
|
||||
/// `is_cjk` determines behavior for characters in the Ambiguous category:
|
||||
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
|
||||
/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
|
||||
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
|
||||
/// recommends that these characters be treated as 1 column (i.e.,
|
||||
/// `is_cjk` = `false`) if the context cannot be reliably determined.
|
||||
fn width(&self, is_cjk: bool) -> Option<uint>;
|
||||
}
|
||||
|
||||
impl UnicodeChar for char {
|
||||
fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
|
||||
|
||||
fn is_XID_start(&self) -> bool { is_XID_start(*self) }
|
||||
|
||||
fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
|
||||
|
||||
fn is_lowercase(&self) -> bool { is_lowercase(*self) }
|
||||
|
||||
fn is_uppercase(&self) -> bool { is_uppercase(*self) }
|
||||
|
||||
fn is_whitespace(&self) -> bool { is_whitespace(*self) }
|
||||
|
||||
fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
|
||||
|
||||
fn is_control(&self) -> bool { is_control(*self) }
|
||||
|
||||
fn is_digit(&self) -> bool { is_digit(*self) }
|
||||
|
||||
fn to_lowercase(&self) -> char { to_lowercase(*self) }
|
||||
|
||||
fn to_uppercase(&self) -> char { to_uppercase(*self) }
|
||||
|
||||
fn width(&self, is_cjk: bool) -> Option<uint> { width(*self, is_cjk) }
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
/*!
|
||||
* Unicode-intensive string manipulations.
|
||||
*
|
||||
* This module provides functionality to `str` that requires the Unicode
|
||||
* methods provided by the UnicodeChar trait.
|
||||
*/
|
||||
|
||||
use core::collections::Collection;
|
||||
use core::iter::{Filter};
|
||||
use core::str::{CharSplits, StrSlice};
|
||||
use core::iter::Iterator;
|
||||
use u_char;
|
||||
|
||||
/// An iterator over the words of a string, separated by a sequence of whitespace
|
||||
pub type Words<'a> =
|
||||
Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
|
||||
|
||||
/// Methods for Unicode string slices
|
||||
pub trait UnicodeStrSlice<'a> {
|
||||
/// An iterator over the words of a string (subsequences separated
|
||||
/// by any sequence of whitespace). Sequences of whitespace are
|
||||
/// collapsed, so empty "words" are not included.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// let some_words = " Mary had\ta little \n\t lamb";
|
||||
/// let v: Vec<&str> = some_words.words().collect();
|
||||
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
|
||||
/// ```
|
||||
fn words(&self) -> Words<'a>;
|
||||
|
||||
/// Returns true if the string contains only whitespace.
|
||||
///
|
||||
/// Whitespace characters are determined by `char::is_whitespace`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// assert!(" \t\n".is_whitespace());
|
||||
/// assert!("".is_whitespace());
|
||||
///
|
||||
/// assert!( !"abc".is_whitespace());
|
||||
/// ```
|
||||
fn is_whitespace(&self) -> bool;
|
||||
|
||||
/// Returns true if the string contains only alphanumeric code
|
||||
/// points.
|
||||
///
|
||||
/// Alphanumeric characters are determined by `char::is_alphanumeric`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// assert!("Löwe老虎Léopard123".is_alphanumeric());
|
||||
/// assert!("".is_alphanumeric());
|
||||
///
|
||||
/// assert!( !" &*~".is_alphanumeric());
|
||||
/// ```
|
||||
fn is_alphanumeric(&self) -> bool;
|
||||
|
||||
/// Returns a string's displayed width in columns, treating control
|
||||
/// characters as zero-width.
|
||||
///
|
||||
/// `is_cjk` determines behavior for characters in the Ambiguous category:
|
||||
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
|
||||
/// In CJK locales, `is_cjk` should be `true`, else it should be `false`.
|
||||
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
|
||||
/// recommends that these characters be treated as 1 column (i.e.,
|
||||
/// `is_cjk` = `false`) if the locale is unknown.
|
||||
//fn width(&self, is_cjk: bool) -> uint;
|
||||
|
||||
/// Returns a string with leading and trailing whitespace removed.
|
||||
fn trim(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with leading whitespace removed.
|
||||
fn trim_left(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with trailing whitespace removed.
|
||||
fn trim_right(&self) -> &'a str;
|
||||
}
|
||||
|
||||
impl<'a> UnicodeStrSlice<'a> for &'a str {
|
||||
#[inline]
|
||||
fn words(&self) -> Words<'a> {
|
||||
self.split(u_char::is_whitespace).filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_whitespace(&self) -> bool { self.chars().all(u_char::is_whitespace) }
|
||||
|
||||
#[inline]
|
||||
fn is_alphanumeric(&self) -> bool { self.chars().all(u_char::is_alphanumeric) }
|
||||
|
||||
#[inline]
|
||||
fn trim(&self) -> &'a str {
|
||||
self.trim_left().trim_right()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_left(&self) -> &'a str {
|
||||
self.trim_left_chars(u_char::is_whitespace)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_right(&self) -> &'a str {
|
||||
self.trim_right_chars(u_char::is_whitespace)
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue