2012-10-23 04:31:11 +00:00
|
|
|
// Copyright 2012 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
// +build ignore
|
|
|
|
|
|
|
|
// Collation table generator.
|
|
|
|
// Data read from the web.
|
|
|
|
|
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"archive/zip"
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
|
|
|
"encoding/xml"
|
|
|
|
"exp/locale/collate"
|
|
|
|
"exp/locale/collate/build"
|
|
|
|
"flag"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"io/ioutil"
|
|
|
|
"log"
|
|
|
|
"net/http"
|
|
|
|
"os"
|
|
|
|
"path"
|
|
|
|
"regexp"
|
|
|
|
"sort"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"unicode"
|
|
|
|
"unicode/utf8"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
root = flag.String("root",
|
|
|
|
"http://unicode.org/Public/UCA/"+unicode.Version+"/CollationAuxiliary.zip",
|
|
|
|
`URL of the Default Unicode Collation Element Table (DUCET). This can be a zip
|
|
|
|
file containing the file allkeys_CLDR.txt or an allkeys.txt file.`)
|
|
|
|
cldr = flag.String("cldr",
|
2012-11-21 07:03:38 +00:00
|
|
|
"http://www.unicode.org/Public/cldr/22/core.zip",
|
2012-10-23 04:31:11 +00:00
|
|
|
"URL of CLDR archive.")
|
|
|
|
test = flag.Bool("test", false,
|
|
|
|
"test existing tables; can be used to compare web data with package data.")
|
|
|
|
localFiles = flag.Bool("local", false,
|
|
|
|
"data files have been copied to the current directory; for debugging only.")
|
|
|
|
short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
|
|
|
|
draft = flag.Bool("draft", false, `Use draft versions, when available.`)
|
|
|
|
tags = flag.String("tags", "", "build tags to be included after +build directive")
|
|
|
|
pkg = flag.String("package", "collate",
|
|
|
|
"the name of the package in which the generated file is to be included")
|
|
|
|
|
|
|
|
tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
|
|
|
|
"comma-spearated list of tables to generate.")
|
|
|
|
exclude = flagStringSet("exclude", "zh2", "",
|
|
|
|
"comma-separated list of languages to exclude.")
|
|
|
|
include = flagStringSet("include", "", "",
|
|
|
|
"comma-separated list of languages to include. Include trumps exclude.")
|
|
|
|
types = flagStringSetAllowAll("types", "", "",
|
|
|
|
"comma-separated list of types that should be included in addition to the standard type.")
|
|
|
|
)
|
|
|
|
|
|
|
|
// stringSet implements an ordered set based on a list. It implements flag.Value
|
|
|
|
// to allow a set to be specified as a comma-separated list.
|
|
|
|
type stringSet struct {
|
|
|
|
s []string
|
|
|
|
allowed *stringSet
|
|
|
|
dirty bool // needs compaction if true
|
|
|
|
all bool
|
|
|
|
allowAll bool
|
|
|
|
}
|
|
|
|
|
|
|
|
func flagStringSet(name, def, allowed, usage string) *stringSet {
|
|
|
|
ss := &stringSet{}
|
|
|
|
if allowed != "" {
|
|
|
|
usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
|
|
|
|
ss.allowed = &stringSet{}
|
|
|
|
failOnError(ss.allowed.Set(allowed))
|
|
|
|
}
|
|
|
|
ss.Set(def)
|
|
|
|
flag.Var(ss, name, usage)
|
|
|
|
return ss
|
|
|
|
}
|
|
|
|
|
|
|
|
func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
|
|
|
|
ss := &stringSet{allowAll: true}
|
|
|
|
if allowed == "" {
|
|
|
|
flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
|
|
|
|
} else {
|
|
|
|
ss.allowed = &stringSet{}
|
|
|
|
failOnError(ss.allowed.Set(allowed))
|
|
|
|
flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
|
|
|
|
}
|
|
|
|
ss.Set(def)
|
|
|
|
return ss
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ss stringSet) Len() int {
|
|
|
|
return len(ss.s)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ss stringSet) String() string {
|
|
|
|
return strings.Join(ss.s, ",")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ss *stringSet) Set(s string) error {
|
|
|
|
if ss.allowAll && s == "all" {
|
|
|
|
ss.s = nil
|
|
|
|
ss.all = true
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
ss.s = ss.s[:0]
|
|
|
|
for _, s := range strings.Split(s, ",") {
|
|
|
|
if s := strings.TrimSpace(s); s != "" {
|
|
|
|
if ss.allowed != nil && !ss.allowed.contains(s) {
|
|
|
|
return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
|
|
|
|
}
|
|
|
|
ss.add(s)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ss.compact()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ss *stringSet) add(s string) {
|
|
|
|
ss.s = append(ss.s, s)
|
|
|
|
ss.dirty = true
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ss *stringSet) values() []string {
|
|
|
|
ss.compact()
|
|
|
|
return ss.s
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ss *stringSet) contains(s string) bool {
|
|
|
|
if ss.all {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
for _, v := range ss.s {
|
|
|
|
if v == s {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ss *stringSet) compact() {
|
|
|
|
if !ss.dirty {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
a := ss.s
|
|
|
|
sort.Strings(a)
|
|
|
|
k := 0
|
|
|
|
for i := 1; i < len(a); i++ {
|
|
|
|
if a[k] != a[i] {
|
|
|
|
a[k+1] = a[i]
|
|
|
|
k++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ss.s = a[:k+1]
|
|
|
|
ss.dirty = false
|
|
|
|
}
|
|
|
|
|
|
|
|
func skipLang(l string) bool {
|
|
|
|
if include.Len() > 0 {
|
|
|
|
return !include.contains(l)
|
|
|
|
}
|
|
|
|
return exclude.contains(l)
|
|
|
|
}
|
|
|
|
|
|
|
|
func skipAlt(a string) bool {
|
|
|
|
if *draft && a == "proposed" {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if *short && a == "short" {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
func failOnError(e error) {
|
|
|
|
if e != nil {
|
2012-11-21 07:03:38 +00:00
|
|
|
log.Panic(e)
|
2012-10-23 04:31:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// openReader opens the URL or file given by url and returns it as an io.ReadCloser
|
|
|
|
// or nil on error.
|
|
|
|
func openReader(url *string) (io.ReadCloser, error) {
|
|
|
|
if *localFiles {
|
|
|
|
pwd, _ := os.Getwd()
|
|
|
|
*url = "file://" + path.Join(pwd, path.Base(*url))
|
|
|
|
}
|
|
|
|
t := &http.Transport{}
|
|
|
|
t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
|
|
|
|
c := &http.Client{Transport: t}
|
|
|
|
resp, err := c.Get(*url)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if resp.StatusCode != 200 {
|
|
|
|
return nil, fmt.Errorf(`bad GET status for "%s": %s`, *url, resp.Status)
|
|
|
|
}
|
|
|
|
return resp.Body, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func openArchive(url *string) *zip.Reader {
|
|
|
|
f, err := openReader(url)
|
|
|
|
failOnError(err)
|
|
|
|
buffer, err := ioutil.ReadAll(f)
|
|
|
|
f.Close()
|
|
|
|
failOnError(err)
|
|
|
|
archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
|
|
|
|
failOnError(err)
|
|
|
|
return archive
|
|
|
|
}
|
|
|
|
|
|
|
|
// parseUCA parses a Default Unicode Collation Element Table of the format
|
|
|
|
// specified in http://www.unicode.org/reports/tr10/#File_Format.
|
|
|
|
// It returns the variable top.
|
|
|
|
func parseUCA(builder *build.Builder) {
|
|
|
|
var r io.ReadCloser
|
|
|
|
var err error
|
|
|
|
if strings.HasSuffix(*root, ".zip") {
|
|
|
|
for _, f := range openArchive(root).File {
|
|
|
|
if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
|
|
|
|
r, err = f.Open()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if r == nil {
|
|
|
|
err = fmt.Errorf("file allkeys_CLDR.txt not found in archive %q", *root)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
r, err = openReader(root)
|
|
|
|
}
|
|
|
|
failOnError(err)
|
|
|
|
defer r.Close()
|
|
|
|
input := bufio.NewReader(r)
|
|
|
|
colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
|
|
|
|
for i := 1; err == nil; i++ {
|
|
|
|
l, prefix, e := input.ReadLine()
|
|
|
|
err = e
|
|
|
|
line := string(l)
|
|
|
|
if prefix {
|
|
|
|
log.Fatalf("%d: buffer overflow", i)
|
|
|
|
}
|
|
|
|
if err != nil && err != io.EOF {
|
|
|
|
log.Fatalf("%d: %v", i, err)
|
|
|
|
}
|
|
|
|
if len(line) == 0 || line[0] == '#' {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if line[0] == '@' {
|
|
|
|
// parse properties
|
|
|
|
switch {
|
|
|
|
case strings.HasPrefix(line[1:], "version "):
|
|
|
|
a := strings.Split(line[1:], " ")
|
|
|
|
if a[1] != unicode.Version {
|
|
|
|
log.Fatalf("incompatible version %s; want %s", a[1], unicode.Version)
|
|
|
|
}
|
|
|
|
case strings.HasPrefix(line[1:], "backwards "):
|
|
|
|
log.Fatalf("%d: unsupported option backwards", i)
|
|
|
|
default:
|
|
|
|
log.Printf("%d: unknown option %s", i, line[1:])
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// parse entries
|
|
|
|
part := strings.Split(line, " ; ")
|
|
|
|
if len(part) != 2 {
|
|
|
|
log.Fatalf("%d: production rule without ';': %v", i, line)
|
|
|
|
}
|
|
|
|
lhs := []rune{}
|
|
|
|
for _, v := range strings.Split(part[0], " ") {
|
|
|
|
if v == "" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
lhs = append(lhs, rune(convHex(i, v)))
|
|
|
|
}
|
|
|
|
var n int
|
|
|
|
var vars []int
|
|
|
|
rhs := [][]int{}
|
|
|
|
for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
|
|
|
|
n += len(m[0])
|
|
|
|
elem := []int{}
|
|
|
|
for _, h := range strings.Split(m[2], ".") {
|
|
|
|
elem = append(elem, convHex(i, h))
|
|
|
|
}
|
|
|
|
if m[1] == "*" {
|
|
|
|
vars = append(vars, i)
|
|
|
|
}
|
|
|
|
rhs = append(rhs, elem)
|
|
|
|
}
|
|
|
|
if len(part[1]) < n+3 || part[1][n+1] != '#' {
|
|
|
|
log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
|
|
|
|
}
|
|
|
|
if *test {
|
|
|
|
testInput.add(string(lhs))
|
|
|
|
}
|
|
|
|
failOnError(builder.Add(lhs, rhs, vars))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func convHex(line int, s string) int {
|
|
|
|
r, e := strconv.ParseInt(s, 16, 32)
|
|
|
|
if e != nil {
|
|
|
|
log.Fatalf("%d: %v", line, e)
|
|
|
|
}
|
|
|
|
return int(r)
|
|
|
|
}
|
|
|
|
|
|
|
|
var testInput = stringSet{}
|
|
|
|
|
|
|
|
// LDML holds all collation information parsed from an LDML XML file.
|
|
|
|
// The format of these files is defined in http://unicode.org/reports/tr35/.
|
|
|
|
type LDML struct {
|
|
|
|
XMLName xml.Name `xml:"ldml"`
|
|
|
|
Language Attr `xml:"identity>language"`
|
|
|
|
Territory Attr `xml:"identity>territory"`
|
|
|
|
Chars *struct {
|
|
|
|
ExemplarCharacters []AttrValue `xml:"exemplarCharacters"`
|
|
|
|
MoreInformaton string `xml:"moreInformation,omitempty"`
|
|
|
|
} `xml:"characters"`
|
|
|
|
Default Attr `xml:"collations>default"`
|
|
|
|
Collations []Collation `xml:"collations>collation"`
|
|
|
|
}
|
|
|
|
|
|
|
|
type Attr struct {
|
|
|
|
XMLName xml.Name
|
|
|
|
Attr string `xml:"type,attr"`
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t Attr) String() string {
|
|
|
|
return t.Attr
|
|
|
|
}
|
|
|
|
|
|
|
|
type AttrValue struct {
|
|
|
|
Type string `xml:"type,attr"`
|
|
|
|
Key string `xml:"key,attr,omitempty"`
|
|
|
|
Draft string `xml:"draft,attr,omitempty"`
|
|
|
|
Value string `xml:",innerxml"`
|
|
|
|
}
|
|
|
|
|
|
|
|
type Collation struct {
|
|
|
|
Type string `xml:"type,attr"`
|
|
|
|
Alt string `xml:"alt,attr"`
|
|
|
|
SuppressContraction string `xml:"suppress_contractions,omitempty"`
|
|
|
|
Settings *Settings `xml:"settings"`
|
|
|
|
Optimize string `xml:"optimize"`
|
|
|
|
Rules Rules `xml:"rules"`
|
|
|
|
}
|
|
|
|
|
|
|
|
type Optimize struct {
|
|
|
|
XMLName xml.Name `xml:"optimize"`
|
|
|
|
Data string `xml:"chardata"`
|
|
|
|
}
|
|
|
|
|
|
|
|
type Suppression struct {
|
|
|
|
XMLName xml.Name `xml:"suppress_contractions"`
|
|
|
|
Data string `xml:"chardata"`
|
|
|
|
}
|
|
|
|
|
|
|
|
type Settings struct {
|
|
|
|
Strength string `xml:"strenght,attr,omitempty"`
|
|
|
|
Backwards string `xml:"backwards,attr,omitempty"`
|
|
|
|
Normalization string `xml:"normalization,attr,omitempty"`
|
|
|
|
CaseLevel string `xml:"caseLevel,attr,omitempty"`
|
|
|
|
CaseFirst string `xml:"caseFirst,attr,omitempty"`
|
|
|
|
HiraganaQuarternary string `xml:"hiraganaQuartenary,attr,omitempty"`
|
|
|
|
Numeric string `xml:"numeric,attr,omitempty"`
|
|
|
|
VariableTop string `xml:"variableTop,attr,omitempty"`
|
|
|
|
}
|
|
|
|
|
|
|
|
type Rules struct {
|
|
|
|
XMLName xml.Name `xml:"rules"`
|
|
|
|
Any []RuleElem `xml:",any"`
|
|
|
|
}
|
|
|
|
|
|
|
|
type RuleElem struct {
|
|
|
|
XMLName xml.Name
|
|
|
|
Value string `xml:",innerxml"`
|
|
|
|
Before string `xml:"before,attr"`
|
|
|
|
Any []RuleElem `xml:",any"` // for <x> elements
|
|
|
|
}
|
|
|
|
|
|
|
|
var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
|
|
|
|
var tagRe = regexp.MustCompile(`<([a-z_]*) */>`)
|
|
|
|
|
|
|
|
func (r *RuleElem) rewrite() {
|
|
|
|
// Convert hexadecimal Unicode codepoint notation to a string.
|
|
|
|
if m := charRe.FindAllStringSubmatch(r.Value, -1); m != nil {
|
|
|
|
runes := []rune{}
|
|
|
|
for _, sa := range m {
|
|
|
|
runes = append(runes, rune(convHex(-1, sa[1])))
|
|
|
|
}
|
|
|
|
r.Value = string(runes)
|
|
|
|
}
|
|
|
|
// Strip spaces from reset positions.
|
|
|
|
if m := tagRe.FindStringSubmatch(r.Value); m != nil {
|
|
|
|
r.Value = fmt.Sprintf("<%s/>", m[1])
|
|
|
|
}
|
|
|
|
for _, rr := range r.Any {
|
|
|
|
rr.rewrite()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func decodeXML(f *zip.File) *LDML {
|
|
|
|
r, err := f.Open()
|
|
|
|
failOnError(err)
|
|
|
|
d := xml.NewDecoder(r)
|
|
|
|
var x LDML
|
|
|
|
err = d.Decode(&x)
|
|
|
|
failOnError(err)
|
|
|
|
return &x
|
|
|
|
}
|
|
|
|
|
|
|
|
var mainLocales = []string{}
|
|
|
|
|
|
|
|
// charsets holds a list of exemplar characters per category.
|
|
|
|
type charSets map[string][]string
|
|
|
|
|
|
|
|
func (p charSets) fprint(w io.Writer) {
|
|
|
|
fmt.Fprintln(w, "[exN]string{")
|
|
|
|
for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
|
|
|
|
if set := p[k]; len(set) != 0 {
|
|
|
|
fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "\t},")
|
|
|
|
}
|
|
|
|
|
|
|
|
var localeChars = make(map[string]charSets)
|
|
|
|
|
|
|
|
const exemplarHeader = `
|
|
|
|
type exemplarType int
|
|
|
|
const (
|
|
|
|
exCharacters exemplarType = iota
|
|
|
|
exContractions
|
|
|
|
exPunctuation
|
|
|
|
exAuxiliary
|
|
|
|
exCurrency
|
|
|
|
exIndex
|
|
|
|
exN
|
|
|
|
)
|
|
|
|
`
|
|
|
|
|
|
|
|
func printExemplarCharacters(w io.Writer) {
|
|
|
|
fmt.Fprintln(w, exemplarHeader)
|
|
|
|
fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
|
|
|
|
for _, loc := range mainLocales {
|
|
|
|
fmt.Fprintf(w, "\t%q: ", loc)
|
|
|
|
localeChars[loc].fprint(w)
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "}")
|
|
|
|
}
|
|
|
|
|
|
|
|
var mainRe = regexp.MustCompile(`.*/main/(.*)\.xml`)
|
|
|
|
|
|
|
|
// parseMain parses XML files in the main directory of the CLDR core.zip file.
|
|
|
|
func parseMain() {
|
|
|
|
for _, f := range openArchive(cldr).File {
|
|
|
|
if m := mainRe.FindStringSubmatch(f.Name); m != nil {
|
|
|
|
locale := m[1]
|
|
|
|
x := decodeXML(f)
|
|
|
|
if skipLang(x.Language.Attr) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if x.Chars != nil {
|
|
|
|
for _, ec := range x.Chars.ExemplarCharacters {
|
|
|
|
if ec.Draft != "" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if _, ok := localeChars[locale]; !ok {
|
|
|
|
mainLocales = append(mainLocales, locale)
|
|
|
|
localeChars[locale] = make(charSets)
|
|
|
|
}
|
|
|
|
localeChars[locale][ec.Type] = parseCharacters(ec.Value)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func parseCharacters(chars string) []string {
|
|
|
|
parseSingle := func(s string) (r rune, tail string, escaped bool) {
|
|
|
|
if s[0] == '\\' {
|
|
|
|
if s[1] == 'u' || s[1] == 'U' {
|
|
|
|
r, _, tail, err := strconv.UnquoteChar(s, 0)
|
|
|
|
failOnError(err)
|
|
|
|
return r, tail, false
|
|
|
|
} else if strings.HasPrefix(s[1:], "&") {
|
|
|
|
return '&', s[6:], false
|
|
|
|
}
|
|
|
|
return rune(s[1]), s[2:], true
|
|
|
|
} else if strings.HasPrefix(s, """) {
|
|
|
|
return '"', s[6:], false
|
|
|
|
}
|
|
|
|
r, sz := utf8.DecodeRuneInString(s)
|
|
|
|
return r, s[sz:], false
|
|
|
|
}
|
|
|
|
chars = strings.Trim(chars, "[ ]")
|
|
|
|
list := []string{}
|
|
|
|
var r, last, end rune
|
|
|
|
for len(chars) > 0 {
|
|
|
|
if chars[0] == '{' { // character sequence
|
|
|
|
buf := []rune{}
|
|
|
|
for chars = chars[1:]; len(chars) > 0; {
|
|
|
|
r, chars, _ = parseSingle(chars)
|
|
|
|
if r == '}' {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
if r == ' ' {
|
|
|
|
log.Fatalf("space not supported in sequence %q", chars)
|
|
|
|
}
|
|
|
|
buf = append(buf, r)
|
|
|
|
}
|
|
|
|
list = append(list, string(buf))
|
|
|
|
last = 0
|
|
|
|
} else { // single character
|
|
|
|
escaped := false
|
|
|
|
r, chars, escaped = parseSingle(chars)
|
|
|
|
if r != ' ' {
|
|
|
|
if r == '-' && !escaped {
|
|
|
|
if last == 0 {
|
|
|
|
log.Fatal("'-' should be preceded by a character")
|
|
|
|
}
|
|
|
|
end, chars, _ = parseSingle(chars)
|
|
|
|
for ; last <= end; last++ {
|
|
|
|
list = append(list, string(last))
|
|
|
|
}
|
|
|
|
last = 0
|
|
|
|
} else {
|
|
|
|
list = append(list, string(r))
|
|
|
|
last = r
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return list
|
|
|
|
}
|
|
|
|
|
|
|
|
var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
|
|
|
|
|
|
|
|
// parseCollation parses XML files in the collation directory of the CLDR core.zip file.
|
|
|
|
func parseCollation(b *build.Builder) {
|
|
|
|
for _, f := range openArchive(cldr).File {
|
|
|
|
if m := fileRe.FindStringSubmatch(f.Name); m != nil {
|
|
|
|
lang := m[1]
|
|
|
|
x := decodeXML(f)
|
|
|
|
if skipLang(x.Language.Attr) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
def := "standard"
|
|
|
|
if x.Default.Attr != "" {
|
|
|
|
def = x.Default.Attr
|
|
|
|
}
|
|
|
|
todo := make(map[string]Collation)
|
|
|
|
for _, c := range x.Collations {
|
|
|
|
if c.Type != def && !types.contains(c.Type) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if c.Alt != "" && skipAlt(c.Alt) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
for j := range c.Rules.Any {
|
|
|
|
c.Rules.Any[j].rewrite()
|
|
|
|
}
|
|
|
|
locale := lang
|
|
|
|
if c.Type != def {
|
|
|
|
locale += "_u_co_" + c.Type
|
|
|
|
}
|
|
|
|
_, exists := todo[locale]
|
|
|
|
if c.Alt != "" || !exists {
|
|
|
|
todo[locale] = c
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for _, c := range x.Collations {
|
|
|
|
locale := lang
|
|
|
|
if c.Type != def {
|
|
|
|
locale += "_u_co_" + c.Type
|
|
|
|
}
|
|
|
|
if d, ok := todo[locale]; ok && d.Alt == c.Alt {
|
|
|
|
insertCollation(b, locale, &c)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
var lmap = map[byte]collate.Level{
|
|
|
|
'p': collate.Primary,
|
|
|
|
's': collate.Secondary,
|
|
|
|
't': collate.Tertiary,
|
|
|
|
'i': collate.Identity,
|
|
|
|
}
|
|
|
|
|
|
|
|
// cldrIndex is a Unicode-reserved sentinel value used.
|
|
|
|
// We ignore any rule that starts with this rune.
|
|
|
|
// See http://unicode.org/reports/tr35/#Collation_Elements for details.
|
|
|
|
const cldrIndex = 0xFDD0
|
|
|
|
|
|
|
|
func insertTailoring(t *build.Tailoring, r RuleElem, context, extend string) {
|
|
|
|
switch l := r.XMLName.Local; l {
|
|
|
|
case "p", "s", "t", "i":
|
|
|
|
if []rune(r.Value)[0] != cldrIndex {
|
|
|
|
str := context + r.Value
|
|
|
|
if *test {
|
|
|
|
testInput.add(str)
|
|
|
|
}
|
2012-11-21 07:03:38 +00:00
|
|
|
err := t.Insert(lmap[l[0]], str, context+extend)
|
2012-10-23 04:31:11 +00:00
|
|
|
failOnError(err)
|
|
|
|
}
|
|
|
|
case "pc", "sc", "tc", "ic":
|
|
|
|
level := lmap[l[0]]
|
|
|
|
for _, s := range r.Value {
|
|
|
|
str := context + string(s)
|
|
|
|
if *test {
|
|
|
|
testInput.add(str)
|
|
|
|
}
|
2012-11-21 07:03:38 +00:00
|
|
|
err := t.Insert(level, str, context+extend)
|
2012-10-23 04:31:11 +00:00
|
|
|
failOnError(err)
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
log.Fatalf("unsupported tag: %q", l)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func insertCollation(builder *build.Builder, locale string, c *Collation) {
|
|
|
|
t := builder.Tailoring(locale)
|
|
|
|
for _, r := range c.Rules.Any {
|
|
|
|
switch r.XMLName.Local {
|
|
|
|
case "reset":
|
|
|
|
if r.Before == "" {
|
|
|
|
failOnError(t.SetAnchor(r.Value))
|
|
|
|
} else {
|
|
|
|
failOnError(t.SetAnchorBefore(r.Value))
|
|
|
|
}
|
|
|
|
case "x":
|
|
|
|
var context, extend string
|
|
|
|
for _, r1 := range r.Any {
|
|
|
|
switch r1.XMLName.Local {
|
|
|
|
case "context":
|
|
|
|
context = r1.Value
|
|
|
|
case "extend":
|
|
|
|
extend = r1.Value
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for _, r1 := range r.Any {
|
|
|
|
if t := r1.XMLName.Local; t == "context" || t == "extend" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
insertTailoring(t, r1, context, extend)
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
insertTailoring(t, r, "", "")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func testCollator(c *collate.Collator) {
|
|
|
|
c0 := collate.New("")
|
|
|
|
|
|
|
|
// iterator over all characters for all locales and check
|
|
|
|
// whether Key is equal.
|
|
|
|
buf := collate.Buffer{}
|
|
|
|
|
|
|
|
// Add all common and not too uncommon runes to the test set.
|
|
|
|
for i := rune(0); i < 0x30000; i++ {
|
|
|
|
testInput.add(string(i))
|
|
|
|
}
|
|
|
|
for i := rune(0xE0000); i < 0xF0000; i++ {
|
|
|
|
testInput.add(string(i))
|
|
|
|
}
|
|
|
|
for _, str := range testInput.values() {
|
|
|
|
k0 := c0.KeyFromString(&buf, str)
|
|
|
|
k := c.KeyFromString(&buf, str)
|
|
|
|
if bytes.Compare(k0, k) != 0 {
|
|
|
|
failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
|
|
|
|
}
|
2012-11-21 07:03:38 +00:00
|
|
|
buf.Reset()
|
2012-10-23 04:31:11 +00:00
|
|
|
}
|
|
|
|
fmt.Println("PASS")
|
|
|
|
}
|
|
|
|
|
|
|
|
func main() {
|
|
|
|
flag.Parse()
|
|
|
|
b := build.NewBuilder()
|
|
|
|
if *root != "" {
|
|
|
|
parseUCA(b)
|
|
|
|
}
|
|
|
|
if *cldr != "" {
|
|
|
|
if tables.contains("chars") {
|
|
|
|
parseMain()
|
|
|
|
}
|
|
|
|
parseCollation(b)
|
|
|
|
}
|
|
|
|
|
|
|
|
c, err := b.Build()
|
|
|
|
failOnError(err)
|
|
|
|
|
|
|
|
if *test {
|
|
|
|
testCollator(c)
|
|
|
|
} else {
|
|
|
|
fmt.Println("// Generated by running")
|
|
|
|
fmt.Printf("// maketables -root=%s -cldr=%s\n", *root, *cldr)
|
|
|
|
fmt.Println("// DO NOT EDIT")
|
|
|
|
fmt.Println("// TODO: implement more compact representation for sparse blocks.")
|
|
|
|
if *tags != "" {
|
|
|
|
fmt.Printf("// +build %s\n", *tags)
|
|
|
|
}
|
|
|
|
fmt.Println("")
|
|
|
|
fmt.Printf("package %s\n", *pkg)
|
|
|
|
if tables.contains("collate") {
|
|
|
|
fmt.Println("")
|
|
|
|
_, err = b.Print(os.Stdout)
|
|
|
|
failOnError(err)
|
|
|
|
}
|
|
|
|
if tables.contains("chars") {
|
|
|
|
printExemplarCharacters(os.Stdout)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|