593f74bbab
From-SVN: r185010
175 lines
5.6 KiB
Go
175 lines
5.6 KiB
Go
// Copyright 2011 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package norm
|
|
|
|
// This file contains Form-specific logic and wrappers for data in tables.go.
|
|
|
|
// Rune info is stored in a separate trie per composing form. A composing form
|
|
// and its corresponding decomposing form share the same trie. Each trie maps
|
|
// a rune to a uint16. The values take two forms. For v >= 0x8000:
|
|
// bits
|
|
// 0..8: ccc
|
|
// 9..12: qcInfo (see below). isYesD is always true (no decompostion).
|
|
// 16: 1
|
|
// For v < 0x8000, the respective rune has a decomposition and v is an index
|
|
// into a byte array of UTF-8 decomposition sequences and additional info and
|
|
// has the form:
|
|
// <header> <decomp_byte>* [<tccc> [<lccc>]]
|
|
// The header contains the number of bytes in the decomposition (excluding this
|
|
// length byte). The two most significant bits of this length byte correspond
|
|
// to bit 2 and 3 of qcIfo (see below). The byte sequence itself starts at v+1.
|
|
// The byte sequence is followed by a trailing and leading CCC if the values
|
|
// for these are not zero. The value of v determines which ccc are appended
|
|
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
|
|
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
|
|
// there is an additional leading ccc.
|
|
|
|
const (
|
|
qcInfoMask = 0xF // to clear all but the relevant bits in a qcInfo
|
|
headerLenMask = 0x3F // extract the length value from the header byte
|
|
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
|
|
)
|
|
|
|
// runeInfo is a representation for the data stored in charinfoTrie.
|
|
type runeInfo struct {
|
|
pos uint8 // start position in reorderBuffer; used in composition.go
|
|
size uint8 // length of UTF-8 encoding of this rune
|
|
ccc uint8 // leading canonical combining class (ccc if not decomposition)
|
|
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
|
|
flags qcInfo // quick check flags
|
|
index uint16
|
|
}
|
|
|
|
// functions dispatchable per form
|
|
type lookupFunc func(b input, i int) runeInfo
|
|
|
|
// formInfo holds Form-specific functions and tables.
|
|
type formInfo struct {
|
|
form Form
|
|
composing, compatibility bool // form type
|
|
info lookupFunc
|
|
}
|
|
|
|
var formTable []*formInfo
|
|
|
|
func init() {
|
|
formTable = make([]*formInfo, 4)
|
|
|
|
for i := range formTable {
|
|
f := &formInfo{}
|
|
formTable[i] = f
|
|
f.form = Form(i)
|
|
if Form(i) == NFKD || Form(i) == NFKC {
|
|
f.compatibility = true
|
|
f.info = lookupInfoNFKC
|
|
} else {
|
|
f.info = lookupInfoNFC
|
|
}
|
|
if Form(i) == NFC || Form(i) == NFKC {
|
|
f.composing = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
|
|
// unexpected behavior for the user. For example, in NFD, there is a boundary
|
|
// after 'a'. However, a might combine with modifiers, so from the application's
|
|
// perspective it is not a good boundary. We will therefore always use the
|
|
// boundaries for the combining variants.
|
|
func (i runeInfo) boundaryBefore() bool {
|
|
if i.ccc == 0 && !i.combinesBackward() {
|
|
return true
|
|
}
|
|
// We assume that the CCC of the first character in a decomposition
|
|
// is always non-zero if different from info.ccc and that we can return
|
|
// false at this point. This is verified by maketables.
|
|
return false
|
|
}
|
|
|
|
func (i runeInfo) boundaryAfter() bool {
|
|
return i.isInert()
|
|
}
|
|
|
|
// We pack quick check data in 4 bits:
|
|
// 0: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
|
|
// 1..2: NFC_QC Yes(00), No (10), or Maybe (11)
|
|
// 3: Combines forward (0 == false, 1 == true)
|
|
//
|
|
// When all 4 bits are zero, the character is inert, meaning it is never
|
|
// influenced by normalization.
|
|
type qcInfo uint8
|
|
|
|
func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 }
|
|
func (i runeInfo) isYesD() bool { return i.flags&0x1 == 0 }
|
|
|
|
func (i runeInfo) combinesForward() bool { return i.flags&0x8 != 0 }
|
|
func (i runeInfo) combinesBackward() bool { return i.flags&0x2 != 0 } // == isMaybe
|
|
func (i runeInfo) hasDecomposition() bool { return i.flags&0x1 != 0 } // == isNoD
|
|
|
|
func (r runeInfo) isInert() bool {
|
|
return r.flags&0xf == 0 && r.ccc == 0
|
|
}
|
|
|
|
func (r runeInfo) decomposition() []byte {
|
|
if r.index == 0 {
|
|
return nil
|
|
}
|
|
p := r.index
|
|
n := decomps[p] & 0x3F
|
|
p++
|
|
return decomps[p : p+uint16(n)]
|
|
}
|
|
|
|
// Recomposition
|
|
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
|
|
// This clips off the bits of three entries, but we know this will not
|
|
// result in a collision. In the unlikely event that changes to
|
|
// UnicodeData.txt introduce collisions, the compiler will catch it.
|
|
// Note that the recomposition map for NFC and NFKC are identical.
|
|
|
|
// combine returns the combined rune or 0 if it doesn't exist.
|
|
func combine(a, b rune) rune {
|
|
key := uint32(uint16(a))<<16 + uint32(uint16(b))
|
|
return recompMap[key]
|
|
}
|
|
|
|
func lookupInfoNFC(b input, i int) runeInfo {
|
|
v, sz := b.charinfoNFC(i)
|
|
return compInfo(v, sz)
|
|
}
|
|
|
|
func lookupInfoNFKC(b input, i int) runeInfo {
|
|
v, sz := b.charinfoNFKC(i)
|
|
return compInfo(v, sz)
|
|
}
|
|
|
|
// compInfo converts the information contained in v and sz
|
|
// to a runeInfo. See the comment at the top of the file
|
|
// for more information on the format.
|
|
func compInfo(v uint16, sz int) runeInfo {
|
|
if v == 0 {
|
|
return runeInfo{size: uint8(sz)}
|
|
} else if v >= 0x8000 {
|
|
return runeInfo{
|
|
size: uint8(sz),
|
|
ccc: uint8(v),
|
|
tccc: uint8(v),
|
|
flags: qcInfo(v>>8) & qcInfoMask,
|
|
}
|
|
}
|
|
// has decomposition
|
|
h := decomps[v]
|
|
f := (qcInfo(h&headerFlagsMask) >> 4) | 0x1
|
|
ri := runeInfo{size: uint8(sz), flags: f, index: v}
|
|
if v >= firstCCC {
|
|
v += uint16(h&headerLenMask) + 1
|
|
ri.tccc = decomps[v]
|
|
if v >= firstLeadingCCC {
|
|
ri.ccc = decomps[v+1]
|
|
}
|
|
}
|
|
return ri
|
|
}
|