d8f412571f
From-SVN: r180552
238 lines
4.9 KiB
Go
238 lines
4.9 KiB
Go
// Copyright 2011 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package norm
|
|
|
|
type valueRange struct {
|
|
value uint16 // header: value:stride
|
|
lo, hi byte // header: lo:n
|
|
}
|
|
|
|
type trie struct {
|
|
index []uint8
|
|
values []uint16
|
|
sparse []valueRange
|
|
sparseOffset []uint16
|
|
cutoff uint8 // indices >= cutoff are sparse
|
|
}
|
|
|
|
// lookupValue determines the type of block n and looks up the value for b.
|
|
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
|
|
// is a list of ranges with an accompanying value. Given a matching range r,
|
|
// the value for b is by r.value + (b - r.lo) * stride.
|
|
func (t *trie) lookupValue(n uint8, b byte) uint16 {
|
|
if n < t.cutoff {
|
|
return t.values[uint16(n)<<6+uint16(b&maskx)]
|
|
}
|
|
offset := t.sparseOffset[n-t.cutoff]
|
|
header := t.sparse[offset]
|
|
lo := offset + 1
|
|
hi := lo + uint16(header.lo)
|
|
for lo < hi {
|
|
m := lo + (hi-lo)/2
|
|
r := t.sparse[m]
|
|
if r.lo <= b && b <= r.hi {
|
|
return r.value + uint16(b-r.lo)*header.value
|
|
}
|
|
if b < r.lo {
|
|
hi = m
|
|
} else {
|
|
lo = m + 1
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
const (
|
|
t1 = 0x00 // 0000 0000
|
|
tx = 0x80 // 1000 0000
|
|
t2 = 0xC0 // 1100 0000
|
|
t3 = 0xE0 // 1110 0000
|
|
t4 = 0xF0 // 1111 0000
|
|
t5 = 0xF8 // 1111 1000
|
|
t6 = 0xFC // 1111 1100
|
|
te = 0xFE // 1111 1110
|
|
|
|
maskx = 0x3F // 0011 1111
|
|
mask2 = 0x1F // 0001 1111
|
|
mask3 = 0x0F // 0000 1111
|
|
mask4 = 0x07 // 0000 0111
|
|
)
|
|
|
|
// lookup returns the trie value for the first UTF-8 encoding in s and
|
|
// the width in bytes of this encoding. The size will be 0 if s does not
|
|
// hold enough bytes to complete the encoding. len(s) must be greater than 0.
|
|
func (t *trie) lookup(s []byte) (v uint16, sz int) {
|
|
c0 := s[0]
|
|
switch {
|
|
case c0 < tx:
|
|
return t.values[c0], 1
|
|
case c0 < t2:
|
|
return 0, 1
|
|
case c0 < t3:
|
|
if len(s) < 2 {
|
|
return 0, 0
|
|
}
|
|
i := t.index[c0]
|
|
c1 := s[1]
|
|
if c1 < tx || t2 <= c1 {
|
|
return 0, 1
|
|
}
|
|
return t.lookupValue(i, c1), 2
|
|
case c0 < t4:
|
|
if len(s) < 3 {
|
|
return 0, 0
|
|
}
|
|
i := t.index[c0]
|
|
c1 := s[1]
|
|
if c1 < tx || t2 <= c1 {
|
|
return 0, 1
|
|
}
|
|
o := uint16(i)<<6 + uint16(c1)&maskx
|
|
i = t.index[o]
|
|
c2 := s[2]
|
|
if c2 < tx || t2 <= c2 {
|
|
return 0, 2
|
|
}
|
|
return t.lookupValue(i, c2), 3
|
|
case c0 < t5:
|
|
if len(s) < 4 {
|
|
return 0, 0
|
|
}
|
|
i := t.index[c0]
|
|
c1 := s[1]
|
|
if c1 < tx || t2 <= c1 {
|
|
return 0, 1
|
|
}
|
|
o := uint16(i)<<6 + uint16(c1)&maskx
|
|
i = t.index[o]
|
|
c2 := s[2]
|
|
if c2 < tx || t2 <= c2 {
|
|
return 0, 2
|
|
}
|
|
o = uint16(i)<<6 + uint16(c2)&maskx
|
|
i = t.index[o]
|
|
c3 := s[3]
|
|
if c3 < tx || t2 <= c3 {
|
|
return 0, 3
|
|
}
|
|
return t.lookupValue(i, c3), 4
|
|
}
|
|
// Illegal rune
|
|
return 0, 1
|
|
}
|
|
|
|
// lookupString returns the trie value for the first UTF-8 encoding in s and
|
|
// the width in bytes of this encoding. The size will be 0 if s does not
|
|
// hold enough bytes to complete the encoding. len(s) must be greater than 0.
|
|
func (t *trie) lookupString(s string) (v uint16, sz int) {
|
|
c0 := s[0]
|
|
switch {
|
|
case c0 < tx:
|
|
return t.values[c0], 1
|
|
case c0 < t2:
|
|
return 0, 1
|
|
case c0 < t3:
|
|
if len(s) < 2 {
|
|
return 0, 0
|
|
}
|
|
i := t.index[c0]
|
|
c1 := s[1]
|
|
if c1 < tx || t2 <= c1 {
|
|
return 0, 1
|
|
}
|
|
return t.lookupValue(i, c1), 2
|
|
case c0 < t4:
|
|
if len(s) < 3 {
|
|
return 0, 0
|
|
}
|
|
i := t.index[c0]
|
|
c1 := s[1]
|
|
if c1 < tx || t2 <= c1 {
|
|
return 0, 1
|
|
}
|
|
o := uint16(i)<<6 + uint16(c1)&maskx
|
|
i = t.index[o]
|
|
c2 := s[2]
|
|
if c2 < tx || t2 <= c2 {
|
|
return 0, 2
|
|
}
|
|
return t.lookupValue(i, c2), 3
|
|
case c0 < t5:
|
|
if len(s) < 4 {
|
|
return 0, 0
|
|
}
|
|
i := t.index[c0]
|
|
c1 := s[1]
|
|
if c1 < tx || t2 <= c1 {
|
|
return 0, 1
|
|
}
|
|
o := uint16(i)<<6 + uint16(c1)&maskx
|
|
i = t.index[o]
|
|
c2 := s[2]
|
|
if c2 < tx || t2 <= c2 {
|
|
return 0, 2
|
|
}
|
|
o = uint16(i)<<6 + uint16(c2)&maskx
|
|
i = t.index[o]
|
|
c3 := s[3]
|
|
if c3 < tx || t2 <= c3 {
|
|
return 0, 3
|
|
}
|
|
return t.lookupValue(i, c3), 4
|
|
}
|
|
// Illegal rune
|
|
return 0, 1
|
|
}
|
|
|
|
// lookupUnsafe returns the trie value for the first UTF-8 encoding in s.
|
|
// s must hold a full encoding.
|
|
func (t *trie) lookupUnsafe(s []byte) uint16 {
|
|
c0 := s[0]
|
|
if c0 < tx {
|
|
return t.values[c0]
|
|
}
|
|
if c0 < t2 {
|
|
return 0
|
|
}
|
|
i := t.index[c0]
|
|
if c0 < t3 {
|
|
return t.lookupValue(i, s[1])
|
|
}
|
|
i = t.index[uint16(i)<<6+uint16(s[1])&maskx]
|
|
if c0 < t4 {
|
|
return t.lookupValue(i, s[2])
|
|
}
|
|
i = t.index[uint16(i)<<6+uint16(s[2])&maskx]
|
|
if c0 < t5 {
|
|
return t.lookupValue(i, s[3])
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// lookupStringUnsafe returns the trie value for the first UTF-8 encoding in s.
|
|
// s must hold a full encoding.
|
|
func (t *trie) lookupStringUnsafe(s string) uint16 {
|
|
c0 := s[0]
|
|
if c0 < tx {
|
|
return t.values[c0]
|
|
}
|
|
if c0 < t2 {
|
|
return 0
|
|
}
|
|
i := t.index[c0]
|
|
if c0 < t3 {
|
|
return t.lookupValue(i, s[1])
|
|
}
|
|
i = t.index[uint16(i)<<6+uint16(s[1])&maskx]
|
|
if c0 < t4 {
|
|
return t.lookupValue(i, s[2])
|
|
}
|
|
i = t.index[uint16(i)<<6+uint16(s[2])&maskx]
|
|
if c0 < t5 {
|
|
return t.lookupValue(i, s[3])
|
|
}
|
|
return 0
|
|
}
|