// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // The trie in this file is used to associate the first full character // in a UTF-8 string to a collation element. // All but the last byte in a UTF-8 byte sequence are // used to look up offsets in the index table to be used for the next byte. // The last byte is used to index into a table of collation elements. // This file contains the code for the generation of the trie. package build import ( "fmt" "hash/fnv" "io" "reflect" ) const ( blockSize = 64 blockOffset = 2 // Subtract 2 blocks to compensate for the 0x80 added to continuation bytes. ) type trieHandle struct { lookupStart uint16 // offset in table for first byte valueStart uint16 // offset in table for first byte } type trie struct { index []uint16 values []uint32 } // trieNode is the intermediate trie structure used for generating a trie. type trieNode struct { index []*trieNode value []uint32 b byte refValue uint16 refIndex uint16 } func newNode() *trieNode { return &trieNode{ index: make([]*trieNode, 64), value: make([]uint32, 128), // root node size is 128 instead of 64 } } func (n *trieNode) isInternal() bool { return n.value != nil } func (n *trieNode) insert(r rune, value uint32) { const maskx = 0x3F // mask out two most-significant bits str := string(r) if len(str) == 1 { n.value[str[0]] = value return } for i := 0; i < len(str)-1; i++ { b := str[i] & maskx if n.index == nil { n.index = make([]*trieNode, blockSize) } nn := n.index[b] if nn == nil { nn = &trieNode{} nn.b = b n.index[b] = nn } n = nn } if n.value == nil { n.value = make([]uint32, blockSize) } b := str[len(str)-1] & maskx n.value[b] = value } type trieBuilder struct { t *trie roots []*trieHandle lookupBlocks []*trieNode valueBlocks []*trieNode lookupBlockIdx map[uint32]*trieNode valueBlockIdx map[uint32]*trieNode } func newTrieBuilder() *trieBuilder { index := &trieBuilder{} index.lookupBlocks = make([]*trieNode, 0) index.valueBlocks = make([]*trieNode, 0) index.lookupBlockIdx = make(map[uint32]*trieNode) index.valueBlockIdx = make(map[uint32]*trieNode) // The third nil is the default null block. The other two blocks // are used to guarantee an offset of at least 3 for each block. index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil) index.t = &trie{} return index } func (b *trieBuilder) computeOffsets(n *trieNode) *trieNode { hasher := fnv.New32() if n.index != nil { for i, nn := range n.index { var vi, vv uint16 if nn != nil { nn = b.computeOffsets(nn) n.index[i] = nn vi = nn.refIndex vv = nn.refValue } hasher.Write([]byte{byte(vi >> 8), byte(vi)}) hasher.Write([]byte{byte(vv >> 8), byte(vv)}) } h := hasher.Sum32() nn, ok := b.lookupBlockIdx[h] if !ok { n.refIndex = uint16(len(b.lookupBlocks)) - blockOffset b.lookupBlocks = append(b.lookupBlocks, n) b.lookupBlockIdx[h] = n } else { n = nn } } else { for _, v := range n.value { hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}) } h := hasher.Sum32() nn, ok := b.valueBlockIdx[h] if !ok { n.refValue = uint16(len(b.valueBlocks)) - blockOffset n.refIndex = n.refValue b.valueBlocks = append(b.valueBlocks, n) b.valueBlockIdx[h] = n } else { n = nn } } return n } func (b *trieBuilder) addStartValueBlock(n *trieNode) uint16 { hasher := fnv.New32() for _, v := range n.value[:2*blockSize] { hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}) } h := hasher.Sum32() nn, ok := b.valueBlockIdx[h] if !ok { n.refValue = uint16(len(b.valueBlocks)) n.refIndex = n.refValue b.valueBlocks = append(b.valueBlocks, n) // Add a dummy block to accommodate the double block size. b.valueBlocks = append(b.valueBlocks, nil) b.valueBlockIdx[h] = n } else { n = nn } return n.refValue } func genValueBlock(t *trie, n *trieNode) { if n != nil { for _, v := range n.value { t.values = append(t.values, v) } } } func genLookupBlock(t *trie, n *trieNode) { for _, nn := range n.index { v := uint16(0) if nn != nil { if n.index != nil { v = nn.refIndex } else { v = nn.refValue } } t.index = append(t.index, v) } } func (b *trieBuilder) addTrie(n *trieNode) *trieHandle { h := &trieHandle{} b.roots = append(b.roots, h) h.valueStart = b.addStartValueBlock(n) if len(b.roots) == 1 { // We insert a null block after the first start value block. // This ensures that continuation bytes UTF-8 sequences of length // greater than 2 will automatically hit a null block if there // was an undefined entry. b.valueBlocks = append(b.valueBlocks, nil) } n = b.computeOffsets(n) // Offset by one extra block as the first byte starts at 0xC0 instead of 0x80. h.lookupStart = n.refIndex - 1 return h } // generate generates and returns the trie for n. func (b *trieBuilder) generate() (t *trie, err error) { t = b.t if len(b.valueBlocks) >= 1<<16 { return nil, fmt.Errorf("maximum number of value blocks exceeded (%d > %d)", len(b.valueBlocks), 1<<16) } if len(b.lookupBlocks) >= 1<<16 { return nil, fmt.Errorf("maximum number of lookup blocks exceeded (%d > %d)", len(b.lookupBlocks), 1<<16) } genValueBlock(t, b.valueBlocks[0]) genValueBlock(t, &trieNode{value: make([]uint32, 64)}) for i := 2; i < len(b.valueBlocks); i++ { genValueBlock(t, b.valueBlocks[i]) } n := &trieNode{index: make([]*trieNode, 64)} genLookupBlock(t, n) genLookupBlock(t, n) genLookupBlock(t, n) for i := 3; i < len(b.lookupBlocks); i++ { genLookupBlock(t, b.lookupBlocks[i]) } return b.t, nil } func (t *trie) printArrays(w io.Writer, name string) (n, size int, err error) { p := func(f string, a ...interface{}) { nn, e := fmt.Fprintf(w, f, a...) n += nn if err == nil { err = e } } nv := len(t.values) p("// %sValues: %d entries, %d bytes\n", name, nv, nv*4) p("// Block 2 is the null block.\n") p("var %sValues = [%d]uint32 {", name, nv) var printnewline bool for i, v := range t.values { if i%blockSize == 0 { p("\n\t// Block %#x, offset %#x", i/blockSize, i) } if i%4 == 0 { printnewline = true } if v != 0 { if printnewline { p("\n\t") printnewline = false } p("%#04x:%#08x, ", i, v) } } p("\n}\n\n") ni := len(t.index) p("// %sLookup: %d entries, %d bytes\n", name, ni, ni*2) p("// Block 0 is the null block.\n") p("var %sLookup = [%d]uint16 {", name, ni) printnewline = false for i, v := range t.index { if i%blockSize == 0 { p("\n\t// Block %#x, offset %#x", i/blockSize, i) } if i%8 == 0 { printnewline = true } if v != 0 { if printnewline { p("\n\t") printnewline = false } p("%#03x:%#02x, ", i, v) } } p("\n}\n\n") return n, nv*4 + ni*2, err } func (t *trie) printStruct(w io.Writer, handle *trieHandle, name string) (n, sz int, err error) { const msg = "trie{ %sLookup[%d:], %sValues[%d:], %sLookup[:], %sValues[:]}" n, err = fmt.Fprintf(w, msg, name, handle.lookupStart*blockSize, name, handle.valueStart*blockSize, name, name) sz += int(reflect.TypeOf(trie{}).Size()) return }