// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package build // import "golang.org/x/text/collate/build" import ( "fmt" "io" "log" "sort" "strings" "unicode/utf8" "golang.org/x/text/internal/colltab" "golang.org/x/text/language" "golang.org/x/text/unicode/norm" ) // TODO: optimizations: // - expandElem is currently 20K. By putting unique colElems in a separate // table and having a byte array of indexes into this table, we can reduce // the total size to about 7K. By also factoring out the length bytes, we // can reduce this to about 6K. // - trie valueBlocks are currently 100K. There are a lot of sparse blocks // and many consecutive values with the same stride. This can be further // compacted. // - Compress secondary weights into 8 bits. // - Some LDML specs specify a context element. Currently we simply concatenate // those. Context can be implemented using the contraction trie. If Builder // could analyze and detect when using a context makes sense, there is no // need to expose this construct in the API. // A Builder builds a root collation table. The user must specify the // collation elements for each entry. A common use will be to base the weights // on those specified in the allkeys* file as provided by the UCA or CLDR. type Builder struct { index *trieBuilder root ordering locale []*Tailoring t *table err error built bool minNonVar int // lowest primary recorded for a variable varTop int // highest primary recorded for a non-variable // indexes used for reusing expansions and contractions expIndex map[string]int // positions of expansions keyed by their string representation ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes ctElem map[string]int // contraction elements keyed by their string representation } // A Tailoring builds a collation table based on another collation table. // The table is defined by specifying tailorings to the underlying table. // See http://unicode.org/reports/tr35/ for an overview of tailoring // collation tables. The CLDR contains pre-defined tailorings for a variety // of languages (See http://www.unicode.org/Public/cldr//core.zip.) type Tailoring struct { id string builder *Builder index *ordering anchor *entry before bool } // NewBuilder returns a new Builder. func NewBuilder() *Builder { return &Builder{ index: newTrieBuilder(), root: makeRootOrdering(), expIndex: make(map[string]int), ctHandle: make(map[string]ctHandle), ctElem: make(map[string]int), } } // Tailoring returns a Tailoring for the given locale. One should // have completed all calls to Add before calling Tailoring. func (b *Builder) Tailoring(loc language.Tag) *Tailoring { t := &Tailoring{ id: loc.String(), builder: b, index: b.root.clone(), } t.index.id = t.id b.locale = append(b.locale, t) return t } // Add adds an entry to the collation element table, mapping // a slice of runes to a sequence of collation elements. // A collation element is specified as list of weights: []int{primary, secondary, ...}. // The entries are typically obtained from a collation element table // as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format. // Note that the collation elements specified by colelems are only used // as a guide. The actual weights generated by Builder may differ. // The argument variables is a list of indices into colelems that should contain // a value for each colelem that is a variable. (See the reference above.) func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error { str := string(runes) elems := make([]rawCE, len(colelems)) for i, ce := range colelems { if len(ce) == 0 { break } elems[i] = makeRawCE(ce, 0) if len(ce) == 1 { elems[i].w[1] = defaultSecondary } if len(ce) <= 2 { elems[i].w[2] = defaultTertiary } if len(ce) <= 3 { elems[i].w[3] = ce[0] } } for i, ce := range elems { p := ce.w[0] isvar := false for _, j := range variables { if i == j { isvar = true } } if isvar { if p >= b.minNonVar && b.minNonVar > 0 { return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar) } if p > b.varTop { b.varTop = p } } else if p > 1 { // 1 is a special primary value reserved for FFFE if p <= b.varTop { return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop) } if b.minNonVar == 0 || p < b.minNonVar { b.minNonVar = p } } } elems, err := convertLargeWeights(elems) if err != nil { return err } cccs := []uint8{} nfd := norm.NFD.String(str) for i := range nfd { cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC()) } if len(cccs) < len(elems) { if len(cccs) > 2 { return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems)) } p := len(elems) - 1 for ; p > 0 && elems[p].w[0] == 0; p-- { elems[p].ccc = cccs[len(cccs)-1] } for ; p >= 0; p-- { elems[p].ccc = cccs[0] } } else { for i := range elems { elems[i].ccc = cccs[i] } } // doNorm in collate.go assumes that the following conditions hold. if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] { return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs) } b.root.newEntry(str, elems) return nil } func (t *Tailoring) setAnchor(anchor string) error { anchor = norm.NFC.String(anchor) a := t.index.find(anchor) if a == nil { a = t.index.newEntry(anchor, nil) a.implicit = true a.modified = true for _, r := range []rune(anchor) { e := t.index.find(string(r)) e.lock = true } } t.anchor = a return nil } // SetAnchor sets the point after which elements passed in subsequent calls to // Insert will be inserted. It is equivalent to the reset directive in an LDML // specification. See Insert for an example. // SetAnchor supports the following logical reset positions: // , , , // and . func (t *Tailoring) SetAnchor(anchor string) error { if err := t.setAnchor(anchor); err != nil { return err } t.before = false return nil } // SetAnchorBefore is similar to SetAnchor, except that subsequent calls to // Insert will insert entries before the anchor. func (t *Tailoring) SetAnchorBefore(anchor string) error { if err := t.setAnchor(anchor); err != nil { return err } t.before = true return nil } // Insert sets the ordering of str relative to the entry set by the previous // call to SetAnchor or Insert. The argument extend corresponds // to the extend elements as defined in LDML. A non-empty value for extend // will cause the collation elements corresponding to extend to be appended // to the collation elements generated for the entry added by Insert. // This has the same net effect as sorting str after the string anchor+extend. // See http://www.unicode.org/reports/tr10/#Tailoring_Example for details // on parametric tailoring and http://unicode.org/reports/tr35/#Collation_Elements // for full details on LDML. // // Examples: create a tailoring for Swedish, where "ä" is ordered after "z" // at the primary sorting level: // t := b.Tailoring("se") // t.SetAnchor("z") // t.Insert(colltab.Primary, "ä", "") // Order "ü" after "ue" at the secondary sorting level: // t.SetAnchor("ue") // t.Insert(colltab.Secondary, "ü","") // or // t.SetAnchor("u") // t.Insert(colltab.Secondary, "ü", "e") // Order "q" afer "ab" at the secondary level and "Q" after "q" // at the tertiary level: // t.SetAnchor("ab") // t.Insert(colltab.Secondary, "q", "") // t.Insert(colltab.Tertiary, "Q", "") // Order "b" before "a": // t.SetAnchorBefore("a") // t.Insert(colltab.Primary, "b", "") // Order "0" after the last primary ignorable: // t.SetAnchor("") // t.Insert(colltab.Primary, "0", "") func (t *Tailoring) Insert(level colltab.Level, str, extend string) error { if t.anchor == nil { return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str) } str = norm.NFC.String(str) e := t.index.find(str) if e == nil { e = t.index.newEntry(str, nil) } else if e.logical != noAnchor { return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str) } if e.lock { return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str) } a := t.anchor // Find the first element after the anchor which differs at a level smaller or // equal to the given level. Then insert at this position. // See http://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details. e.before = t.before if t.before { t.before = false if a.prev == nil { a.insertBefore(e) } else { for a = a.prev; a.level > level; a = a.prev { } a.insertAfter(e) } e.level = level } else { for ; a.level > level; a = a.next { } e.level = a.level if a != e { a.insertAfter(e) a.level = level } else { // We don't set a to prev itself. This has the effect of the entry // getting new collation elements that are an increment of itself. // This is intentional. a.prev.level = level } } e.extend = norm.NFD.String(extend) e.exclude = false e.modified = true e.elems = nil t.anchor = e return nil } func (o *ordering) getWeight(e *entry) []rawCE { if len(e.elems) == 0 && e.logical == noAnchor { if e.implicit { for _, r := range e.runes { e.elems = append(e.elems, o.getWeight(o.find(string(r)))...) } } else if e.before { count := [colltab.Identity + 1]int{} a := e for ; a.elems == nil && !a.implicit; a = a.next { count[a.level]++ } e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)} for i := colltab.Primary; i < colltab.Quaternary; i++ { if count[i] != 0 { e.elems[0].w[i] -= count[i] break } } if e.prev != nil { o.verifyWeights(e.prev, e, e.prev.level) } } else { prev := e.prev e.elems = nextWeight(prev.level, o.getWeight(prev)) o.verifyWeights(e, e.next, e.level) } } return e.elems } func (o *ordering) addExtension(e *entry) { if ex := o.find(e.extend); ex != nil { e.elems = append(e.elems, ex.elems...) } else { for _, r := range []rune(e.extend) { e.elems = append(e.elems, o.find(string(r)).elems...) } } e.extend = "" } func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error { if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil { return nil } for i := colltab.Primary; i < level; i++ { if a.elems[0].w[i] < b.elems[0].w[i] { return nil } } if a.elems[0].w[level] >= b.elems[0].w[level] { err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems) log.Println(err) // TODO: return the error instead, or better, fix the conflicting entry by making room. } return nil } func (b *Builder) error(e error) { if e != nil { b.err = e } } func (b *Builder) errorID(locale string, e error) { if e != nil { b.err = fmt.Errorf("%s:%v", locale, e) } } // patchNorm ensures that NFC and NFD counterparts are consistent. func (o *ordering) patchNorm() { // Insert the NFD counterparts, if necessary. for _, e := range o.ordered { nfd := norm.NFD.String(e.str) if nfd != e.str { if e0 := o.find(nfd); e0 != nil && !e0.modified { e0.elems = e.elems } else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) { e := o.newEntry(nfd, e.elems) e.modified = true } } } // Update unchanged composed forms if one of their parts changed. for _, e := range o.ordered { nfd := norm.NFD.String(e.str) if e.modified || nfd == e.str { continue } if e0 := o.find(nfd); e0 != nil { e.elems = e0.elems } else { e.elems = o.genColElems(nfd) if norm.NFD.LastBoundary([]byte(nfd)) == 0 { r := []rune(nfd) head := string(r[0]) tail := "" for i := 1; i < len(r); i++ { s := norm.NFC.String(head + string(r[i])) if e0 := o.find(s); e0 != nil && e0.modified { head = s } else { tail += string(r[i]) } } e.elems = append(o.genColElems(head), o.genColElems(tail)...) } } } // Exclude entries for which the individual runes generate the same collation elements. for _, e := range o.ordered { if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) { e.exclude = true } } } func (b *Builder) buildOrdering(o *ordering) { for _, e := range o.ordered { o.getWeight(e) } for _, e := range o.ordered { o.addExtension(e) } o.patchNorm() o.sort() simplify(o) b.processExpansions(o) // requires simplify b.processContractions(o) // requires simplify t := newNode() for e := o.front(); e != nil; e, _ = e.nextIndexed() { if !e.skip() { ce, err := e.encode() b.errorID(o.id, err) t.insert(e.runes[0], ce) } } o.handle = b.index.addTrie(t) } func (b *Builder) build() (*table, error) { if b.built { return b.t, b.err } b.built = true b.t = &table{ Table: colltab.Table{ MaxContractLen: utf8.UTFMax, VariableTop: uint32(b.varTop), }, } b.buildOrdering(&b.root) b.t.root = b.root.handle for _, t := range b.locale { b.buildOrdering(t.index) if b.err != nil { break } } i, err := b.index.generate() b.t.trie = *i b.t.Index = colltab.Trie{ Index: i.index, Values: i.values, Index0: i.index[blockSize*b.t.root.lookupStart:], Values0: i.values[blockSize*b.t.root.valueStart:], } b.error(err) return b.t, b.err } // Build builds the root Collator. func (b *Builder) Build() (colltab.Weighter, error) { table, err := b.build() if err != nil { return nil, err } return table, nil } // Build builds a Collator for Tailoring t. func (t *Tailoring) Build() (colltab.Weighter, error) { // TODO: implement. return nil, nil } // Print prints the tables for b and all its Tailorings as a Go file // that can be included in the Collate package. func (b *Builder) Print(w io.Writer) (n int, err error) { p := func(nn int, e error) { n += nn if err == nil { err = e } } t, err := b.build() if err != nil { return 0, err } p(fmt.Fprintf(w, `var availableLocales = "und`)) for _, loc := range b.locale { if loc.id != "und" { p(fmt.Fprintf(w, ",%s", loc.id)) } } p(fmt.Fprint(w, "\"\n\n")) p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop)) p(fmt.Fprintln(w, "var locales = [...]tableIndex{")) for _, loc := range b.locale { if loc.id == "und" { p(t.fprintIndex(w, loc.index.handle, loc.id)) } } for _, loc := range b.locale { if loc.id != "und" { p(t.fprintIndex(w, loc.index.handle, loc.id)) } } p(fmt.Fprint(w, "}\n\n")) n, _, err = t.fprint(w, "main") return } // reproducibleFromNFKD checks whether the given expansion could be generated // from an NFKD expansion. func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool { // Length must be equal. if len(exp) != len(nfkd) { return false } for i, ce := range exp { // Primary and secondary values should be equal. if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] { return false } // Tertiary values should be equal to maxTertiary for third element onwards. // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can // simply be dropped. Try this out by dropping the following code. if i >= 2 && ce.w[2] != maxTertiary { return false } if _, err := makeCE(ce); err != nil { // Simply return false. The error will be caught elsewhere. return false } } return true } func simplify(o *ordering) { // Runes that are a starter of a contraction should not be removed. // (To date, there is only Kannada character 0CCA.) keep := make(map[rune]bool) for e := o.front(); e != nil; e, _ = e.nextIndexed() { if len(e.runes) > 1 { keep[e.runes[0]] = true } } // Tag entries for which the runes NFKD decompose to identical values. for e := o.front(); e != nil; e, _ = e.nextIndexed() { s := e.str nfkd := norm.NFKD.String(s) nfd := norm.NFD.String(s) if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd { continue } if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) { e.decompose = true } } } // appendExpansion converts the given collation sequence to // collation elements and adds them to the expansion table. // It returns an index to the expansion table. func (b *Builder) appendExpansion(e *entry) int { t := b.t i := len(t.ExpandElem) ce := uint32(len(e.elems)) t.ExpandElem = append(t.ExpandElem, ce) for _, w := range e.elems { ce, err := makeCE(w) if err != nil { b.error(err) return -1 } t.ExpandElem = append(t.ExpandElem, ce) } return i } // processExpansions extracts data necessary to generate // the extraction tables. func (b *Builder) processExpansions(o *ordering) { for e := o.front(); e != nil; e, _ = e.nextIndexed() { if !e.expansion() { continue } key := fmt.Sprintf("%v", e.elems) i, ok := b.expIndex[key] if !ok { i = b.appendExpansion(e) b.expIndex[key] = i } e.expansionIndex = i } } func (b *Builder) processContractions(o *ordering) { // Collate contractions per starter rune. starters := []rune{} cm := make(map[rune][]*entry) for e := o.front(); e != nil; e, _ = e.nextIndexed() { if e.contraction() { if len(e.str) > b.t.MaxContractLen { b.t.MaxContractLen = len(e.str) } r := e.runes[0] if _, ok := cm[r]; !ok { starters = append(starters, r) } cm[r] = append(cm[r], e) } } // Add entries of single runes that are at a start of a contraction. for e := o.front(); e != nil; e, _ = e.nextIndexed() { if !e.contraction() { r := e.runes[0] if _, ok := cm[r]; ok { cm[r] = append(cm[r], e) } } } // Build the tries for the contractions. t := b.t for _, r := range starters { l := cm[r] // Compute suffix strings. There are 31 different contraction suffix // sets for 715 contractions and 82 contraction starter runes as of // version 6.0.0. sufx := []string{} hasSingle := false for _, e := range l { if len(e.runes) > 1 { sufx = append(sufx, string(e.runes[1:])) } else { hasSingle = true } } if !hasSingle { b.error(fmt.Errorf("no single entry for starter rune %U found", r)) continue } // Unique the suffix set. sort.Strings(sufx) key := strings.Join(sufx, "\n") handle, ok := b.ctHandle[key] if !ok { var err error handle, err = appendTrie(&t.ContractTries, sufx) if err != nil { b.error(err) } b.ctHandle[key] = handle } // Bucket sort entries in index order. es := make([]*entry, len(l)) for _, e := range l { var p, sn int if len(e.runes) > 1 { str := []byte(string(e.runes[1:])) p, sn = lookup(&t.ContractTries, handle, str) if sn != len(str) { log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str)) } } if es[p] != nil { log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0]) } es[p] = e } // Create collation elements for contractions. elems := []uint32{} for _, e := range es { ce, err := e.encodeBase() b.errorID(o.id, err) elems = append(elems, ce) } key = fmt.Sprintf("%v", elems) i, ok := b.ctElem[key] if !ok { i = len(t.ContractElem) b.ctElem[key] = i t.ContractElem = append(t.ContractElem, elems...) } // Store info in entry for starter rune. es[0].contractionIndex = i es[0].contractionHandle = handle } }