// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package colltab import ( "unicode" "unicode/utf8" ) // NewNumericWeighter wraps w to replace individual digits to sort based on their // numeric value. // // Weighter w must have a free primary weight after the primary weight for 9. // If this is not the case, numeric value will sort at the same primary level // as the first primary sorting after 9. func NewNumericWeighter(w Weighter) Weighter { getElem := func(s string) Elem { elems, _ := w.AppendNextString(nil, s) return elems[0] } nine := getElem("9") // Numbers should order before zero, but the DUCET has no room for this. // TODO: move before zero once we use fractional collation elements. ns, _ := MakeElem(nine.Primary()+1, nine.Secondary(), int(nine.Tertiary()), 0) return &numericWeighter{ Weighter: w, // We assume that w sorts digits of different kinds in order of numeric // value and that the tertiary weight order is preserved. // // TODO: evaluate whether it is worth basing the ranges on the Elem // encoding itself once the move to fractional weights is complete. zero: getElem("0"), zeroSpecialLo: getElem("0"), // U+FF10 FULLWIDTH DIGIT ZERO zeroSpecialHi: getElem("₀"), // U+2080 SUBSCRIPT ZERO nine: nine, nineSpecialHi: getElem("₉"), // U+2089 SUBSCRIPT NINE numberStart: ns, } } // A numericWeighter translates a stream of digits into a stream of weights // representing the numeric value. type numericWeighter struct { Weighter // The Elems below all demarcate boundaries of specific ranges. With the // current element encoding digits are in two ranges: normal (default // tertiary value) and special. For most languages, digits have collation // elements in the normal range. // // Note: the range tests are very specific for the element encoding used by // this implementation. The tests in collate_test.go are designed to fail // if this code is not updated when an encoding has changed. zero Elem // normal digit zero zeroSpecialLo Elem // special digit zero, low tertiary value zeroSpecialHi Elem // special digit zero, high tertiary value nine Elem // normal digit nine nineSpecialHi Elem // special digit nine numberStart Elem } // AppendNext calls the namesake of the underlying weigher, but replaces single // digits with weights representing their value. func (nw *numericWeighter) AppendNext(buf []Elem, s []byte) (ce []Elem, n int) { ce, n = nw.Weighter.AppendNext(buf, s) nc := numberConverter{ elems: buf, w: nw, b: s, } isZero, ok := nc.checkNextDigit(ce) if !ok { return ce, n } // ce might have been grown already, so take it instead of buf. nc.init(ce, len(buf), isZero) for n < len(s) { ce, sz := nw.Weighter.AppendNext(nc.elems, s[n:]) nc.b = s n += sz if !nc.update(ce) { break } } return nc.result(), n } // AppendNextString calls the namesake of the underlying weigher, but replaces // single digits with weights representing their value. func (nw *numericWeighter) AppendNextString(buf []Elem, s string) (ce []Elem, n int) { ce, n = nw.Weighter.AppendNextString(buf, s) nc := numberConverter{ elems: buf, w: nw, s: s, } isZero, ok := nc.checkNextDigit(ce) if !ok { return ce, n } nc.init(ce, len(buf), isZero) for n < len(s) { ce, sz := nw.Weighter.AppendNextString(nc.elems, s[n:]) nc.s = s n += sz if !nc.update(ce) { break } } return nc.result(), n } type numberConverter struct { w *numericWeighter elems []Elem nDigits int lenIndex int s string // set if the input was of type string b []byte // set if the input was of type []byte } // init completes initialization of a numberConverter and prepares it for adding // more digits. elems is assumed to have a digit starting at oldLen. func (nc *numberConverter) init(elems []Elem, oldLen int, isZero bool) { // Insert a marker indicating the start of a number and and a placeholder // for the number of digits. if isZero { elems = append(elems[:oldLen], nc.w.numberStart, 0) } else { elems = append(elems, 0, 0) copy(elems[oldLen+2:], elems[oldLen:]) elems[oldLen] = nc.w.numberStart elems[oldLen+1] = 0 nc.nDigits = 1 } nc.elems = elems nc.lenIndex = oldLen + 1 } // checkNextDigit reports whether bufNew adds a single digit relative to the old // buffer. If it does, it also reports whether this digit is zero. func (nc *numberConverter) checkNextDigit(bufNew []Elem) (isZero, ok bool) { if len(nc.elems) >= len(bufNew) { return false, false } e := bufNew[len(nc.elems)] if e < nc.w.zeroSpecialLo || nc.w.nine < e { // Not a number. return false, false } if e < nc.w.zero { if e > nc.w.nineSpecialHi { // Not a number. return false, false } if !nc.isDigit() { return false, false } isZero = e <= nc.w.zeroSpecialHi } else { // This is the common case if we encounter a digit. isZero = e == nc.w.zero } // Test the remaining added collation elements have a zero primary value. if n := len(bufNew) - len(nc.elems); n > 1 { for i := len(nc.elems) + 1; i < len(bufNew); i++ { if bufNew[i].Primary() != 0 { return false, false } } // In some rare cases, collation elements will encode runes in // unicode.No as a digit. For example Ethiopic digits (U+1369 - U+1371) // are not in Nd. Also some digits that clearly belong in unicode.No, // like U+0C78 TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR, have // collation elements indistinguishable from normal digits. // Unfortunately, this means we need to make this check for nearly all // non-Latin digits. // // TODO: check the performance impact and find something better if it is // an issue. if !nc.isDigit() { return false, false } } return isZero, true } func (nc *numberConverter) isDigit() bool { if nc.b != nil { r, _ := utf8.DecodeRune(nc.b) return unicode.In(r, unicode.Nd) } r, _ := utf8.DecodeRuneInString(nc.s) return unicode.In(r, unicode.Nd) } // We currently support a maximum of about 2M digits (the number of primary // values). Such numbers will compare correctly against small numbers, but their // comparison against other large numbers is undefined. // // TODO: define a proper fallback, such as comparing large numbers textually or // actually allowing numbers of unlimited length. // // TODO: cap this to a lower number (like 100) and maybe allow a larger number // in an option? const maxDigits = 1<