// Copyright 2015 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package colltab // An Iter incrementally converts chunks of the input text to collation // elements, while ensuring that the collation elements are in normalized order // (that is, they are in the order as if the input text were normalized first). type Iter struct { Weighter Weighter Elems []Elem // N is the number of elements in Elems that will not be reordered on // subsequent iterations, N <= len(Elems). N int bytes []byte str string // Because the Elems buffer may contain collation elements that are needed // for look-ahead, we need two positions in the text (bytes or str): one for // the end position in the text for the current iteration and one for the // start of the next call to appendNext. pEnd int // end position in text corresponding to N. pNext int // pEnd <= pNext. } // Reset sets the position in the current input text to p and discards any // results obtained so far. func (i *Iter) Reset(p int) { i.Elems = i.Elems[:0] i.N = 0 i.pEnd = p i.pNext = p } // Len returns the length of the input text. func (i *Iter) Len() int { if i.bytes != nil { return len(i.bytes) } return len(i.str) } // Discard removes the collation elements up to N. func (i *Iter) Discard() { // TODO: change this such that only modifiers following starters will have // to be copied. i.Elems = i.Elems[:copy(i.Elems, i.Elems[i.N:])] i.N = 0 } // End returns the end position of the input text for which Next has returned // results. func (i *Iter) End() int { return i.pEnd } // SetInput resets i to input s. func (i *Iter) SetInput(s []byte) { i.bytes = s i.str = "" i.Reset(0) } // SetInputString resets i to input s. func (i *Iter) SetInputString(s string) { i.str = s i.bytes = nil i.Reset(0) } func (i *Iter) done() bool { return i.pNext >= len(i.str) && i.pNext >= len(i.bytes) } func (i *Iter) appendNext() bool { if i.done() { return false } var sz int if i.bytes == nil { i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str[i.pNext:]) } else { i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes[i.pNext:]) } if sz == 0 { sz = 1 } i.pNext += sz return true } // Next appends Elems to the internal array. On each iteration, it will either // add starters or modifiers. In the majority of cases, an Elem with a primary // value > 0 will have a CCC of 0. The CCC values of collation elements are also // used to detect if the input string was not normalized and to adjust the // result accordingly. func (i *Iter) Next() bool { if i.N == len(i.Elems) && !i.appendNext() { return false } // Check if the current segment starts with a starter. prevCCC := i.Elems[len(i.Elems)-1].CCC() if prevCCC == 0 { i.N = len(i.Elems) i.pEnd = i.pNext return true } else if i.Elems[i.N].CCC() == 0 { // set i.N to only cover part of i.Elems for which prevCCC == 0 and // use rest for the next call to next. for i.N++; i.N < len(i.Elems) && i.Elems[i.N].CCC() == 0; i.N++ { } i.pEnd = i.pNext return true } // The current (partial) segment starts with modifiers. We need to collect // all successive modifiers to ensure that they are normalized. for { p := len(i.Elems) i.pEnd = i.pNext if !i.appendNext() { break } if ccc := i.Elems[p].CCC(); ccc == 0 || len(i.Elems)-i.N > maxCombiningCharacters { // Leave the starter for the next iteration. This ensures that we // do not return sequences of collation elements that cross two // segments. // // TODO: handle large number of combining characters by fully // normalizing the input segment before iteration. This ensures // results are consistent across the text repo. i.N = p return true } else if ccc < prevCCC { i.doNorm(p, ccc) // should be rare, never occurs for NFD and FCC. } else { prevCCC = ccc } } done := len(i.Elems) != i.N i.N = len(i.Elems) return done } // nextNoNorm is the same as next, but does not "normalize" the collation // elements. func (i *Iter) nextNoNorm() bool { // TODO: remove this function. Using this instead of next does not seem // to improve performance in any significant way. We retain this until // later for evaluation purposes. if i.done() { return false } i.appendNext() i.N = len(i.Elems) return true } const maxCombiningCharacters = 30 // doNorm reorders the collation elements in i.Elems. // It assumes that blocks of collation elements added with appendNext // either start and end with the same CCC or start with CCC == 0. // This allows for a single insertion point for the entire block. // The correctness of this assumption is verified in builder.go. func (i *Iter) doNorm(p int, ccc uint8) { n := len(i.Elems) k := p for p--; p > i.N && ccc < i.Elems[p-1].CCC(); p-- { } i.Elems = append(i.Elems, i.Elems[p:k]...) copy(i.Elems[p:], i.Elems[k:]) i.Elems = i.Elems[:n] }