// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package ucd provides a parser for Unicode Character Database files, the // format of which is defined in http://www.unicode.org/reports/tr44/. See // http://www.unicode.org/Public/UCD/latest/ucd/ for example files. // // It currently does not support substitutions of missing fields. package ucd // import "golang.org/x/text/internal/ucd" import ( "bufio" "bytes" "errors" "fmt" "io" "log" "regexp" "strconv" "strings" ) // UnicodeData.txt fields. const ( CodePoint = iota Name GeneralCategory CanonicalCombiningClass BidiClass DecompMapping DecimalValue DigitValue NumericValue BidiMirrored Unicode1Name ISOComment SimpleUppercaseMapping SimpleLowercaseMapping SimpleTitlecaseMapping ) // Parse calls f for each entry in the given reader of a UCD file. It will close // the reader upon return. It will call log.Fatal if any error occurred. // // This implements the most common usage pattern of using Parser. func Parse(r io.ReadCloser, f func(p *Parser)) { defer r.Close() p := New(r) for p.Next() { f(p) } if err := p.Err(); err != nil { r.Close() // os.Exit will cause defers not to be called. log.Fatal(err) } } // An Option is used to configure a Parser. type Option func(p *Parser) func keepRanges(p *Parser) { p.keepRanges = true } var ( // KeepRanges prevents the expansion of ranges. The raw ranges can be // obtained by calling Range(0) on the parser. KeepRanges Option = keepRanges ) // The Part option register a handler for lines starting with a '@'. The text // after a '@' is available as the first field. Comments are handled as usual. func Part(f func(p *Parser)) Option { return func(p *Parser) { p.partHandler = f } } // The CommentHandler option passes comments that are on a line by itself to // a given handler. func CommentHandler(f func(s string)) Option { return func(p *Parser) { p.commentHandler = f } } // A Parser parses Unicode Character Database (UCD) files. type Parser struct { scanner *bufio.Scanner keepRanges bool // Don't expand rune ranges in field 0. err error comment []byte field [][]byte // parsedRange is needed in case Range(0) is called more than once for one // field. In some cases this requires scanning ahead. parsedRange bool rangeStart, rangeEnd rune partHandler func(p *Parser) commentHandler func(s string) } func (p *Parser) setError(err error) { if p.err == nil { p.err = err } } func (p *Parser) getField(i int) []byte { if i >= len(p.field) { p.setError(fmt.Errorf("ucd: index of field %d out of bounds", i)) return nil } return p.field[i] } // Err returns a non-nil error if any error occurred during parsing. func (p *Parser) Err() error { return p.err } // New returns a Parser for the given Reader. func New(r io.Reader, o ...Option) *Parser { p := &Parser{ scanner: bufio.NewScanner(r), } for _, f := range o { f(p) } return p } // Next parses the next line in the file. It returns true if a line was parsed // and false if it reached the end of the file. func (p *Parser) Next() bool { if !p.keepRanges && p.rangeStart < p.rangeEnd { p.rangeStart++ return true } p.comment = nil p.field = p.field[:0] p.parsedRange = false for p.scanner.Scan() { b := p.scanner.Bytes() if len(b) == 0 { continue } if b[0] == '#' { if p.commentHandler != nil { p.commentHandler(strings.TrimSpace(string(b[1:]))) } continue } // Parse line if i := bytes.IndexByte(b, '#'); i != -1 { p.comment = bytes.TrimSpace(b[i+1:]) b = b[:i] } if b[0] == '@' { if p.partHandler != nil { p.field = append(p.field, bytes.TrimSpace(b[1:])) p.partHandler(p) p.field = p.field[:0] } p.comment = nil continue } for { i := bytes.IndexByte(b, ';') if i == -1 { p.field = append(p.field, bytes.TrimSpace(b)) break } p.field = append(p.field, bytes.TrimSpace(b[:i])) b = b[i+1:] } if !p.keepRanges { p.rangeStart, p.rangeEnd = p.getRange(0) } return true } p.setError(p.scanner.Err()) return false } func parseRune(b []byte) (rune, error) { if len(b) > 2 && b[0] == 'U' && b[1] == '+' { b = b[2:] } x, err := strconv.ParseUint(string(b), 16, 32) return rune(x), err } func (p *Parser) parseRune(b []byte) rune { x, err := parseRune(b) p.setError(err) return x } // Rune parses and returns field i as a rune. func (p *Parser) Rune(i int) rune { if i > 0 || p.keepRanges { return p.parseRune(p.getField(i)) } return p.rangeStart } // Runes interprets and returns field i as a sequence of runes. func (p *Parser) Runes(i int) (runes []rune) { add := func(b []byte) { if b = bytes.TrimSpace(b); len(b) > 0 { runes = append(runes, p.parseRune(b)) } } for b := p.getField(i); ; { i := bytes.IndexByte(b, ' ') if i == -1 { add(b) break } add(b[:i]) b = b[i+1:] } return } var ( errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>") // reRange matches one line of a legacy rune range. reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$") ) // Range parses and returns field i as a rune range. A range is inclusive at // both ends. If the field only has one rune, first and last will be identical. // It supports the legacy format for ranges used in UnicodeData.txt. func (p *Parser) Range(i int) (first, last rune) { if !p.keepRanges { return p.rangeStart, p.rangeStart } return p.getRange(i) } func (p *Parser) getRange(i int) (first, last rune) { b := p.getField(i) if k := bytes.Index(b, []byte("..")); k != -1 { return p.parseRune(b[:k]), p.parseRune(b[k+2:]) } // The first field may not be a rune, in which case we may ignore any error // and set the range as 0..0. x, err := parseRune(b) if err != nil { // Disable range parsing henceforth. This ensures that an error will be // returned if the user subsequently will try to parse this field as // a Rune. p.keepRanges = true } // Special case for UnicodeData that was retained for backwards compatibility. if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) { if p.parsedRange { return p.rangeStart, p.rangeEnd } mf := reRange.FindStringSubmatch(p.scanner.Text()) if mf == nil || !p.scanner.Scan() { p.setError(errIncorrectLegacyRange) return x, x } // Using Bytes would be more efficient here, but Text is a lot easier // and this is not a frequent case. ml := reRange.FindStringSubmatch(p.scanner.Text()) if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] { p.setError(errIncorrectLegacyRange) return x, x } p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])]) p.parsedRange = true return p.rangeStart, p.rangeEnd } return x, x } // bools recognizes all valid UCD boolean values. var bools = map[string]bool{ "": false, "N": false, "No": false, "F": false, "False": false, "Y": true, "Yes": true, "T": true, "True": true, } // Bool parses and returns field i as a boolean value. func (p *Parser) Bool(i int) bool { b := p.getField(i) for s, v := range bools { if bstrEq(b, s) { return v } } p.setError(strconv.ErrSyntax) return false } // Int parses and returns field i as an integer value. func (p *Parser) Int(i int) int { x, err := strconv.ParseInt(string(p.getField(i)), 10, 64) p.setError(err) return int(x) } // Uint parses and returns field i as an unsigned integer value. func (p *Parser) Uint(i int) uint { x, err := strconv.ParseUint(string(p.getField(i)), 10, 64) p.setError(err) return uint(x) } // Float parses and returns field i as a decimal value. func (p *Parser) Float(i int) float64 { x, err := strconv.ParseFloat(string(p.getField(i)), 64) p.setError(err) return x } // String parses and returns field i as a string value. func (p *Parser) String(i int) string { return string(p.getField(i)) } // Strings parses and returns field i as a space-separated list of strings. func (p *Parser) Strings(i int) []string { ss := strings.Split(string(p.getField(i)), " ") for i, s := range ss { ss[i] = strings.TrimSpace(s) } return ss } // Comment returns the comments for the current line. func (p *Parser) Comment() string { return string(p.comment) } var errUndefinedEnum = errors.New("ucd: undefined enum value") // Enum interprets and returns field i as a value that must be one of the values // in enum. func (p *Parser) Enum(i int, enum ...string) string { b := p.getField(i) for _, s := range enum { if bstrEq(b, s) { return s } } p.setError(errUndefinedEnum) return "" } func bstrEq(b []byte, s string) bool { if len(b) != len(s) { return false } for i, c := range b { if c != s[i] { return false } } return true }