From 921818bca208f0c70e85ec670074cb3905cbbc82 Mon Sep 17 00:00:00 2001 From: Niall Sheridan Date: Sat, 27 Aug 2016 01:32:30 +0100 Subject: Update dependencies --- vendor/golang.org/x/text/internal/gen/code.go | 338 ++++++++++++++ vendor/golang.org/x/text/internal/gen/gen.go | 226 ++++++++++ .../golang.org/x/text/internal/triegen/compact.go | 58 +++ vendor/golang.org/x/text/internal/triegen/print.go | 251 +++++++++++ .../golang.org/x/text/internal/triegen/triegen.go | 494 +++++++++++++++++++++ vendor/golang.org/x/text/internal/ucd/ucd.go | 363 +++++++++++++++ 6 files changed, 1730 insertions(+) create mode 100644 vendor/golang.org/x/text/internal/gen/code.go create mode 100644 vendor/golang.org/x/text/internal/gen/gen.go create mode 100644 vendor/golang.org/x/text/internal/triegen/compact.go create mode 100644 vendor/golang.org/x/text/internal/triegen/print.go create mode 100644 vendor/golang.org/x/text/internal/triegen/triegen.go create mode 100644 vendor/golang.org/x/text/internal/ucd/ucd.go (limited to 'vendor/golang.org/x/text/internal') diff --git a/vendor/golang.org/x/text/internal/gen/code.go b/vendor/golang.org/x/text/internal/gen/code.go new file mode 100644 index 0000000..ca917d2 --- /dev/null +++ b/vendor/golang.org/x/text/internal/gen/code.go @@ -0,0 +1,338 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gen + +import ( + "bytes" + "encoding/gob" + "fmt" + "hash" + "hash/fnv" + "io" + "log" + "os" + "reflect" + "strings" + "unicode" + "unicode/utf8" +) + +// This file contains utilities for generating code. + +// TODO: other write methods like: +// - slices, maps, types, etc. + +// CodeWriter is a utility for writing structured code. It computes the content +// hash and size of written content. It ensures there are newlines between +// written code blocks. +type CodeWriter struct { + buf bytes.Buffer + Size int + Hash hash.Hash32 // content hash + gob *gob.Encoder + // For comments we skip the usual one-line separator if they are followed by + // a code block. + skipSep bool +} + +func (w *CodeWriter) Write(p []byte) (n int, err error) { + return w.buf.Write(p) +} + +// NewCodeWriter returns a new CodeWriter. +func NewCodeWriter() *CodeWriter { + h := fnv.New32() + return &CodeWriter{Hash: h, gob: gob.NewEncoder(h)} +} + +// WriteGoFile appends the buffer with the total size of all created structures +// and writes it as a Go file to the the given file with the given package name. +func (w *CodeWriter) WriteGoFile(filename, pkg string) { + f, err := os.Create(filename) + if err != nil { + log.Fatalf("Could not create file %s: %v", filename, err) + } + defer f.Close() + if _, err = w.WriteGo(f, pkg); err != nil { + log.Fatalf("Error writing file %s: %v", filename, err) + } +} + +// WriteGo appends the buffer with the total size of all created structures and +// writes it as a Go file to the the given writer with the given package name. +func (w *CodeWriter) WriteGo(out io.Writer, pkg string) (n int, err error) { + sz := w.Size + w.WriteComment("Total table size %d bytes (%dKiB); checksum: %X\n", sz, sz/1024, w.Hash.Sum32()) + defer w.buf.Reset() + return WriteGo(out, pkg, w.buf.Bytes()) +} + +func (w *CodeWriter) printf(f string, x ...interface{}) { + fmt.Fprintf(w, f, x...) +} + +func (w *CodeWriter) insertSep() { + if w.skipSep { + w.skipSep = false + return + } + // Use at least two newlines to ensure a blank space between the previous + // block. WriteGoFile will remove extraneous newlines. + w.printf("\n\n") +} + +// WriteComment writes a comment block. All line starts are prefixed with "//". +// Initial empty lines are gobbled. The indentation for the first line is +// stripped from consecutive lines. +func (w *CodeWriter) WriteComment(comment string, args ...interface{}) { + s := fmt.Sprintf(comment, args...) + s = strings.Trim(s, "\n") + + // Use at least two newlines to ensure a blank space between the previous + // block. WriteGoFile will remove extraneous newlines. + w.printf("\n\n// ") + w.skipSep = true + + // strip first indent level. + sep := "\n" + for ; len(s) > 0 && (s[0] == '\t' || s[0] == ' '); s = s[1:] { + sep += s[:1] + } + + strings.NewReplacer(sep, "\n// ", "\n", "\n// ").WriteString(w, s) + + w.printf("\n") +} + +func (w *CodeWriter) writeSizeInfo(size int) { + w.printf("// Size: %d bytes\n", size) +} + +// WriteConst writes a constant of the given name and value. +func (w *CodeWriter) WriteConst(name string, x interface{}) { + w.insertSep() + v := reflect.ValueOf(x) + + switch v.Type().Kind() { + case reflect.String: + // See golang.org/issue/13145. + const arbitraryCutoff = 16 + if v.Len() > arbitraryCutoff { + w.printf("var %s %s = ", name, typeName(x)) + } else { + w.printf("const %s %s = ", name, typeName(x)) + } + w.WriteString(v.String()) + w.printf("\n") + default: + w.printf("const %s = %#v\n", name, x) + } +} + +// WriteVar writes a variable of the given name and value. +func (w *CodeWriter) WriteVar(name string, x interface{}) { + w.insertSep() + v := reflect.ValueOf(x) + oldSize := w.Size + sz := int(v.Type().Size()) + w.Size += sz + + switch v.Type().Kind() { + case reflect.String: + w.printf("var %s %s = ", name, typeName(x)) + w.WriteString(v.String()) + case reflect.Struct: + w.gob.Encode(x) + fallthrough + case reflect.Slice, reflect.Array: + w.printf("var %s = ", name) + w.writeValue(v) + w.writeSizeInfo(w.Size - oldSize) + default: + w.printf("var %s %s = ", name, typeName(x)) + w.gob.Encode(x) + w.writeValue(v) + w.writeSizeInfo(w.Size - oldSize) + } + w.printf("\n") +} + +func (w *CodeWriter) writeValue(v reflect.Value) { + x := v.Interface() + switch v.Kind() { + case reflect.String: + w.WriteString(v.String()) + case reflect.Array: + // Don't double count: callers of WriteArray count on the size being + // added, so we need to discount it here. + w.Size -= int(v.Type().Size()) + w.writeSlice(x, true) + case reflect.Slice: + w.writeSlice(x, false) + case reflect.Struct: + w.printf("%s{\n", typeName(v.Interface())) + t := v.Type() + for i := 0; i < v.NumField(); i++ { + w.printf("%s: ", t.Field(i).Name) + w.writeValue(v.Field(i)) + w.printf(",\n") + } + w.printf("}") + default: + w.printf("%#v", x) + } +} + +// WriteString writes a string literal. +func (w *CodeWriter) WriteString(s string) { + io.WriteString(w.Hash, s) // content hash + w.Size += len(s) + + const maxInline = 40 + if len(s) <= maxInline { + w.printf("%q", s) + return + } + + // We will render the string as a multi-line string. + const maxWidth = 80 - 4 - len(`"`) - len(`" +`) + + // When starting on its own line, go fmt indents line 2+ an extra level. + n, max := maxWidth, maxWidth-4 + + // Print "" +\n, if a string does not start on its own line. + b := w.buf.Bytes() + if p := len(bytes.TrimRight(b, " \t")); p > 0 && b[p-1] != '\n' { + w.printf("\"\" + // Size: %d bytes\n", len(s)) + n, max = maxWidth, maxWidth + } + + w.printf(`"`) + + for sz, p := 0, 0; p < len(s); { + var r rune + r, sz = utf8.DecodeRuneInString(s[p:]) + out := s[p : p+sz] + chars := 1 + if !unicode.IsPrint(r) || r == utf8.RuneError || r == '"' { + switch sz { + case 1: + out = fmt.Sprintf("\\x%02x", s[p]) + case 2, 3: + out = fmt.Sprintf("\\u%04x", r) + case 4: + out = fmt.Sprintf("\\U%08x", r) + } + chars = len(out) + } + if n -= chars; n < 0 { + w.printf("\" +\n\"") + n = max - len(out) + } + w.printf("%s", out) + p += sz + } + w.printf(`"`) +} + +// WriteSlice writes a slice value. +func (w *CodeWriter) WriteSlice(x interface{}) { + w.writeSlice(x, false) +} + +// WriteArray writes an array value. +func (w *CodeWriter) WriteArray(x interface{}) { + w.writeSlice(x, true) +} + +func (w *CodeWriter) writeSlice(x interface{}, isArray bool) { + v := reflect.ValueOf(x) + w.gob.Encode(v.Len()) + w.Size += v.Len() * int(v.Type().Elem().Size()) + name := typeName(x) + if isArray { + name = fmt.Sprintf("[%d]%s", v.Len(), name[strings.Index(name, "]")+1:]) + } + if isArray { + w.printf("%s{\n", name) + } else { + w.printf("%s{ // %d elements\n", name, v.Len()) + } + + switch kind := v.Type().Elem().Kind(); kind { + case reflect.String: + for _, s := range x.([]string) { + w.WriteString(s) + w.printf(",\n") + } + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, + reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: + // nLine and nBlock are the number of elements per line and block. + nLine, nBlock, format := 8, 64, "%d," + switch kind { + case reflect.Uint8: + format = "%#02x," + case reflect.Uint16: + format = "%#04x," + case reflect.Uint32: + nLine, nBlock, format = 4, 32, "%#08x," + case reflect.Uint, reflect.Uint64: + nLine, nBlock, format = 4, 32, "%#016x," + case reflect.Int8: + nLine = 16 + } + n := nLine + for i := 0; i < v.Len(); i++ { + if i%nBlock == 0 && v.Len() > nBlock { + w.printf("// Entry %X - %X\n", i, i+nBlock-1) + } + x := v.Index(i).Interface() + w.gob.Encode(x) + w.printf(format, x) + if n--; n == 0 { + n = nLine + w.printf("\n") + } + } + w.printf("\n") + case reflect.Struct: + zero := reflect.Zero(v.Type().Elem()).Interface() + for i := 0; i < v.Len(); i++ { + x := v.Index(i).Interface() + w.gob.EncodeValue(v) + if !reflect.DeepEqual(zero, x) { + line := fmt.Sprintf("%#v,\n", x) + line = line[strings.IndexByte(line, '{'):] + w.printf("%d: ", i) + w.printf(line) + } + } + case reflect.Array: + for i := 0; i < v.Len(); i++ { + w.printf("%d: %#v,\n", i, v.Index(i).Interface()) + } + default: + panic("gen: slice elem type not supported") + } + w.printf("}") +} + +// WriteType writes a definition of the type of the given value and returns the +// type name. +func (w *CodeWriter) WriteType(x interface{}) string { + t := reflect.TypeOf(x) + w.printf("type %s struct {\n", t.Name()) + for i := 0; i < t.NumField(); i++ { + w.printf("\t%s %s\n", t.Field(i).Name, t.Field(i).Type) + } + w.printf("}\n") + return t.Name() +} + +// typeName returns the name of the go type of x. +func typeName(x interface{}) string { + t := reflect.ValueOf(x).Type() + return strings.Replace(fmt.Sprint(t), "main.", "", 1) +} diff --git a/vendor/golang.org/x/text/internal/gen/gen.go b/vendor/golang.org/x/text/internal/gen/gen.go new file mode 100644 index 0000000..dfaa278 --- /dev/null +++ b/vendor/golang.org/x/text/internal/gen/gen.go @@ -0,0 +1,226 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package gen contains common code for the various code generation tools in the +// text repository. Its usage ensures consistency between tools. +// +// This package defines command line flags that are common to most generation +// tools. The flags allow for specifying specific Unicode and CLDR versions +// in the public Unicode data repository (http://www.unicode.org/Public). +// +// A local Unicode data mirror can be set through the flag -local or the +// environment variable UNICODE_DIR. The former takes precedence. The local +// directory should follow the same structure as the public repository. +// +// IANA data can also optionally be mirrored by putting it in the iana directory +// rooted at the top of the local mirror. Beware, though, that IANA data is not +// versioned. So it is up to the developer to use the right version. +package gen // import "golang.org/x/text/internal/gen" + +import ( + "bytes" + "flag" + "fmt" + "go/format" + "io" + "io/ioutil" + "log" + "net/http" + "os" + "path" + "path/filepath" + "unicode" + + "golang.org/x/text/unicode/cldr" +) + +var ( + url = flag.String("url", + "http://www.unicode.org/Public", + "URL of Unicode database directory") + iana = flag.String("iana", + "http://www.iana.org", + "URL of the IANA repository") + unicodeVersion = flag.String("unicode", + getEnv("UNICODE_VERSION", unicode.Version), + "unicode version to use") + cldrVersion = flag.String("cldr", + getEnv("CLDR_VERSION", cldr.Version), + "cldr version to use") + // Allow an environment variable to specify the local directory. + // go generate doesn't allow specifying arguments; this is a useful + // alternative to specifying a local mirror. + localDir = flag.String("local", + os.Getenv("UNICODE_DIR"), + "directory containing local data files; for debugging only.") +) + +func getEnv(name, def string) string { + if v := os.Getenv(name); v != "" { + return v + } + return def +} + +// Init performs common initialization for a gen command. It parses the flags +// and sets up the standard logging parameters. +func Init() { + log.SetPrefix("") + log.SetFlags(log.Lshortfile) + flag.Parse() +} + +const header = `// This file was generated by go generate; DO NOT EDIT + +package %s + +` + +// UnicodeVersion reports the requested Unicode version. +func UnicodeVersion() string { + return *unicodeVersion +} + +// UnicodeVersion reports the requested CLDR version. +func CLDRVersion() string { + return *cldrVersion +} + +// IsLocal reports whether the user specified a local directory. +func IsLocal() bool { + return *localDir != "" +} + +// OpenUCDFile opens the requested UCD file. The file is specified relative to +// the public Unicode root directory. It will call log.Fatal if there are any +// errors. +func OpenUCDFile(file string) io.ReadCloser { + return openUnicode(path.Join(*unicodeVersion, "ucd", file)) +} + +// OpenCLDRCoreZip opens the CLDR core zip file. It will call log.Fatal if there +// are any errors. +func OpenCLDRCoreZip() io.ReadCloser { + return OpenUnicodeFile("cldr", *cldrVersion, "core.zip") +} + +// OpenUnicodeFile opens the requested file of the requested category from the +// root of the Unicode data archive. The file is specified relative to the +// public Unicode root directory. If version is "", it will use the default +// Unicode version. It will call log.Fatal if there are any errors. +func OpenUnicodeFile(category, version, file string) io.ReadCloser { + if version == "" { + version = UnicodeVersion() + } + return openUnicode(path.Join(category, version, file)) +} + +// OpenIANAFile opens the requested IANA file. The file is specified relative +// to the IANA root, which is typically either http://www.iana.org or the +// iana directory in the local mirror. It will call log.Fatal if there are any +// errors. +func OpenIANAFile(path string) io.ReadCloser { + return Open(*iana, "iana", path) +} + +// Open opens subdir/path if a local directory is specified and the file exists, +// where subdir is a directory relative to the local root, or fetches it from +// urlRoot/path otherwise. It will call log.Fatal if there are any errors. +func Open(urlRoot, subdir, path string) io.ReadCloser { + if *localDir != "" { + path = filepath.FromSlash(path) + if f, err := os.Open(filepath.Join(*localDir, subdir, path)); err == nil { + return f + } + } + return get(urlRoot, path) +} + +func openUnicode(path string) io.ReadCloser { + if *localDir != "" { + path = filepath.FromSlash(path) + f, err := os.Open(filepath.Join(*localDir, path)) + if err != nil { + log.Fatal(err) + } + return f + } + return get(*url, path) +} + +func get(root, path string) io.ReadCloser { + url := root + "/" + path + fmt.Printf("Fetching %s...", url) + defer fmt.Println(" done.") + resp, err := http.Get(url) + if err != nil { + log.Fatalf("HTTP GET: %v", err) + } + if resp.StatusCode != 200 { + log.Fatalf("Bad GET status for %q: %q", url, resp.Status) + } + return resp.Body +} + +// TODO: use Write*Version in all applicable packages. + +// WriteUnicodeVersion writes a constant for the Unicode version from which the +// tables are generated. +func WriteUnicodeVersion(w io.Writer) { + fmt.Fprintf(w, "// UnicodeVersion is the Unicode version from which the tables in this package are derived.\n") + fmt.Fprintf(w, "const UnicodeVersion = %q\n\n", UnicodeVersion()) +} + +// WriteCLDRVersion writes a constant for the CLDR version from which the +// tables are generated. +func WriteCLDRVersion(w io.Writer) { + fmt.Fprintf(w, "// CLDRVersion is the CLDR version from which the tables in this package are derived.\n") + fmt.Fprintf(w, "const CLDRVersion = %q\n\n", CLDRVersion()) +} + +// WriteGoFile prepends a standard file comment and package statement to the +// given bytes, applies gofmt, and writes them to a file with the given name. +// It will call log.Fatal if there are any errors. +func WriteGoFile(filename, pkg string, b []byte) { + w, err := os.Create(filename) + if err != nil { + log.Fatalf("Could not create file %s: %v", filename, err) + } + defer w.Close() + if _, err = WriteGo(w, pkg, b); err != nil { + log.Fatalf("Error writing file %s: %v", filename, err) + } +} + +// WriteGo prepends a standard file comment and package statement to the given +// bytes, applies gofmt, and writes them to w. +func WriteGo(w io.Writer, pkg string, b []byte) (n int, err error) { + src := []byte(fmt.Sprintf(header, pkg)) + src = append(src, b...) + formatted, err := format.Source(src) + if err != nil { + // Print the generated code even in case of an error so that the + // returned error can be meaningfully interpreted. + n, _ = w.Write(src) + return n, err + } + return w.Write(formatted) +} + +// Repackage rewrites a Go file from belonging to package main to belonging to +// the given package. +func Repackage(inFile, outFile, pkg string) { + src, err := ioutil.ReadFile(inFile) + if err != nil { + log.Fatalf("reading %s: %v", inFile, err) + } + const toDelete = "package main\n\n" + i := bytes.Index(src, []byte(toDelete)) + if i < 0 { + log.Fatalf("Could not find %q in %s.", toDelete, inFile) + } + w := &bytes.Buffer{} + w.Write(src[i+len(toDelete):]) + WriteGoFile(outFile, pkg, w.Bytes()) +} diff --git a/vendor/golang.org/x/text/internal/triegen/compact.go b/vendor/golang.org/x/text/internal/triegen/compact.go new file mode 100644 index 0000000..397b975 --- /dev/null +++ b/vendor/golang.org/x/text/internal/triegen/compact.go @@ -0,0 +1,58 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package triegen + +// This file defines Compacter and its implementations. + +import "io" + +// A Compacter generates an alternative, more space-efficient way to store a +// trie value block. A trie value block holds all possible values for the last +// byte of a UTF-8 encoded rune. Excluding ASCII characters, a trie value block +// always has 64 values, as a UTF-8 encoding ends with a byte in [0x80, 0xC0). +type Compacter interface { + // Size returns whether the Compacter could encode the given block as well + // as its size in case it can. len(v) is always 64. + Size(v []uint64) (sz int, ok bool) + + // Store stores the block using the Compacter's compression method. + // It returns a handle with which the block can be retrieved. + // len(v) is always 64. + Store(v []uint64) uint32 + + // Print writes the data structures associated to the given store to w. + Print(w io.Writer) error + + // Handler returns the name of a function that gets called during trie + // lookup for blocks generated by the Compacter. The function should be of + // the form func (n uint32, b byte) uint64, where n is the index returned by + // the Compacter's Store method and b is the last byte of the UTF-8 + // encoding, where 0x80 <= b < 0xC0, for which to do the lookup in the + // block. + Handler() string +} + +// simpleCompacter is the default Compacter used by builder. It implements a +// normal trie block. +type simpleCompacter builder + +func (b *simpleCompacter) Size([]uint64) (sz int, ok bool) { + return blockSize * b.ValueSize, true +} + +func (b *simpleCompacter) Store(v []uint64) uint32 { + h := uint32(len(b.ValueBlocks) - blockOffset) + b.ValueBlocks = append(b.ValueBlocks, v) + return h +} + +func (b *simpleCompacter) Print(io.Writer) error { + // Structures are printed in print.go. + return nil +} + +func (b *simpleCompacter) Handler() string { + panic("Handler should be special-cased for this Compacter") +} diff --git a/vendor/golang.org/x/text/internal/triegen/print.go b/vendor/golang.org/x/text/internal/triegen/print.go new file mode 100644 index 0000000..8d9f120 --- /dev/null +++ b/vendor/golang.org/x/text/internal/triegen/print.go @@ -0,0 +1,251 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package triegen + +import ( + "bytes" + "fmt" + "io" + "strings" + "text/template" +) + +// print writes all the data structures as well as the code necessary to use the +// trie to w. +func (b *builder) print(w io.Writer) error { + b.Stats.NValueEntries = len(b.ValueBlocks) * blockSize + b.Stats.NValueBytes = len(b.ValueBlocks) * blockSize * b.ValueSize + b.Stats.NIndexEntries = len(b.IndexBlocks) * blockSize + b.Stats.NIndexBytes = len(b.IndexBlocks) * blockSize * b.IndexSize + b.Stats.NHandleBytes = len(b.Trie) * 2 * b.IndexSize + + // If we only have one root trie, all starter blocks are at position 0 and + // we can access the arrays directly. + if len(b.Trie) == 1 { + // At this point we cannot refer to the generated tables directly. + b.ASCIIBlock = b.Name + "Values" + b.StarterBlock = b.Name + "Index" + } else { + // Otherwise we need to have explicit starter indexes in the trie + // structure. + b.ASCIIBlock = "t.ascii" + b.StarterBlock = "t.utf8Start" + } + + b.SourceType = "[]byte" + if err := lookupGen.Execute(w, b); err != nil { + return err + } + + b.SourceType = "string" + if err := lookupGen.Execute(w, b); err != nil { + return err + } + + if err := trieGen.Execute(w, b); err != nil { + return err + } + + for _, c := range b.Compactions { + if err := c.c.Print(w); err != nil { + return err + } + } + + return nil +} + +func printValues(n int, values []uint64) string { + w := &bytes.Buffer{} + boff := n * blockSize + fmt.Fprintf(w, "\t// Block %#x, offset %#x", n, boff) + var newline bool + for i, v := range values { + if i%6 == 0 { + newline = true + } + if v != 0 { + if newline { + fmt.Fprintf(w, "\n") + newline = false + } + fmt.Fprintf(w, "\t%#02x:%#04x, ", boff+i, v) + } + } + return w.String() +} + +func printIndex(b *builder, nr int, n *node) string { + w := &bytes.Buffer{} + boff := nr * blockSize + fmt.Fprintf(w, "\t// Block %#x, offset %#x", nr, boff) + var newline bool + for i, c := range n.children { + if i%8 == 0 { + newline = true + } + if c != nil { + v := b.Compactions[c.index.compaction].Offset + uint32(c.index.index) + if v != 0 { + if newline { + fmt.Fprintf(w, "\n") + newline = false + } + fmt.Fprintf(w, "\t%#02x:%#02x, ", boff+i, v) + } + } + } + return w.String() +} + +var ( + trieGen = template.Must(template.New("trie").Funcs(template.FuncMap{ + "printValues": printValues, + "printIndex": printIndex, + "title": strings.Title, + "dec": func(x int) int { return x - 1 }, + "psize": func(n int) string { + return fmt.Sprintf("%d bytes (%.2f KiB)", n, float64(n)/1024) + }, + }).Parse(trieTemplate)) + lookupGen = template.Must(template.New("lookup").Parse(lookupTemplate)) +) + +// TODO: consider the return type of lookup. It could be uint64, even if the +// internal value type is smaller. We will have to verify this with the +// performance of unicode/norm, which is very sensitive to such changes. +const trieTemplate = `{{$b := .}}{{$multi := gt (len .Trie) 1}} +// {{.Name}}Trie. Total size: {{psize .Size}}. Checksum: {{printf "%08x" .Checksum}}. +type {{.Name}}Trie struct { {{if $multi}} + ascii []{{.ValueType}} // index for ASCII bytes + utf8Start []{{.IndexType}} // index for UTF-8 bytes >= 0xC0 +{{end}}} + +func new{{title .Name}}Trie(i int) *{{.Name}}Trie { {{if $multi}} + h := {{.Name}}TrieHandles[i] + return &{{.Name}}Trie{ {{.Name}}Values[uint32(h.ascii)<<6:], {{.Name}}Index[uint32(h.multi)<<6:] } +} + +type {{.Name}}TrieHandle struct { + ascii, multi {{.IndexType}} +} + +// {{.Name}}TrieHandles: {{len .Trie}} handles, {{.Stats.NHandleBytes}} bytes +var {{.Name}}TrieHandles = [{{len .Trie}}]{{.Name}}TrieHandle{ +{{range .Trie}} { {{.ASCIIIndex}}, {{.StarterIndex}} }, // {{printf "%08x" .Checksum}}: {{.Name}} +{{end}}}{{else}} + return &{{.Name}}Trie{} +} +{{end}} +// lookupValue determines the type of block n and looks up the value for b. +func (t *{{.Name}}Trie) lookupValue(n uint32, b byte) {{.ValueType}}{{$last := dec (len .Compactions)}} { + switch { {{range $i, $c := .Compactions}} + {{if eq $i $last}}default{{else}}case n < {{$c.Cutoff}}{{end}}:{{if ne $i 0}} + n -= {{$c.Offset}}{{end}} + return {{print $b.ValueType}}({{$c.Handler}}){{end}} + } +} + +// {{.Name}}Values: {{len .ValueBlocks}} blocks, {{.Stats.NValueEntries}} entries, {{.Stats.NValueBytes}} bytes +// The third block is the zero block. +var {{.Name}}Values = [{{.Stats.NValueEntries}}]{{.ValueType}} { +{{range $i, $v := .ValueBlocks}}{{printValues $i $v}} +{{end}}} + +// {{.Name}}Index: {{len .IndexBlocks}} blocks, {{.Stats.NIndexEntries}} entries, {{.Stats.NIndexBytes}} bytes +// Block 0 is the zero block. +var {{.Name}}Index = [{{.Stats.NIndexEntries}}]{{.IndexType}} { +{{range $i, $v := .IndexBlocks}}{{printIndex $b $i $v}} +{{end}}} +` + +// TODO: consider allowing zero-length strings after evaluating performance with +// unicode/norm. +const lookupTemplate = ` +// lookup{{if eq .SourceType "string"}}String{{end}} returns the trie value for the first UTF-8 encoding in s and +// the width in bytes of this encoding. The size will be 0 if s does not +// hold enough bytes to complete the encoding. len(s) must be greater than 0. +func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}(s {{.SourceType}}) (v {{.ValueType}}, sz int) { + c0 := s[0] + switch { + case c0 < 0x80: // is ASCII + return {{.ASCIIBlock}}[c0], 1 + case c0 < 0xC2: + return 0, 1 // Illegal UTF-8: not a starter, not ASCII. + case c0 < 0xE0: // 2-byte UTF-8 + if len(s) < 2 { + return 0, 0 + } + i := {{.StarterBlock}}[c0] + c1 := s[1] + if c1 < 0x80 || 0xC0 <= c1 { + return 0, 1 // Illegal UTF-8: not a continuation byte. + } + return t.lookupValue(uint32(i), c1), 2 + case c0 < 0xF0: // 3-byte UTF-8 + if len(s) < 3 { + return 0, 0 + } + i := {{.StarterBlock}}[c0] + c1 := s[1] + if c1 < 0x80 || 0xC0 <= c1 { + return 0, 1 // Illegal UTF-8: not a continuation byte. + } + o := uint32(i)<<6 + uint32(c1) + i = {{.Name}}Index[o] + c2 := s[2] + if c2 < 0x80 || 0xC0 <= c2 { + return 0, 2 // Illegal UTF-8: not a continuation byte. + } + return t.lookupValue(uint32(i), c2), 3 + case c0 < 0xF8: // 4-byte UTF-8 + if len(s) < 4 { + return 0, 0 + } + i := {{.StarterBlock}}[c0] + c1 := s[1] + if c1 < 0x80 || 0xC0 <= c1 { + return 0, 1 // Illegal UTF-8: not a continuation byte. + } + o := uint32(i)<<6 + uint32(c1) + i = {{.Name}}Index[o] + c2 := s[2] + if c2 < 0x80 || 0xC0 <= c2 { + return 0, 2 // Illegal UTF-8: not a continuation byte. + } + o = uint32(i)<<6 + uint32(c2) + i = {{.Name}}Index[o] + c3 := s[3] + if c3 < 0x80 || 0xC0 <= c3 { + return 0, 3 // Illegal UTF-8: not a continuation byte. + } + return t.lookupValue(uint32(i), c3), 4 + } + // Illegal rune + return 0, 1 +} + +// lookup{{if eq .SourceType "string"}}String{{end}}Unsafe returns the trie value for the first UTF-8 encoding in s. +// s must start with a full and valid UTF-8 encoded rune. +func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}Unsafe(s {{.SourceType}}) {{.ValueType}} { + c0 := s[0] + if c0 < 0x80 { // is ASCII + return {{.ASCIIBlock}}[c0] + } + i := {{.StarterBlock}}[c0] + if c0 < 0xE0 { // 2-byte UTF-8 + return t.lookupValue(uint32(i), s[1]) + } + i = {{.Name}}Index[uint32(i)<<6+uint32(s[1])] + if c0 < 0xF0 { // 3-byte UTF-8 + return t.lookupValue(uint32(i), s[2]) + } + i = {{.Name}}Index[uint32(i)<<6+uint32(s[2])] + if c0 < 0xF8 { // 4-byte UTF-8 + return t.lookupValue(uint32(i), s[3]) + } + return 0 +} +` diff --git a/vendor/golang.org/x/text/internal/triegen/triegen.go b/vendor/golang.org/x/text/internal/triegen/triegen.go new file mode 100644 index 0000000..adb0108 --- /dev/null +++ b/vendor/golang.org/x/text/internal/triegen/triegen.go @@ -0,0 +1,494 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package triegen implements a code generator for a trie for associating +// unsigned integer values with UTF-8 encoded runes. +// +// Many of the go.text packages use tries for storing per-rune information. A +// trie is especially useful if many of the runes have the same value. If this +// is the case, many blocks can be expected to be shared allowing for +// information on many runes to be stored in little space. +// +// As most of the lookups are done directly on []byte slices, the tries use the +// UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to +// runes and contributes a little bit to better performance. It also naturally +// provides a fast path for ASCII. +// +// Space is also an issue. There are many code points defined in Unicode and as +// a result tables can get quite large. So every byte counts. The triegen +// package automatically chooses the smallest integer values to represent the +// tables. Compacters allow further compression of the trie by allowing for +// alternative representations of individual trie blocks. +// +// triegen allows generating multiple tries as a single structure. This is +// useful when, for example, one wants to generate tries for several languages +// that have a lot of values in common. Some existing libraries for +// internationalization store all per-language data as a dynamically loadable +// chunk. The go.text packages are designed with the assumption that the user +// typically wants to compile in support for all supported languages, in line +// with the approach common to Go to create a single standalone binary. The +// multi-root trie approach can give significant storage savings in this +// scenario. +// +// triegen generates both tables and code. The code is optimized to use the +// automatically chosen data types. The following code is generated for a Trie +// or multiple Tries named "foo": +// - type fooTrie +// The trie type. +// +// - func newFooTrie(x int) *fooTrie +// Trie constructor, where x is the index of the trie passed to Gen. +// +// - func (t *fooTrie) lookup(s []byte) (v uintX, sz int) +// The lookup method, where uintX is automatically chosen. +// +// - func lookupString, lookupUnsafe and lookupStringUnsafe +// Variants of the above. +// +// - var fooValues and fooIndex and any tables generated by Compacters. +// The core trie data. +// +// - var fooTrieHandles +// Indexes of starter blocks in case of multiple trie roots. +// +// It is recommended that users test the generated trie by checking the returned +// value for every rune. Such exhaustive tests are possible as the the number of +// runes in Unicode is limited. +package triegen // import "golang.org/x/text/internal/triegen" + +// TODO: Arguably, the internally optimized data types would not have to be +// exposed in the generated API. We could also investigate not generating the +// code, but using it through a package. We would have to investigate the impact +// on performance of making such change, though. For packages like unicode/norm, +// small changes like this could tank performance. + +import ( + "encoding/binary" + "fmt" + "hash/crc64" + "io" + "log" + "unicode/utf8" +) + +// builder builds a set of tries for associating values with runes. The set of +// tries can share common index and value blocks. +type builder struct { + Name string + + // ValueType is the type of the trie values looked up. + ValueType string + + // ValueSize is the byte size of the ValueType. + ValueSize int + + // IndexType is the type of trie index values used for all UTF-8 bytes of + // a rune except the last one. + IndexType string + + // IndexSize is the byte size of the IndexType. + IndexSize int + + // SourceType is used when generating the lookup functions. If the user + // requests StringSupport, all lookup functions will be generated for + // string input as well. + SourceType string + + Trie []*Trie + + IndexBlocks []*node + ValueBlocks [][]uint64 + Compactions []compaction + Checksum uint64 + + ASCIIBlock string + StarterBlock string + + indexBlockIdx map[uint64]int + valueBlockIdx map[uint64]nodeIndex + asciiBlockIdx map[uint64]int + + // Stats are used to fill out the template. + Stats struct { + NValueEntries int + NValueBytes int + NIndexEntries int + NIndexBytes int + NHandleBytes int + } + + err error +} + +// A nodeIndex encodes the index of a node, which is defined by the compaction +// which stores it and an index within the compaction. For internal nodes, the +// compaction is always 0. +type nodeIndex struct { + compaction int + index int +} + +// compaction keeps track of stats used for the compaction. +type compaction struct { + c Compacter + blocks []*node + maxHandle uint32 + totalSize int + + // Used by template-based generator and thus exported. + Cutoff uint32 + Offset uint32 + Handler string +} + +func (b *builder) setError(err error) { + if b.err == nil { + b.err = err + } +} + +// An Option can be passed to Gen. +type Option func(b *builder) error + +// Compact configures the trie generator to use the given Compacter. +func Compact(c Compacter) Option { + return func(b *builder) error { + b.Compactions = append(b.Compactions, compaction{ + c: c, + Handler: c.Handler() + "(n, b)"}) + return nil + } +} + +// Gen writes Go code for a shared trie lookup structure to w for the given +// Tries. The generated trie type will be called nameTrie. newNameTrie(x) will +// return the *nameTrie for tries[x]. A value can be looked up by using one of +// the various lookup methods defined on nameTrie. It returns the table size of +// the generated trie. +func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) { + // The index contains two dummy blocks, followed by the zero block. The zero + // block is at offset 0x80, so that the offset for the zero block for + // continuation bytes is 0. + b := &builder{ + Name: name, + Trie: tries, + IndexBlocks: []*node{{}, {}, {}}, + Compactions: []compaction{{ + Handler: name + "Values[n<<6+uint32(b)]", + }}, + // The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero + // block. + indexBlockIdx: map[uint64]int{0: 0}, + valueBlockIdx: map[uint64]nodeIndex{0: {}}, + asciiBlockIdx: map[uint64]int{}, + } + b.Compactions[0].c = (*simpleCompacter)(b) + + for _, f := range opts { + if err := f(b); err != nil { + return 0, err + } + } + b.build() + if b.err != nil { + return 0, b.err + } + if err = b.print(w); err != nil { + return 0, err + } + return b.Size(), nil +} + +// A Trie represents a single root node of a trie. A builder may build several +// overlapping tries at once. +type Trie struct { + root *node + + hiddenTrie +} + +// hiddenTrie contains values we want to be visible to the template generator, +// but hidden from the API documentation. +type hiddenTrie struct { + Name string + Checksum uint64 + ASCIIIndex int + StarterIndex int +} + +// NewTrie returns a new trie root. +func NewTrie(name string) *Trie { + return &Trie{ + &node{ + children: make([]*node, blockSize), + values: make([]uint64, utf8.RuneSelf), + }, + hiddenTrie{Name: name}, + } +} + +// Gen is a convenience wrapper around the Gen func passing t as the only trie +// and uses the name passed to NewTrie. It returns the size of the generated +// tables. +func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) { + return Gen(w, t.Name, []*Trie{t}, opts...) +} + +// node is a node of the intermediate trie structure. +type node struct { + // children holds this node's children. It is always of length 64. + // A child node may be nil. + children []*node + + // values contains the values of this node. If it is non-nil, this node is + // either a root or leaf node: + // For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F]. + // For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF]. + values []uint64 + + index nodeIndex +} + +// Insert associates value with the given rune. Insert will panic if a non-zero +// value is passed for an invalid rune. +func (t *Trie) Insert(r rune, value uint64) { + if value == 0 { + return + } + s := string(r) + if []rune(s)[0] != r && value != 0 { + // Note: The UCD tables will always assign what amounts to a zero value + // to a surrogate. Allowing a zero value for an illegal rune allows + // users to iterate over [0..MaxRune] without having to explicitly + // exclude surrogates, which would be tedious. + panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r)) + } + if len(s) == 1 { + // It is a root node value (ASCII). + t.root.values[s[0]] = value + return + } + + n := t.root + for ; len(s) > 1; s = s[1:] { + if n.children == nil { + n.children = make([]*node, blockSize) + } + p := s[0] % blockSize + c := n.children[p] + if c == nil { + c = &node{} + n.children[p] = c + } + if len(s) > 2 && c.values != nil { + log.Fatalf("triegen: insert(%U): found internal node with values", r) + } + n = c + } + if n.values == nil { + n.values = make([]uint64, blockSize) + } + if n.children != nil { + log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r) + } + n.values[s[0]-0x80] = value +} + +// Size returns the number of bytes the generated trie will take to store. It +// needs to be exported as it is used in the templates. +func (b *builder) Size() int { + // Index blocks. + sz := len(b.IndexBlocks) * blockSize * b.IndexSize + + // Skip the first compaction, which represents the normal value blocks, as + // its totalSize does not account for the ASCII blocks, which are managed + // separately. + sz += len(b.ValueBlocks) * blockSize * b.ValueSize + for _, c := range b.Compactions[1:] { + sz += c.totalSize + } + + // TODO: this computation does not account for the fixed overhead of a using + // a compaction, either code or data. As for data, though, the typical + // overhead of data is in the order of bytes (2 bytes for cases). Further, + // the savings of using a compaction should anyway be substantial for it to + // be worth it. + + // For multi-root tries, we also need to account for the handles. + if len(b.Trie) > 1 { + sz += 2 * b.IndexSize * len(b.Trie) + } + return sz +} + +func (b *builder) build() { + // Compute the sizes of the values. + var vmax uint64 + for _, t := range b.Trie { + vmax = maxValue(t.root, vmax) + } + b.ValueType, b.ValueSize = getIntType(vmax) + + // Compute all block allocations. + // TODO: first compute the ASCII blocks for all tries and then the other + // nodes. ASCII blocks are more restricted in placement, as they require two + // blocks to be placed consecutively. Processing them first may improve + // sharing (at least one zero block can be expected to be saved.) + for _, t := range b.Trie { + b.Checksum += b.buildTrie(t) + } + + // Compute the offsets for all the Compacters. + offset := uint32(0) + for i := range b.Compactions { + c := &b.Compactions[i] + c.Offset = offset + offset += c.maxHandle + 1 + c.Cutoff = offset + } + + // Compute the sizes of indexes. + // TODO: different byte positions could have different sizes. So far we have + // not found a case where this is beneficial. + imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff) + for _, ib := range b.IndexBlocks { + if x := uint64(ib.index.index); x > imax { + imax = x + } + } + b.IndexType, b.IndexSize = getIntType(imax) +} + +func maxValue(n *node, max uint64) uint64 { + if n == nil { + return max + } + for _, c := range n.children { + max = maxValue(c, max) + } + for _, v := range n.values { + if max < v { + max = v + } + } + return max +} + +func getIntType(v uint64) (string, int) { + switch { + case v < 1<<8: + return "uint8", 1 + case v < 1<<16: + return "uint16", 2 + case v < 1<<32: + return "uint32", 4 + } + return "uint64", 8 +} + +const ( + blockSize = 64 + + // Subtract two blocks to offset 0x80, the first continuation byte. + blockOffset = 2 + + // Subtract three blocks to offset 0xC0, the first non-ASCII starter. + rootBlockOffset = 3 +) + +var crcTable = crc64.MakeTable(crc64.ISO) + +func (b *builder) buildTrie(t *Trie) uint64 { + n := t.root + + // Get the ASCII offset. For the first trie, the ASCII block will be at + // position 0. + hasher := crc64.New(crcTable) + binary.Write(hasher, binary.BigEndian, n.values) + hash := hasher.Sum64() + + v, ok := b.asciiBlockIdx[hash] + if !ok { + v = len(b.ValueBlocks) + b.asciiBlockIdx[hash] = v + + b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:]) + if v == 0 { + // Add the zero block at position 2 so that it will be assigned a + // zero reference in the lookup blocks. + // TODO: always do this? This would allow us to remove a check from + // the trie lookup, but at the expense of extra space. Analyze + // performance for unicode/norm. + b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize)) + } + } + t.ASCIIIndex = v + + // Compute remaining offsets. + t.Checksum = b.computeOffsets(n, true) + // We already subtracted the normal blockOffset from the index. Subtract the + // difference for starter bytes. + t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset) + return t.Checksum +} + +func (b *builder) computeOffsets(n *node, root bool) uint64 { + // For the first trie, the root lookup block will be at position 3, which is + // the offset for UTF-8 non-ASCII starter bytes. + first := len(b.IndexBlocks) == rootBlockOffset + if first { + b.IndexBlocks = append(b.IndexBlocks, n) + } + + // We special-case the cases where all values recursively are 0. This allows + // for the use of a zero block to which all such values can be directed. + hash := uint64(0) + if n.children != nil || n.values != nil { + hasher := crc64.New(crcTable) + for _, c := range n.children { + var v uint64 + if c != nil { + v = b.computeOffsets(c, false) + } + binary.Write(hasher, binary.BigEndian, v) + } + binary.Write(hasher, binary.BigEndian, n.values) + hash = hasher.Sum64() + } + + if first { + b.indexBlockIdx[hash] = rootBlockOffset - blockOffset + } + + // Compacters don't apply to internal nodes. + if n.children != nil { + v, ok := b.indexBlockIdx[hash] + if !ok { + v = len(b.IndexBlocks) - blockOffset + b.IndexBlocks = append(b.IndexBlocks, n) + b.indexBlockIdx[hash] = v + } + n.index = nodeIndex{0, v} + } else { + h, ok := b.valueBlockIdx[hash] + if !ok { + bestI, bestSize := 0, blockSize*b.ValueSize + for i, c := range b.Compactions[1:] { + if sz, ok := c.c.Size(n.values); ok && bestSize > sz { + bestI, bestSize = i+1, sz + } + } + c := &b.Compactions[bestI] + c.totalSize += bestSize + v := c.c.Store(n.values) + if c.maxHandle < v { + c.maxHandle = v + } + h = nodeIndex{bestI, int(v)} + b.valueBlockIdx[hash] = h + } + n.index = h + } + return hash +} diff --git a/vendor/golang.org/x/text/internal/ucd/ucd.go b/vendor/golang.org/x/text/internal/ucd/ucd.go new file mode 100644 index 0000000..2b0d1a1 --- /dev/null +++ b/vendor/golang.org/x/text/internal/ucd/ucd.go @@ -0,0 +1,363 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package ucd provides a parser for Unicode Character Database files, the +// format of which is defined in http://www.unicode.org/reports/tr44/. See +// http://www.unicode.org/Public/UCD/latest/ucd/ for example files. +// +// It currently does not support substitutions of missing fields. +package ucd // import "golang.org/x/text/internal/ucd" + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "log" + "regexp" + "strconv" + "strings" +) + +// UnicodeData.txt fields. +const ( + CodePoint = iota + Name + GeneralCategory + CanonicalCombiningClass + BidiClass + DecompMapping + DecimalValue + DigitValue + NumericValue + BidiMirrored + Unicode1Name + ISOComment + SimpleUppercaseMapping + SimpleLowercaseMapping + SimpleTitlecaseMapping +) + +// Parse calls f for each entry in the given reader of a UCD file. It will close +// the reader upon return. It will call log.Fatal if any error occurred. +// +// This implements the most common usage pattern of using Parser. +func Parse(r io.ReadCloser, f func(p *Parser)) { + defer r.Close() + + p := New(r) + for p.Next() { + f(p) + } + if err := p.Err(); err != nil { + r.Close() // os.Exit will cause defers not to be called. + log.Fatal(err) + } +} + +// An Option is used to configure a Parser. +type Option func(p *Parser) + +func keepRanges(p *Parser) { + p.keepRanges = true +} + +var ( + // KeepRanges prevents the expansion of ranges. The raw ranges can be + // obtained by calling Range(0) on the parser. + KeepRanges Option = keepRanges +) + +// The Part option register a handler for lines starting with a '@'. The text +// after a '@' is available as the first field. Comments are handled as usual. +func Part(f func(p *Parser)) Option { + return func(p *Parser) { + p.partHandler = f + } +} + +// A Parser parses Unicode Character Database (UCD) files. +type Parser struct { + scanner *bufio.Scanner + + keepRanges bool // Don't expand rune ranges in field 0. + + err error + comment []byte + field [][]byte + // parsedRange is needed in case Range(0) is called more than once for one + // field. In some cases this requires scanning ahead. + parsedRange bool + rangeStart, rangeEnd rune + + partHandler func(p *Parser) +} + +func (p *Parser) setError(err error) { + if p.err == nil { + p.err = err + } +} + +func (p *Parser) getField(i int) []byte { + if i >= len(p.field) { + p.setError(fmt.Errorf("ucd: index of field %d out of bounds", i)) + return nil + } + return p.field[i] +} + +// Err returns a non-nil error if any error occurred during parsing. +func (p *Parser) Err() error { + return p.err +} + +// New returns a Parser for the given Reader. +func New(r io.Reader, o ...Option) *Parser { + p := &Parser{ + scanner: bufio.NewScanner(r), + } + for _, f := range o { + f(p) + } + return p +} + +// Next parses the next line in the file. It returns true if a line was parsed +// and false if it reached the end of the file. +func (p *Parser) Next() bool { + if !p.keepRanges && p.rangeStart < p.rangeEnd { + p.rangeStart++ + return true + } + p.comment = nil + p.field = p.field[:0] + p.parsedRange = false + + for p.scanner.Scan() { + b := p.scanner.Bytes() + if len(b) == 0 || b[0] == '#' { + continue + } + + // Parse line + if i := bytes.IndexByte(b, '#'); i != -1 { + p.comment = bytes.TrimSpace(b[i+1:]) + b = b[:i] + } + if b[0] == '@' { + if p.partHandler != nil { + p.field = append(p.field, bytes.TrimSpace(b[1:])) + p.partHandler(p) + p.field = p.field[:0] + } + p.comment = nil + continue + } + for { + i := bytes.IndexByte(b, ';') + if i == -1 { + p.field = append(p.field, bytes.TrimSpace(b)) + break + } + p.field = append(p.field, bytes.TrimSpace(b[:i])) + b = b[i+1:] + } + if !p.keepRanges { + p.rangeStart, p.rangeEnd = p.getRange(0) + } + return true + } + p.setError(p.scanner.Err()) + return false +} + +func parseRune(b []byte) (rune, error) { + if len(b) > 2 && b[0] == 'U' && b[1] == '+' { + b = b[2:] + } + x, err := strconv.ParseUint(string(b), 16, 32) + return rune(x), err +} + +func (p *Parser) parseRune(b []byte) rune { + x, err := parseRune(b) + p.setError(err) + return x +} + +// Rune parses and returns field i as a rune. +func (p *Parser) Rune(i int) rune { + if i > 0 || p.keepRanges { + return p.parseRune(p.getField(i)) + } + return p.rangeStart +} + +// Runes interprets and returns field i as a sequence of runes. +func (p *Parser) Runes(i int) (runes []rune) { + add := func(b []byte) { + if b = bytes.TrimSpace(b); len(b) > 0 { + runes = append(runes, p.parseRune(b)) + } + } + for b := p.getField(i); ; { + i := bytes.IndexByte(b, ' ') + if i == -1 { + add(b) + break + } + add(b[:i]) + b = b[i+1:] + } + return +} + +var ( + errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>") + + // reRange matches one line of a legacy rune range. + reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$") +) + +// Range parses and returns field i as a rune range. A range is inclusive at +// both ends. If the field only has one rune, first and last will be identical. +// It supports the legacy format for ranges used in UnicodeData.txt. +func (p *Parser) Range(i int) (first, last rune) { + if !p.keepRanges { + return p.rangeStart, p.rangeStart + } + return p.getRange(i) +} + +func (p *Parser) getRange(i int) (first, last rune) { + b := p.getField(i) + if k := bytes.Index(b, []byte("..")); k != -1 { + return p.parseRune(b[:k]), p.parseRune(b[k+2:]) + } + // The first field may not be a rune, in which case we may ignore any error + // and set the range as 0..0. + x, err := parseRune(b) + if err != nil { + // Disable range parsing henceforth. This ensures that an error will be + // returned if the user subsequently will try to parse this field as + // a Rune. + p.keepRanges = true + } + // Special case for UnicodeData that was retained for backwards compatibility. + if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) { + if p.parsedRange { + return p.rangeStart, p.rangeEnd + } + mf := reRange.FindStringSubmatch(p.scanner.Text()) + if mf == nil || !p.scanner.Scan() { + p.setError(errIncorrectLegacyRange) + return x, x + } + // Using Bytes would be more efficient here, but Text is a lot easier + // and this is not a frequent case. + ml := reRange.FindStringSubmatch(p.scanner.Text()) + if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] { + p.setError(errIncorrectLegacyRange) + return x, x + } + p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])]) + p.parsedRange = true + return p.rangeStart, p.rangeEnd + } + return x, x +} + +// bools recognizes all valid UCD boolean values. +var bools = map[string]bool{ + "": false, + "N": false, + "No": false, + "F": false, + "False": false, + "Y": true, + "Yes": true, + "T": true, + "True": true, +} + +// Bool parses and returns field i as a boolean value. +func (p *Parser) Bool(i int) bool { + b := p.getField(i) + for s, v := range bools { + if bstrEq(b, s) { + return v + } + } + p.setError(strconv.ErrSyntax) + return false +} + +// Int parses and returns field i as an integer value. +func (p *Parser) Int(i int) int { + x, err := strconv.ParseInt(string(p.getField(i)), 10, 64) + p.setError(err) + return int(x) +} + +// Uint parses and returns field i as an unsigned integer value. +func (p *Parser) Uint(i int) uint { + x, err := strconv.ParseUint(string(p.getField(i)), 10, 64) + p.setError(err) + return uint(x) +} + +// Float parses and returns field i as a decimal value. +func (p *Parser) Float(i int) float64 { + x, err := strconv.ParseFloat(string(p.getField(i)), 64) + p.setError(err) + return x +} + +// String parses and returns field i as a string value. +func (p *Parser) String(i int) string { + return string(p.getField(i)) +} + +// Strings parses and returns field i as a space-separated list of strings. +func (p *Parser) Strings(i int) []string { + ss := strings.Split(string(p.getField(i)), " ") + for i, s := range ss { + ss[i] = strings.TrimSpace(s) + } + return ss +} + +// Comment returns the comments for the current line. +func (p *Parser) Comment() string { + return string(p.comment) +} + +var errUndefinedEnum = errors.New("ucd: undefined enum value") + +// Enum interprets and returns field i as a value that must be one of the values +// in enum. +func (p *Parser) Enum(i int, enum ...string) string { + b := p.getField(i) + for _, s := range enum { + if bstrEq(b, s) { + return s + } + } + p.setError(errUndefinedEnum) + return "" +} + +func bstrEq(b []byte, s string) bool { + if len(b) != len(s) { + return false + } + for i, c := range b { + if c != s[i] { + return false + } + } + return true +} -- cgit v1.2.3