// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build ignore // Collation table generator. // Data read from the web. package main import ( "archive/zip" "bufio" "bytes" "flag" "fmt" "io" "io/ioutil" "log" "os" "regexp" "sort" "strconv" "strings" "unicode/utf8" "golang.org/x/text/collate" "golang.org/x/text/collate/build" "golang.org/x/text/internal/colltab" "golang.org/x/text/internal/gen" "golang.org/x/text/language" "golang.org/x/text/unicode/cldr" ) var ( test = flag.Bool("test", false, "test existing tables; can be used to compare web data with package data.") short = flag.Bool("short", false, `Use "short" alternatives, when available.`) draft = flag.Bool("draft", false, `Use draft versions, when available.`) tags = flag.String("tags", "", "build tags to be included after +build directive") pkg = flag.String("package", "collate", "the name of the package in which the generated file is to be included") tables = flagStringSetAllowAll("tables", "collate", "collate,chars", "comma-spearated list of tables to generate.") exclude = flagStringSet("exclude", "zh2", "", "comma-separated list of languages to exclude.") include = flagStringSet("include", "", "", "comma-separated list of languages to include. Include trumps exclude.") // TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons) // TODO: Not included: traditional (buggy for Bengali) types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "", "comma-separated list of types that should be included.") ) // stringSet implements an ordered set based on a list. It implements flag.Value // to allow a set to be specified as a comma-separated list. type stringSet struct { s []string allowed *stringSet dirty bool // needs compaction if true all bool allowAll bool } func flagStringSet(name, def, allowed, usage string) *stringSet { ss := &stringSet{} if allowed != "" { usage += fmt.Sprintf(" (allowed values: any of %s)", allowed) ss.allowed = &stringSet{} failOnError(ss.allowed.Set(allowed)) } ss.Set(def) flag.Var(ss, name, usage) return ss } func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet { ss := &stringSet{allowAll: true} if allowed == "" { flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`)) } else { ss.allowed = &stringSet{} failOnError(ss.allowed.Set(allowed)) flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed)) } ss.Set(def) return ss } func (ss stringSet) Len() int { return len(ss.s) } func (ss stringSet) String() string { return strings.Join(ss.s, ",") } func (ss *stringSet) Set(s string) error { if ss.allowAll && s == "all" { ss.s = nil ss.all = true return nil } ss.s = ss.s[:0] for _, s := range strings.Split(s, ",") { if s := strings.TrimSpace(s); s != "" { if ss.allowed != nil && !ss.allowed.contains(s) { return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed) } ss.add(s) } } ss.compact() return nil } func (ss *stringSet) add(s string) { ss.s = append(ss.s, s) ss.dirty = true } func (ss *stringSet) values() []string { ss.compact() return ss.s } func (ss *stringSet) contains(s string) bool { if ss.all { return true } for _, v := range ss.s { if v == s { return true } } return false } func (ss *stringSet) compact() { if !ss.dirty { return } a := ss.s sort.Strings(a) k := 0 for i := 1; i < len(a); i++ { if a[k] != a[i] { a[k+1] = a[i] k++ } } ss.s = a[:k+1] ss.dirty = false } func skipLang(l string) bool { if include.Len() > 0 { return !include.contains(l) } return exclude.contains(l) } // altInclude returns a list of alternatives (for the LDML alt attribute) // in order of preference. An empty string in this list indicates the // default entry. func altInclude() []string { l := []string{} if *short { l = append(l, "short") } l = append(l, "") // TODO: handle draft using cldr.SetDraftLevel if *draft { l = append(l, "proposed") } return l } func failOnError(e error) { if e != nil { log.Panic(e) } } func openArchive() *zip.Reader { f := gen.OpenCLDRCoreZip() buffer, err := ioutil.ReadAll(f) f.Close() failOnError(err) archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) failOnError(err) return archive } // parseUCA parses a Default Unicode Collation Element Table of the format // specified in http://www.unicode.org/reports/tr10/#File_Format. // It returns the variable top. func parseUCA(builder *build.Builder) { var r io.ReadCloser var err error for _, f := range openArchive().File { if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") { r, err = f.Open() } } if r == nil { log.Fatal("File allkeys_CLDR.txt not found in archive.") } failOnError(err) defer r.Close() scanner := bufio.NewScanner(r) colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) for i := 1; scanner.Scan(); i++ { line := scanner.Text() if len(line) == 0 || line[0] == '#' { continue } if line[0] == '@' { // parse properties switch { case strings.HasPrefix(line[1:], "version "): a := strings.Split(line[1:], " ") if a[1] != gen.UnicodeVersion() { log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion()) } case strings.HasPrefix(line[1:], "backwards "): log.Fatalf("%d: unsupported option backwards", i) default: log.Printf("%d: unknown option %s", i, line[1:]) } } else { // parse entries part := strings.Split(line, " ; ") if len(part) != 2 { log.Fatalf("%d: production rule without ';': %v", i, line) } lhs := []rune{} for _, v := range strings.Split(part[0], " ") { if v == "" { continue } lhs = append(lhs, rune(convHex(i, v))) } var n int var vars []int rhs := [][]int{} for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { n += len(m[0]) elem := []int{} for _, h := range strings.Split(m[2], ".") { elem = append(elem, convHex(i, h)) } if m[1] == "*" { vars = append(vars, i) } rhs = append(rhs, elem) } if len(part[1]) < n+3 || part[1][n+1] != '#' { log.Fatalf("%d: expected comment; found %s", i, part[1][n:]) } if *test { testInput.add(string(lhs)) } failOnError(builder.Add(lhs, rhs, vars)) } } if scanner.Err() != nil { log.Fatal(scanner.Err()) } } func convHex(line int, s string) int { r, e := strconv.ParseInt(s, 16, 32) if e != nil { log.Fatalf("%d: %v", line, e) } return int(r) } var testInput = stringSet{} var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`) var tagRe = regexp.MustCompile(`<([a-z_]*) */>`) var mainLocales = []string{} // charsets holds a list of exemplar characters per category. type charSets map[string][]string func (p charSets) fprint(w io.Writer) { fmt.Fprintln(w, "[exN]string{") for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} { if set := p[k]; len(set) != 0 { fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " ")) } } fmt.Fprintln(w, "\t},") } var localeChars = make(map[string]charSets) const exemplarHeader = ` type exemplarType int const ( exCharacters exemplarType = iota exContractions exPunctuation exAuxiliary exCurrency exIndex exN ) ` func printExemplarCharacters(w io.Writer) { fmt.Fprintln(w, exemplarHeader) fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{") for _, loc := range mainLocales { fmt.Fprintf(w, "\t%q: ", loc) localeChars[loc].fprint(w) } fmt.Fprintln(w, "}") } func decodeCLDR(d *cldr.Decoder) *cldr.CLDR { r := gen.OpenCLDRCoreZip() data, err := d.DecodeZip(r) failOnError(err) return data } // parseMain parses XML files in the main directory of the CLDR core.zip file. func parseMain() { d := &cldr.Decoder{} d.SetDirFilter("main") d.SetSectionFilter("characters") data := decodeCLDR(d) for _, loc := range data.Locales() { x := data.RawLDML(loc) if skipLang(x.Identity.Language.Type) { continue } if x.Characters != nil { x, _ = data.LDML(loc) loc = language.Make(loc).String() for _, ec := range x.Characters.ExemplarCharacters { if ec.Draft != "" { continue } if _, ok := localeChars[loc]; !ok { mainLocales = append(mainLocales, loc) localeChars[loc] = make(charSets) } localeChars[loc][ec.Type] = parseCharacters(ec.Data()) } } } } func parseCharacters(chars string) []string { parseSingle := func(s string) (r rune, tail string, escaped bool) { if s[0] == '\\' { return rune(s[1]), s[2:], true } r, sz := utf8.DecodeRuneInString(s) return r, s[sz:], false } chars = strings.TrimSpace(chars) if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' { chars = chars[1:n] } list := []string{} var r, last, end rune for len(chars) > 0 { if chars[0] == '{' { // character sequence buf := []rune{} for chars = chars[1:]; len(chars) > 0; { r, chars, _ = parseSingle(chars) if r == '}' { break } if r == ' ' { log.Fatalf("space not supported in sequence %q", chars) } buf = append(buf, r) } list = append(list, string(buf)) last = 0 } else { // single character escaped := false r, chars, escaped = parseSingle(chars) if r != ' ' { if r == '-' && !escaped { if last == 0 { log.Fatal("'-' should be preceded by a character") } end, chars, _ = parseSingle(chars) for ; last <= end; last++ { list = append(list, string(last)) } last = 0 } else { list = append(list, string(r)) last = r } } } } return list } var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`) // typeMap translates legacy type keys to their BCP47 equivalent. var typeMap = map[string]string{ "phonebook": "phonebk", "traditional": "trad", } // parseCollation parses XML files in the collation directory of the CLDR core.zip file. func parseCollation(b *build.Builder) { d := &cldr.Decoder{} d.SetDirFilter("collation") data := decodeCLDR(d) for _, loc := range data.Locales() { x, err := data.LDML(loc) failOnError(err) if skipLang(x.Identity.Language.Type) { continue } cs := x.Collations.Collation sl := cldr.MakeSlice(&cs) if len(types.s) == 0 { sl.SelectAnyOf("type", x.Collations.Default()) } else if !types.all { sl.SelectAnyOf("type", types.s...) } sl.SelectOnePerGroup("alt", altInclude()) for _, c := range cs { id, err := language.Parse(loc) if err != nil { fmt.Fprintf(os.Stderr, "invalid locale: %q", err) continue } // Support both old- and new-style defaults. d := c.Type if x.Collations.DefaultCollation == nil { d = x.Collations.Default() } else { d = x.Collations.DefaultCollation.Data() } // We assume tables are being built either for search or collation, // but not both. For search the default is always "search". if d != c.Type && c.Type != "search" { typ := c.Type if len(c.Type) > 8 { typ = typeMap[c.Type] } id, err = id.SetTypeForKey("co", typ) failOnError(err) } t := b.Tailoring(id) c.Process(processor{t}) } } } type processor struct { t *build.Tailoring } func (p processor) Reset(anchor string, before int) (err error) { if before != 0 { err = p.t.SetAnchorBefore(anchor) } else { err = p.t.SetAnchor(anchor) } failOnError(err) return nil } func (p processor) Insert(level int, str, context, extend string) error { str = context + str if *test { testInput.add(str) } // TODO: mimic bug in old maketables: remove. err := p.t.Insert(colltab.Level(level-1), str, context+extend) failOnError(err) return nil } func (p processor) Index(id string) { } func testCollator(c *collate.Collator) { c0 := collate.New(language.Und) // iterator over all characters for all locales and check // whether Key is equal. buf := collate.Buffer{} // Add all common and not too uncommon runes to the test set. for i := rune(0); i < 0x30000; i++ { testInput.add(string(i)) } for i := rune(0xE0000); i < 0xF0000; i++ { testInput.add(string(i)) } for _, str := range testInput.values() { k0 := c0.KeyFromString(&buf, str) k := c.KeyFromString(&buf, str) if !bytes.Equal(k0, k) { failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k)) } buf.Reset() } fmt.Println("PASS") } func main() { gen.Init() b := build.NewBuilder() parseUCA(b) if tables.contains("chars") { parseMain() } parseCollation(b) c, err := b.Build() failOnError(err) if *test { testCollator(collate.NewFromTable(c)) } else { w := &bytes.Buffer{} gen.WriteUnicodeVersion(w) gen.WriteCLDRVersion(w) if tables.contains("collate") { _, err = b.Print(w) failOnError(err) } if tables.contains("chars") { printExemplarCharacters(w) } gen.WriteGoFile("tables.go", *pkg, w.Bytes()) } }