// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build ignore // Language tag table generator. // Data read from the web. package main import ( "bufio" "flag" "fmt" "io" "io/ioutil" "log" "math" "reflect" "regexp" "sort" "strconv" "strings" "golang.org/x/text/internal/gen" "golang.org/x/text/internal/tag" "golang.org/x/text/unicode/cldr" ) var ( test = flag.Bool("test", false, "test existing tables; can be used to compare web data with package data.") outputFile = flag.String("output", "tables.go", "output file for generated tables") ) var comment = []string{ ` lang holds an alphabetically sorted list of ISO-639 language identifiers. All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag. For 2-byte language identifiers, the two successive bytes have the following meaning: - if the first letter of the 2- and 3-letter ISO codes are the same: the second and third letter of the 3-letter ISO code. - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3. For 3-byte language identifiers the 4th byte is 0.`, ` langNoIndex is a bit vector of all 3-letter language codes that are not used as an index in lookup tables. The language ids for these language codes are derived directly from the letters and are not consecutive.`, ` altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives to 2-letter language codes that cannot be derived using the method described above. Each 3-letter code is followed by its 1-byte langID.`, ` altLangIndex is used to convert indexes in altLangISO3 to langIDs.`, ` langAliasMap maps langIDs to their suggested replacements.`, ` script is an alphabetically sorted list of ISO 15924 codes. The index of the script in the string, divided by 4, is the internal scriptID.`, ` isoRegionOffset needs to be added to the index of regionISO to obtain the regionID for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for the UN.M49 codes used for groups.)`, ` regionISO holds a list of alphabetically sorted 2-letter ISO region codes. Each 2-letter codes is followed by two bytes with the following meaning: - [A-Z}{2}: the first letter of the 2-letter code plus these two letters form the 3-letter ISO code. - 0, n: index into altRegionISO3.`, ` regionTypes defines the status of a region for various standards.`, ` m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are codes indicating collections of regions.`, ` m49Index gives indexes into fromM49 based on the three most significant bits of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]] for an entry where the first 7 bits match the 7 lsb of the UN.M49 code. The region code is stored in the 9 lsb of the indexed value.`, ` fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`, ` altRegionISO3 holds a list of 3-letter region codes that cannot be mapped to 2-letter codes using the default algorithm. This is a short list.`, ` altRegionIDs holds a list of regionIDs the positions of which match those of the 3-letter ISO codes in altRegionISO3.`, ` variantNumSpecialized is the number of specialized variants in variants.`, ` suppressScript is an index from langID to the dominant script for that language, if it exists. If a script is given, it should be suppressed from the language tag.`, ` likelyLang is a lookup table, indexed by langID, for the most likely scripts and regions given incomplete information. If more entries exist for a given language, region and script are the index and size respectively of the list in likelyLangList.`, ` likelyLangList holds lists info associated with likelyLang.`, ` likelyRegion is a lookup table, indexed by regionID, for the most likely languages and scripts given incomplete information. If more entries exist for a given regionID, lang and script are the index and size respectively of the list in likelyRegionList. TODO: exclude containers and user-definable regions from the list.`, ` likelyRegionList holds lists info associated with likelyRegion.`, ` likelyScript is a lookup table, indexed by scriptID, for the most likely languages and regions given a script.`, ` matchLang holds pairs of langIDs of base languages that are typically mutually intelligible. Each pair is associated with a confidence and whether the intelligibility goes one or both ways.`, ` matchScript holds pairs of scriptIDs where readers of one script can typically also read the other. Each is associated with a confidence.`, ` nRegionGroups is the number of region groups.`, ` regionInclusion maps region identifiers to sets of regions in regionInclusionBits, where each set holds all groupings that are directly connected in a region containment graph.`, ` regionInclusionBits is an array of bit vectors where every vector represents a set of region groupings. These sets are used to compute the distance between two regions for the purpose of language matching.`, ` regionInclusionNext marks, for each entry in regionInclusionBits, the set of all groups that are reachable from the groups set in the respective entry.`, } // TODO: consider changing some of these structures to tries. This can reduce // memory, but may increase the need for memory allocations. This could be // mitigated if we can piggyback on language tags for common cases. func failOnError(e error) { if e != nil { log.Panic(e) } } type setType int const ( Indexed setType = 1 + iota // all elements must be of same size Linear ) type stringSet struct { s []string sorted, frozen bool // We often need to update values after the creation of an index is completed. // We include a convenience map for keeping track of this. update map[string]string typ setType // used for checking. } func (ss *stringSet) clone() stringSet { c := *ss c.s = append([]string(nil), c.s...) return c } func (ss *stringSet) setType(t setType) { if ss.typ != t && ss.typ != 0 { log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ) } } // parse parses a whitespace-separated string and initializes ss with its // components. func (ss *stringSet) parse(s string) { scan := bufio.NewScanner(strings.NewReader(s)) scan.Split(bufio.ScanWords) for scan.Scan() { ss.add(scan.Text()) } } func (ss *stringSet) assertChangeable() { if ss.frozen { log.Panic("attempt to modify a frozen stringSet") } } func (ss *stringSet) add(s string) { ss.assertChangeable() ss.s = append(ss.s, s) ss.sorted = ss.frozen } func (ss *stringSet) freeze() { ss.compact() ss.frozen = true } func (ss *stringSet) compact() { if ss.sorted { return } a := ss.s sort.Strings(a) k := 0 for i := 1; i < len(a); i++ { if a[k] != a[i] { a[k+1] = a[i] k++ } } ss.s = a[:k+1] ss.sorted = ss.frozen } type funcSorter struct { fn func(a, b string) bool sort.StringSlice } func (s funcSorter) Less(i, j int) bool { return s.fn(s.StringSlice[i], s.StringSlice[j]) } func (ss *stringSet) sortFunc(f func(a, b string) bool) { ss.compact() sort.Sort(funcSorter{f, sort.StringSlice(ss.s)}) } func (ss *stringSet) remove(s string) { ss.assertChangeable() if i, ok := ss.find(s); ok { copy(ss.s[i:], ss.s[i+1:]) ss.s = ss.s[:len(ss.s)-1] } } func (ss *stringSet) replace(ol, nu string) { ss.s[ss.index(ol)] = nu ss.sorted = ss.frozen } func (ss *stringSet) index(s string) int { ss.setType(Indexed) i, ok := ss.find(s) if !ok { if i < len(ss.s) { log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i]) } log.Panicf("find: item %q is not in list", s) } return i } func (ss *stringSet) find(s string) (int, bool) { ss.compact() i := sort.SearchStrings(ss.s, s) return i, i != len(ss.s) && ss.s[i] == s } func (ss *stringSet) slice() []string { ss.compact() return ss.s } func (ss *stringSet) updateLater(v, key string) { if ss.update == nil { ss.update = map[string]string{} } ss.update[v] = key } // join joins the string and ensures that all entries are of the same length. func (ss *stringSet) join() string { ss.setType(Indexed) n := len(ss.s[0]) for _, s := range ss.s { if len(s) != n { log.Panicf("join: not all entries are of the same length: %q", s) } } ss.s = append(ss.s, strings.Repeat("\xff", n)) return strings.Join(ss.s, "") } // ianaEntry holds information for an entry in the IANA Language Subtag Repository. // All types use the same entry. // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various // fields. type ianaEntry struct { typ string description []string scope string added string preferred string deprecated string suppressScript string macro string prefix []string } type builder struct { w *gen.CodeWriter hw io.Writer // MultiWriter for w and w.Hash data *cldr.CLDR supp *cldr.SupplementalData // indices locale stringSet // common locales lang stringSet // canonical language ids (2 or 3 letter ISO codes) with data langNoIndex stringSet // 3-letter ISO codes with no associated data script stringSet // 4-letter ISO codes region stringSet // 2-letter ISO or 3-digit UN M49 codes variant stringSet // 4-8-alphanumeric variant code. // Region codes that are groups with their corresponding group IDs. groups map[int]index // langInfo registry map[string]*ianaEntry } type index uint func newBuilder(w *gen.CodeWriter) *builder { r := gen.OpenCLDRCoreZip() defer r.Close() d := &cldr.Decoder{} data, err := d.DecodeZip(r) failOnError(err) b := builder{ w: w, hw: io.MultiWriter(w, w.Hash), data: data, supp: data.Supplemental(), } b.parseRegistry() return &b } func (b *builder) parseRegistry() { r := gen.OpenIANAFile("assignments/language-subtag-registry") defer r.Close() b.registry = make(map[string]*ianaEntry) scan := bufio.NewScanner(r) scan.Split(bufio.ScanWords) var record *ianaEntry for more := scan.Scan(); more; { key := scan.Text() more = scan.Scan() value := scan.Text() switch key { case "Type:": record = &ianaEntry{typ: value} case "Subtag:", "Tag:": if s := strings.SplitN(value, "..", 2); len(s) > 1 { for a := s[0]; a <= s[1]; a = inc(a) { b.addToRegistry(a, record) } } else { b.addToRegistry(value, record) } case "Suppress-Script:": record.suppressScript = value case "Added:": record.added = value case "Deprecated:": record.deprecated = value case "Macrolanguage:": record.macro = value case "Preferred-Value:": record.preferred = value case "Prefix:": record.prefix = append(record.prefix, value) case "Scope:": record.scope = value case "Description:": buf := []byte(value) for more = scan.Scan(); more; more = scan.Scan() { b := scan.Bytes() if b[0] == '%' || b[len(b)-1] == ':' { break } buf = append(buf, ' ') buf = append(buf, b...) } record.description = append(record.description, string(buf)) continue default: continue } more = scan.Scan() } if scan.Err() != nil { log.Panic(scan.Err()) } } func (b *builder) addToRegistry(key string, entry *ianaEntry) { if info, ok := b.registry[key]; ok { if info.typ != "language" || entry.typ != "extlang" { log.Fatalf("parseRegistry: tag %q already exists", key) } } else { b.registry[key] = entry } } var commentIndex = make(map[string]string) func init() { for _, s := range comment { key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0]) commentIndex[key] = s } } func (b *builder) comment(name string) { if s := commentIndex[name]; len(s) > 0 { b.w.WriteComment(s) } else { fmt.Fprintln(b.w) } } func (b *builder) pf(f string, x ...interface{}) { fmt.Fprintf(b.hw, f, x...) fmt.Fprint(b.hw, "\n") } func (b *builder) p(x ...interface{}) { fmt.Fprintln(b.hw, x...) } func (b *builder) addSize(s int) { b.w.Size += s b.pf("// Size: %d bytes", s) } func (b *builder) writeConst(name string, x interface{}) { b.comment(name) b.w.WriteConst(name, x) } // writeConsts computes f(v) for all v in values and writes the results // as constants named _v to a single constant block. func (b *builder) writeConsts(f func(string) int, values ...string) { b.pf("const (") for _, v := range values { b.pf("\t_%s = %v", v, f(v)) } b.pf(")") } // writeType writes the type of the given value, which must be a struct. func (b *builder) writeType(value interface{}) { b.comment(reflect.TypeOf(value).Name()) b.w.WriteType(value) } func (b *builder) writeSlice(name string, ss interface{}) { b.writeSliceAddSize(name, 0, ss) } func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) { b.comment(name) b.w.Size += extraSize v := reflect.ValueOf(ss) t := v.Type().Elem() b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len()) fmt.Fprintf(b.w, "var %s = ", name) b.w.WriteArray(ss) b.p() } type fromTo struct { from, to uint16 } func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) { ss.sortFunc(func(a, b string) bool { return index(a) < index(b) }) m := []fromTo{} for _, s := range ss.s { m = append(m, fromTo{index(s), index(ss.update[s])}) } b.writeSlice(name, m) } const base = 'z' - 'a' + 1 func strToInt(s string) uint { v := uint(0) for i := 0; i < len(s); i++ { v *= base v += uint(s[i] - 'a') } return v } // converts the given integer to the original ASCII string passed to strToInt. // len(s) must match the number of characters obtained. func intToStr(v uint, s []byte) { for i := len(s) - 1; i >= 0; i-- { s[i] = byte(v%base) + 'a' v /= base } } func (b *builder) writeBitVector(name string, ss []string) { vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8))) for _, s := range ss { v := strToInt(s) vec[v/8] |= 1 << (v % 8) } b.writeSlice(name, vec) } // TODO: convert this type into a list or two-stage trie. func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) { b.comment(name) v := reflect.ValueOf(m) sz := v.Len() * (2 + int(v.Type().Key().Size())) for _, k := range m { sz += len(k) } b.addSize(sz) keys := []string{} b.pf(`var %s = map[string]uint16{`, name) for k := range m { keys = append(keys, k) } sort.Strings(keys) for _, k := range keys { b.pf("\t%q: %v,", k, f(m[k])) } b.p("}") } func (b *builder) writeMap(name string, m interface{}) { b.comment(name) v := reflect.ValueOf(m) sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size())) b.addSize(sz) f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool { return strings.IndexRune("{}, ", r) != -1 }) sort.Strings(f[1:]) b.pf(`var %s = %s{`, name, f[0]) for _, kv := range f[1:] { b.pf("\t%s,", kv) } b.p("}") } func (b *builder) langIndex(s string) uint16 { if s == "und" { return 0 } if i, ok := b.lang.find(s); ok { return uint16(i) } return uint16(strToInt(s)) + uint16(len(b.lang.s)) } // inc advances the string to its lexicographical successor. func inc(s string) string { const maxTagLength = 4 var buf [maxTagLength]byte intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)]) for i := 0; i < len(s); i++ { if s[i] <= 'Z' { buf[i] -= 'a' - 'A' } } return string(buf[:len(s)]) } func (b *builder) parseIndices() { meta := b.supp.Metadata for k, v := range b.registry { var ss *stringSet switch v.typ { case "language": if len(k) == 2 || v.suppressScript != "" || v.scope == "special" { b.lang.add(k) continue } else { ss = &b.langNoIndex } case "region": ss = &b.region case "script": ss = &b.script case "variant": ss = &b.variant default: continue } ss.add(k) } // Include any language for which there is data. for _, lang := range b.data.Locales() { if x := b.data.RawLDML(lang); false || x.LocaleDisplayNames != nil || x.Characters != nil || x.Delimiters != nil || x.Measurement != nil || x.Dates != nil || x.Numbers != nil || x.Units != nil || x.ListPatterns != nil || x.Collations != nil || x.Segmentations != nil || x.Rbnf != nil || x.Annotations != nil || x.Metadata != nil { from := strings.Split(lang, "_") if lang := from[0]; lang != "root" { b.lang.add(lang) } } } // Include locales for plural rules, which uses a different structure. for _, plurals := range b.data.Supplemental().Plurals { for _, rules := range plurals.PluralRules { for _, lang := range strings.Split(rules.Locales, " ") { if lang = strings.Split(lang, "_")[0]; lang != "root" { b.lang.add(lang) } } } } // Include languages in likely subtags. for _, m := range b.supp.LikelySubtags.LikelySubtag { from := strings.Split(m.From, "_") b.lang.add(from[0]) } // Include ISO-639 alpha-3 bibliographic entries. for _, a := range meta.Alias.LanguageAlias { if a.Reason == "bibliographic" { b.langNoIndex.add(a.Type) } } // Include regions in territoryAlias (not all are in the IANA registry!) for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { if len(reg.Type) == 2 { b.region.add(reg.Type) } } for _, s := range b.lang.s { if len(s) == 3 { b.langNoIndex.remove(s) } } b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice())) b.writeConst("numScripts", len(b.script.slice())) b.writeConst("numRegions", len(b.region.slice())) // Add dummy codes at the start of each list to represent "unspecified". b.lang.add("---") b.script.add("----") b.region.add("---") // common locales b.locale.parse(meta.DefaultContent.Locales) } // TODO: region inclusion data will probably not be use used in future matchers. func (b *builder) computeRegionGroups() { b.groups = make(map[int]index) // Create group indices. for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID. b.groups[i] = index(len(b.groups)) } for _, g := range b.supp.TerritoryContainment.Group { // Skip UN and EURO zone as they are flattening the containment // relationship. if g.Type == "EZ" || g.Type == "UN" { continue } group := b.region.index(g.Type) if _, ok := b.groups[group]; !ok { b.groups[group] = index(len(b.groups)) } } if len(b.groups) > 32 { log.Fatalf("only 32 groups supported, found %d", len(b.groups)) } b.writeConst("nRegionGroups", len(b.groups)) } var langConsts = []string{ "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu", // constants for grandfathered tags (if not already defined) "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu", "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn", } // writeLanguage generates all tables needed for language canonicalization. func (b *builder) writeLanguage() { meta := b.supp.Metadata b.writeConst("nonCanonicalUnd", b.lang.index("und")) b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) b.writeConst("langPrivateStart", b.langIndex("qaa")) b.writeConst("langPrivateEnd", b.langIndex("qtz")) // Get language codes that need to be mapped (overlong 3-letter codes, // deprecated 2-letter codes, legacy and grandfathered tags.) langAliasMap := stringSet{} aliasTypeMap := map[string]langAliasType{} // altLangISO3 get the alternative ISO3 names that need to be mapped. altLangISO3 := stringSet{} // Add dummy start to avoid the use of index 0. altLangISO3.add("---") altLangISO3.updateLater("---", "aa") lang := b.lang.clone() for _, a := range meta.Alias.LanguageAlias { if a.Replacement == "" { a.Replacement = "und" } // TODO: support mapping to tags repl := strings.SplitN(a.Replacement, "_", 2)[0] if a.Reason == "overlong" { if len(a.Replacement) == 2 && len(a.Type) == 3 { lang.updateLater(a.Replacement, a.Type) } } else if len(a.Type) <= 3 { switch a.Reason { case "macrolanguage": aliasTypeMap[a.Type] = langMacro case "deprecated": // handled elsewhere continue case "bibliographic", "legacy": if a.Type == "no" { continue } aliasTypeMap[a.Type] = langLegacy default: log.Fatalf("new %s alias: %s", a.Reason, a.Type) } langAliasMap.add(a.Type) langAliasMap.updateLater(a.Type, repl) } } // Manually add the mapping of "nb" (Norwegian) to its macro language. // This can be removed if CLDR adopts this change. langAliasMap.add("nb") langAliasMap.updateLater("nb", "no") aliasTypeMap["nb"] = langMacro for k, v := range b.registry { // Also add deprecated values for 3-letter ISO codes, which CLDR omits. if v.typ == "language" && v.deprecated != "" && v.preferred != "" { langAliasMap.add(k) langAliasMap.updateLater(k, v.preferred) aliasTypeMap[k] = langDeprecated } } // Fix CLDR mappings. lang.updateLater("tl", "tgl") lang.updateLater("sh", "hbs") lang.updateLater("mo", "mol") lang.updateLater("no", "nor") lang.updateLater("tw", "twi") lang.updateLater("nb", "nob") lang.updateLater("ak", "aka") lang.updateLater("bh", "bih") // Ensure that each 2-letter code is matched with a 3-letter code. for _, v := range lang.s[1:] { s, ok := lang.update[v] if !ok { if s, ok = lang.update[langAliasMap.update[v]]; !ok { continue } lang.update[v] = s } if v[0] != s[0] { altLangISO3.add(s) altLangISO3.updateLater(s, v) } } // Complete canonialized language tags. lang.freeze() for i, v := range lang.s { // We can avoid these manual entries by using the IANI registry directly. // Seems easier to update the list manually, as changes are rare. // The panic in this loop will trigger if we miss an entry. add := "" if s, ok := lang.update[v]; ok { if s[0] == v[0] { add = s[1:] } else { add = string([]byte{0, byte(altLangISO3.index(s))}) } } else if len(v) == 3 { add = "\x00" } else { log.Panicf("no data for long form of %q", v) } lang.s[i] += add } b.writeConst("lang", tag.Index(lang.join())) b.writeConst("langNoIndexOffset", len(b.lang.s)) // space of all valid 3-letter language identifiers. b.writeBitVector("langNoIndex", b.langNoIndex.slice()) altLangIndex := []uint16{} for i, s := range altLangISO3.slice() { altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))}) if i > 0 { idx := b.lang.index(altLangISO3.update[s]) altLangIndex = append(altLangIndex, uint16(idx)) } } b.writeConst("altLangISO3", tag.Index(altLangISO3.join())) b.writeSlice("altLangIndex", altLangIndex) b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex) types := make([]langAliasType, len(langAliasMap.s)) for i, s := range langAliasMap.s { types[i] = aliasTypeMap[s] } b.writeSlice("langAliasTypes", types) } var scriptConsts = []string{ "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", "Zzzz", } func (b *builder) writeScript() { b.writeConsts(b.script.index, scriptConsts...) b.writeConst("script", tag.Index(b.script.join())) supp := make([]uint8, len(b.lang.slice())) for i, v := range b.lang.slice()[1:] { if sc := b.registry[v].suppressScript; sc != "" { supp[i+1] = uint8(b.script.index(sc)) } } b.writeSlice("suppressScript", supp) // There is only one deprecated script in CLDR. This value is hard-coded. // We check here if the code must be updated. for _, a := range b.supp.Metadata.Alias.ScriptAlias { if a.Type != "Qaai" { log.Panicf("unexpected deprecated stript %q", a.Type) } } } func parseM49(s string) int16 { if len(s) == 0 { return 0 } v, err := strconv.ParseUint(s, 10, 10) failOnError(err) return int16(v) } var regionConsts = []string{ "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. } func (b *builder) writeRegion() { b.writeConsts(b.region.index, regionConsts...) isoOffset := b.region.index("AA") m49map := make([]int16, len(b.region.slice())) fromM49map := make(map[int16]int) altRegionISO3 := "" altRegionIDs := []uint16{} b.writeConst("isoRegionOffset", isoOffset) // 2-letter region lookup and mapping to numeric codes. regionISO := b.region.clone() regionISO.s = regionISO.s[isoOffset:] regionISO.sorted = false regionTypes := make([]byte, len(b.region.s)) // Is the region valid BCP 47? for s, e := range b.registry { if len(s) == 2 && s == strings.ToUpper(s) { i := b.region.index(s) for _, d := range e.description { if strings.Contains(d, "Private use") { regionTypes[i] = iso3166UserAssgined } } regionTypes[i] |= bcp47Region } } // Is the region a valid ccTLD? r := gen.OpenIANAFile("domains/root/db") defer r.Close() buf, err := ioutil.ReadAll(r) failOnError(err) re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`) for _, m := range re.FindAllSubmatch(buf, -1) { i := b.region.index(strings.ToUpper(string(m[1]))) regionTypes[i] |= ccTLD } b.writeSlice("regionTypes", regionTypes) iso3Set := make(map[string]int) update := func(iso2, iso3 string) { i := regionISO.index(iso2) if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] { regionISO.s[i] += iso3[1:] iso3Set[iso3] = -1 } else { if ok && j >= 0 { regionISO.s[i] += string([]byte{0, byte(j)}) } else { iso3Set[iso3] = len(altRegionISO3) regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))}) altRegionISO3 += iso3 altRegionIDs = append(altRegionIDs, uint16(isoOffset+i)) } } } for _, tc := range b.supp.CodeMappings.TerritoryCodes { i := regionISO.index(tc.Type) + isoOffset if d := m49map[i]; d != 0 { log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d) } m49 := parseM49(tc.Numeric) m49map[i] = m49 if r := fromM49map[m49]; r == 0 { fromM49map[m49] = i } else if r != i { dep := b.registry[regionISO.s[r-isoOffset]].deprecated if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) { fromM49map[m49] = i } } } for _, ta := range b.supp.Metadata.Alias.TerritoryAlias { if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 { from := parseM49(ta.Type) if r := fromM49map[from]; r == 0 { fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset } } } for _, tc := range b.supp.CodeMappings.TerritoryCodes { if len(tc.Alpha3) == 3 { update(tc.Type, tc.Alpha3) } } // This entries are not included in territoryCodes. Mostly 3-letter variants // of deleted codes and an entry for QU. for _, m := range []struct{ iso2, iso3 string }{ {"CT", "CTE"}, {"DY", "DHY"}, {"HV", "HVO"}, {"JT", "JTN"}, {"MI", "MID"}, {"NH", "NHB"}, {"NQ", "ATN"}, {"PC", "PCI"}, {"PU", "PUS"}, {"PZ", "PCZ"}, {"RH", "RHO"}, {"VD", "VDR"}, {"WK", "WAK"}, // These three-letter codes are used for others as well. {"FQ", "ATF"}, } { update(m.iso2, m.iso3) } for i, s := range regionISO.s { if len(s) != 4 { regionISO.s[i] = s + " " } } b.writeConst("regionISO", tag.Index(regionISO.join())) b.writeConst("altRegionISO3", altRegionISO3) b.writeSlice("altRegionIDs", altRegionIDs) // Create list of deprecated regions. // TODO: consider inserting SF -> FI. Not included by CLDR, but is the only // Transitionally-reserved mapping not included. regionOldMap := stringSet{} // Include regions in territoryAlias (not all are in the IANA registry!) for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 { regionOldMap.add(reg.Type) regionOldMap.updateLater(reg.Type, reg.Replacement) i, _ := regionISO.find(reg.Type) j, _ := regionISO.find(reg.Replacement) if k := m49map[i+isoOffset]; k == 0 { m49map[i+isoOffset] = m49map[j+isoOffset] } } } b.writeSortedMap("regionOldMap", ®ionOldMap, func(s string) uint16 { return uint16(b.region.index(s)) }) // 3-digit region lookup, groupings. for i := 1; i < isoOffset; i++ { m := parseM49(b.region.s[i]) m49map[i] = m fromM49map[m] = i } b.writeSlice("m49", m49map) const ( searchBits = 7 regionBits = 9 ) if len(m49map) >= 1<<regionBits { log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits) } m49Index := [9]int16{} fromM49 := []uint16{} m49 := []int{} for k, _ := range fromM49map { m49 = append(m49, int(k)) } sort.Ints(m49) for _, k := range m49[1:] { val := (k & (1<<searchBits - 1)) << regionBits fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)])) m49Index[1:][k>>searchBits] = int16(len(fromM49)) } b.writeSlice("m49Index", m49Index) b.writeSlice("fromM49", fromM49) } const ( // TODO: put these lists in regionTypes as user data? Could be used for // various optimizations and refinements and could be exposed in the API. iso3166Except = "AC CP DG EA EU FX IC SU TA UK" iso3166Trans = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions. // DY and RH are actually not deleted, but indeterminately reserved. iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD" ) const ( iso3166UserAssgined = 1 << iota ccTLD bcp47Region ) func find(list []string, s string) int { for i, t := range list { if t == s { return i } } return -1 } // writeVariants generates per-variant information and creates a map from variant // name to index value. We assign index values such that sorting multiple // variants by index value will result in the correct order. // There are two types of variants: specialized and general. Specialized variants // are only applicable to certain language or language-script pairs. Generalized // variants apply to any language. Generalized variants always sort after // specialized variants. We will therefore always assign a higher index value // to a generalized variant than any other variant. Generalized variants are // sorted alphabetically among themselves. // Specialized variants may also sort after other specialized variants. Such // variants will be ordered after any of the variants they may follow. // We assume that if a variant x is followed by a variant y, then for any prefix // p of x, p-x is a prefix of y. This allows us to order tags based on the // maximum of the length of any of its prefixes. // TODO: it is possible to define a set of Prefix values on variants such that // a total order cannot be defined to the point that this algorithm breaks. // In other words, we cannot guarantee the same order of variants for the // future using the same algorithm or for non-compliant combinations of // variants. For this reason, consider using simple alphabetic sorting // of variants and ignore Prefix restrictions altogether. func (b *builder) writeVariant() { generalized := stringSet{} specialized := stringSet{} specializedExtend := stringSet{} // Collate the variants by type and check assumptions. for _, v := range b.variant.slice() { e := b.registry[v] if len(e.prefix) == 0 { generalized.add(v) continue } c := strings.Split(e.prefix[0], "-") hasScriptOrRegion := false if len(c) > 1 { _, hasScriptOrRegion = b.script.find(c[1]) if !hasScriptOrRegion { _, hasScriptOrRegion = b.region.find(c[1]) } } if len(c) == 1 || len(c) == 2 && hasScriptOrRegion { // Variant is preceded by a language. specialized.add(v) continue } // Variant is preceded by another variant. specializedExtend.add(v) prefix := c[0] + "-" if hasScriptOrRegion { prefix += c[1] } for _, p := range e.prefix { // Verify that the prefix minus the last element is a prefix of the // predecessor element. i := strings.LastIndex(p, "-") pred := b.registry[p[i+1:]] if find(pred.prefix, p[:i]) < 0 { log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v) } // The sorting used below does not work in the general case. It works // if we assume that variants that may be followed by others only have // prefixes of the same length. Verify this. count := strings.Count(p[:i], "-") for _, q := range pred.prefix { if c := strings.Count(q, "-"); c != count { log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count) } } if !strings.HasPrefix(p, prefix) { log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix) } } } // Sort extended variants. a := specializedExtend.s less := func(v, w string) bool { // Sort by the maximum number of elements. maxCount := func(s string) (max int) { for _, p := range b.registry[s].prefix { if c := strings.Count(p, "-"); c > max { max = c } } return } if cv, cw := maxCount(v), maxCount(w); cv != cw { return cv < cw } // Sort by name as tie breaker. return v < w } sort.Sort(funcSorter{less, sort.StringSlice(a)}) specializedExtend.frozen = true // Create index from variant name to index. variantIndex := make(map[string]uint8) add := func(s []string) { for _, v := range s { variantIndex[v] = uint8(len(variantIndex)) } } add(specialized.slice()) add(specializedExtend.s) numSpecialized := len(variantIndex) add(generalized.slice()) if n := len(variantIndex); n > 255 { log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n) } b.writeMap("variantIndex", variantIndex) b.writeConst("variantNumSpecialized", numSpecialized) } func (b *builder) writeLanguageInfo() { } // writeLikelyData writes tables that are used both for finding parent relations and for // language matching. Each entry contains additional bits to indicate the status of the // data to know when it cannot be used for parent relations. func (b *builder) writeLikelyData() { const ( isList = 1 << iota scriptInFrom regionInFrom ) type ( // generated types likelyScriptRegion struct { region uint16 script uint8 flags uint8 } likelyLangScript struct { lang uint16 script uint8 flags uint8 } likelyLangRegion struct { lang uint16 region uint16 } // likelyTag is used for getting likely tags for group regions, where // the likely region might be a region contained in the group. likelyTag struct { lang uint16 region uint16 script uint8 } ) var ( // generated variables likelyRegionGroup = make([]likelyTag, len(b.groups)) likelyLang = make([]likelyScriptRegion, len(b.lang.s)) likelyRegion = make([]likelyLangScript, len(b.region.s)) likelyScript = make([]likelyLangRegion, len(b.script.s)) likelyLangList = []likelyScriptRegion{} likelyRegionList = []likelyLangScript{} ) type fromTo struct { from, to []string } langToOther := map[int][]fromTo{} regionToOther := map[int][]fromTo{} for _, m := range b.supp.LikelySubtags.LikelySubtag { from := strings.Split(m.From, "_") to := strings.Split(m.To, "_") if len(to) != 3 { log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to)) } if len(from) > 3 { log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from)) } if from[0] != to[0] && from[0] != "und" { log.Fatalf("unexpected language change in expansion: %s -> %s", from, to) } if len(from) == 3 { if from[2] != to[2] { log.Fatalf("unexpected region change in expansion: %s -> %s", from, to) } if from[0] != "und" { log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to) } } if len(from) == 1 || from[0] != "und" { id := 0 if from[0] != "und" { id = b.lang.index(from[0]) } langToOther[id] = append(langToOther[id], fromTo{from, to}) } else if len(from) == 2 && len(from[1]) == 4 { sid := b.script.index(from[1]) likelyScript[sid].lang = uint16(b.langIndex(to[0])) likelyScript[sid].region = uint16(b.region.index(to[2])) } else { r := b.region.index(from[len(from)-1]) if id, ok := b.groups[r]; ok { if from[0] != "und" { log.Fatalf("region changed unexpectedly: %s -> %s", from, to) } likelyRegionGroup[id].lang = uint16(b.langIndex(to[0])) likelyRegionGroup[id].script = uint8(b.script.index(to[1])) likelyRegionGroup[id].region = uint16(b.region.index(to[2])) } else { regionToOther[r] = append(regionToOther[r], fromTo{from, to}) } } } b.writeType(likelyLangRegion{}) b.writeSlice("likelyScript", likelyScript) for id := range b.lang.s { list := langToOther[id] if len(list) == 1 { likelyLang[id].region = uint16(b.region.index(list[0].to[2])) likelyLang[id].script = uint8(b.script.index(list[0].to[1])) } else if len(list) > 1 { likelyLang[id].flags = isList likelyLang[id].region = uint16(len(likelyLangList)) likelyLang[id].script = uint8(len(list)) for _, x := range list { flags := uint8(0) if len(x.from) > 1 { if x.from[1] == x.to[2] { flags = regionInFrom } else { flags = scriptInFrom } } likelyLangList = append(likelyLangList, likelyScriptRegion{ region: uint16(b.region.index(x.to[2])), script: uint8(b.script.index(x.to[1])), flags: flags, }) } } } // TODO: merge suppressScript data with this table. b.writeType(likelyScriptRegion{}) b.writeSlice("likelyLang", likelyLang) b.writeSlice("likelyLangList", likelyLangList) for id := range b.region.s { list := regionToOther[id] if len(list) == 1 { likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0])) likelyRegion[id].script = uint8(b.script.index(list[0].to[1])) if len(list[0].from) > 2 { likelyRegion[id].flags = scriptInFrom } } else if len(list) > 1 { likelyRegion[id].flags = isList likelyRegion[id].lang = uint16(len(likelyRegionList)) likelyRegion[id].script = uint8(len(list)) for i, x := range list { if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 { log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i) } x := likelyLangScript{ lang: uint16(b.langIndex(x.to[0])), script: uint8(b.script.index(x.to[1])), } if len(list[0].from) > 2 { x.flags = scriptInFrom } likelyRegionList = append(likelyRegionList, x) } } } b.writeType(likelyLangScript{}) b.writeSlice("likelyRegion", likelyRegion) b.writeSlice("likelyRegionList", likelyRegionList) b.writeType(likelyTag{}) b.writeSlice("likelyRegionGroup", likelyRegionGroup) } type mutualIntelligibility struct { want, have uint16 conf uint8 oneway bool } type scriptIntelligibility struct { lang uint16 // langID or 0 if * want, have uint8 conf uint8 } type sortByConf []mutualIntelligibility func (l sortByConf) Less(a, b int) bool { return l[a].conf > l[b].conf } func (l sortByConf) Swap(a, b int) { l[a], l[b] = l[b], l[a] } func (l sortByConf) Len() int { return len(l) } // toConf converts a percentage value [0, 100] to a confidence class. func toConf(pct uint8) uint8 { switch { case pct == 100: return 3 // Exact case pct >= 90: return 2 // High case pct > 50: return 1 // Low default: return 0 // No } } // writeMatchData writes tables with languages and scripts for which there is // mutual intelligibility. The data is based on CLDR's languageMatching data. // Note that we use a different algorithm than the one defined by CLDR and that // we slightly modify the data. For example, we convert scores to confidence levels. // We also drop all region-related data as we use a different algorithm to // determine region equivalence. func (b *builder) writeMatchData() { b.writeType(mutualIntelligibility{}) b.writeType(scriptIntelligibility{}) lm := b.supp.LanguageMatching.LanguageMatches cldr.MakeSlice(&lm).SelectAnyOf("type", "written") matchLang := []mutualIntelligibility{} matchScript := []scriptIntelligibility{} // Convert the languageMatch entries in lists keyed by desired language. for _, m := range lm[0].LanguageMatch { // Different versions of CLDR use different separators. desired := strings.Replace(m.Desired, "-", "_", -1) supported := strings.Replace(m.Supported, "-", "_", -1) d := strings.Split(desired, "_") s := strings.Split(supported, "_") if len(d) != len(s) || len(d) > 2 { // Skip all entries with regions and work around CLDR bug. continue } pct, _ := strconv.ParseInt(m.Percent, 10, 8) if len(d) == 2 && d[0] == s[0] && len(d[1]) == 4 { // language-script pair. lang := uint16(0) if d[0] != "*" { lang = uint16(b.langIndex(d[0])) } matchScript = append(matchScript, scriptIntelligibility{ lang: lang, want: uint8(b.script.index(d[1])), have: uint8(b.script.index(s[1])), conf: toConf(uint8(pct)), }) if m.Oneway != "true" { matchScript = append(matchScript, scriptIntelligibility{ lang: lang, want: uint8(b.script.index(s[1])), have: uint8(b.script.index(d[1])), conf: toConf(uint8(pct)), }) } } else if len(d) == 1 && d[0] != "*" { if pct == 100 { // nb == no is already handled by macro mapping. Check there // really is only this case. if d[0] != "no" || s[0] != "nb" { log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) } continue } matchLang = append(matchLang, mutualIntelligibility{ want: uint16(b.langIndex(d[0])), have: uint16(b.langIndex(s[0])), conf: uint8(pct), oneway: m.Oneway == "true", }) } else { // TODO: Handle other mappings. a := []string{"*;*", "*_*;*_*", "es_MX;es_419"} s := strings.Join([]string{desired, supported}, ";") if i := sort.SearchStrings(a, s); i == len(a) || a[i] != s { log.Printf("%q not handled", s) } } } sort.Stable(sortByConf(matchLang)) // collapse percentage into confidence classes for i, m := range matchLang { matchLang[i].conf = toConf(m.conf) } b.writeSlice("matchLang", matchLang) b.writeSlice("matchScript", matchScript) } func (b *builder) writeRegionInclusionData() { var ( // mm holds for each group the set of groups with a distance of 1. mm = make(map[int][]index) // containment holds for each group the transitive closure of // containment of other groups. containment = make(map[index][]index) ) for _, g := range b.supp.TerritoryContainment.Group { // Skip UN and EURO zone as they are flattening the containment // relationship. if g.Type == "EZ" || g.Type == "UN" { continue } group := b.region.index(g.Type) groupIdx := b.groups[group] for _, mem := range strings.Split(g.Contains, " ") { r := b.region.index(mem) mm[r] = append(mm[r], groupIdx) if g, ok := b.groups[r]; ok { mm[group] = append(mm[group], g) containment[groupIdx] = append(containment[groupIdx], g) } } } regionContainment := make([]uint32, len(b.groups)) for _, g := range b.groups { l := containment[g] // Compute the transitive closure of containment. for i := 0; i < len(l); i++ { l = append(l, containment[l[i]]...) } // Compute the bitmask. regionContainment[g] = 1 << g for _, v := range l { regionContainment[g] |= 1 << v } // log.Printf("%d: %X", g, regionContainment[g]) } b.writeSlice("regionContainment", regionContainment) regionInclusion := make([]uint8, len(b.region.s)) bvs := make(map[uint32]index) // Make the first bitvector positions correspond with the groups. for r, i := range b.groups { bv := uint32(1 << i) for _, g := range mm[r] { bv |= 1 << g } bvs[bv] = i regionInclusion[r] = uint8(bvs[bv]) } for r := 1; r < len(b.region.s); r++ { if _, ok := b.groups[r]; !ok { bv := uint32(0) for _, g := range mm[r] { bv |= 1 << g } if bv == 0 { // Pick the world for unspecified regions. bv = 1 << b.groups[b.region.index("001")] } if _, ok := bvs[bv]; !ok { bvs[bv] = index(len(bvs)) } regionInclusion[r] = uint8(bvs[bv]) } } b.writeSlice("regionInclusion", regionInclusion) regionInclusionBits := make([]uint32, len(bvs)) for k, v := range bvs { regionInclusionBits[v] = uint32(k) } // Add bit vectors for increasingly large distances until a fixed point is reached. regionInclusionNext := []uint8{} for i := 0; i < len(regionInclusionBits); i++ { bits := regionInclusionBits[i] next := bits for i := uint(0); i < uint(len(b.groups)); i++ { if bits&(1<<i) != 0 { next |= regionInclusionBits[i] } } if _, ok := bvs[next]; !ok { bvs[next] = index(len(bvs)) regionInclusionBits = append(regionInclusionBits, next) } regionInclusionNext = append(regionInclusionNext, uint8(bvs[next])) } b.writeSlice("regionInclusionBits", regionInclusionBits) b.writeSlice("regionInclusionNext", regionInclusionNext) } type parentRel struct { lang uint16 script uint8 maxScript uint8 toRegion uint16 fromRegion []uint16 } func (b *builder) writeParents() { b.writeType(parentRel{}) parents := []parentRel{} // Construct parent overrides. n := 0 for _, p := range b.data.Supplemental().ParentLocales.ParentLocale { // Skipping non-standard scripts to root is implemented using addTags. if p.Parent == "root" { continue } sub := strings.Split(p.Parent, "_") parent := parentRel{lang: b.langIndex(sub[0])} if len(sub) == 2 { // TODO: check that all undefined scripts are indeed Latn in these // cases. parent.maxScript = uint8(b.script.index("Latn")) parent.toRegion = uint16(b.region.index(sub[1])) } else { parent.script = uint8(b.script.index(sub[1])) parent.maxScript = parent.script parent.toRegion = uint16(b.region.index(sub[2])) } for _, c := range strings.Split(p.Locales, " ") { region := b.region.index(c[strings.LastIndex(c, "_")+1:]) parent.fromRegion = append(parent.fromRegion, uint16(region)) } parents = append(parents, parent) n += len(parent.fromRegion) } b.writeSliceAddSize("parents", n*2, parents) } func main() { gen.Init() gen.Repackage("gen_common.go", "common.go", "language") w := gen.NewCodeWriter() defer w.WriteGoFile("tables.go", "language") fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`) b := newBuilder(w) gen.WriteCLDRVersion(w) b.parseIndices() b.writeType(fromTo{}) b.writeLanguage() b.writeScript() b.writeRegion() b.writeVariant() // TODO: b.writeLocale() b.computeRegionGroups() b.writeLikelyData() b.writeMatchData() b.writeRegionInclusionData() b.writeParents() }