diff options
Diffstat (limited to 'vendor/golang.org/x/text/language/parse.go')
-rw-r--r-- | vendor/golang.org/x/text/language/parse.go | 859 |
1 files changed, 859 insertions, 0 deletions
diff --git a/vendor/golang.org/x/text/language/parse.go b/vendor/golang.org/x/text/language/parse.go new file mode 100644 index 00000000..cfa28f56 --- /dev/null +++ b/vendor/golang.org/x/text/language/parse.go @@ -0,0 +1,859 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package language + +import ( + "bytes" + "errors" + "fmt" + "sort" + "strconv" + "strings" + + "golang.org/x/text/internal/tag" +) + +// isAlpha returns true if the byte is not a digit. +// b must be an ASCII letter or digit. +func isAlpha(b byte) bool { + return b > '9' +} + +// isAlphaNum returns true if the string contains only ASCII letters or digits. +func isAlphaNum(s []byte) bool { + for _, c := range s { + if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') { + return false + } + } + return true +} + +// errSyntax is returned by any of the parsing functions when the +// input is not well-formed, according to BCP 47. +// TODO: return the position at which the syntax error occurred? +var errSyntax = errors.New("language: tag is not well-formed") + +// ValueError is returned by any of the parsing functions when the +// input is well-formed but the respective subtag is not recognized +// as a valid value. +type ValueError struct { + v [8]byte +} + +func mkErrInvalid(s []byte) error { + var e ValueError + copy(e.v[:], s) + return e +} + +func (e ValueError) tag() []byte { + n := bytes.IndexByte(e.v[:], 0) + if n == -1 { + n = 8 + } + return e.v[:n] +} + +// Error implements the error interface. +func (e ValueError) Error() string { + return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag()) +} + +// Subtag returns the subtag for which the error occurred. +func (e ValueError) Subtag() string { + return string(e.tag()) +} + +// scanner is used to scan BCP 47 tokens, which are separated by _ or -. +type scanner struct { + b []byte + bytes [max99thPercentileSize]byte + token []byte + start int // start position of the current token + end int // end position of the current token + next int // next point for scan + err error + done bool +} + +func makeScannerString(s string) scanner { + scan := scanner{} + if len(s) <= len(scan.bytes) { + scan.b = scan.bytes[:copy(scan.bytes[:], s)] + } else { + scan.b = []byte(s) + } + scan.init() + return scan +} + +// makeScanner returns a scanner using b as the input buffer. +// b is not copied and may be modified by the scanner routines. +func makeScanner(b []byte) scanner { + scan := scanner{b: b} + scan.init() + return scan +} + +func (s *scanner) init() { + for i, c := range s.b { + if c == '_' { + s.b[i] = '-' + } + } + s.scan() +} + +// restToLower converts the string between start and end to lower case. +func (s *scanner) toLower(start, end int) { + for i := start; i < end; i++ { + c := s.b[i] + if 'A' <= c && c <= 'Z' { + s.b[i] += 'a' - 'A' + } + } +} + +func (s *scanner) setError(e error) { + if s.err == nil || (e == errSyntax && s.err != errSyntax) { + s.err = e + } +} + +// resizeRange shrinks or grows the array at position oldStart such that +// a new string of size newSize can fit between oldStart and oldEnd. +// Sets the scan point to after the resized range. +func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) { + s.start = oldStart + if end := oldStart + newSize; end != oldEnd { + diff := end - oldEnd + if end < cap(s.b) { + b := make([]byte, len(s.b)+diff) + copy(b, s.b[:oldStart]) + copy(b[end:], s.b[oldEnd:]) + s.b = b + } else { + s.b = append(s.b[end:], s.b[oldEnd:]...) + } + s.next = end + (s.next - s.end) + s.end = end + } +} + +// replace replaces the current token with repl. +func (s *scanner) replace(repl string) { + s.resizeRange(s.start, s.end, len(repl)) + copy(s.b[s.start:], repl) +} + +// gobble removes the current token from the input. +// Caller must call scan after calling gobble. +func (s *scanner) gobble(e error) { + s.setError(e) + if s.start == 0 { + s.b = s.b[:+copy(s.b, s.b[s.next:])] + s.end = 0 + } else { + s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])] + s.end = s.start - 1 + } + s.next = s.start +} + +// deleteRange removes the given range from s.b before the current token. +func (s *scanner) deleteRange(start, end int) { + s.setError(errSyntax) + s.b = s.b[:start+copy(s.b[start:], s.b[end:])] + diff := end - start + s.next -= diff + s.start -= diff + s.end -= diff +} + +// scan parses the next token of a BCP 47 string. Tokens that are larger +// than 8 characters or include non-alphanumeric characters result in an error +// and are gobbled and removed from the output. +// It returns the end position of the last token consumed. +func (s *scanner) scan() (end int) { + end = s.end + s.token = nil + for s.start = s.next; s.next < len(s.b); { + i := bytes.IndexByte(s.b[s.next:], '-') + if i == -1 { + s.end = len(s.b) + s.next = len(s.b) + i = s.end - s.start + } else { + s.end = s.next + i + s.next = s.end + 1 + } + token := s.b[s.start:s.end] + if i < 1 || i > 8 || !isAlphaNum(token) { + s.gobble(errSyntax) + continue + } + s.token = token + return end + } + if n := len(s.b); n > 0 && s.b[n-1] == '-' { + s.setError(errSyntax) + s.b = s.b[:len(s.b)-1] + } + s.done = true + return end +} + +// acceptMinSize parses multiple tokens of the given size or greater. +// It returns the end position of the last token consumed. +func (s *scanner) acceptMinSize(min int) (end int) { + end = s.end + s.scan() + for ; len(s.token) >= min; s.scan() { + end = s.end + } + return end +} + +// Parse parses the given BCP 47 string and returns a valid Tag. If parsing +// failed it returns an error and any part of the tag that could be parsed. +// If parsing succeeded but an unknown value was found, it returns +// ValueError. The Tag returned in this case is just stripped of the unknown +// value. All other values are preserved. It accepts tags in the BCP 47 format +// and extensions to this standard defined in +// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. +// The resulting tag is canonicalized using the default canonicalization type. +func Parse(s string) (t Tag, err error) { + return Default.Parse(s) +} + +// Parse parses the given BCP 47 string and returns a valid Tag. If parsing +// failed it returns an error and any part of the tag that could be parsed. +// If parsing succeeded but an unknown value was found, it returns +// ValueError. The Tag returned in this case is just stripped of the unknown +// value. All other values are preserved. It accepts tags in the BCP 47 format +// and extensions to this standard defined in +// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. +// The resulting tag is canonicalized using the the canonicalization type c. +func (c CanonType) Parse(s string) (t Tag, err error) { + // TODO: consider supporting old-style locale key-value pairs. + if s == "" { + return und, errSyntax + } + if len(s) <= maxAltTaglen { + b := [maxAltTaglen]byte{} + for i, c := range s { + // Generating invalid UTF-8 is okay as it won't match. + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } else if c == '_' { + c = '-' + } + b[i] = byte(c) + } + if t, ok := grandfathered(b); ok { + return t, nil + } + } + scan := makeScannerString(s) + t, err = parse(&scan, s) + t, changed := t.canonicalize(c) + if changed { + t.remakeString() + } + return t, err +} + +func parse(scan *scanner, s string) (t Tag, err error) { + t = und + var end int + if n := len(scan.token); n <= 1 { + scan.toLower(0, len(scan.b)) + if n == 0 || scan.token[0] != 'x' { + return t, errSyntax + } + end = parseExtensions(scan) + } else if n >= 4 { + return und, errSyntax + } else { // the usual case + t, end = parseTag(scan) + if n := len(scan.token); n == 1 { + t.pExt = uint16(end) + end = parseExtensions(scan) + } else if end < len(scan.b) { + scan.setError(errSyntax) + scan.b = scan.b[:end] + } + } + if int(t.pVariant) < len(scan.b) { + if end < len(s) { + s = s[:end] + } + if len(s) > 0 && tag.Compare(s, scan.b) == 0 { + t.str = s + } else { + t.str = string(scan.b) + } + } else { + t.pVariant, t.pExt = 0, 0 + } + return t, scan.err +} + +// parseTag parses language, script, region and variants. +// It returns a Tag and the end position in the input that was parsed. +func parseTag(scan *scanner) (t Tag, end int) { + var e error + // TODO: set an error if an unknown lang, script or region is encountered. + t.lang, e = getLangID(scan.token) + scan.setError(e) + scan.replace(t.lang.String()) + langStart := scan.start + end = scan.scan() + for len(scan.token) == 3 && isAlpha(scan.token[0]) { + // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent + // to a tag of the form <extlang>. + lang, e := getLangID(scan.token) + if lang != 0 { + t.lang = lang + copy(scan.b[langStart:], lang.String()) + scan.b[langStart+3] = '-' + scan.start = langStart + 4 + } + scan.gobble(e) + end = scan.scan() + } + if len(scan.token) == 4 && isAlpha(scan.token[0]) { + t.script, e = getScriptID(script, scan.token) + if t.script == 0 { + scan.gobble(e) + } + end = scan.scan() + } + if n := len(scan.token); n >= 2 && n <= 3 { + t.region, e = getRegionID(scan.token) + if t.region == 0 { + scan.gobble(e) + } else { + scan.replace(t.region.String()) + } + end = scan.scan() + } + scan.toLower(scan.start, len(scan.b)) + t.pVariant = byte(end) + end = parseVariants(scan, end, t) + t.pExt = uint16(end) + return t, end +} + +var separator = []byte{'-'} + +// parseVariants scans tokens as long as each token is a valid variant string. +// Duplicate variants are removed. +func parseVariants(scan *scanner, end int, t Tag) int { + start := scan.start + varIDBuf := [4]uint8{} + variantBuf := [4][]byte{} + varID := varIDBuf[:0] + variant := variantBuf[:0] + last := -1 + needSort := false + for ; len(scan.token) >= 4; scan.scan() { + // TODO: measure the impact of needing this conversion and redesign + // the data structure if there is an issue. + v, ok := variantIndex[string(scan.token)] + if !ok { + // unknown variant + // TODO: allow user-defined variants? + scan.gobble(mkErrInvalid(scan.token)) + continue + } + varID = append(varID, v) + variant = append(variant, scan.token) + if !needSort { + if last < int(v) { + last = int(v) + } else { + needSort = true + // There is no legal combinations of more than 7 variants + // (and this is by no means a useful sequence). + const maxVariants = 8 + if len(varID) > maxVariants { + break + } + } + } + end = scan.end + } + if needSort { + sort.Sort(variantsSort{varID, variant}) + k, l := 0, -1 + for i, v := range varID { + w := int(v) + if l == w { + // Remove duplicates. + continue + } + varID[k] = varID[i] + variant[k] = variant[i] + k++ + l = w + } + if str := bytes.Join(variant[:k], separator); len(str) == 0 { + end = start - 1 + } else { + scan.resizeRange(start, end, len(str)) + copy(scan.b[scan.start:], str) + end = scan.end + } + } + return end +} + +type variantsSort struct { + i []uint8 + v [][]byte +} + +func (s variantsSort) Len() int { + return len(s.i) +} + +func (s variantsSort) Swap(i, j int) { + s.i[i], s.i[j] = s.i[j], s.i[i] + s.v[i], s.v[j] = s.v[j], s.v[i] +} + +func (s variantsSort) Less(i, j int) bool { + return s.i[i] < s.i[j] +} + +type bytesSort [][]byte + +func (b bytesSort) Len() int { + return len(b) +} + +func (b bytesSort) Swap(i, j int) { + b[i], b[j] = b[j], b[i] +} + +func (b bytesSort) Less(i, j int) bool { + return bytes.Compare(b[i], b[j]) == -1 +} + +// parseExtensions parses and normalizes the extensions in the buffer. +// It returns the last position of scan.b that is part of any extension. +// It also trims scan.b to remove excess parts accordingly. +func parseExtensions(scan *scanner) int { + start := scan.start + exts := [][]byte{} + private := []byte{} + end := scan.end + for len(scan.token) == 1 { + extStart := scan.start + ext := scan.token[0] + end = parseExtension(scan) + extension := scan.b[extStart:end] + if len(extension) < 3 || (ext != 'x' && len(extension) < 4) { + scan.setError(errSyntax) + end = extStart + continue + } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) { + scan.b = scan.b[:end] + return end + } else if ext == 'x' { + private = extension + break + } + exts = append(exts, extension) + } + sort.Sort(bytesSort(exts)) + if len(private) > 0 { + exts = append(exts, private) + } + scan.b = scan.b[:start] + if len(exts) > 0 { + scan.b = append(scan.b, bytes.Join(exts, separator)...) + } else if start > 0 { + // Strip trailing '-'. + scan.b = scan.b[:start-1] + } + return end +} + +// parseExtension parses a single extension and returns the position of +// the extension end. +func parseExtension(scan *scanner) int { + start, end := scan.start, scan.end + switch scan.token[0] { + case 'u': + attrStart := end + scan.scan() + for last := []byte{}; len(scan.token) > 2; scan.scan() { + if bytes.Compare(scan.token, last) != -1 { + // Attributes are unsorted. Start over from scratch. + p := attrStart + 1 + scan.next = p + attrs := [][]byte{} + for scan.scan(); len(scan.token) > 2; scan.scan() { + attrs = append(attrs, scan.token) + end = scan.end + } + sort.Sort(bytesSort(attrs)) + copy(scan.b[p:], bytes.Join(attrs, separator)) + break + } + last = scan.token + end = scan.end + } + var last, key []byte + for attrEnd := end; len(scan.token) == 2; last = key { + key = scan.token + keyEnd := scan.end + end = scan.acceptMinSize(3) + // TODO: check key value validity + if keyEnd == end || bytes.Compare(key, last) != 1 { + // We have an invalid key or the keys are not sorted. + // Start scanning keys from scratch and reorder. + p := attrEnd + 1 + scan.next = p + keys := [][]byte{} + for scan.scan(); len(scan.token) == 2; { + keyStart, keyEnd := scan.start, scan.end + end = scan.acceptMinSize(3) + if keyEnd != end { + keys = append(keys, scan.b[keyStart:end]) + } else { + scan.setError(errSyntax) + end = keyStart + } + } + sort.Sort(bytesSort(keys)) + reordered := bytes.Join(keys, separator) + if e := p + len(reordered); e < end { + scan.deleteRange(e, end) + end = e + } + copy(scan.b[p:], bytes.Join(keys, separator)) + break + } + } + case 't': + scan.scan() + if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) { + _, end = parseTag(scan) + scan.toLower(start, end) + } + for len(scan.token) == 2 && !isAlpha(scan.token[1]) { + end = scan.acceptMinSize(3) + } + case 'x': + end = scan.acceptMinSize(1) + default: + end = scan.acceptMinSize(2) + } + return end +} + +// Compose creates a Tag from individual parts, which may be of type Tag, Base, +// Script, Region, Variant, []Variant, Extension, []Extension or error. If a +// Base, Script or Region or slice of type Variant or Extension is passed more +// than once, the latter will overwrite the former. Variants and Extensions are +// accumulated, but if two extensions of the same type are passed, the latter +// will replace the former. A Tag overwrites all former values and typically +// only makes sense as the first argument. The resulting tag is returned after +// canonicalizing using the Default CanonType. If one or more errors are +// encountered, one of the errors is returned. +func Compose(part ...interface{}) (t Tag, err error) { + return Default.Compose(part...) +} + +// Compose creates a Tag from individual parts, which may be of type Tag, Base, +// Script, Region, Variant, []Variant, Extension, []Extension or error. If a +// Base, Script or Region or slice of type Variant or Extension is passed more +// than once, the latter will overwrite the former. Variants and Extensions are +// accumulated, but if two extensions of the same type are passed, the latter +// will replace the former. A Tag overwrites all former values and typically +// only makes sense as the first argument. The resulting tag is returned after +// canonicalizing using CanonType c. If one or more errors are encountered, +// one of the errors is returned. +func (c CanonType) Compose(part ...interface{}) (t Tag, err error) { + var b builder + if err = b.update(part...); err != nil { + return und, err + } + t, _ = b.tag.canonicalize(c) + + if len(b.ext) > 0 || len(b.variant) > 0 { + sort.Sort(sortVariant(b.variant)) + sort.Strings(b.ext) + if b.private != "" { + b.ext = append(b.ext, b.private) + } + n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...) + buf := make([]byte, n) + p := t.genCoreBytes(buf) + t.pVariant = byte(p) + p += appendTokens(buf[p:], b.variant...) + t.pExt = uint16(p) + p += appendTokens(buf[p:], b.ext...) + t.str = string(buf[:p]) + } else if b.private != "" { + t.str = b.private + t.remakeString() + } + return +} + +type builder struct { + tag Tag + + private string // the x extension + ext []string + variant []string + + err error +} + +func (b *builder) addExt(e string) { + if e == "" { + } else if e[0] == 'x' { + b.private = e + } else { + b.ext = append(b.ext, e) + } +} + +var errInvalidArgument = errors.New("invalid Extension or Variant") + +func (b *builder) update(part ...interface{}) (err error) { + replace := func(l *[]string, s string, eq func(a, b string) bool) bool { + if s == "" { + b.err = errInvalidArgument + return true + } + for i, v := range *l { + if eq(v, s) { + (*l)[i] = s + return true + } + } + return false + } + for _, x := range part { + switch v := x.(type) { + case Tag: + b.tag.lang = v.lang + b.tag.region = v.region + b.tag.script = v.script + if v.str != "" { + b.variant = nil + for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; { + x, s = nextToken(s) + b.variant = append(b.variant, x) + } + b.ext, b.private = nil, "" + for i, e := int(v.pExt), ""; i < len(v.str); { + i, e = getExtension(v.str, i) + b.addExt(e) + } + } + case Base: + b.tag.lang = v.langID + case Script: + b.tag.script = v.scriptID + case Region: + b.tag.region = v.regionID + case Variant: + if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) { + b.variant = append(b.variant, v.variant) + } + case Extension: + if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) { + b.addExt(v.s) + } + case []Variant: + b.variant = nil + for _, x := range v { + b.update(x) + } + case []Extension: + b.ext, b.private = nil, "" + for _, e := range v { + b.update(e) + } + // TODO: support parsing of raw strings based on morphology or just extensions? + case error: + err = v + } + } + return +} + +func tokenLen(token ...string) (n int) { + for _, t := range token { + n += len(t) + 1 + } + return +} + +func appendTokens(b []byte, token ...string) int { + p := 0 + for _, t := range token { + b[p] = '-' + copy(b[p+1:], t) + p += 1 + len(t) + } + return p +} + +type sortVariant []string + +func (s sortVariant) Len() int { + return len(s) +} + +func (s sortVariant) Swap(i, j int) { + s[j], s[i] = s[i], s[j] +} + +func (s sortVariant) Less(i, j int) bool { + return variantIndex[s[i]] < variantIndex[s[j]] +} + +func findExt(list []string, x byte) int { + for i, e := range list { + if e[0] == x { + return i + } + } + return -1 +} + +// getExtension returns the name, body and end position of the extension. +func getExtension(s string, p int) (end int, ext string) { + if s[p] == '-' { + p++ + } + if s[p] == 'x' { + return len(s), s[p:] + } + end = nextExtension(s, p) + return end, s[p:end] +} + +// nextExtension finds the next extension within the string, searching +// for the -<char>- pattern from position p. +// In the fast majority of cases, language tags will have at most +// one extension and extensions tend to be small. +func nextExtension(s string, p int) int { + for n := len(s) - 3; p < n; { + if s[p] == '-' { + if s[p+2] == '-' { + return p + } + p += 3 + } else { + p++ + } + } + return len(s) +} + +var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight") + +// ParseAcceptLanguage parses the contents of a Accept-Language header as +// defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and +// a list of corresponding quality weights. It is more permissive than RFC 2616 +// and may return non-nil slices even if the input is not valid. +// The Tags will be sorted by highest weight first and then by first occurrence. +// Tags with a weight of zero will be dropped. An error will be returned if the +// input could not be parsed. +func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) { + var entry string + for s != "" { + if entry, s = split(s, ','); entry == "" { + continue + } + + entry, weight := split(entry, ';') + + // Scan the language. + t, err := Parse(entry) + if err != nil { + id, ok := acceptFallback[entry] + if !ok { + return nil, nil, err + } + t = Tag{lang: id} + } + + // Scan the optional weight. + w := 1.0 + if weight != "" { + weight = consume(weight, 'q') + weight = consume(weight, '=') + // consume returns the empty string when a token could not be + // consumed, resulting in an error for ParseFloat. + if w, err = strconv.ParseFloat(weight, 32); err != nil { + return nil, nil, errInvalidWeight + } + // Drop tags with a quality weight of 0. + if w <= 0 { + continue + } + } + + tag = append(tag, t) + q = append(q, float32(w)) + } + sortStable(&tagSort{tag, q}) + return tag, q, nil +} + +// consume removes a leading token c from s and returns the result or the empty +// string if there is no such token. +func consume(s string, c byte) string { + if s == "" || s[0] != c { + return "" + } + return strings.TrimSpace(s[1:]) +} + +func split(s string, c byte) (head, tail string) { + if i := strings.IndexByte(s, c); i >= 0 { + return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:]) + } + return strings.TrimSpace(s), "" +} + +// Add hack mapping to deal with a small number of cases that that occur +// in Accept-Language (with reasonable frequency). +var acceptFallback = map[string]langID{ + "english": _en, + "deutsch": _de, + "italian": _it, + "french": _fr, + "*": _mul, // defined in the spec to match all languages. +} + +type tagSort struct { + tag []Tag + q []float32 +} + +func (s *tagSort) Len() int { + return len(s.q) +} + +func (s *tagSort) Less(i, j int) bool { + return s.q[i] > s.q[j] +} + +func (s *tagSort) Swap(i, j int) { + s.tag[i], s.tag[j] = s.tag[j], s.tag[i] + s.q[i], s.q[j] = s.q[j], s.q[i] +} |