diff options
Diffstat (limited to 'vendor/golang.org/x/text/internal/triegen')
-rw-r--r-- | vendor/golang.org/x/text/internal/triegen/LICENSE | 27 | ||||
-rw-r--r-- | vendor/golang.org/x/text/internal/triegen/compact.go | 58 | ||||
-rw-r--r-- | vendor/golang.org/x/text/internal/triegen/print.go | 251 | ||||
-rw-r--r-- | vendor/golang.org/x/text/internal/triegen/triegen.go | 494 |
4 files changed, 0 insertions, 830 deletions
diff --git a/vendor/golang.org/x/text/internal/triegen/LICENSE b/vendor/golang.org/x/text/internal/triegen/LICENSE deleted file mode 100644 index 6a66aea5..00000000 --- a/vendor/golang.org/x/text/internal/triegen/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/golang.org/x/text/internal/triegen/compact.go b/vendor/golang.org/x/text/internal/triegen/compact.go deleted file mode 100644 index 397b975c..00000000 --- a/vendor/golang.org/x/text/internal/triegen/compact.go +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package triegen - -// This file defines Compacter and its implementations. - -import "io" - -// A Compacter generates an alternative, more space-efficient way to store a -// trie value block. A trie value block holds all possible values for the last -// byte of a UTF-8 encoded rune. Excluding ASCII characters, a trie value block -// always has 64 values, as a UTF-8 encoding ends with a byte in [0x80, 0xC0). -type Compacter interface { - // Size returns whether the Compacter could encode the given block as well - // as its size in case it can. len(v) is always 64. - Size(v []uint64) (sz int, ok bool) - - // Store stores the block using the Compacter's compression method. - // It returns a handle with which the block can be retrieved. - // len(v) is always 64. - Store(v []uint64) uint32 - - // Print writes the data structures associated to the given store to w. - Print(w io.Writer) error - - // Handler returns the name of a function that gets called during trie - // lookup for blocks generated by the Compacter. The function should be of - // the form func (n uint32, b byte) uint64, where n is the index returned by - // the Compacter's Store method and b is the last byte of the UTF-8 - // encoding, where 0x80 <= b < 0xC0, for which to do the lookup in the - // block. - Handler() string -} - -// simpleCompacter is the default Compacter used by builder. It implements a -// normal trie block. -type simpleCompacter builder - -func (b *simpleCompacter) Size([]uint64) (sz int, ok bool) { - return blockSize * b.ValueSize, true -} - -func (b *simpleCompacter) Store(v []uint64) uint32 { - h := uint32(len(b.ValueBlocks) - blockOffset) - b.ValueBlocks = append(b.ValueBlocks, v) - return h -} - -func (b *simpleCompacter) Print(io.Writer) error { - // Structures are printed in print.go. - return nil -} - -func (b *simpleCompacter) Handler() string { - panic("Handler should be special-cased for this Compacter") -} diff --git a/vendor/golang.org/x/text/internal/triegen/print.go b/vendor/golang.org/x/text/internal/triegen/print.go deleted file mode 100644 index 8d9f120b..00000000 --- a/vendor/golang.org/x/text/internal/triegen/print.go +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package triegen - -import ( - "bytes" - "fmt" - "io" - "strings" - "text/template" -) - -// print writes all the data structures as well as the code necessary to use the -// trie to w. -func (b *builder) print(w io.Writer) error { - b.Stats.NValueEntries = len(b.ValueBlocks) * blockSize - b.Stats.NValueBytes = len(b.ValueBlocks) * blockSize * b.ValueSize - b.Stats.NIndexEntries = len(b.IndexBlocks) * blockSize - b.Stats.NIndexBytes = len(b.IndexBlocks) * blockSize * b.IndexSize - b.Stats.NHandleBytes = len(b.Trie) * 2 * b.IndexSize - - // If we only have one root trie, all starter blocks are at position 0 and - // we can access the arrays directly. - if len(b.Trie) == 1 { - // At this point we cannot refer to the generated tables directly. - b.ASCIIBlock = b.Name + "Values" - b.StarterBlock = b.Name + "Index" - } else { - // Otherwise we need to have explicit starter indexes in the trie - // structure. - b.ASCIIBlock = "t.ascii" - b.StarterBlock = "t.utf8Start" - } - - b.SourceType = "[]byte" - if err := lookupGen.Execute(w, b); err != nil { - return err - } - - b.SourceType = "string" - if err := lookupGen.Execute(w, b); err != nil { - return err - } - - if err := trieGen.Execute(w, b); err != nil { - return err - } - - for _, c := range b.Compactions { - if err := c.c.Print(w); err != nil { - return err - } - } - - return nil -} - -func printValues(n int, values []uint64) string { - w := &bytes.Buffer{} - boff := n * blockSize - fmt.Fprintf(w, "\t// Block %#x, offset %#x", n, boff) - var newline bool - for i, v := range values { - if i%6 == 0 { - newline = true - } - if v != 0 { - if newline { - fmt.Fprintf(w, "\n") - newline = false - } - fmt.Fprintf(w, "\t%#02x:%#04x, ", boff+i, v) - } - } - return w.String() -} - -func printIndex(b *builder, nr int, n *node) string { - w := &bytes.Buffer{} - boff := nr * blockSize - fmt.Fprintf(w, "\t// Block %#x, offset %#x", nr, boff) - var newline bool - for i, c := range n.children { - if i%8 == 0 { - newline = true - } - if c != nil { - v := b.Compactions[c.index.compaction].Offset + uint32(c.index.index) - if v != 0 { - if newline { - fmt.Fprintf(w, "\n") - newline = false - } - fmt.Fprintf(w, "\t%#02x:%#02x, ", boff+i, v) - } - } - } - return w.String() -} - -var ( - trieGen = template.Must(template.New("trie").Funcs(template.FuncMap{ - "printValues": printValues, - "printIndex": printIndex, - "title": strings.Title, - "dec": func(x int) int { return x - 1 }, - "psize": func(n int) string { - return fmt.Sprintf("%d bytes (%.2f KiB)", n, float64(n)/1024) - }, - }).Parse(trieTemplate)) - lookupGen = template.Must(template.New("lookup").Parse(lookupTemplate)) -) - -// TODO: consider the return type of lookup. It could be uint64, even if the -// internal value type is smaller. We will have to verify this with the -// performance of unicode/norm, which is very sensitive to such changes. -const trieTemplate = `{{$b := .}}{{$multi := gt (len .Trie) 1}} -// {{.Name}}Trie. Total size: {{psize .Size}}. Checksum: {{printf "%08x" .Checksum}}. -type {{.Name}}Trie struct { {{if $multi}} - ascii []{{.ValueType}} // index for ASCII bytes - utf8Start []{{.IndexType}} // index for UTF-8 bytes >= 0xC0 -{{end}}} - -func new{{title .Name}}Trie(i int) *{{.Name}}Trie { {{if $multi}} - h := {{.Name}}TrieHandles[i] - return &{{.Name}}Trie{ {{.Name}}Values[uint32(h.ascii)<<6:], {{.Name}}Index[uint32(h.multi)<<6:] } -} - -type {{.Name}}TrieHandle struct { - ascii, multi {{.IndexType}} -} - -// {{.Name}}TrieHandles: {{len .Trie}} handles, {{.Stats.NHandleBytes}} bytes -var {{.Name}}TrieHandles = [{{len .Trie}}]{{.Name}}TrieHandle{ -{{range .Trie}} { {{.ASCIIIndex}}, {{.StarterIndex}} }, // {{printf "%08x" .Checksum}}: {{.Name}} -{{end}}}{{else}} - return &{{.Name}}Trie{} -} -{{end}} -// lookupValue determines the type of block n and looks up the value for b. -func (t *{{.Name}}Trie) lookupValue(n uint32, b byte) {{.ValueType}}{{$last := dec (len .Compactions)}} { - switch { {{range $i, $c := .Compactions}} - {{if eq $i $last}}default{{else}}case n < {{$c.Cutoff}}{{end}}:{{if ne $i 0}} - n -= {{$c.Offset}}{{end}} - return {{print $b.ValueType}}({{$c.Handler}}){{end}} - } -} - -// {{.Name}}Values: {{len .ValueBlocks}} blocks, {{.Stats.NValueEntries}} entries, {{.Stats.NValueBytes}} bytes -// The third block is the zero block. -var {{.Name}}Values = [{{.Stats.NValueEntries}}]{{.ValueType}} { -{{range $i, $v := .ValueBlocks}}{{printValues $i $v}} -{{end}}} - -// {{.Name}}Index: {{len .IndexBlocks}} blocks, {{.Stats.NIndexEntries}} entries, {{.Stats.NIndexBytes}} bytes -// Block 0 is the zero block. -var {{.Name}}Index = [{{.Stats.NIndexEntries}}]{{.IndexType}} { -{{range $i, $v := .IndexBlocks}}{{printIndex $b $i $v}} -{{end}}} -` - -// TODO: consider allowing zero-length strings after evaluating performance with -// unicode/norm. -const lookupTemplate = ` -// lookup{{if eq .SourceType "string"}}String{{end}} returns the trie value for the first UTF-8 encoding in s and -// the width in bytes of this encoding. The size will be 0 if s does not -// hold enough bytes to complete the encoding. len(s) must be greater than 0. -func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}(s {{.SourceType}}) (v {{.ValueType}}, sz int) { - c0 := s[0] - switch { - case c0 < 0x80: // is ASCII - return {{.ASCIIBlock}}[c0], 1 - case c0 < 0xC2: - return 0, 1 // Illegal UTF-8: not a starter, not ASCII. - case c0 < 0xE0: // 2-byte UTF-8 - if len(s) < 2 { - return 0, 0 - } - i := {{.StarterBlock}}[c0] - c1 := s[1] - if c1 < 0x80 || 0xC0 <= c1 { - return 0, 1 // Illegal UTF-8: not a continuation byte. - } - return t.lookupValue(uint32(i), c1), 2 - case c0 < 0xF0: // 3-byte UTF-8 - if len(s) < 3 { - return 0, 0 - } - i := {{.StarterBlock}}[c0] - c1 := s[1] - if c1 < 0x80 || 0xC0 <= c1 { - return 0, 1 // Illegal UTF-8: not a continuation byte. - } - o := uint32(i)<<6 + uint32(c1) - i = {{.Name}}Index[o] - c2 := s[2] - if c2 < 0x80 || 0xC0 <= c2 { - return 0, 2 // Illegal UTF-8: not a continuation byte. - } - return t.lookupValue(uint32(i), c2), 3 - case c0 < 0xF8: // 4-byte UTF-8 - if len(s) < 4 { - return 0, 0 - } - i := {{.StarterBlock}}[c0] - c1 := s[1] - if c1 < 0x80 || 0xC0 <= c1 { - return 0, 1 // Illegal UTF-8: not a continuation byte. - } - o := uint32(i)<<6 + uint32(c1) - i = {{.Name}}Index[o] - c2 := s[2] - if c2 < 0x80 || 0xC0 <= c2 { - return 0, 2 // Illegal UTF-8: not a continuation byte. - } - o = uint32(i)<<6 + uint32(c2) - i = {{.Name}}Index[o] - c3 := s[3] - if c3 < 0x80 || 0xC0 <= c3 { - return 0, 3 // Illegal UTF-8: not a continuation byte. - } - return t.lookupValue(uint32(i), c3), 4 - } - // Illegal rune - return 0, 1 -} - -// lookup{{if eq .SourceType "string"}}String{{end}}Unsafe returns the trie value for the first UTF-8 encoding in s. -// s must start with a full and valid UTF-8 encoded rune. -func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}Unsafe(s {{.SourceType}}) {{.ValueType}} { - c0 := s[0] - if c0 < 0x80 { // is ASCII - return {{.ASCIIBlock}}[c0] - } - i := {{.StarterBlock}}[c0] - if c0 < 0xE0 { // 2-byte UTF-8 - return t.lookupValue(uint32(i), s[1]) - } - i = {{.Name}}Index[uint32(i)<<6+uint32(s[1])] - if c0 < 0xF0 { // 3-byte UTF-8 - return t.lookupValue(uint32(i), s[2]) - } - i = {{.Name}}Index[uint32(i)<<6+uint32(s[2])] - if c0 < 0xF8 { // 4-byte UTF-8 - return t.lookupValue(uint32(i), s[3]) - } - return 0 -} -` diff --git a/vendor/golang.org/x/text/internal/triegen/triegen.go b/vendor/golang.org/x/text/internal/triegen/triegen.go deleted file mode 100644 index adb01081..00000000 --- a/vendor/golang.org/x/text/internal/triegen/triegen.go +++ /dev/null @@ -1,494 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package triegen implements a code generator for a trie for associating -// unsigned integer values with UTF-8 encoded runes. -// -// Many of the go.text packages use tries for storing per-rune information. A -// trie is especially useful if many of the runes have the same value. If this -// is the case, many blocks can be expected to be shared allowing for -// information on many runes to be stored in little space. -// -// As most of the lookups are done directly on []byte slices, the tries use the -// UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to -// runes and contributes a little bit to better performance. It also naturally -// provides a fast path for ASCII. -// -// Space is also an issue. There are many code points defined in Unicode and as -// a result tables can get quite large. So every byte counts. The triegen -// package automatically chooses the smallest integer values to represent the -// tables. Compacters allow further compression of the trie by allowing for -// alternative representations of individual trie blocks. -// -// triegen allows generating multiple tries as a single structure. This is -// useful when, for example, one wants to generate tries for several languages -// that have a lot of values in common. Some existing libraries for -// internationalization store all per-language data as a dynamically loadable -// chunk. The go.text packages are designed with the assumption that the user -// typically wants to compile in support for all supported languages, in line -// with the approach common to Go to create a single standalone binary. The -// multi-root trie approach can give significant storage savings in this -// scenario. -// -// triegen generates both tables and code. The code is optimized to use the -// automatically chosen data types. The following code is generated for a Trie -// or multiple Tries named "foo": -// - type fooTrie -// The trie type. -// -// - func newFooTrie(x int) *fooTrie -// Trie constructor, where x is the index of the trie passed to Gen. -// -// - func (t *fooTrie) lookup(s []byte) (v uintX, sz int) -// The lookup method, where uintX is automatically chosen. -// -// - func lookupString, lookupUnsafe and lookupStringUnsafe -// Variants of the above. -// -// - var fooValues and fooIndex and any tables generated by Compacters. -// The core trie data. -// -// - var fooTrieHandles -// Indexes of starter blocks in case of multiple trie roots. -// -// It is recommended that users test the generated trie by checking the returned -// value for every rune. Such exhaustive tests are possible as the the number of -// runes in Unicode is limited. -package triegen // import "golang.org/x/text/internal/triegen" - -// TODO: Arguably, the internally optimized data types would not have to be -// exposed in the generated API. We could also investigate not generating the -// code, but using it through a package. We would have to investigate the impact -// on performance of making such change, though. For packages like unicode/norm, -// small changes like this could tank performance. - -import ( - "encoding/binary" - "fmt" - "hash/crc64" - "io" - "log" - "unicode/utf8" -) - -// builder builds a set of tries for associating values with runes. The set of -// tries can share common index and value blocks. -type builder struct { - Name string - - // ValueType is the type of the trie values looked up. - ValueType string - - // ValueSize is the byte size of the ValueType. - ValueSize int - - // IndexType is the type of trie index values used for all UTF-8 bytes of - // a rune except the last one. - IndexType string - - // IndexSize is the byte size of the IndexType. - IndexSize int - - // SourceType is used when generating the lookup functions. If the user - // requests StringSupport, all lookup functions will be generated for - // string input as well. - SourceType string - - Trie []*Trie - - IndexBlocks []*node - ValueBlocks [][]uint64 - Compactions []compaction - Checksum uint64 - - ASCIIBlock string - StarterBlock string - - indexBlockIdx map[uint64]int - valueBlockIdx map[uint64]nodeIndex - asciiBlockIdx map[uint64]int - - // Stats are used to fill out the template. - Stats struct { - NValueEntries int - NValueBytes int - NIndexEntries int - NIndexBytes int - NHandleBytes int - } - - err error -} - -// A nodeIndex encodes the index of a node, which is defined by the compaction -// which stores it and an index within the compaction. For internal nodes, the -// compaction is always 0. -type nodeIndex struct { - compaction int - index int -} - -// compaction keeps track of stats used for the compaction. -type compaction struct { - c Compacter - blocks []*node - maxHandle uint32 - totalSize int - - // Used by template-based generator and thus exported. - Cutoff uint32 - Offset uint32 - Handler string -} - -func (b *builder) setError(err error) { - if b.err == nil { - b.err = err - } -} - -// An Option can be passed to Gen. -type Option func(b *builder) error - -// Compact configures the trie generator to use the given Compacter. -func Compact(c Compacter) Option { - return func(b *builder) error { - b.Compactions = append(b.Compactions, compaction{ - c: c, - Handler: c.Handler() + "(n, b)"}) - return nil - } -} - -// Gen writes Go code for a shared trie lookup structure to w for the given -// Tries. The generated trie type will be called nameTrie. newNameTrie(x) will -// return the *nameTrie for tries[x]. A value can be looked up by using one of -// the various lookup methods defined on nameTrie. It returns the table size of -// the generated trie. -func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) { - // The index contains two dummy blocks, followed by the zero block. The zero - // block is at offset 0x80, so that the offset for the zero block for - // continuation bytes is 0. - b := &builder{ - Name: name, - Trie: tries, - IndexBlocks: []*node{{}, {}, {}}, - Compactions: []compaction{{ - Handler: name + "Values[n<<6+uint32(b)]", - }}, - // The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero - // block. - indexBlockIdx: map[uint64]int{0: 0}, - valueBlockIdx: map[uint64]nodeIndex{0: {}}, - asciiBlockIdx: map[uint64]int{}, - } - b.Compactions[0].c = (*simpleCompacter)(b) - - for _, f := range opts { - if err := f(b); err != nil { - return 0, err - } - } - b.build() - if b.err != nil { - return 0, b.err - } - if err = b.print(w); err != nil { - return 0, err - } - return b.Size(), nil -} - -// A Trie represents a single root node of a trie. A builder may build several -// overlapping tries at once. -type Trie struct { - root *node - - hiddenTrie -} - -// hiddenTrie contains values we want to be visible to the template generator, -// but hidden from the API documentation. -type hiddenTrie struct { - Name string - Checksum uint64 - ASCIIIndex int - StarterIndex int -} - -// NewTrie returns a new trie root. -func NewTrie(name string) *Trie { - return &Trie{ - &node{ - children: make([]*node, blockSize), - values: make([]uint64, utf8.RuneSelf), - }, - hiddenTrie{Name: name}, - } -} - -// Gen is a convenience wrapper around the Gen func passing t as the only trie -// and uses the name passed to NewTrie. It returns the size of the generated -// tables. -func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) { - return Gen(w, t.Name, []*Trie{t}, opts...) -} - -// node is a node of the intermediate trie structure. -type node struct { - // children holds this node's children. It is always of length 64. - // A child node may be nil. - children []*node - - // values contains the values of this node. If it is non-nil, this node is - // either a root or leaf node: - // For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F]. - // For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF]. - values []uint64 - - index nodeIndex -} - -// Insert associates value with the given rune. Insert will panic if a non-zero -// value is passed for an invalid rune. -func (t *Trie) Insert(r rune, value uint64) { - if value == 0 { - return - } - s := string(r) - if []rune(s)[0] != r && value != 0 { - // Note: The UCD tables will always assign what amounts to a zero value - // to a surrogate. Allowing a zero value for an illegal rune allows - // users to iterate over [0..MaxRune] without having to explicitly - // exclude surrogates, which would be tedious. - panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r)) - } - if len(s) == 1 { - // It is a root node value (ASCII). - t.root.values[s[0]] = value - return - } - - n := t.root - for ; len(s) > 1; s = s[1:] { - if n.children == nil { - n.children = make([]*node, blockSize) - } - p := s[0] % blockSize - c := n.children[p] - if c == nil { - c = &node{} - n.children[p] = c - } - if len(s) > 2 && c.values != nil { - log.Fatalf("triegen: insert(%U): found internal node with values", r) - } - n = c - } - if n.values == nil { - n.values = make([]uint64, blockSize) - } - if n.children != nil { - log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r) - } - n.values[s[0]-0x80] = value -} - -// Size returns the number of bytes the generated trie will take to store. It -// needs to be exported as it is used in the templates. -func (b *builder) Size() int { - // Index blocks. - sz := len(b.IndexBlocks) * blockSize * b.IndexSize - - // Skip the first compaction, which represents the normal value blocks, as - // its totalSize does not account for the ASCII blocks, which are managed - // separately. - sz += len(b.ValueBlocks) * blockSize * b.ValueSize - for _, c := range b.Compactions[1:] { - sz += c.totalSize - } - - // TODO: this computation does not account for the fixed overhead of a using - // a compaction, either code or data. As for data, though, the typical - // overhead of data is in the order of bytes (2 bytes for cases). Further, - // the savings of using a compaction should anyway be substantial for it to - // be worth it. - - // For multi-root tries, we also need to account for the handles. - if len(b.Trie) > 1 { - sz += 2 * b.IndexSize * len(b.Trie) - } - return sz -} - -func (b *builder) build() { - // Compute the sizes of the values. - var vmax uint64 - for _, t := range b.Trie { - vmax = maxValue(t.root, vmax) - } - b.ValueType, b.ValueSize = getIntType(vmax) - - // Compute all block allocations. - // TODO: first compute the ASCII blocks for all tries and then the other - // nodes. ASCII blocks are more restricted in placement, as they require two - // blocks to be placed consecutively. Processing them first may improve - // sharing (at least one zero block can be expected to be saved.) - for _, t := range b.Trie { - b.Checksum += b.buildTrie(t) - } - - // Compute the offsets for all the Compacters. - offset := uint32(0) - for i := range b.Compactions { - c := &b.Compactions[i] - c.Offset = offset - offset += c.maxHandle + 1 - c.Cutoff = offset - } - - // Compute the sizes of indexes. - // TODO: different byte positions could have different sizes. So far we have - // not found a case where this is beneficial. - imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff) - for _, ib := range b.IndexBlocks { - if x := uint64(ib.index.index); x > imax { - imax = x - } - } - b.IndexType, b.IndexSize = getIntType(imax) -} - -func maxValue(n *node, max uint64) uint64 { - if n == nil { - return max - } - for _, c := range n.children { - max = maxValue(c, max) - } - for _, v := range n.values { - if max < v { - max = v - } - } - return max -} - -func getIntType(v uint64) (string, int) { - switch { - case v < 1<<8: - return "uint8", 1 - case v < 1<<16: - return "uint16", 2 - case v < 1<<32: - return "uint32", 4 - } - return "uint64", 8 -} - -const ( - blockSize = 64 - - // Subtract two blocks to offset 0x80, the first continuation byte. - blockOffset = 2 - - // Subtract three blocks to offset 0xC0, the first non-ASCII starter. - rootBlockOffset = 3 -) - -var crcTable = crc64.MakeTable(crc64.ISO) - -func (b *builder) buildTrie(t *Trie) uint64 { - n := t.root - - // Get the ASCII offset. For the first trie, the ASCII block will be at - // position 0. - hasher := crc64.New(crcTable) - binary.Write(hasher, binary.BigEndian, n.values) - hash := hasher.Sum64() - - v, ok := b.asciiBlockIdx[hash] - if !ok { - v = len(b.ValueBlocks) - b.asciiBlockIdx[hash] = v - - b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:]) - if v == 0 { - // Add the zero block at position 2 so that it will be assigned a - // zero reference in the lookup blocks. - // TODO: always do this? This would allow us to remove a check from - // the trie lookup, but at the expense of extra space. Analyze - // performance for unicode/norm. - b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize)) - } - } - t.ASCIIIndex = v - - // Compute remaining offsets. - t.Checksum = b.computeOffsets(n, true) - // We already subtracted the normal blockOffset from the index. Subtract the - // difference for starter bytes. - t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset) - return t.Checksum -} - -func (b *builder) computeOffsets(n *node, root bool) uint64 { - // For the first trie, the root lookup block will be at position 3, which is - // the offset for UTF-8 non-ASCII starter bytes. - first := len(b.IndexBlocks) == rootBlockOffset - if first { - b.IndexBlocks = append(b.IndexBlocks, n) - } - - // We special-case the cases where all values recursively are 0. This allows - // for the use of a zero block to which all such values can be directed. - hash := uint64(0) - if n.children != nil || n.values != nil { - hasher := crc64.New(crcTable) - for _, c := range n.children { - var v uint64 - if c != nil { - v = b.computeOffsets(c, false) - } - binary.Write(hasher, binary.BigEndian, v) - } - binary.Write(hasher, binary.BigEndian, n.values) - hash = hasher.Sum64() - } - - if first { - b.indexBlockIdx[hash] = rootBlockOffset - blockOffset - } - - // Compacters don't apply to internal nodes. - if n.children != nil { - v, ok := b.indexBlockIdx[hash] - if !ok { - v = len(b.IndexBlocks) - blockOffset - b.IndexBlocks = append(b.IndexBlocks, n) - b.indexBlockIdx[hash] = v - } - n.index = nodeIndex{0, v} - } else { - h, ok := b.valueBlockIdx[hash] - if !ok { - bestI, bestSize := 0, blockSize*b.ValueSize - for i, c := range b.Compactions[1:] { - if sz, ok := c.c.Size(n.values); ok && bestSize > sz { - bestI, bestSize = i+1, sz - } - } - c := &b.Compactions[bestI] - c.totalSize += bestSize - v := c.c.Store(n.values) - if c.maxHandle < v { - c.maxHandle = v - } - h = nodeIndex{bestI, int(v)} - b.valueBlockIdx[hash] = h - } - n.index = h - } - return hash -} |