summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/huff0
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/huff0')
-rw-r--r--vendor/github.com/klauspost/compress/huff0/autogen.go5
-rw-r--r--vendor/github.com/klauspost/compress/huff0/bitreader.go126
-rw-r--r--vendor/github.com/klauspost/compress/huff0/compress.go9
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress.go509
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s488
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in197
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.go181
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s506
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in195
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_generic.go193
-rw-r--r--vendor/github.com/klauspost/compress/huff0/huff0.go2
11 files changed, 1992 insertions, 419 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/autogen.go b/vendor/github.com/klauspost/compress/huff0/autogen.go
new file mode 100644
index 00000000..ff2c69d6
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/autogen.go
@@ -0,0 +1,5 @@
+package huff0
+
+//go:generate go run generate.go
+//go:generate asmfmt -w decompress_amd64.s
+//go:generate asmfmt -w decompress_8b_amd64.s
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index a4979e88..451160ed 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -8,118 +8,13 @@ package huff0
import (
"encoding/binary"
"errors"
+ "fmt"
"io"
)
// bitReader reads a bitstream in reverse.
// The last set bit indicates the start of the stream and is used
// for aligning the input.
-type bitReader struct {
- in []byte
- off uint // next byte to read is at in[off - 1]
- value uint64
- bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReader) init(in []byte) error {
- if len(in) < 1 {
- return errors.New("corrupt stream: too short")
- }
- b.in = in
- b.off = uint(len(in))
- // The highest bit of the last byte indicates where to start
- v := in[len(in)-1]
- if v == 0 {
- return errors.New("corrupt stream, did not find end of stream")
- }
- b.bitsRead = 64
- b.value = 0
- if len(in) >= 8 {
- b.fillFastStart()
- } else {
- b.fill()
- b.fill()
- }
- b.bitsRead += 8 - uint8(highBit32(uint32(v)))
- return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) peekBitsFast(n uint8) uint16 {
- const regMask = 64 - 1
- v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
- return v
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReader) fillFast() {
- if b.bitsRead < 32 {
- return
- }
-
- // 2 bounds checks.
- v := b.in[b.off-4 : b.off]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- b.value = (b.value << 32) | uint64(low)
- b.bitsRead -= 32
- b.off -= 4
-}
-
-func (b *bitReader) advance(n uint8) {
- b.bitsRead += n
-}
-
-// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
-func (b *bitReader) fillFastStart() {
- // Do single re-slice to avoid bounds checks.
- b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
- b.bitsRead = 0
- b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReader) fill() {
- if b.bitsRead < 32 {
- return
- }
- if b.off > 4 {
- v := b.in[b.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- b.value = (b.value << 32) | uint64(low)
- b.bitsRead -= 32
- b.off -= 4
- return
- }
- for b.off > 0 {
- b.value = (b.value << 8) | uint64(b.in[b.off-1])
- b.bitsRead -= 8
- b.off--
- }
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReader) finished() bool {
- return b.off == 0 && b.bitsRead >= 64
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReader) close() error {
- // Release reference.
- b.in = nil
- if b.bitsRead > 64 {
- return io.ErrUnexpectedEOF
- }
- return nil
-}
-
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
type bitReaderBytes struct {
in []byte
off uint // next byte to read is at in[off - 1]
@@ -213,10 +108,17 @@ func (b *bitReaderBytes) finished() bool {
return b.off == 0 && b.bitsRead >= 64
}
+func (b *bitReaderBytes) remaining() uint {
+ return b.off*8 + uint(64-b.bitsRead)
+}
+
// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReaderBytes) close() error {
// Release reference.
b.in = nil
+ if b.remaining() > 0 {
+ return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+ }
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
@@ -263,6 +165,11 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
return uint16(b.value >> ((64 - n) & 63))
}
+// peekTopBits(n) is equvialent to peekBitFast(64 - n)
+func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
+ return uint16(b.value >> n)
+}
+
func (b *bitReaderShifted) advance(n uint8) {
b.bitsRead += n
b.value <<= n & 63
@@ -318,10 +225,17 @@ func (b *bitReaderShifted) finished() bool {
return b.off == 0 && b.bitsRead >= 64
}
+func (b *bitReaderShifted) remaining() uint {
+ return b.off*8 + uint(64-b.bitsRead)
+}
+
// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReaderShifted) close() error {
// Release reference.
b.in = nil
+ if b.remaining() > 0 {
+ return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+ }
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 8323dc05..bc95ac62 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -2,6 +2,7 @@ package huff0
import (
"fmt"
+ "math"
"runtime"
"sync"
)
@@ -289,6 +290,10 @@ func (s *Scratch) compress4X(src []byte) ([]byte, error) {
if err != nil {
return nil, err
}
+ if len(s.Out)-idx > math.MaxUint16 {
+ // We cannot store the size in the jump table
+ return nil, ErrIncompressible
+ }
// Write compressed length as little endian before block.
if i < 3 {
// Last length is not written.
@@ -332,6 +337,10 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
return nil, errs[i]
}
o := s.tmpOut[i]
+ if len(o) > math.MaxUint16 {
+ // We cannot store the size in the jump table
+ return nil, ErrIncompressible
+ }
// Write compressed length as little endian before block.
if i < 3 {
// Last length is not written.
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 2a06bd1a..04f65299 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"io"
+ "sync"
"github.com/klauspost/compress/fse"
)
@@ -216,6 +217,7 @@ func (s *Scratch) Decoder() *Decoder {
return &Decoder{
dt: s.dt,
actualTableLog: s.actualTableLog,
+ bufs: &s.decPool,
}
}
@@ -223,6 +225,15 @@ func (s *Scratch) Decoder() *Decoder {
type Decoder struct {
dt dTable
actualTableLog uint8
+ bufs *sync.Pool
+}
+
+func (d *Decoder) buffer() *[4][256]byte {
+ buf, ok := d.bufs.Get().(*[4][256]byte)
+ if ok {
+ return buf
+ }
+ return &[4][256]byte{}
}
// Decompress1X will decompress a 1X encoded stream.
@@ -249,7 +260,8 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
dt := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
+ bufs := d.buffer()
+ buf := &bufs[0]
var off uint8
for br.off >= 8 {
@@ -277,6 +289,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
if off == 0 {
if len(dst)+256 > maxDecodedSize {
br.close()
+ d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
@@ -284,6 +297,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
}
if len(dst)+int(off) > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -310,6 +324,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
}
}
if len(dst) >= maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -319,6 +334,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
bitsLeft -= nBits
dst = append(dst, uint8(v.entry>>8))
}
+ d.bufs.Put(bufs)
return dst, br.close()
}
@@ -341,7 +357,8 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
dt := d.dt.single[:256]
// Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
+ bufs := d.buffer()
+ buf := &bufs[0]
var off uint8
switch d.actualTableLog {
@@ -369,6 +386,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
if off == 0 {
if len(dst)+256 > maxDecodedSize {
br.close()
+ d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
@@ -398,6 +416,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
if off == 0 {
if len(dst)+256 > maxDecodedSize {
br.close()
+ d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
@@ -426,6 +445,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -455,6 +475,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -484,6 +505,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -513,6 +535,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -542,6 +565,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -571,6 +595,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -578,10 +603,12 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
}
}
default:
+ d.bufs.Put(bufs)
return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
}
if len(dst)+int(off) > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -601,6 +628,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
}
if len(dst) >= maxDecodedSize {
br.close()
+ d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
v := dt[br.peekByteFast()>>shift]
@@ -609,6 +637,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
bitsLeft -= int8(nBits)
dst = append(dst, uint8(v.entry>>8))
}
+ d.bufs.Put(bufs)
return dst, br.close()
}
@@ -628,7 +657,8 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
dt := d.dt.single[:256]
// Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
+ bufs := d.buffer()
+ buf := &bufs[0]
var off uint8
const shift = 56
@@ -655,6 +685,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -663,6 +694,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
}
if len(dst)+int(off) > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -679,6 +711,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
}
}
if len(dst) >= maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -688,6 +721,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
bitsLeft -= int8(nBits)
dst = append(dst, uint8(v.entry>>8))
}
+ d.bufs.Put(bufs)
return dst, br.close()
}
@@ -695,192 +729,6 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
- if len(d.dt.single) == 0 {
- return nil, errors.New("no table loaded")
- }
- if len(src) < 6+(4*1) {
- return nil, errors.New("input too small")
- }
- if use8BitTables && d.actualTableLog <= 8 {
- return d.decompress4X8bit(dst, src)
- }
-
- var br [4]bitReaderShifted
- start := 6
- for i := 0; i < 3; i++ {
- length := int(src[i*2]) | (int(src[i*2+1]) << 8)
- if start+length >= len(src) {
- return nil, errors.New("truncated input (or invalid offset)")
- }
- err := br[i].init(src[start : start+length])
- if err != nil {
- return nil, err
- }
- start += length
- }
- err := br[3].init(src[start:])
- if err != nil {
- return nil, err
- }
-
- // destination, offset to match first output
- dstSize := cap(dst)
- dst = dst[:dstSize]
- out := dst
- dstEvery := (dstSize + 3) / 4
-
- const tlSize = 1 << tableLogMax
- const tlMask = tlSize - 1
- single := d.dt.single[:tlSize]
-
- // Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
- var off uint8
- var decoded int
-
- // Decode 2 values from each decoder/loop.
- const bufoff = 256 / 4
- for {
- if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
- break
- }
-
- {
- const stream = 0
- const stream2 = 1
- br[stream].fillFast()
- br[stream2].fillFast()
-
- val := br[stream].peekBitsFast(d.actualTableLog)
- val2 := br[stream2].peekBitsFast(d.actualTableLog)
- v := single[val&tlMask]
- v2 := single[val2&tlMask]
- br[stream].advance(uint8(v.entry))
- br[stream2].advance(uint8(v2.entry))
- buf[off+bufoff*stream] = uint8(v.entry >> 8)
- buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
- val = br[stream].peekBitsFast(d.actualTableLog)
- val2 = br[stream2].peekBitsFast(d.actualTableLog)
- v = single[val&tlMask]
- v2 = single[val2&tlMask]
- br[stream].advance(uint8(v.entry))
- br[stream2].advance(uint8(v2.entry))
- buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
- buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
- }
-
- {
- const stream = 2
- const stream2 = 3
- br[stream].fillFast()
- br[stream2].fillFast()
-
- val := br[stream].peekBitsFast(d.actualTableLog)
- val2 := br[stream2].peekBitsFast(d.actualTableLog)
- v := single[val&tlMask]
- v2 := single[val2&tlMask]
- br[stream].advance(uint8(v.entry))
- br[stream2].advance(uint8(v2.entry))
- buf[off+bufoff*stream] = uint8(v.entry >> 8)
- buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
- val = br[stream].peekBitsFast(d.actualTableLog)
- val2 = br[stream2].peekBitsFast(d.actualTableLog)
- v = single[val&tlMask]
- v2 = single[val2&tlMask]
- br[stream].advance(uint8(v.entry))
- br[stream2].advance(uint8(v2.entry))
- buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
- buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
- }
-
- off += 2
-
- if off == bufoff {
- if bufoff > dstEvery {
- return nil, errors.New("corruption detected: stream overrun 1")
- }
- copy(out, buf[:bufoff])
- copy(out[dstEvery:], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
- off = 0
- out = out[bufoff:]
- decoded += 256
- // There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
- return nil, errors.New("corruption detected: stream overrun 2")
- }
- }
- }
- if off > 0 {
- ioff := int(off)
- if len(out) < dstEvery*3+ioff {
- return nil, errors.New("corruption detected: stream overrun 3")
- }
- copy(out, buf[:off])
- copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
- decoded += int(off) * 4
- out = out[off:]
- }
-
- // Decode remaining.
- for i := range br {
- offset := dstEvery * i
- br := &br[i]
- bitsLeft := br.off*8 + uint(64-br.bitsRead)
- for bitsLeft > 0 {
- br.fill()
- if false && br.bitsRead >= 32 {
- if br.off >= 4 {
- v := br.in[br.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- br.value = (br.value << 32) | uint64(low)
- br.bitsRead -= 32
- br.off -= 4
- } else {
- for br.off > 0 {
- br.value = (br.value << 8) | uint64(br.in[br.off-1])
- br.bitsRead -= 8
- br.off--
- }
- }
- }
- // end inline...
- if offset >= len(out) {
- return nil, errors.New("corruption detected: stream overrun 4")
- }
-
- // Read value and increment offset.
- val := br.peekBitsFast(d.actualTableLog)
- v := single[val&tlMask].entry
- nBits := uint8(v)
- br.advance(nBits)
- bitsLeft -= uint(nBits)
- out[offset] = uint8(v >> 8)
- offset++
- }
- decoded += offset - dstEvery*i
- err = br.close()
- if err != nil {
- return nil, err
- }
- }
- if dstSize != decoded {
- return nil, errors.New("corruption detected: short output block")
- }
- return dst, nil
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
if d.actualTableLog == 8 {
return d.decompress4X8bitExactly(dst, src)
@@ -916,12 +764,12 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
+ buf := d.buffer()
var off uint8
var decoded int
// Decode 4 values from each decoder/loop.
- const bufoff = 256 / 4
+ const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
@@ -942,8 +790,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
- buf[off+bufoff*stream] = uint8(v >> 8)
- buf[off+bufoff*stream2] = uint8(v2 >> 8)
+ buf[stream][off] = uint8(v >> 8)
+ buf[stream2][off] = uint8(v2 >> 8)
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
@@ -951,8 +799,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
- buf[off+bufoff*stream+1] = uint8(v >> 8)
- buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+ buf[stream][off+1] = uint8(v >> 8)
+ buf[stream2][off+1] = uint8(v2 >> 8)
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
@@ -960,8 +808,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
- buf[off+bufoff*stream+2] = uint8(v >> 8)
- buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+ buf[stream][off+2] = uint8(v >> 8)
+ buf[stream2][off+2] = uint8(v2 >> 8)
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
@@ -969,8 +817,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
- buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
- buf[off+bufoff*stream+3] = uint8(v >> 8)
+ buf[stream][off+3] = uint8(v >> 8)
+ buf[stream2][off+3] = uint8(v2 >> 8)
}
{
@@ -987,8 +835,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
- buf[off+bufoff*stream] = uint8(v >> 8)
- buf[off+bufoff*stream2] = uint8(v2 >> 8)
+ buf[stream][off] = uint8(v >> 8)
+ buf[stream2][off] = uint8(v2 >> 8)
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
@@ -996,8 +844,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
- buf[off+bufoff*stream+1] = uint8(v >> 8)
- buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+ buf[stream][off+1] = uint8(v >> 8)
+ buf[stream2][off+1] = uint8(v2 >> 8)
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
@@ -1005,8 +853,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
- buf[off+bufoff*stream+2] = uint8(v >> 8)
- buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+ buf[stream][off+2] = uint8(v >> 8)
+ buf[stream2][off+2] = uint8(v2 >> 8)
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
@@ -1014,25 +862,26 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
- buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
- buf[off+bufoff*stream+3] = uint8(v >> 8)
+ buf[stream][off+3] = uint8(v >> 8)
+ buf[stream2][off+3] = uint8(v2 >> 8)
}
off += 4
- if off == bufoff {
+ if off == 0 {
if bufoff > dstEvery {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
- copy(out, buf[:bufoff])
- copy(out[dstEvery:], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
- off = 0
+ copy(out, buf[0][:])
+ copy(out[dstEvery:], buf[1][:])
+ copy(out[dstEvery*2:], buf[2][:])
+ copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
- decoded += 256
+ decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
@@ -1040,23 +889,31 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
- copy(out, buf[:off])
- copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+ copy(out, buf[0][:off])
+ copy(out[dstEvery:], buf[1][:off])
+ copy(out[dstEvery*2:], buf[2][:off])
+ copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}
// Decode remaining.
+ // Decode remaining.
+ remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
+ endsAt := offset + remainBytes
+ if endsAt > len(out) {
+ endsAt = len(out)
+ }
br := &br[i]
- bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+ bitsLeft := br.remaining()
for bitsLeft > 0 {
if br.finished() {
+ d.bufs.Put(buf)
return nil, io.ErrUnexpectedEOF
}
if br.bitsRead >= 56 {
@@ -1076,7 +933,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
}
}
// end inline...
- if offset >= len(out) {
+ if offset >= endsAt {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
@@ -1084,16 +942,22 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
v := single[uint8(br.value>>shift)].entry
nBits := uint8(v)
br.advance(nBits)
- bitsLeft -= int(nBits)
+ bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
+ if offset != endsAt {
+ d.bufs.Put(buf)
+ return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+ }
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
+ d.bufs.Put(buf)
return nil, err
}
}
+ d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
@@ -1135,12 +999,12 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
+ buf := d.buffer()
var off uint8
var decoded int
// Decode 4 values from each decoder/loop.
- const bufoff = 256 / 4
+ const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
@@ -1150,104 +1014,109 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
// Interleave 2 decodes.
const stream = 0
const stream2 = 1
- br[stream].fillFast()
- br[stream2].fillFast()
-
- v := single[uint8(br[stream].value>>shift)].entry
- v2 := single[uint8(br[stream2].value>>shift)].entry
- br[stream].bitsRead += uint8(v)
- br[stream].value <<= v & 63
- br[stream2].bitsRead += uint8(v2)
- br[stream2].value <<= v2 & 63
- buf[off+bufoff*stream] = uint8(v >> 8)
- buf[off+bufoff*stream2] = uint8(v2 >> 8)
-
- v = single[uint8(br[stream].value>>shift)].entry
- v2 = single[uint8(br[stream2].value>>shift)].entry
- br[stream].bitsRead += uint8(v)
- br[stream].value <<= v & 63
- br[stream2].bitsRead += uint8(v2)
- br[stream2].value <<= v2 & 63
- buf[off+bufoff*stream+1] = uint8(v >> 8)
- buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-
- v = single[uint8(br[stream].value>>shift)].entry
- v2 = single[uint8(br[stream2].value>>shift)].entry
- br[stream].bitsRead += uint8(v)
- br[stream].value <<= v & 63
- br[stream2].bitsRead += uint8(v2)
- br[stream2].value <<= v2 & 63
- buf[off+bufoff*stream+2] = uint8(v >> 8)
- buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-
- v = single[uint8(br[stream].value>>shift)].entry
- v2 = single[uint8(br[stream2].value>>shift)].entry
- br[stream].bitsRead += uint8(v)
- br[stream].value <<= v & 63
- br[stream2].bitsRead += uint8(v2)
- br[stream2].value <<= v2 & 63
- buf[off+bufoff*stream+3] = uint8(v >> 8)
- buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+ br1 := &br[stream]
+ br2 := &br[stream2]
+ br1.fillFast()
+ br2.fillFast()
+
+ v := single[uint8(br1.value>>shift)].entry
+ v2 := single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off] = uint8(v >> 8)
+ buf[stream2][off] = uint8(v2 >> 8)
+
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+1] = uint8(v >> 8)
+ buf[stream2][off+1] = uint8(v2 >> 8)
+
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+2] = uint8(v >> 8)
+ buf[stream2][off+2] = uint8(v2 >> 8)
+
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+3] = uint8(v >> 8)
+ buf[stream2][off+3] = uint8(v2 >> 8)
}
{
const stream = 2
const stream2 = 3
- br[stream].fillFast()
- br[stream2].fillFast()
-
- v := single[uint8(br[stream].value>>shift)].entry
- v2 := single[uint8(br[stream2].value>>shift)].entry
- br[stream].bitsRead += uint8(v)
- br[stream].value <<= v & 63
- br[stream2].bitsRead += uint8(v2)
- br[stream2].value <<= v2 & 63
- buf[off+bufoff*stream] = uint8(v >> 8)
- buf[off+bufoff*stream2] = uint8(v2 >> 8)
-
- v = single[uint8(br[stream].value>>shift)].entry
- v2 = single[uint8(br[stream2].value>>shift)].entry
- br[stream].bitsRead += uint8(v)
- br[stream].value <<= v & 63
- br[stream2].bitsRead += uint8(v2)
- br[stream2].value <<= v2 & 63
- buf[off+bufoff*stream+1] = uint8(v >> 8)
- buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-
- v = single[uint8(br[stream].value>>shift)].entry
- v2 = single[uint8(br[stream2].value>>shift)].entry
- br[stream].bitsRead += uint8(v)
- br[stream].value <<= v & 63
- br[stream2].bitsRead += uint8(v2)
- br[stream2].value <<= v2 & 63
- buf[off+bufoff*stream+2] = uint8(v >> 8)
- buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-
- v = single[uint8(br[stream].value>>shift)].entry
- v2 = single[uint8(br[stream2].value>>shift)].entry
- br[stream].bitsRead += uint8(v)
- br[stream].value <<= v & 63
- br[stream2].bitsRead += uint8(v2)
- br[stream2].value <<= v2 & 63
- buf[off+bufoff*stream+3] = uint8(v >> 8)
- buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+ br1 := &br[stream]
+ br2 := &br[stream2]
+ br1.fillFast()
+ br2.fillFast()
+
+ v := single[uint8(br1.value>>shift)].entry
+ v2 := single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off] = uint8(v >> 8)
+ buf[stream2][off] = uint8(v2 >> 8)
+
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+1] = uint8(v >> 8)
+ buf[stream2][off+1] = uint8(v2 >> 8)
+
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+2] = uint8(v >> 8)
+ buf[stream2][off+2] = uint8(v2 >> 8)
+
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+3] = uint8(v >> 8)
+ buf[stream2][off+3] = uint8(v2 >> 8)
}
off += 4
- if off == bufoff {
+ if off == 0 {
if bufoff > dstEvery {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
- copy(out, buf[:bufoff])
- copy(out[dstEvery:], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
- off = 0
+ copy(out, buf[0][:])
+ copy(out[dstEvery:], buf[1][:])
+ copy(out[dstEvery*2:], buf[2][:])
+ copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
- decoded += 256
+ decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
@@ -1257,21 +1126,27 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
if len(out) < dstEvery*3+ioff {
return nil, errors.New("corruption detected: stream overrun 3")
}
- copy(out, buf[:off])
- copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+ copy(out, buf[0][:off])
+ copy(out[dstEvery:], buf[1][:off])
+ copy(out[dstEvery*2:], buf[2][:off])
+ copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}
// Decode remaining.
+ remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
+ endsAt := offset + remainBytes
+ if endsAt > len(out) {
+ endsAt = len(out)
+ }
br := &br[i]
- bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+ bitsLeft := br.remaining()
for bitsLeft > 0 {
if br.finished() {
+ d.bufs.Put(buf)
return nil, io.ErrUnexpectedEOF
}
if br.bitsRead >= 56 {
@@ -1291,7 +1166,8 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
}
}
// end inline...
- if offset >= len(out) {
+ if offset >= endsAt {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
@@ -1299,16 +1175,23 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
v := single[br.peekByteFast()].entry
nBits := uint8(v)
br.advance(nBits)
- bitsLeft -= int(nBits)
+ bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
+ if offset != endsAt {
+ d.bufs.Put(buf)
+ return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+ }
+
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
+ d.bufs.Put(buf)
return nil, err
}
}
+ d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
new file mode 100644
index 00000000..0d6cb1a9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
@@ -0,0 +1,488 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#define bufoff 256 // see decompress.go, we're using [4][256]byte table
+
+// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
+#define off R8
+#define buffer DI
+#define table SI
+
+#define br_bits_read R9
+#define br_value R10
+#define br_offset R11
+#define peek_bits R12
+#define exhausted DX
+
+#define br0 R13
+#define br1 R14
+#define br2 R15
+#define br3 BP
+
+ MOVQ BP, 0(SP)
+
+ XORQ exhausted, exhausted // exhausted = false
+ XORQ off, off // off = 0
+
+ MOVBQZX peekBits+32(FP), peek_bits
+ MOVQ buf+40(FP), buffer
+ MOVQ tbl+48(FP), table
+
+ MOVQ pbr0+0(FP), br0
+ MOVQ pbr1+8(FP), br1
+ MOVQ pbr2+16(FP), br2
+ MOVQ pbr3+24(FP), br3
+
+main_loop:
+
+ // const stream = 0
+ // br0.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
+ MOVQ bitReaderShifted_value(br0), br_value
+ MOVQ bitReaderShifted_off(br0), br_offset
+
+ // if b.bitsRead >= 32 {
+ CMPQ br_bits_read, $32
+ JB skip_fill0
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br0), AX
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br0.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill0:
+
+ // val0 := br0.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br0.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br0.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 0(buffer)(off*1)
+
+ // SECOND PART:
+ // val2 := br0.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v2 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br0.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val3 := br0.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v3 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br0.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off+2] = uint8(v2.entry >> 8)
+ // buf[stream][off+3] = uint8(v3.entry >> 8)
+ MOVW BX, 0+2(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
+ MOVQ br_value, bitReaderShifted_value(br0)
+ MOVQ br_offset, bitReaderShifted_off(br0)
+
+ // const stream = 1
+ // br1.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
+ MOVQ bitReaderShifted_value(br1), br_value
+ MOVQ bitReaderShifted_off(br1), br_offset
+
+ // if b.bitsRead >= 32 {
+ CMPQ br_bits_read, $32
+ JB skip_fill1
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br1), AX
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br1.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill1:
+
+ // val0 := br1.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br1.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br1.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 256(buffer)(off*1)
+
+ // SECOND PART:
+ // val2 := br1.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v2 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br1.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val3 := br1.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v3 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br1.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off+2] = uint8(v2.entry >> 8)
+ // buf[stream][off+3] = uint8(v3.entry >> 8)
+ MOVW BX, 256+2(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
+ MOVQ br_value, bitReaderShifted_value(br1)
+ MOVQ br_offset, bitReaderShifted_off(br1)
+
+ // const stream = 2
+ // br2.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
+ MOVQ bitReaderShifted_value(br2), br_value
+ MOVQ bitReaderShifted_off(br2), br_offset
+
+ // if b.bitsRead >= 32 {
+ CMPQ br_bits_read, $32
+ JB skip_fill2
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br2), AX
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br2.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill2:
+
+ // val0 := br2.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br2.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br2.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 512(buffer)(off*1)
+
+ // SECOND PART:
+ // val2 := br2.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v2 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br2.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val3 := br2.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v3 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br2.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off+2] = uint8(v2.entry >> 8)
+ // buf[stream][off+3] = uint8(v3.entry >> 8)
+ MOVW BX, 512+2(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
+ MOVQ br_value, bitReaderShifted_value(br2)
+ MOVQ br_offset, bitReaderShifted_off(br2)
+
+ // const stream = 3
+ // br3.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
+ MOVQ bitReaderShifted_value(br3), br_value
+ MOVQ bitReaderShifted_off(br3), br_offset
+
+ // if b.bitsRead >= 32 {
+ CMPQ br_bits_read, $32
+ JB skip_fill3
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br3), AX
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br3.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill3:
+
+ // val0 := br3.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br3.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br3.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 768(buffer)(off*1)
+
+ // SECOND PART:
+ // val2 := br3.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v2 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br3.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val3 := br3.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v3 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br3.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off+2] = uint8(v2.entry >> 8)
+ // buf[stream][off+3] = uint8(v3.entry >> 8)
+ MOVW BX, 768+2(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
+ MOVQ br_value, bitReaderShifted_value(br3)
+ MOVQ br_offset, bitReaderShifted_off(br3)
+
+ ADDQ $4, off // off += 2
+
+ TESTB DH, DH // any br[i].ofs < 4?
+ JNZ end
+
+ CMPQ off, $bufoff
+ JL main_loop
+
+end:
+ MOVQ 0(SP), BP
+
+ MOVB off, ret+56(FP)
+ RET
+
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
new file mode 100644
index 00000000..6d477a2c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
@@ -0,0 +1,197 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+
+#define bufoff 256 // see decompress.go, we're using [4][256]byte table
+
+//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
+#define off R8
+#define buffer DI
+#define table SI
+
+#define br_bits_read R9
+#define br_value R10
+#define br_offset R11
+#define peek_bits R12
+#define exhausted DX
+
+#define br0 R13
+#define br1 R14
+#define br2 R15
+#define br3 BP
+
+ MOVQ BP, 0(SP)
+
+ XORQ exhausted, exhausted // exhausted = false
+ XORQ off, off // off = 0
+
+ MOVBQZX peekBits+32(FP), peek_bits
+ MOVQ buf+40(FP), buffer
+ MOVQ tbl+48(FP), table
+
+ MOVQ pbr0+0(FP), br0
+ MOVQ pbr1+8(FP), br1
+ MOVQ pbr2+16(FP), br2
+ MOVQ pbr3+24(FP), br3
+
+main_loop:
+{{ define "decode_2_values_x86" }}
+ // const stream = {{ var "id" }}
+ // br{{ var "id"}}.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
+ MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
+ MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
+
+ // if b.bitsRead >= 32 {
+ CMPQ br_bits_read, $32
+ JB skip_fill{{ var "id" }}
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br{{ var "id"}}.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+ // }
+skip_fill{{ var "id" }}:
+
+ // val0 := br{{ var "id"}}.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br{{ var "id"}}.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val1 := br{{ var "id"}}.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br{{ var "id"}}.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
+
+ // SECOND PART:
+ // val2 := br{{ var "id"}}.peekTopBits(peekBits)
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v2 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br{{ var "id"}}.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // val3 := br{{ var "id"}}.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+ // v3 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br{{ var "id"}}.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ MOVBQZX AL, CX
+ SHLQ CX, br_value // value <<= n
+ ADDQ CX, br_bits_read // bits_read += n
+
+
+ // these two writes get coalesced
+ // buf[stream][off+2] = uint8(v2.entry >> 8)
+ // buf[stream][off+3] = uint8(v3.entry >> 8)
+ MOVW BX, {{ var "bufofs" }}+2(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
+ MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
+ MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
+{{ end }}
+
+ {{ set "id" "0" }}
+ {{ set "ofs" "0" }}
+ {{ set "bufofs" "0" }} {{/* id * bufoff */}}
+ {{ template "decode_2_values_x86" . }}
+
+ {{ set "id" "1" }}
+ {{ set "ofs" "8" }}
+ {{ set "bufofs" "256" }}
+ {{ template "decode_2_values_x86" . }}
+
+ {{ set "id" "2" }}
+ {{ set "ofs" "16" }}
+ {{ set "bufofs" "512" }}
+ {{ template "decode_2_values_x86" . }}
+
+ {{ set "id" "3" }}
+ {{ set "ofs" "24" }}
+ {{ set "bufofs" "768" }}
+ {{ template "decode_2_values_x86" . }}
+
+ ADDQ $4, off // off += 2
+
+ TESTB DH, DH // any br[i].ofs < 4?
+ JNZ end
+
+ CMPQ off, $bufoff
+ JL main_loop
+end:
+ MOVQ 0(SP), BP
+
+ MOVB off, ret+56(FP)
+ RET
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
new file mode 100644
index 00000000..d47f6644
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -0,0 +1,181 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// This file contains the specialisation of Decoder.Decompress4X
+// that uses an asm implementation of its main loop.
+package huff0
+
+import (
+ "errors"
+ "fmt"
+)
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog > 8.
+// go:noescape
+func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+ peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+
+// decompress4x_8b_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog <= 8 which decodes 4 entries
+// per loop.
+// go:noescape
+func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+ peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+
+// fallback8BitSize is the size where using Go version is faster.
+const fallback8BitSize = 800
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if len(src) < 6+(4*1) {
+ return nil, errors.New("input too small")
+ }
+
+ use8BitTables := d.actualTableLog <= 8
+ if cap(dst) < fallback8BitSize && use8BitTables {
+ return d.decompress4X8bit(dst, src)
+ }
+ var br [4]bitReaderShifted
+ // Decode "jump table"
+ start := 6
+ for i := 0; i < 3; i++ {
+ length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+ if start+length >= len(src) {
+ return nil, errors.New("truncated input (or invalid offset)")
+ }
+ err := br[i].init(src[start : start+length])
+ if err != nil {
+ return nil, err
+ }
+ start += length
+ }
+ err := br[3].init(src[start:])
+ if err != nil {
+ return nil, err
+ }
+
+ // destination, offset to match first output
+ dstSize := cap(dst)
+ dst = dst[:dstSize]
+ out := dst
+ dstEvery := (dstSize + 3) / 4
+
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+ single := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ buf := d.buffer()
+ var off uint8
+ var decoded int
+
+ const debug = false
+
+ // see: bitReaderShifted.peekBitsFast()
+ peekBits := uint8((64 - d.actualTableLog) & 63)
+
+ // Decode 2 values from each decoder/loop.
+ const bufoff = 256
+ for {
+ if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+ break
+ }
+
+ if use8BitTables {
+ off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+ } else {
+ off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+ }
+ if debug {
+ fmt.Print("DEBUG: ")
+ fmt.Printf("off=%d,", off)
+ for i := 0; i < 4; i++ {
+ fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
+ i, br[i].bitsRead, br[i].value, br[i].off)
+ }
+ fmt.Println("")
+ }
+
+ if off != 0 {
+ break
+ }
+
+ if bufoff > dstEvery {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 1")
+ }
+ copy(out, buf[0][:])
+ copy(out[dstEvery:], buf[1][:])
+ copy(out[dstEvery*2:], buf[2][:])
+ copy(out[dstEvery*3:], buf[3][:])
+ out = out[bufoff:]
+ decoded += bufoff * 4
+ // There must at least be 3 buffers left.
+ if len(out) < dstEvery*3 {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 2")
+ }
+ }
+ if off > 0 {
+ ioff := int(off)
+ if len(out) < dstEvery*3+ioff {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 3")
+ }
+ copy(out, buf[0][:off])
+ copy(out[dstEvery:], buf[1][:off])
+ copy(out[dstEvery*2:], buf[2][:off])
+ copy(out[dstEvery*3:], buf[3][:off])
+ decoded += int(off) * 4
+ out = out[off:]
+ }
+
+ // Decode remaining.
+ remainBytes := dstEvery - (decoded / 4)
+ for i := range br {
+ offset := dstEvery * i
+ endsAt := offset + remainBytes
+ if endsAt > len(out) {
+ endsAt = len(out)
+ }
+ br := &br[i]
+ bitsLeft := br.remaining()
+ for bitsLeft > 0 {
+ br.fill()
+ if offset >= endsAt {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 4")
+ }
+
+ // Read value and increment offset.
+ val := br.peekBitsFast(d.actualTableLog)
+ v := single[val&tlMask].entry
+ nBits := uint8(v)
+ br.advance(nBits)
+ bitsLeft -= uint(nBits)
+ out[offset] = uint8(v >> 8)
+ offset++
+ }
+ if offset != endsAt {
+ d.bufs.Put(buf)
+ return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+ }
+ decoded += offset - dstEvery*i
+ err = br.close()
+ if err != nil {
+ return nil, err
+ }
+ }
+ d.bufs.Put(buf)
+ if dstSize != decoded {
+ return nil, errors.New("corruption detected: short output block")
+ }
+ return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 00000000..2edad3ea
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,506 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+
+#define bufoff 256 // see decompress.go, we're using [4][256]byte table
+
+// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off R8
+#define buffer DI
+#define table SI
+
+#define br_bits_read R9
+#define br_value R10
+#define br_offset R11
+#define peek_bits R12
+#define exhausted DX
+
+#define br0 R13
+#define br1 R14
+#define br2 R15
+#define br3 BP
+
+ MOVQ BP, 0(SP)
+
+ XORQ exhausted, exhausted // exhausted = false
+ XORQ off, off // off = 0
+
+ MOVBQZX peekBits+32(FP), peek_bits
+ MOVQ buf+40(FP), buffer
+ MOVQ tbl+48(FP), table
+
+ MOVQ pbr0+0(FP), br0
+ MOVQ pbr1+8(FP), br1
+ MOVQ pbr2+16(FP), br2
+ MOVQ pbr3+24(FP), br3
+
+main_loop:
+
+ // const stream = 0
+ // br0.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
+ MOVQ bitReaderShifted_value(br0), br_value
+ MOVQ bitReaderShifted_off(br0), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill0
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br0), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br0.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill0:
+
+ // val0 := br0.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br0.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br0.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 0(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
+ MOVQ br_value, bitReaderShifted_value(br0)
+ MOVQ br_offset, bitReaderShifted_off(br0)
+
+ // const stream = 1
+ // br1.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
+ MOVQ bitReaderShifted_value(br1), br_value
+ MOVQ bitReaderShifted_off(br1), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill1
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br1), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br1.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill1:
+
+ // val0 := br1.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br1.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br1.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 256(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
+ MOVQ br_value, bitReaderShifted_value(br1)
+ MOVQ br_offset, bitReaderShifted_off(br1)
+
+ // const stream = 2
+ // br2.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
+ MOVQ bitReaderShifted_value(br2), br_value
+ MOVQ bitReaderShifted_off(br2), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill2
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br2), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br2.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill2:
+
+ // val0 := br2.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br2.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br2.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 512(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
+ MOVQ br_value, bitReaderShifted_value(br2)
+ MOVQ br_offset, bitReaderShifted_off(br2)
+
+ // const stream = 3
+ // br3.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
+ MOVQ bitReaderShifted_value(br3), br_value
+ MOVQ bitReaderShifted_off(br3), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill3
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br3), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br3.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill3:
+
+ // val0 := br3.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br3.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br3.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 768(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
+ MOVQ br_value, bitReaderShifted_value(br3)
+ MOVQ br_offset, bitReaderShifted_off(br3)
+
+ ADDQ $2, off // off += 2
+
+ TESTB DH, DH // any br[i].ofs < 4?
+ JNZ end
+
+ CMPQ off, $bufoff
+ JL main_loop
+
+end:
+ MOVQ 0(SP), BP
+
+ MOVB off, ret+56(FP)
+ RET
+
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
new file mode 100644
index 00000000..330d86ae
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
@@ -0,0 +1,195 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+
+#define bufoff 256 // see decompress.go, we're using [4][256]byte table
+
+//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off R8
+#define buffer DI
+#define table SI
+
+#define br_bits_read R9
+#define br_value R10
+#define br_offset R11
+#define peek_bits R12
+#define exhausted DX
+
+#define br0 R13
+#define br1 R14
+#define br2 R15
+#define br3 BP
+
+ MOVQ BP, 0(SP)
+
+ XORQ exhausted, exhausted // exhausted = false
+ XORQ off, off // off = 0
+
+ MOVBQZX peekBits+32(FP), peek_bits
+ MOVQ buf+40(FP), buffer
+ MOVQ tbl+48(FP), table
+
+ MOVQ pbr0+0(FP), br0
+ MOVQ pbr1+8(FP), br1
+ MOVQ pbr2+16(FP), br2
+ MOVQ pbr3+24(FP), br3
+
+main_loop:
+{{ define "decode_2_values_x86" }}
+ // const stream = {{ var "id" }}
+ // br{{ var "id"}}.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
+ MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
+ MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill{{ var "id" }}
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br{{ var "id"}}.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+ // }
+skip_fill{{ var "id" }}:
+
+ // val0 := br{{ var "id"}}.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br{{ var "id"}}.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+#else
+ // val1 := br{{ var "id"}}.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br{{ var "id"}}.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
+ MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
+ MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
+{{ end }}
+
+ {{ set "id" "0" }}
+ {{ set "ofs" "0" }}
+ {{ set "bufofs" "0" }} {{/* id * bufoff */}}
+ {{ template "decode_2_values_x86" . }}
+
+ {{ set "id" "1" }}
+ {{ set "ofs" "8" }}
+ {{ set "bufofs" "256" }}
+ {{ template "decode_2_values_x86" . }}
+
+ {{ set "id" "2" }}
+ {{ set "ofs" "16" }}
+ {{ set "bufofs" "512" }}
+ {{ template "decode_2_values_x86" . }}
+
+ {{ set "id" "3" }}
+ {{ set "ofs" "24" }}
+ {{ set "bufofs" "768" }}
+ {{ template "decode_2_values_x86" . }}
+
+ ADDQ $2, off // off += 2
+
+ TESTB DH, DH // any br[i].ofs < 4?
+ JNZ end
+
+ CMPQ off, $bufoff
+ JL main_loop
+end:
+ MOVQ 0(SP), BP
+
+ MOVB off, ret+56(FP)
+ RET
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
new file mode 100644
index 00000000..126b4d68
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -0,0 +1,193 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// This file contains a generic implementation of Decoder.Decompress4X.
+package huff0
+
+import (
+ "errors"
+ "fmt"
+)
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if len(src) < 6+(4*1) {
+ return nil, errors.New("input too small")
+ }
+ if use8BitTables && d.actualTableLog <= 8 {
+ return d.decompress4X8bit(dst, src)
+ }
+
+ var br [4]bitReaderShifted
+ // Decode "jump table"
+ start := 6
+ for i := 0; i < 3; i++ {
+ length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+ if start+length >= len(src) {
+ return nil, errors.New("truncated input (or invalid offset)")
+ }
+ err := br[i].init(src[start : start+length])
+ if err != nil {
+ return nil, err
+ }
+ start += length
+ }
+ err := br[3].init(src[start:])
+ if err != nil {
+ return nil, err
+ }
+
+ // destination, offset to match first output
+ dstSize := cap(dst)
+ dst = dst[:dstSize]
+ out := dst
+ dstEvery := (dstSize + 3) / 4
+
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+ single := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ buf := d.buffer()
+ var off uint8
+ var decoded int
+
+ // Decode 2 values from each decoder/loop.
+ const bufoff = 256
+ for {
+ if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+ break
+ }
+
+ {
+ const stream = 0
+ const stream2 = 1
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ val := br[stream].peekBitsFast(d.actualTableLog)
+ val2 := br[stream2].peekBitsFast(d.actualTableLog)
+ v := single[val&tlMask]
+ v2 := single[val2&tlMask]
+ br[stream].advance(uint8(v.entry))
+ br[stream2].advance(uint8(v2.entry))
+ buf[stream][off] = uint8(v.entry >> 8)
+ buf[stream2][off] = uint8(v2.entry >> 8)
+
+ val = br[stream].peekBitsFast(d.actualTableLog)
+ val2 = br[stream2].peekBitsFast(d.actualTableLog)
+ v = single[val&tlMask]
+ v2 = single[val2&tlMask]
+ br[stream].advance(uint8(v.entry))
+ br[stream2].advance(uint8(v2.entry))
+ buf[stream][off+1] = uint8(v.entry >> 8)
+ buf[stream2][off+1] = uint8(v2.entry >> 8)
+ }
+
+ {
+ const stream = 2
+ const stream2 = 3
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ val := br[stream].peekBitsFast(d.actualTableLog)
+ val2 := br[stream2].peekBitsFast(d.actualTableLog)
+ v := single[val&tlMask]
+ v2 := single[val2&tlMask]
+ br[stream].advance(uint8(v.entry))
+ br[stream2].advance(uint8(v2.entry))
+ buf[stream][off] = uint8(v.entry >> 8)
+ buf[stream2][off] = uint8(v2.entry >> 8)
+
+ val = br[stream].peekBitsFast(d.actualTableLog)
+ val2 = br[stream2].peekBitsFast(d.actualTableLog)
+ v = single[val&tlMask]
+ v2 = single[val2&tlMask]
+ br[stream].advance(uint8(v.entry))
+ br[stream2].advance(uint8(v2.entry))
+ buf[stream][off+1] = uint8(v.entry >> 8)
+ buf[stream2][off+1] = uint8(v2.entry >> 8)
+ }
+
+ off += 2
+
+ if off == 0 {
+ if bufoff > dstEvery {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 1")
+ }
+ copy(out, buf[0][:])
+ copy(out[dstEvery:], buf[1][:])
+ copy(out[dstEvery*2:], buf[2][:])
+ copy(out[dstEvery*3:], buf[3][:])
+ out = out[bufoff:]
+ decoded += bufoff * 4
+ // There must at least be 3 buffers left.
+ if len(out) < dstEvery*3 {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 2")
+ }
+ }
+ }
+ if off > 0 {
+ ioff := int(off)
+ if len(out) < dstEvery*3+ioff {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 3")
+ }
+ copy(out, buf[0][:off])
+ copy(out[dstEvery:], buf[1][:off])
+ copy(out[dstEvery*2:], buf[2][:off])
+ copy(out[dstEvery*3:], buf[3][:off])
+ decoded += int(off) * 4
+ out = out[off:]
+ }
+
+ // Decode remaining.
+ remainBytes := dstEvery - (decoded / 4)
+ for i := range br {
+ offset := dstEvery * i
+ endsAt := offset + remainBytes
+ if endsAt > len(out) {
+ endsAt = len(out)
+ }
+ br := &br[i]
+ bitsLeft := br.remaining()
+ for bitsLeft > 0 {
+ br.fill()
+ if offset >= endsAt {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 4")
+ }
+
+ // Read value and increment offset.
+ val := br.peekBitsFast(d.actualTableLog)
+ v := single[val&tlMask].entry
+ nBits := uint8(v)
+ br.advance(nBits)
+ bitsLeft -= uint(nBits)
+ out[offset] = uint8(v >> 8)
+ offset++
+ }
+ if offset != endsAt {
+ d.bufs.Put(buf)
+ return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+ }
+ decoded += offset - dstEvery*i
+ err = br.close()
+ if err != nil {
+ return nil, err
+ }
+ }
+ d.bufs.Put(buf)
+ if dstSize != decoded {
+ return nil, errors.New("corruption detected: short output block")
+ }
+ return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 3ee00ecb..e8ad17ad 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -8,6 +8,7 @@ import (
"fmt"
"math"
"math/bits"
+ "sync"
"github.com/klauspost/compress/fse"
)
@@ -116,6 +117,7 @@ type Scratch struct {
nodes []nodeElt
tmpOut [4][]byte
fse *fse.Scratch
+ decPool sync.Pool // *[4][256]byte buffers.
huffWeight [maxSymbolValue + 1]byte
}