summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/huff0
diff options
context:
space:
mode:
authorWim <wim@42.be>2022-06-25 00:36:16 +0200
committerGitHub <noreply@github.com>2022-06-25 00:36:16 +0200
commit4649876956ab944d2a9ea8fc98c75dc8a9f5ef08 (patch)
tree0f32da8e492cabd8eca53de319e287e6c4791c5e /vendor/github.com/klauspost/compress/huff0
parent5604d140e3bbf5c8f6c414b1427145a0c4e36bb4 (diff)
downloadmatterbridge-msglm-4649876956ab944d2a9ea8fc98c75dc8a9f5ef08.tar.gz
matterbridge-msglm-4649876956ab944d2a9ea8fc98c75dc8a9f5ef08.tar.bz2
matterbridge-msglm-4649876956ab944d2a9ea8fc98c75dc8a9f5ef08.zip
Update dependencies (#1851)
Diffstat (limited to 'vendor/github.com/klauspost/compress/huff0')
-rw-r--r--vendor/github.com/klauspost/compress/huff0/autogen.go5
-rw-r--r--vendor/github.com/klauspost/compress/huff0/bitreader.go10
-rw-r--r--vendor/github.com/klauspost/compress/huff0/bitwriter.go115
-rw-r--r--vendor/github.com/klauspost/compress/huff0/bytereader.go10
-rw-r--r--vendor/github.com/klauspost/compress/huff0/compress.go1
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress.go113
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s488
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in197
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.go181
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s1179
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in195
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_generic.go102
12 files changed, 987 insertions, 1609 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/autogen.go b/vendor/github.com/klauspost/compress/huff0/autogen.go
deleted file mode 100644
index ff2c69d6..00000000
--- a/vendor/github.com/klauspost/compress/huff0/autogen.go
+++ /dev/null
@@ -1,5 +0,0 @@
-package huff0
-
-//go:generate go run generate.go
-//go:generate asmfmt -w decompress_amd64.s
-//go:generate asmfmt -w decompress_8b_amd64.s
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index 451160ed..504a7be9 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -165,11 +165,6 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
return uint16(b.value >> ((64 - n) & 63))
}
-// peekTopBits(n) is equvialent to peekBitFast(64 - n)
-func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
- return uint16(b.value >> n)
-}
-
func (b *bitReaderShifted) advance(n uint8) {
b.bitsRead += n
b.value <<= n & 63
@@ -220,11 +215,6 @@ func (b *bitReaderShifted) fill() {
}
}
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReaderShifted) finished() bool {
- return b.off == 0 && b.bitsRead >= 64
-}
-
func (b *bitReaderShifted) remaining() uint {
return b.off*8 + uint(64-b.bitsRead)
}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index 6bce4e87..ec71f7a3 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -5,8 +5,6 @@
package huff0
-import "fmt"
-
// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
@@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF} /* up to 16 bits */
-// addBits16NC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
- b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
- b.nBits += bits
-}
-
// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
b.nBits += encA.nBits + encB.nBits
}
-// addBits16ZeroNC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-// This is fastest if bits can be zero.
-func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
- if bits == 0 {
- return
- }
- value <<= (16 - bits) & 15
- value >>= (16 - bits) & 15
- b.bitContainer |= uint64(value) << (b.nBits & 63)
- b.nBits += bits
-}
-
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
- v := b.nBits >> 3
- switch v {
- case 0:
- return
- case 1:
- b.out = append(b.out,
- byte(b.bitContainer),
- )
- b.bitContainer >>= 1 << 3
- case 2:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- )
- b.bitContainer >>= 2 << 3
- case 3:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- )
- b.bitContainer >>= 3 << 3
- case 4:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- )
- b.bitContainer >>= 4 << 3
- case 5:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- )
- b.bitContainer >>= 5 << 3
- case 6:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- )
- b.bitContainer >>= 6 << 3
- case 7:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- )
- b.bitContainer >>= 7 << 3
- case 8:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- byte(b.bitContainer>>56),
- )
- b.bitContainer = 0
- b.nBits = 0
- return
- default:
- panic(fmt.Errorf("bits (%d) > 64", b.nBits))
- }
- b.nBits &= 7
-}
-
// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
@@ -201,10 +93,3 @@ func (b *bitWriter) close() error {
b.flushAlign()
return nil
}
-
-// reset and continue writing by appending to out.
-func (b *bitWriter) reset(out []byte) {
- b.bitContainer = 0
- b.nBits = 0
- b.out = out
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/bytereader.go b/vendor/github.com/klauspost/compress/huff0/bytereader.go
index 50bcdf6e..4dcab8d2 100644
--- a/vendor/github.com/klauspost/compress/huff0/bytereader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bytereader.go
@@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) {
b.off = 0
}
-// advance the stream b n bytes.
-func (b *byteReader) advance(n uint) {
- b.off += int(n)
-}
-
// Int32 returns a little endian int32 starting at current offset.
func (b byteReader) Int32() int32 {
v3 := int32(b.b[b.off+3])
@@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 {
return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
}
-// unread returns the unread portion of the input.
-func (b byteReader) unread() []byte {
- return b.b[b.off:]
-}
-
// remain will return the number of bytes remaining.
func (b byteReader) remain() int {
return len(b.b) - b.off
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index bc95ac62..4d14542f 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -404,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
return true
}
+//lint:ignore U1000 used for debugging
func (s *Scratch) validateTable(c cTable) bool {
if len(c) < int(s.symbolLen) {
return false
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 04f65299..c0c48bd7 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -11,7 +11,6 @@ import (
type dTable struct {
single []dEntrySingle
- double []dEntryDouble
}
// single-symbols decoding
@@ -19,13 +18,6 @@ type dEntrySingle struct {
entry uint16
}
-// double-symbols decoding
-type dEntryDouble struct {
- seq [4]byte
- nBits uint8
- len uint8
-}
-
// Uses special code for all tables that are < 8 bits.
const use8BitTables = true
@@ -35,7 +27,7 @@ const use8BitTables = true
// If no Scratch is provided a new one is allocated.
// The returned Scratch can be used for encoding or decoding input using this table.
func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
- s, err = s.prepare(in)
+ s, err = s.prepare(nil)
if err != nil {
return s, nil, err
}
@@ -236,108 +228,6 @@ func (d *Decoder) buffer() *[4][256]byte {
return &[4][256]byte{}
}
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
- if len(d.dt.single) == 0 {
- return nil, errors.New("no table loaded")
- }
- if use8BitTables && d.actualTableLog <= 8 {
- return d.decompress1X8Bit(dst, src)
- }
- var br bitReaderShifted
- err := br.init(src)
- if err != nil {
- return dst, err
- }
- maxDecodedSize := cap(dst)
- dst = dst[:0]
-
- // Avoid bounds check by always having full sized table.
- const tlSize = 1 << tableLogMax
- const tlMask = tlSize - 1
- dt := d.dt.single[:tlSize]
-
- // Use temp table to avoid bound checks/append penalty.
- bufs := d.buffer()
- buf := &bufs[0]
- var off uint8
-
- for br.off >= 8 {
- br.fillFast()
- v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+0] = uint8(v.entry >> 8)
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+1] = uint8(v.entry >> 8)
-
- // Refill
- br.fillFast()
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+2] = uint8(v.entry >> 8)
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+3] = uint8(v.entry >> 8)
-
- off += 4
- if off == 0 {
- if len(dst)+256 > maxDecodedSize {
- br.close()
- d.bufs.Put(bufs)
- return nil, ErrMaxDecodedSizeExceeded
- }
- dst = append(dst, buf[:]...)
- }
- }
-
- if len(dst)+int(off) > maxDecodedSize {
- d.bufs.Put(bufs)
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- dst = append(dst, buf[:off]...)
-
- // br < 8, so uint8 is fine
- bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
- for bitsLeft > 0 {
- br.fill()
- if false && br.bitsRead >= 32 {
- if br.off >= 4 {
- v := br.in[br.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- br.value = (br.value << 32) | uint64(low)
- br.bitsRead -= 32
- br.off -= 4
- } else {
- for br.off > 0 {
- br.value = (br.value << 8) | uint64(br.in[br.off-1])
- br.bitsRead -= 8
- br.off--
- }
- }
- }
- if len(dst) >= maxDecodedSize {
- d.bufs.Put(bufs)
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
- nBits := uint8(v.entry)
- br.advance(nBits)
- bitsLeft -= nBits
- dst = append(dst, uint8(v.entry>>8))
- }
- d.bufs.Put(bufs)
- return dst, br.close()
-}
-
// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
// The cap of the output buffer will be the maximum decompressed size.
// The length of the supplied input must match the end of a block exactly.
@@ -995,7 +885,6 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
const shift = 56
const tlSize = 1 << 8
- const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
deleted file mode 100644
index 0d6cb1a9..00000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
+++ /dev/null
@@ -1,488 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
-main_loop:
-
- // const stream = 0
- // br0.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
- MOVQ bitReaderShifted_value(br0), br_value
- MOVQ bitReaderShifted_off(br0), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill0
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br0), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br0.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill0:
-
- // val0 := br0.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 0(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br0.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 0+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
- MOVQ br_value, bitReaderShifted_value(br0)
- MOVQ br_offset, bitReaderShifted_off(br0)
-
- // const stream = 1
- // br1.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
- MOVQ bitReaderShifted_value(br1), br_value
- MOVQ bitReaderShifted_off(br1), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill1
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br1), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br1.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill1:
-
- // val0 := br1.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 256(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br1.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 256+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
- MOVQ br_value, bitReaderShifted_value(br1)
- MOVQ br_offset, bitReaderShifted_off(br1)
-
- // const stream = 2
- // br2.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
- MOVQ bitReaderShifted_value(br2), br_value
- MOVQ bitReaderShifted_off(br2), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill2
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br2), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br2.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill2:
-
- // val0 := br2.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 512(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br2.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 512+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
- MOVQ br_value, bitReaderShifted_value(br2)
- MOVQ br_offset, bitReaderShifted_off(br2)
-
- // const stream = 3
- // br3.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
- MOVQ bitReaderShifted_value(br3), br_value
- MOVQ bitReaderShifted_off(br3), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill3
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br3), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br3.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill3:
-
- // val0 := br3.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 768(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br3.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 768+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
- MOVQ br_value, bitReaderShifted_value(br3)
- MOVQ br_offset, bitReaderShifted_off(br3)
-
- ADDQ $4, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
- RET
-
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
deleted file mode 100644
index 6d477a2c..00000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
+++ /dev/null
@@ -1,197 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
- // const stream = {{ var "id" }}
- // br{{ var "id"}}.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
- MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
- MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill{{ var "id" }}
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br{{ var "id"}}.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
- // }
-skip_fill{{ var "id" }}:
-
- // val0 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br{{ var "id"}}.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br{{ var "id"}}.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br{{ var "id"}}.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br{{ var "id"}}.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, {{ var "bufofs" }}+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
- MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
- MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
- {{ set "id" "0" }}
- {{ set "ofs" "0" }}
- {{ set "bufofs" "0" }} {{/* id * bufoff */}}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "1" }}
- {{ set "ofs" "8" }}
- {{ set "bufofs" "256" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "2" }}
- {{ set "ofs" "16" }}
- {{ set "bufofs" "512" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "3" }}
- {{ set "ofs" "24" }}
- {{ set "bufofs" "768" }}
- {{ template "decode_2_values_x86" . }}
-
- ADDQ $4, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
- RET
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index d47f6644..671e630a 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -2,30 +2,43 @@
// +build amd64,!appengine,!noasm,gc
// This file contains the specialisation of Decoder.Decompress4X
-// that uses an asm implementation of its main loop.
+// and Decoder.Decompress1X that use an asm implementation of thir main loops.
package huff0
import (
"errors"
"fmt"
+
+ "github.com/klauspost/compress/internal/cpuinfo"
)
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog > 8.
-// go:noescape
-func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
- peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+//go:noescape
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
// decompress4x_8b_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog <= 8 which decodes 4 entries
// per loop.
-// go:noescape
-func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
- peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+//go:noescape
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
// fallback8BitSize is the size where using Go version is faster.
const fallback8BitSize = 800
+type decompress4xContext struct {
+ pbr0 *bitReaderShifted
+ pbr1 *bitReaderShifted
+ pbr2 *bitReaderShifted
+ pbr3 *bitReaderShifted
+ peekBits uint8
+ out *byte
+ dstEvery int
+ tbl *dEntrySingle
+ decoded int
+ limit *byte
+}
+
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
@@ -42,6 +55,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if cap(dst) < fallback8BitSize && use8BitTables {
return d.decompress4X8bit(dst, src)
}
+
var br [4]bitReaderShifted
// Decode "jump table"
start := 6
@@ -71,70 +85,28 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
- // Use temp table to avoid bound checks/append penalty.
- buf := d.buffer()
- var off uint8
var decoded int
- const debug = false
-
- // see: bitReaderShifted.peekBitsFast()
- peekBits := uint8((64 - d.actualTableLog) & 63)
-
- // Decode 2 values from each decoder/loop.
- const bufoff = 256
- for {
- if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
- break
+ if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+ ctx := decompress4xContext{
+ pbr0: &br[0],
+ pbr1: &br[1],
+ pbr2: &br[2],
+ pbr3: &br[3],
+ peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+ out: &out[0],
+ dstEvery: dstEvery,
+ tbl: &single[0],
+ limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
}
-
if use8BitTables {
- off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+ decompress4x_8b_main_loop_amd64(&ctx)
} else {
- off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
- }
- if debug {
- fmt.Print("DEBUG: ")
- fmt.Printf("off=%d,", off)
- for i := 0; i < 4; i++ {
- fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
- i, br[i].bitsRead, br[i].value, br[i].off)
- }
- fmt.Println("")
- }
-
- if off != 0 {
- break
+ decompress4x_main_loop_amd64(&ctx)
}
- if bufoff > dstEvery {
- d.bufs.Put(buf)
- return nil, errors.New("corruption detected: stream overrun 1")
- }
- copy(out, buf[0][:])
- copy(out[dstEvery:], buf[1][:])
- copy(out[dstEvery*2:], buf[2][:])
- copy(out[dstEvery*3:], buf[3][:])
- out = out[bufoff:]
- decoded += bufoff * 4
- // There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
- d.bufs.Put(buf)
- return nil, errors.New("corruption detected: stream overrun 2")
- }
- }
- if off > 0 {
- ioff := int(off)
- if len(out) < dstEvery*3+ioff {
- d.bufs.Put(buf)
- return nil, errors.New("corruption detected: stream overrun 3")
- }
- copy(out, buf[0][:off])
- copy(out[dstEvery:], buf[1][:off])
- copy(out[dstEvery*2:], buf[2][:off])
- copy(out[dstEvery*3:], buf[3][:off])
- decoded += int(off) * 4
- out = out[off:]
+ decoded = ctx.decoded
+ out = out[decoded/4:]
}
// Decode remaining.
@@ -150,7 +122,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
- d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
@@ -164,7 +135,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
offset++
}
if offset != endsAt {
- d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
@@ -173,9 +143,86 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
return nil, err
}
}
- d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress1X when tablelog > 8.
+//go:noescape
+func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+
+// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
+// of Decompress1X when tablelog > 8.
+//go:noescape
+func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+
+type decompress1xContext struct {
+ pbr *bitReaderShifted
+ peekBits uint8
+ out *byte
+ outCap int
+ tbl *dEntrySingle
+ decoded int
+}
+
+// Error reported by asm implementations
+const error_max_decoded_size_exeeded = -1
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ var br bitReaderShifted
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:maxDecodedSize]
+
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+
+ if maxDecodedSize >= 4 {
+ ctx := decompress1xContext{
+ pbr: &br,
+ out: &dst[0],
+ outCap: maxDecodedSize,
+ peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+ tbl: &d.dt.single[0],
+ }
+
+ if cpuinfo.HasBMI2() {
+ decompress1x_main_loop_bmi2(&ctx)
+ } else {
+ decompress1x_main_loop_amd64(&ctx)
+ }
+ if ctx.decoded == error_max_decoded_size_exeeded {
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+
+ dst = dst[:ctx.decoded]
+ }
+
+ // br < 8, so uint8 is fine
+ bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+ for bitsLeft > 0 {
+ br.fill()
+ if len(dst) >= maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= nBits
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index 2edad3ea..6c65c6e2 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -1,506 +1,865 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $8-8
+ XORQ DX, DX
+
+ // Preload values
+ MOVQ ctx+0(FP), AX
+ MOVBQZX 32(AX), SI
+ MOVQ 40(AX), DI
+ MOVQ DI, BX
+ MOVQ 72(AX), CX
+ MOVQ CX, (SP)
+ MOVQ 48(AX), R8
+ MOVQ 56(AX), R9
+ MOVQ (AX), R10
+ MOVQ 8(AX), R11
+ MOVQ 16(AX), R12
+ MOVQ 24(AX), R13
+
+ // Main loop
main_loop:
-
- // const stream = 0
- // br0.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
- MOVQ bitReaderShifted_value(br0), br_value
- MOVQ bitReaderShifted_off(br0), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill0
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br0), AX
+ MOVQ BX, DI
+ CMPQ DI, (SP)
+ SETGE DL
+
+ // br0.fillFast32()
+ MOVQ 32(R10), R14
+ MOVBQZX 40(R10), R15
+ CMPQ R15, $0x20
+ JBE skip_fill0
+ MOVQ 24(R10), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R10), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R10)
+ ORQ BP, R14
// exhausted = exhausted || (br0.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill0:
-
// val0 := br0.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ MOVW (R9)(BP*2), CX
-#endif
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R9)(BP*2), CX
// br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 0(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
// update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
- MOVQ br_value, bitReaderShifted_value(br0)
- MOVQ br_offset, bitReaderShifted_off(br0)
-
- // const stream = 1
- // br1.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
- MOVQ bitReaderShifted_value(br1), br_value
- MOVQ bitReaderShifted_off(br1), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill1
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br1), AX
+ MOVQ R14, 32(R10)
+ MOVB R15, 40(R10)
+ ADDQ R8, DI
+
+ // br1.fillFast32()
+ MOVQ 32(R11), R14
+ MOVBQZX 40(R11), R15
+ CMPQ R15, $0x20
+ JBE skip_fill1
+ MOVQ 24(R11), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R11), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R11)
+ ORQ BP, R14
// exhausted = exhausted || (br1.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill1:
-
// val0 := br1.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ MOVW (R9)(BP*2), CX
-#endif
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R9)(BP*2), CX
// br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 256(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
// update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
- MOVQ br_value, bitReaderShifted_value(br1)
- MOVQ br_offset, bitReaderShifted_off(br1)
-
- // const stream = 2
- // br2.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
- MOVQ bitReaderShifted_value(br2), br_value
- MOVQ bitReaderShifted_off(br2), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill2
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br2), AX
+ MOVQ R14, 32(R11)
+ MOVB R15, 40(R11)
+ ADDQ R8, DI
+
+ // br2.fillFast32()
+ MOVQ 32(R12), R14
+ MOVBQZX 40(R12), R15
+ CMPQ R15, $0x20
+ JBE skip_fill2
+ MOVQ 24(R12), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R12), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R12)
+ ORQ BP, R14
// exhausted = exhausted || (br2.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill2:
-
// val0 := br2.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ MOVW (R9)(BP*2), CX
-#endif
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R9)(BP*2), CX
// br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 512(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
// update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
- MOVQ br_value, bitReaderShifted_value(br2)
- MOVQ br_offset, bitReaderShifted_off(br2)
-
- // const stream = 3
- // br3.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
- MOVQ bitReaderShifted_value(br3), br_value
- MOVQ bitReaderShifted_off(br3), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill3
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br3), AX
+ MOVQ R14, 32(R12)
+ MOVB R15, 40(R12)
+ ADDQ R8, DI
+
+ // br3.fillFast32()
+ MOVQ 32(R13), R14
+ MOVBQZX 40(R13), R15
+ CMPQ R15, $0x20
+ JBE skip_fill3
+ MOVQ 24(R13), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R13), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R13)
+ ORQ BP, R14
// exhausted = exhausted || (br3.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill3:
-
// val0 := br3.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
+ // v0 := table[val0&mask]
+ MOVW (R9)(BP*2), CX
-#endif
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ // v1 := table[val1&mask]
+ MOVW (R9)(BP*2), CX
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
+ // br3.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
-#endif
+ // update the bitrader reader structure
+ MOVQ R14, 32(R13)
+ MOVB R15, 40(R13)
+ ADDQ $0x02, BX
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ MOVQ 40(AX), CX
+ MOVQ BX, DX
+ SUBQ CX, DX
+ SHLQ $0x02, DX
+ MOVQ DX, 64(AX)
+ RET
- ADDQ CX, br_bits_read // bits_read += n
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
+ XORQ DX, DX
+
+ // Preload values
+ MOVQ ctx+0(FP), CX
+ MOVBQZX 32(CX), BX
+ MOVQ 40(CX), SI
+ MOVQ SI, (SP)
+ MOVQ 72(CX), DX
+ MOVQ DX, 8(SP)
+ MOVQ 48(CX), DI
+ MOVQ 56(CX), R8
+ MOVQ (CX), R9
+ MOVQ 8(CX), R10
+ MOVQ 16(CX), R11
+ MOVQ 24(CX), R12
+
+ // Main loop
+main_loop:
+ MOVQ (SP), SI
+ CMPQ SI, 8(SP)
+ SETGE DL
+
+ // br1000.fillFast32()
+ MOVQ 32(R9), R13
+ MOVBQZX 40(R9), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1000
+ MOVQ 24(R9), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R9), BP
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R9)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1000.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1000:
+ // val0 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
-#else
- // val1 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
-#endif
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br0.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br0.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br0.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R9)
+ MOVB R14, 40(R9)
+ ADDQ DI, SI
+
+ // br1001.fillFast32()
+ MOVQ 32(R10), R13
+ MOVBQZX 40(R10), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1001
+ MOVQ 24(R10), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R10), BP
- // br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R10)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1001.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1001:
+ // val0 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
-#endif
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br1.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br1.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br1.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R10)
+ MOVB R14, 40(R10)
+ ADDQ DI, SI
+
+ // br1002.fillFast32()
+ MOVQ 32(R11), R13
+ MOVBQZX 40(R11), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1002
+ MOVQ 24(R11), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R11), BP
- ADDQ CX, br_bits_read // bits_read += n
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R11)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1002.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1002:
+ // val0 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 768(buffer)(off*1)
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
- MOVQ br_value, bitReaderShifted_value(br3)
- MOVQ br_offset, bitReaderShifted_off(br3)
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
- ADDQ $2, off // off += 2
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br2.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br2.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br2.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R11)
+ MOVB R14, 40(R11)
+ ADDQ DI, SI
+
+ // br1003.fillFast32()
+ MOVQ 32(R12), R13
+ MOVBQZX 40(R12), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1003
+ MOVQ 24(R12), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R12), BP
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R12)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1003.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1003:
+ // val0 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
- CMPQ off, $bufoff
- JL main_loop
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
-end:
- MOVQ 0(SP), BP
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
- MOVB off, ret+56(FP)
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br3.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br3.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br3.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R12)
+ MOVB R14, 40(R12)
+ ADDQ $0x04, (SP)
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ MOVQ 40(AX), CX
+ MOVQ (SP), DX
+ SUBQ CX, DX
+ SHLQ $0x02, DX
+ MOVQ DX, 64(AX)
RET
-#undef off
-#undef buffer
-#undef table
+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_1_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_2_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, CX
+ MOVQ 16(AX), DX
+ SUBQ DX, CX
+ MOVQ CX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
-#undef br0
-#undef br1
-#undef br2
-#undef br3
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
+
+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+// Requires: BMI2
+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_1_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_2_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, CX
+ MOVQ 16(AX), DX
+ SUBQ DX, CX
+ MOVQ CX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
deleted file mode 100644
index 330d86ae..00000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
+++ /dev/null
@@ -1,195 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
- // const stream = {{ var "id" }}
- // br{{ var "id"}}.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
- MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
- MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill{{ var "id" }}
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-#endif
-
- ORQ AX, br_value
-
- // exhausted = exhausted || (br{{ var "id"}}.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
- // }
-skip_fill{{ var "id" }}:
-
- // val0 := br{{ var "id"}}.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-#endif
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br{{ var "id"}}.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-#else
- // val1 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-#endif
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br{{ var "id"}}.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
- MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
- MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
- {{ set "id" "0" }}
- {{ set "ofs" "0" }}
- {{ set "bufofs" "0" }} {{/* id * bufoff */}}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "1" }}
- {{ set "ofs" "8" }}
- {{ set "bufofs" "256" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "2" }}
- {{ set "ofs" "16" }}
- {{ set "bufofs" "512" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "3" }}
- {{ set "ofs" "24" }}
- {{ set "bufofs" "768" }}
- {{ template "decode_2_values_x86" . }}
-
- ADDQ $2, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
- RET
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
index 126b4d68..4f6f37cb 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -191,3 +191,105 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
}
return dst, nil
}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if use8BitTables && d.actualTableLog <= 8 {
+ return d.decompress1X8Bit(dst, src)
+ }
+ var br bitReaderShifted
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:0]
+
+ // Avoid bounds check by always having full sized table.
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+ dt := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ bufs := d.buffer()
+ buf := &bufs[0]
+ var off uint8
+
+ for br.off >= 8 {
+ br.fillFast()
+ v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ // Refill
+ br.fillFast()
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ br.close()
+ d.bufs.Put(bufs)
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+
+ if len(dst)+int(off) > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:off]...)
+
+ // br < 8, so uint8 is fine
+ bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+ for bitsLeft > 0 {
+ br.fill()
+ if false && br.bitsRead >= 32 {
+ if br.off >= 4 {
+ v := br.in[br.off-4:]
+ v = v[:4]
+ low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ br.value = (br.value << 32) | uint64(low)
+ br.bitsRead -= 32
+ br.off -= 4
+ } else {
+ for br.off > 0 {
+ br.value = (br.value << 8) | uint64(br.in[br.off-1])
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ }
+ if len(dst) >= maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= nBits
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ d.bufs.Put(bufs)
+ return dst, br.close()
+}