diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/zstd')
29 files changed, 1072 insertions, 795 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md index beb7fa87..65b38abe 100644 --- a/vendor/github.com/klauspost/compress/zstd/README.md +++ b/vendor/github.com/klauspost/compress/zstd/README.md @@ -12,6 +12,8 @@ The `zstd` package is provided as open source software using a Go standard licen Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors. +For seekable zstd streams, see [this excellent package](https://github.com/SaveTheRbtz/zstd-seekable-format-go). + ## Installation Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`. diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go index 7eed729b..2445bb4f 100644 --- a/vendor/github.com/klauspost/compress/zstd/blockdec.go +++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go @@ -10,7 +10,6 @@ import ( "errors" "fmt" "io" - "io/ioutil" "os" "path/filepath" "sync" @@ -83,8 +82,9 @@ type blockDec struct { err error - // Check against this crc - checkCRC []byte + // Check against this crc, if hasCRC is true. + checkCRC uint32 + hasCRC bool // Frame to use for singlethreaded decoding. // Should not be used by the decoder itself since parent may be another frame. @@ -192,16 +192,14 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error { } // Read block data. - if cap(b.dataStorage) < cSize { + if _, ok := br.(*byteBuf); !ok && cap(b.dataStorage) < cSize { + // byteBuf doesn't need a destination buffer. if b.lowMem || cSize > maxCompressedBlockSize { b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc) } else { b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc) } } - if cap(b.dst) <= maxSize { - b.dst = make([]byte, 0, maxSize+1) - } b.data, err = br.readBig(cSize, b.dataStorage) if err != nil { if debugDecoder { @@ -210,6 +208,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error { } return err } + if cap(b.dst) <= maxSize { + b.dst = make([]byte, 0, maxSize+1) + } return nil } @@ -233,7 +234,7 @@ func (b *blockDec) decodeBuf(hist *history) error { if b.lowMem { b.dst = make([]byte, b.RLESize) } else { - b.dst = make([]byte, maxBlockSize) + b.dst = make([]byte, maxCompressedBlockSize) } } b.dst = b.dst[:b.RLESize] @@ -651,7 +652,7 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) { fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse)) fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse)) buf.Write(in) - ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm) + os.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm) } return nil diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go index 4493baa7..176788f2 100644 --- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go +++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go @@ -7,7 +7,6 @@ package zstd import ( "fmt" "io" - "io/ioutil" ) type byteBuffer interface { @@ -23,7 +22,7 @@ type byteBuffer interface { readByte() (byte, error) // Skip n bytes. - skipN(n int) error + skipN(n int64) error } // in-memory buffer @@ -62,9 +61,12 @@ func (b *byteBuf) readByte() (byte, error) { return r, nil } -func (b *byteBuf) skipN(n int) error { +func (b *byteBuf) skipN(n int64) error { bb := *b - if len(bb) < n { + if n < 0 { + return fmt.Errorf("negative skip (%d) requested", n) + } + if int64(len(bb)) < n { return io.ErrUnexpectedEOF } *b = bb[n:] @@ -120,9 +122,9 @@ func (r *readerWrapper) readByte() (byte, error) { return r.tmp[0], nil } -func (r *readerWrapper) skipN(n int) error { - n2, err := io.CopyN(ioutil.Discard, r.r, int64(n)) - if n2 != int64(n) { +func (r *readerWrapper) skipN(n int64) error { + n2, err := io.CopyN(io.Discard, r.r, n) + if n2 != n { err = io.ErrUnexpectedEOF } return err diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go index 5022e71c..f6a24097 100644 --- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go +++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go @@ -4,7 +4,6 @@ package zstd import ( - "bytes" "encoding/binary" "errors" "io" @@ -102,8 +101,8 @@ func (h *Header) Decode(in []byte) error { } h.HeaderSize += 4 b, in := in[:4], in[4:] - if !bytes.Equal(b, frameMagic) { - if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 { + if string(b) != frameMagic { + if string(b[1:4]) != skippableFrameMagic || b[0]&0xf0 != 0x50 { return ErrMagicMismatch } if len(in) < 4 { @@ -153,7 +152,7 @@ func (h *Header) Decode(in []byte) error { } b, in = in[:size], in[size:] h.HeaderSize += int(size) - switch size { + switch len(b) { case 1: h.DictionaryID = uint32(b[0]) case 2: @@ -183,7 +182,7 @@ func (h *Header) Decode(in []byte) error { } b, in = in[:fcsSize], in[fcsSize:] h.HeaderSize += int(fcsSize) - switch fcsSize { + switch len(b) { case 1: h.FrameContentSize = uint64(b[0]) case 2: diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go index 286c8f9d..7113e69e 100644 --- a/vendor/github.com/klauspost/compress/zstd/decoder.go +++ b/vendor/github.com/klauspost/compress/zstd/decoder.go @@ -5,7 +5,6 @@ package zstd import ( - "bytes" "context" "encoding/binary" "io" @@ -35,13 +34,13 @@ type Decoder struct { br readerWrapper enabled bool inFrame bool + dstBuf []byte } frame *frameDec // Custom dictionaries. - // Always uses copies. - dicts map[uint32]dict + dicts map[uint32]*dict // streamWg is the waitgroup for all streams streamWg sync.WaitGroup @@ -103,7 +102,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) { } // Transfer option dicts. - d.dicts = make(map[uint32]dict, len(d.o.dicts)) + d.dicts = make(map[uint32]*dict, len(d.o.dicts)) for _, dc := range d.o.dicts { d.dicts[dc.id] = dc } @@ -187,21 +186,23 @@ func (d *Decoder) Reset(r io.Reader) error { } // If bytes buffer and < 5MB, do sync decoding anyway. - if bb, ok := r.(byter); ok && bb.Len() < 5<<20 { + if bb, ok := r.(byter); ok && bb.Len() < d.o.decodeBufsBelow && !d.o.limitToCap { bb2 := bb if debugDecoder { println("*bytes.Buffer detected, doing sync decode, len:", bb.Len()) } b := bb2.Bytes() var dst []byte - if cap(d.current.b) > 0 { - dst = d.current.b + if cap(d.syncStream.dstBuf) > 0 { + dst = d.syncStream.dstBuf[:0] } - dst, err := d.DecodeAll(b, dst[:0]) + dst, err := d.DecodeAll(b, dst) if err == nil { err = io.EOF } + // Save output buffer + d.syncStream.dstBuf = dst d.current.b = dst d.current.err = err d.current.flushed = true @@ -216,6 +217,7 @@ func (d *Decoder) Reset(r io.Reader) error { d.current.err = nil d.current.flushed = false d.current.d = nil + d.syncStream.dstBuf = nil // Ensure no-one else is still running... d.streamWg.Wait() @@ -312,6 +314,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { // Grab a block decoder and frame decoder. block := <-d.decoders frame := block.localFrame + initialSize := len(dst) defer func() { if debugDecoder { printf("re-adding decoder: %p", block) @@ -337,21 +340,26 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { } return dst, err } - if frame.DictionaryID != nil { - dict, ok := d.dicts[*frame.DictionaryID] - if !ok { - return nil, ErrUnknownDictionary - } - if debugDecoder { - println("setting dict", frame.DictionaryID) - } - frame.history.setDict(&dict) + if err = d.setDict(frame); err != nil { + return nil, err } if frame.WindowSize > d.o.maxWindowSize { + if debugDecoder { + println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize) + } return dst, ErrWindowSizeExceeded } if frame.FrameContentSize != fcsUnknown { - if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) { + if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)-initialSize) { + if debugDecoder { + println("decoder size exceeded; fcs:", frame.FrameContentSize, "> mcs:", d.o.maxDecodedSize-uint64(len(dst)-initialSize), "len:", len(dst)) + } + return dst, ErrDecoderSizeExceeded + } + if d.o.limitToCap && frame.FrameContentSize > uint64(cap(dst)-len(dst)) { + if debugDecoder { + println("decoder size exceeded; fcs:", frame.FrameContentSize, "> (cap-len)", cap(dst)-len(dst)) + } return dst, ErrDecoderSizeExceeded } if cap(dst)-len(dst) < int(frame.FrameContentSize) { @@ -361,7 +369,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { } } - if cap(dst) == 0 { + if cap(dst) == 0 && !d.o.limitToCap { // Allocate len(input) * 2 by default if nothing is provided // and we didn't get frame content size. size := len(input) * 2 @@ -379,6 +387,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { if err != nil { return dst, err } + if uint64(len(dst)-initialSize) > d.o.maxDecodedSize { + return dst, ErrDecoderSizeExceeded + } if len(frame.bBuf) == 0 { if debugDecoder { println("frame dbuf empty") @@ -439,7 +450,11 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) { println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp) } - if !d.o.ignoreChecksum && len(next.b) > 0 { + if d.o.ignoreChecksum { + return true + } + + if len(next.b) > 0 { n, err := d.current.crc.Write(next.b) if err == nil { if n != len(next.b) { @@ -447,18 +462,16 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) { } } } - if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 { - got := d.current.crc.Sum64() - var tmp [4]byte - binary.LittleEndian.PutUint32(tmp[:], uint32(got)) - if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) { + if next.err == nil && next.d != nil && next.d.hasCRC { + got := uint32(d.current.crc.Sum64()) + if got != next.d.checkCRC { if debugDecoder { - println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)") + printf("CRC Check Failed: %08x (got) != %08x (on stream)\n", got, next.d.checkCRC) } d.current.err = ErrCRCMismatch } else { if debugDecoder { - println("CRC ok", tmp[:]) + printf("CRC ok %08x\n", got) } } } @@ -474,18 +487,12 @@ func (d *Decoder) nextBlockSync() (ok bool) { if !d.syncStream.inFrame { d.frame.history.reset() d.current.err = d.frame.reset(&d.syncStream.br) + if d.current.err == nil { + d.current.err = d.setDict(d.frame) + } if d.current.err != nil { return false } - if d.frame.DictionaryID != nil { - dict, ok := d.dicts[*d.frame.DictionaryID] - if !ok { - d.current.err = ErrUnknownDictionary - return false - } else { - d.frame.history.setDict(&dict) - } - } if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize { d.current.err = ErrDecoderSizeExceeded return false @@ -664,6 +671,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch if debugDecoder { println("Async 1: new history, recent:", block.async.newHist.recentOffsets) } + hist.reset() hist.decoders = block.async.newHist.decoders hist.recentOffsets = block.async.newHist.recentOffsets hist.windowSize = block.async.newHist.windowSize @@ -695,6 +703,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch seqExecute <- block } close(seqExecute) + hist.reset() }() var wg sync.WaitGroup @@ -718,6 +727,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch if debugDecoder { println("Async 2: new history") } + hist.reset() hist.windowSize = block.async.newHist.windowSize hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer if block.async.newHist.dict != nil { @@ -747,7 +757,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch if block.lowMem { block.dst = make([]byte, block.RLESize) } else { - block.dst = make([]byte, maxBlockSize) + block.dst = make([]byte, maxCompressedBlockSize) } } block.dst = block.dst[:block.RLESize] @@ -799,13 +809,14 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch if debugDecoder { println("decoder goroutines finished") } + hist.reset() }() + var hist history decodeStream: for { - var hist history var hasErr bool - + hist.reset() decodeBlock := func(block *blockDec) { if hasErr { if block != nil { @@ -840,15 +851,14 @@ decodeStream: if debugDecoder && err != nil { println("Frame decoder returned", err) } - if err == nil && frame.DictionaryID != nil { - dict, ok := d.dicts[*frame.DictionaryID] - if !ok { - err = ErrUnknownDictionary - } else { - frame.history.setDict(&dict) - } + if err == nil { + err = d.setDict(frame) } if err == nil && d.frame.WindowSize > d.o.maxWindowSize { + if debugDecoder { + println("decoder size exceeded, fws:", d.frame.WindowSize, "> mws:", d.o.maxWindowSize) + } + err = ErrDecoderSizeExceeded } if err != nil { @@ -890,18 +900,22 @@ decodeStream: println("next block returned error:", err) } dec.err = err - dec.checkCRC = nil + dec.hasCRC = false if dec.Last && frame.HasCheckSum && err == nil { crc, err := frame.rawInput.readSmall(4) - if err != nil { + if len(crc) < 4 { + if err == nil { + err = io.ErrUnexpectedEOF + + } println("CRC missing?", err) dec.err = err - } - var tmp [4]byte - copy(tmp[:], crc) - dec.checkCRC = tmp[:] - if debugDecoder { - println("found crc to check:", dec.checkCRC) + } else { + dec.checkCRC = binary.LittleEndian.Uint32(crc) + dec.hasCRC = true + if debugDecoder { + printf("found crc to check: %08x\n", dec.checkCRC) + } } } err = dec.err @@ -917,5 +931,23 @@ decodeStream: } close(seqDecode) wg.Wait() + hist.reset() d.frame.history.b = frameHistCache } + +func (d *Decoder) setDict(frame *frameDec) (err error) { + dict, ok := d.dicts[frame.DictionaryID] + if ok { + if debugDecoder { + println("setting dict", frame.DictionaryID) + } + frame.history.setDict(dict) + } else if frame.DictionaryID != 0 { + // A zero or missing dictionary id is ambiguous: + // either dictionary zero, or no dictionary. In particular, + // zstd --patch-from uses this id for the source file, + // so only return an error if the dictionary id is not zero. + err = ErrUnknownDictionary + } + return err +} diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go index c70e6fa0..07a90dd7 100644 --- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go +++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go @@ -6,6 +6,8 @@ package zstd import ( "errors" + "fmt" + "math/bits" "runtime" ) @@ -14,20 +16,23 @@ type DOption func(*decoderOptions) error // options retains accumulated state of multiple options. type decoderOptions struct { - lowMem bool - concurrent int - maxDecodedSize uint64 - maxWindowSize uint64 - dicts []dict - ignoreChecksum bool + lowMem bool + concurrent int + maxDecodedSize uint64 + maxWindowSize uint64 + dicts []*dict + ignoreChecksum bool + limitToCap bool + decodeBufsBelow int } func (o *decoderOptions) setDefault() { *o = decoderOptions{ // use less ram: true for now, but may change. - lowMem: true, - concurrent: runtime.GOMAXPROCS(0), - maxWindowSize: MaxWindowSize, + lowMem: true, + concurrent: runtime.GOMAXPROCS(0), + maxWindowSize: MaxWindowSize, + decodeBufsBelow: 128 << 10, } if o.concurrent > 4 { o.concurrent = 4 @@ -82,7 +87,13 @@ func WithDecoderMaxMemory(n uint64) DOption { } // WithDecoderDicts allows to register one or more dictionaries for the decoder. -// If several dictionaries with the same ID is provided the last one will be used. +// +// Each slice in dict must be in the [dictionary format] produced by +// "zstd --train" from the Zstandard reference implementation. +// +// If several dictionaries with the same ID are provided, the last one will be used. +// +// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format func WithDecoderDicts(dicts ...[]byte) DOption { return func(o *decoderOptions) error { for _, b := range dicts { @@ -90,12 +101,24 @@ func WithDecoderDicts(dicts ...[]byte) DOption { if err != nil { return err } - o.dicts = append(o.dicts, *d) + o.dicts = append(o.dicts, d) } return nil } } +// WithEncoderDictRaw registers a dictionary that may be used by the decoder. +// The slice content can be arbitrary data. +func WithDecoderDictRaw(id uint32, content []byte) DOption { + return func(o *decoderOptions) error { + if bits.UintSize > 32 && uint(len(content)) > dictMaxLength { + return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content)) + } + o.dicts = append(o.dicts, &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}) + return nil + } +} + // WithDecoderMaxWindow allows to set a maximum window size for decodes. // This allows rejecting packets that will cause big memory usage. // The Decoder will likely allocate more memory based on the WithDecoderLowmem setting. @@ -114,6 +137,29 @@ func WithDecoderMaxWindow(size uint64) DOption { } } +// WithDecodeAllCapLimit will limit DecodeAll to decoding cap(dst)-len(dst) bytes, +// or any size set in WithDecoderMaxMemory. +// This can be used to limit decoding to a specific maximum output size. +// Disabled by default. +func WithDecodeAllCapLimit(b bool) DOption { + return func(o *decoderOptions) error { + o.limitToCap = b + return nil + } +} + +// WithDecodeBuffersBelow will fully decode readers that have a +// `Bytes() []byte` and `Len() int` interface similar to bytes.Buffer. +// This typically uses less allocations but will have the full decompressed object in memory. +// Note that DecodeAllCapLimit will disable this, as well as giving a size of 0 or less. +// Default is 128KiB. +func WithDecodeBuffersBelow(size int) DOption { + return func(o *decoderOptions) error { + o.decodeBufsBelow = size + return nil + } +} + // IgnoreChecksum allows to forcibly ignore checksum checking. func IgnoreChecksum(b bool) DOption { return func(o *decoderOptions) error { diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go index a36ae83e..ca095145 100644 --- a/vendor/github.com/klauspost/compress/zstd/dict.go +++ b/vendor/github.com/klauspost/compress/zstd/dict.go @@ -1,7 +1,6 @@ package zstd import ( - "bytes" "encoding/binary" "errors" "fmt" @@ -20,7 +19,10 @@ type dict struct { content []byte } -var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec} +const dictMagic = "\x37\xa4\x30\xec" + +// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB. +const dictMaxLength = 1 << 31 // ID returns the dictionary id or 0 if d is nil. func (d *dict) ID() uint32 { @@ -30,14 +32,38 @@ func (d *dict) ID() uint32 { return d.id } -// DictContentSize returns the dictionary content size or 0 if d is nil. -func (d *dict) DictContentSize() int { +// ContentSize returns the dictionary content size or 0 if d is nil. +func (d *dict) ContentSize() int { if d == nil { return 0 } return len(d.content) } +// Content returns the dictionary content. +func (d *dict) Content() []byte { + if d == nil { + return nil + } + return d.content +} + +// Offsets returns the initial offsets. +func (d *dict) Offsets() [3]int { + if d == nil { + return [3]int{} + } + return d.offsets +} + +// LitEncoder returns the literal encoder. +func (d *dict) LitEncoder() *huff0.Scratch { + if d == nil { + return nil + } + return d.litEnc +} + // Load a dictionary as described in // https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format func loadDict(b []byte) (*dict, error) { @@ -50,7 +76,7 @@ func loadDict(b []byte) (*dict, error) { ofDec: sequenceDec{fse: &fseDecoder{}}, mlDec: sequenceDec{fse: &fseDecoder{}}, } - if !bytes.Equal(b[:4], dictMagic[:]) { + if string(b[:4]) != dictMagic { return nil, ErrMagicMismatch } d.id = binary.LittleEndian.Uint32(b[4:8]) @@ -62,7 +88,7 @@ func loadDict(b []byte) (*dict, error) { var err error d.litEnc, b, err = huff0.ReadTable(b[8:], nil) if err != nil { - return nil, err + return nil, fmt.Errorf("loading literal table: %w", err) } d.litEnc.Reuse = huff0.ReusePolicyMust @@ -120,3 +146,16 @@ func loadDict(b []byte) (*dict, error) { return &d, nil } + +// InspectDictionary loads a zstd dictionary and provides functions to inspect the content. +func InspectDictionary(b []byte) (interface { + ID() uint32 + ContentSize() int + Content() []byte + Offsets() [3]int + LitEncoder() *huff0.Scratch +}, error) { + initPredefined() + d, err := loadDict(b) + return d, err +} diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go index 15ae8ee8..e008b992 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_base.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go @@ -16,6 +16,7 @@ type fastBase struct { cur int32 // maximum offset. Should be at least 2x block size. maxMatchOff int32 + bufferReset int32 hist []byte crc *xxhash.Digest tmp [8]byte @@ -56,8 +57,8 @@ func (e *fastBase) Block() *blockEnc { } func (e *fastBase) addBlock(src []byte) int32 { - if debugAsserts && e.cur > bufferReset { - panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset)) + if debugAsserts && e.cur > e.bufferReset { + panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, e.bufferReset)) } // check if we have space already if len(e.hist)+len(src) > cap(e.hist) { @@ -126,24 +127,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 { panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize)) } } - a := src[s:] - b := src[t:] - b = b[:len(a)] - end := int32((len(a) >> 3) << 3) - for i := int32(0); i < end; i += 8 { - if diff := load6432(a, i) ^ load6432(b, i); diff != 0 { - return i + int32(bits.TrailingZeros64(diff)>>3) - } - } - - a = a[end:] - b = b[end:] - for i := range a { - if a[i] != b[i] { - return int32(i) + end - } - } - return int32(len(a)) + end + return int32(matchLen(src[s:], src[t:])) } // Reset the encoding table. @@ -165,13 +149,13 @@ func (e *fastBase) resetBase(d *dict, singleBlock bool) { if singleBlock { e.lowMem = true } - e.ensureHist(d.DictContentSize() + maxCompressedBlockSize) + e.ensureHist(d.ContentSize() + maxCompressedBlockSize) e.lowMem = low } // We offset current position so everything will be out of reach. // If above reset line, history will be purged. - if e.cur < bufferReset { + if e.cur < e.bufferReset { e.cur += e.maxMatchOff + int32(len(e.hist)) } e.hist = e.hist[:0] diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go index 96028ecd..830f5ba7 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_best.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go @@ -32,6 +32,7 @@ type match struct { length int32 rep int32 est int32 + _ [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes } const highScore = 25000 @@ -84,14 +85,10 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) { ) // Protect against e.cur wraparound. - for e.cur >= bufferReset { + for e.cur >= e.bufferReset-int32(len(e.hist)) { if len(e.hist) == 0 { - for i := range e.table[:] { - e.table[i] = prevEntry{} - } - for i := range e.longTable[:] { - e.longTable[i] = prevEntry{} - } + e.table = [bestShortTableSize]prevEntry{} + e.longTable = [bestLongTableSize]prevEntry{} e.cur = e.maxMatchOff break } @@ -192,8 +189,8 @@ encodeLoop: panic("offset0 was 0") } - bestOf := func(a, b match) match { - if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 { + bestOf := func(a, b *match) *match { + if a.est-b.est+(a.s-b.s)*bitsPerByte>>10 < 0 { return a } return b @@ -219,22 +216,26 @@ encodeLoop: return m } - best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)) - best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)) - best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1)) + m1 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1) + m2 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1) + m3 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1) + m4 := matchAt(candidateS.prev-e.cur, s, uint32(cv), -1) + best := bestOf(bestOf(&m1, &m2), bestOf(&m3, &m4)) if canRepeat && best.length < goodEnough { cv32 := uint32(cv >> 8) spp := s + 1 - best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1)) - best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2)) - best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3)) + m1 := matchAt(spp-offset1, spp, cv32, 1) + m2 := matchAt(spp-offset2, spp, cv32, 2) + m3 := matchAt(spp-offset3, spp, cv32, 3) + best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3)) if best.length > 0 { cv32 = uint32(cv >> 24) spp += 2 - best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1)) - best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2)) - best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3)) + m1 := matchAt(spp-offset1, spp, cv32, 1) + m2 := matchAt(spp-offset2, spp, cv32, 2) + m3 := matchAt(spp-offset3, spp, cv32, 3) + best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3)) } } // Load next and check... @@ -261,26 +262,33 @@ encodeLoop: candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)] // Short at s+1 - best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)) + m1 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1) // Long at s+1, s+2 - best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)) - best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)) - best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1)) - best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1)) + m2 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1) + m3 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1) + m4 := matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1) + m5 := matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1) + best = bestOf(bestOf(bestOf(best, &m1), &m2), bestOf(bestOf(&m3, &m4), &m5)) if false { // Short at s+3. // Too often worse... - best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)) + m := matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1) + best = bestOf(best, &m) } // See if we can find a better match by checking where the current best ends. // Use that offset to see if we can find a better full match. if sAt := best.s + best.length; sAt < sLimit { nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen) candidateEnd := e.longTable[nextHashL] - if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 { - bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1)) - if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 { - bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1)) + // Start check at a fixed offset to allow for a few mismatches. + // For this compression level 2 yields the best results. + const skipBeginning = 2 + if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 { + m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1) + bestEnd := bestOf(best, &m) + if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 { + m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1) + bestEnd = bestOf(bestEnd, &m) } best = bestEnd } diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go index c769f694..8582f31a 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_better.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go @@ -62,14 +62,10 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) { ) // Protect against e.cur wraparound. - for e.cur >= bufferReset { + for e.cur >= e.bufferReset-int32(len(e.hist)) { if len(e.hist) == 0 { - for i := range e.table[:] { - e.table[i] = tableEntry{} - } - for i := range e.longTable[:] { - e.longTable[i] = prevEntry{} - } + e.table = [betterShortTableSize]tableEntry{} + e.longTable = [betterLongTableSize]prevEntry{} e.cur = e.maxMatchOff break } @@ -416,15 +412,23 @@ encodeLoop: // Try to find a better match by searching for a long match at the end of the current best match if s+matched < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is around 3 bytes, but depends on input. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 3 + nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen) - cv := load3232(src, s) + s2 := s + skipBeginning + cv := load3232(src, s2) candidateL := e.longTable[nextHashL] - coffsetL := candidateL.offset - e.cur - matched - if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { + coffsetL := candidateL.offset - e.cur - matched + skipBeginning + if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { // Found a long match, at least 4 bytes. - matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4 + matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4 if matchedNext > matched { t = coffsetL + s = s2 matched = matchedNext if debugMatches { println("long match at end-of-match") @@ -434,12 +438,13 @@ encodeLoop: // Check prev long... if true { - coffsetL = candidateL.prev - e.cur - matched - if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { + coffsetL = candidateL.prev - e.cur - matched + skipBeginning + if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { // Found a long match, at least 4 bytes. - matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4 + matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4 if matchedNext > matched { t = coffsetL + s = s2 matched = matchedNext if debugMatches { println("prev long match at end-of-match") @@ -578,7 +583,7 @@ func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) { ) // Protect against e.cur wraparound. - for e.cur >= bufferReset { + for e.cur >= e.bufferReset-int32(len(e.hist)) { if len(e.hist) == 0 { for i := range e.table[:] { e.table[i] = tableEntry{} diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go index 7ff0c64f..7d425109 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go @@ -44,14 +44,10 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) { ) // Protect against e.cur wraparound. - for e.cur >= bufferReset { + for e.cur >= e.bufferReset-int32(len(e.hist)) { if len(e.hist) == 0 { - for i := range e.table[:] { - e.table[i] = tableEntry{} - } - for i := range e.longTable[:] { - e.longTable[i] = tableEntry{} - } + e.table = [dFastShortTableSize]tableEntry{} + e.longTable = [dFastLongTableSize]tableEntry{} e.cur = e.maxMatchOff break } @@ -388,7 +384,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) { ) // Protect against e.cur wraparound. - if e.cur >= bufferReset { + if e.cur >= e.bufferReset { for i := range e.table[:] { e.table[i] = tableEntry{} } @@ -685,7 +681,7 @@ encodeLoop: } // We do not store history, so we must offset e.cur to avoid false matches for next user. - if e.cur < bufferReset { + if e.cur < e.bufferReset { e.cur += int32(len(src)) } } @@ -700,7 +696,7 @@ func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) { ) // Protect against e.cur wraparound. - for e.cur >= bufferReset { + for e.cur >= e.bufferReset-int32(len(e.hist)) { if len(e.hist) == 0 { for i := range e.table[:] { e.table[i] = tableEntry{} @@ -1103,7 +1099,8 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) { } if allDirty || dirtyShardCnt > dLongTableShardCnt/2 { - copy(e.longTable[:], e.dictLongTable) + //copy(e.longTable[:], e.dictLongTable) + e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable) for i := range e.longTableShardDirty { e.longTableShardDirty[i] = false } @@ -1114,7 +1111,9 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) { continue } - copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize]) + // copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize]) + *(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:]) + e.longTableShardDirty[i] = false } } diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go index f51ab529..315b1a8f 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go @@ -43,7 +43,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) { ) // Protect against e.cur wraparound. - for e.cur >= bufferReset { + for e.cur >= e.bufferReset-int32(len(e.hist)) { if len(e.hist) == 0 { for i := range e.table[:] { e.table[i] = tableEntry{} @@ -304,13 +304,13 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) { minNonLiteralBlockSize = 1 + 1 + inputMargin ) if debugEncoder { - if len(src) > maxBlockSize { + if len(src) > maxCompressedBlockSize { panic("src too big") } } // Protect against e.cur wraparound. - if e.cur >= bufferReset { + if e.cur >= e.bufferReset { for i := range e.table[:] { e.table[i] = tableEntry{} } @@ -538,7 +538,7 @@ encodeLoop: println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits) } // We do not store history, so we must offset e.cur to avoid false matches for next user. - if e.cur < bufferReset { + if e.cur < e.bufferReset { e.cur += int32(len(src)) } } @@ -555,11 +555,9 @@ func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) { return } // Protect against e.cur wraparound. - for e.cur >= bufferReset { + for e.cur >= e.bufferReset-int32(len(e.hist)) { if len(e.hist) == 0 { - for i := range e.table[:] { - e.table[i] = tableEntry{} - } + e.table = [tableSize]tableEntry{} e.cur = e.maxMatchOff break } @@ -871,7 +869,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) { const shardCnt = tableShardCnt const shardSize = tableShardSize if e.allDirty || dirtyShardCnt > shardCnt*4/6 { - copy(e.table[:], e.dictTable) + //copy(e.table[:], e.dictTable) + e.table = *(*[tableSize]tableEntry)(e.dictTable) for i := range e.tableShardDirty { e.tableShardDirty[i] = false } @@ -883,7 +882,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) { continue } - copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize]) + //copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize]) + *(*[shardSize]tableEntry)(e.table[i*shardSize:]) = *(*[shardSize]tableEntry)(e.dictTable[i*shardSize:]) e.tableShardDirty[i] = false } e.allDirty = false diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go index e6b1d01c..65c6c36d 100644 --- a/vendor/github.com/klauspost/compress/zstd/encoder.go +++ b/vendor/github.com/klauspost/compress/zstd/encoder.go @@ -8,6 +8,7 @@ import ( "crypto/rand" "fmt" "io" + "math" rdebug "runtime/debug" "sync" @@ -528,8 +529,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte { // If a non-single block is needed the encoder will reset again. e.encoders <- enc }() - // Use single segments when above minimum window and below 1MB. - single := len(src) < 1<<20 && len(src) > MinWindowSize + // Use single segments when above minimum window and below window size. + single := len(src) <= e.o.windowSize && len(src) > MinWindowSize if e.o.single != nil { single = *e.o.single } @@ -639,3 +640,37 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte { } return dst } + +// MaxEncodedSize returns the expected maximum +// size of an encoded block or stream. +func (e *Encoder) MaxEncodedSize(size int) int { + frameHeader := 4 + 2 // magic + frame header & window descriptor + if e.o.dict != nil { + frameHeader += 4 + } + // Frame content size: + if size < 256 { + frameHeader++ + } else if size < 65536+256 { + frameHeader += 2 + } else if size < math.MaxInt32 { + frameHeader += 4 + } else { + frameHeader += 8 + } + // Final crc + if e.o.crc { + frameHeader += 4 + } + + // Max overhead is 3 bytes/block. + // There cannot be 0 blocks. + blocks := (size + e.o.blockSize) / e.o.blockSize + + // Combine, add padding. + maxSz := frameHeader + 3*blocks + size + if e.o.pad > 1 { + maxSz += calcSkippableFrame(int64(maxSz), int64(e.o.pad)) + } + return maxSz +} diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go index 44d8dbd1..8e15be2f 100644 --- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go +++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go @@ -3,6 +3,8 @@ package zstd import ( "errors" "fmt" + "math" + "math/bits" "runtime" "strings" ) @@ -47,22 +49,22 @@ func (o encoderOptions) encoder() encoder { switch o.level { case SpeedFastest: if o.dict != nil { - return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}} + return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}} } - return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} + return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}} case SpeedDefault: if o.dict != nil { - return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}} + return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}} } - return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}} + return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}} case SpeedBetterCompression: if o.dict != nil { - return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}} + return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}} } - return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} + return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}} case SpeedBestCompression: - return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} + return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}} } panic("unknown compression level") } @@ -283,7 +285,7 @@ func WithNoEntropyCompression(b bool) EOption { // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range. // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB. // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations. -// If this is not specified, block encodes will automatically choose this based on the input size. +// If this is not specified, block encodes will automatically choose this based on the input size and the window size. // This setting has no effect on streamed encodes. func WithSingleSegment(b bool) EOption { return func(o *encoderOptions) error { @@ -304,7 +306,13 @@ func WithLowerEncoderMem(b bool) EOption { } // WithEncoderDict allows to register a dictionary that will be used for the encode. +// +// The slice dict must be in the [dictionary format] produced by +// "zstd --train" from the Zstandard reference implementation. +// // The encoder *may* choose to use no dictionary instead for certain payloads. +// +// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format func WithEncoderDict(dict []byte) EOption { return func(o *encoderOptions) error { d, err := loadDict(dict) @@ -315,3 +323,17 @@ func WithEncoderDict(dict []byte) EOption { return nil } } + +// WithEncoderDictRaw registers a dictionary that may be used by the encoder. +// +// The slice content may contain arbitrary data. It will be used as an initial +// history. +func WithEncoderDictRaw(id uint32, content []byte) EOption { + return func(o *encoderOptions) error { + if bits.UintSize > 32 && uint(len(content)) > dictMaxLength { + return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content)) + } + o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}} + return nil + } +} diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go index fa0a633f..d8e8a05b 100644 --- a/vendor/github.com/klauspost/compress/zstd/framedec.go +++ b/vendor/github.com/klauspost/compress/zstd/framedec.go @@ -5,7 +5,7 @@ package zstd import ( - "bytes" + "encoding/binary" "encoding/hex" "errors" "io" @@ -29,7 +29,7 @@ type frameDec struct { FrameContentSize uint64 - DictionaryID *uint32 + DictionaryID uint32 HasCheckSum bool SingleSegment bool } @@ -43,9 +43,9 @@ const ( MaxWindowSize = 1 << 29 ) -var ( - frameMagic = []byte{0x28, 0xb5, 0x2f, 0xfd} - skippableFrameMagic = []byte{0x2a, 0x4d, 0x18} +const ( + frameMagic = "\x28\xb5\x2f\xfd" + skippableFrameMagic = "\x2a\x4d\x18" ) func newFrameDec(o decoderOptions) *frameDec { @@ -89,9 +89,9 @@ func (d *frameDec) reset(br byteBuffer) error { copy(signature[1:], b) } - if !bytes.Equal(signature[1:4], skippableFrameMagic) || signature[0]&0xf0 != 0x50 { + if string(signature[1:4]) != skippableFrameMagic || signature[0]&0xf0 != 0x50 { if debugDecoder { - println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString(skippableFrameMagic)) + println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString([]byte(skippableFrameMagic))) } // Break if not skippable frame. break @@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error { } n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24) println("Skipping frame with", n, "bytes.") - err = br.skipN(int(n)) + err = br.skipN(int64(n)) if err != nil { if debugDecoder { println("Reading discarded frame", err) @@ -114,9 +114,9 @@ func (d *frameDec) reset(br byteBuffer) error { return err } } - if !bytes.Equal(signature[:], frameMagic) { + if string(signature[:]) != frameMagic { if debugDecoder { - println("Got magic numbers: ", signature, "want:", frameMagic) + println("Got magic numbers: ", signature, "want:", []byte(frameMagic)) } return ErrMagicMismatch } @@ -155,7 +155,7 @@ func (d *frameDec) reset(br byteBuffer) error { // Read Dictionary_ID // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id - d.DictionaryID = nil + d.DictionaryID = 0 if size := fhd & 3; size != 0 { if size == 3 { size = 4 @@ -167,7 +167,7 @@ func (d *frameDec) reset(br byteBuffer) error { return err } var id uint32 - switch size { + switch len(b) { case 1: id = uint32(b[0]) case 2: @@ -178,11 +178,7 @@ func (d *frameDec) reset(br byteBuffer) error { if debugDecoder { println("Dict size", size, "ID:", id) } - if id > 0 { - // ID 0 means "sorry, no dictionary anyway". - // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format - d.DictionaryID = &id - } + d.DictionaryID = id } // Read Frame_Content_Size @@ -204,7 +200,7 @@ func (d *frameDec) reset(br byteBuffer) error { println("Reading Frame content", err) return err } - switch fcsSize { + switch len(b) { case 1: d.FrameContentSize = uint64(b[0]) case 2: @@ -231,20 +227,27 @@ func (d *frameDec) reset(br byteBuffer) error { d.crc.Reset() } + if d.WindowSize > d.o.maxWindowSize { + if debugDecoder { + printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + } + return ErrWindowSizeExceeded + } + if d.WindowSize == 0 && d.SingleSegment { // We may not need window in this case. d.WindowSize = d.FrameContentSize if d.WindowSize < MinWindowSize { d.WindowSize = MinWindowSize } - } - - if d.WindowSize > uint64(d.o.maxWindowSize) { - if debugDecoder { - printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + if d.WindowSize > d.o.maxDecodedSize { + if debugDecoder { + printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + } + return ErrDecoderSizeExceeded } - return ErrWindowSizeExceeded } + // The minimum Window_Size is 1 KB. if d.WindowSize < MinWindowSize { if debugDecoder { @@ -254,11 +257,16 @@ func (d *frameDec) reset(br byteBuffer) error { } d.history.windowSize = int(d.WindowSize) if !d.o.lowMem || d.history.windowSize < maxBlockSize { - // Alloc 2x window size if not low-mem, or very small window size. + // Alloc 2x window size if not low-mem, or window size below 2MB. d.history.allocFrameBuffer = d.history.windowSize * 2 } else { - // Alloc with one additional block - d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize + if d.o.lowMem { + // Alloc with 1MB extra. + d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize/2 + } else { + // Alloc with 2MB extra. + d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize + } } if debugDecoder { @@ -293,7 +301,7 @@ func (d *frameDec) checkCRC() error { } // We can overwrite upper tmp now - want, err := d.rawInput.readSmall(4) + buf, err := d.rawInput.readSmall(4) if err != nil { println("CRC missing?", err) return err @@ -303,22 +311,17 @@ func (d *frameDec) checkCRC() error { return nil } - var tmp [4]byte - got := d.crc.Sum64() - // Flip to match file order. - tmp[0] = byte(got >> 0) - tmp[1] = byte(got >> 8) - tmp[2] = byte(got >> 16) - tmp[3] = byte(got >> 24) + want := binary.LittleEndian.Uint32(buf[:4]) + got := uint32(d.crc.Sum64()) - if !bytes.Equal(tmp[:], want) { + if got != want { if debugDecoder { - println("CRC Check Failed:", tmp[:], "!=", want) + printf("CRC check failed: got %08x, want %08x\n", got, want) } return ErrCRCMismatch } if debugDecoder { - println("CRC ok", tmp[:]) + printf("CRC ok %08x\n", got) } return nil } @@ -336,7 +339,7 @@ func (d *frameDec) consumeCRC() error { return nil } -// runDecoder will create a sync decoder that will decode a block of data. +// runDecoder will run the decoder for the remainder of the frame. func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) { saved := d.history.b @@ -346,12 +349,23 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) { // Store input length, so we only check new data. crcStart := len(dst) d.history.decoders.maxSyncLen = 0 + if d.o.limitToCap { + d.history.decoders.maxSyncLen = uint64(cap(dst) - len(dst)) + } if d.FrameContentSize != fcsUnknown { - d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst)) + if !d.o.limitToCap || d.FrameContentSize+uint64(len(dst)) < d.history.decoders.maxSyncLen { + d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst)) + } if d.history.decoders.maxSyncLen > d.o.maxDecodedSize { + if debugDecoder { + println("maxSyncLen:", d.history.decoders.maxSyncLen, "> maxDecodedSize:", d.o.maxDecodedSize) + } return dst, ErrDecoderSizeExceeded } - if uint64(cap(dst)) < d.history.decoders.maxSyncLen { + if debugDecoder { + println("maxSyncLen:", d.history.decoders.maxSyncLen) + } + if !d.o.limitToCap && uint64(cap(dst)) < d.history.decoders.maxSyncLen { // Alloc for output dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc) copy(dst2, dst) @@ -371,7 +385,13 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) { if err != nil { break } - if uint64(len(d.history.b)) > d.o.maxDecodedSize { + if uint64(len(d.history.b)-crcStart) > d.o.maxDecodedSize { + println("runDecoder: maxDecodedSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.o.maxDecodedSize) + err = ErrDecoderSizeExceeded + break + } + if d.o.limitToCap && len(d.history.b) > cap(dst) { + println("runDecoder: cap exceeded", uint64(len(d.history.b)), ">", cap(dst)) err = ErrDecoderSizeExceeded break } diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go index e74df436..d04a829b 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go @@ -21,7 +21,8 @@ type buildDtableAsmContext struct { // buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable. // Function returns non-zero exit code on error. -// go:noescape +// +//go:noescape func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int // please keep in sync with _generate/gen_fse.go @@ -34,8 +35,8 @@ const ( // buildDtable will build the decoding table. func (s *fseDecoder) buildDtable() error { ctx := buildDtableAsmContext{ - stateTable: (*uint16)(&s.stateTable[0]), - norm: (*int16)(&s.norm[0]), + stateTable: &s.stateTable[0], + norm: &s.norm[0], dt: (*uint64)(&s.dt[0]), } code := buildDtable_asm(s, &ctx) diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s index da32b442..bcde3986 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s @@ -1,7 +1,6 @@ // Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm -// +build !appengine,!noasm,gc,!noasm // func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int TEXT ·buildDtable_asm(SB), $0-24 diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go index 28b40153..09164856 100644 --- a/vendor/github.com/klauspost/compress/zstd/history.go +++ b/vendor/github.com/klauspost/compress/zstd/history.go @@ -37,24 +37,21 @@ func (h *history) reset() { h.ignoreBuffer = 0 h.error = false h.recentOffsets = [3]int{1, 4, 8} - if f := h.decoders.litLengths.fse; f != nil && !f.preDefined { - fseDecoderPool.Put(f) - } - if f := h.decoders.offsets.fse; f != nil && !f.preDefined { - fseDecoderPool.Put(f) - } - if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined { - fseDecoderPool.Put(f) - } + h.decoders.freeDecoders() h.decoders = sequenceDecs{br: h.decoders.br} + h.freeHuffDecoder() + h.huffTree = nil + h.dict = nil + //printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b)) +} + +func (h *history) freeHuffDecoder() { if h.huffTree != nil { if h.dict == nil || h.dict.litEnc != h.huffTree { huffDecoderPool.Put(h.huffTree) + h.huffTree = nil } } - h.huffTree = nil - h.dict = nil - //printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b)) } func (h *history) setDict(dict *dict) { diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md index 69aa3bb5..777290d4 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md @@ -2,12 +2,7 @@ VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package. - -[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash) -[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash) - -xxhash is a Go implementation of the 64-bit -[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a +xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a high-quality hashing algorithm that is much faster than anything in the Go standard library. @@ -28,31 +23,49 @@ func (*Digest) WriteString(string) (int, error) func (*Digest) Sum64() uint64 ``` -This implementation provides a fast pure-Go implementation and an even faster -assembly implementation for amd64. +The package is written with optimized pure Go and also contains even faster +assembly implementations for amd64 and arm64. If desired, the `purego` build tag +opts into using the Go code even on those architectures. + +[xxHash]: http://cyan4973.github.io/xxHash/ + +## Compatibility + +This package is in a module and the latest code is in version 2 of the module. +You need a version of Go with at least "minimal module compatibility" to use +github.com/cespare/xxhash/v2: + +* 1.9.7+ for Go 1.9 +* 1.10.3+ for Go 1.10 +* Go 1.11 or later + +I recommend using the latest release of Go. ## Benchmarks Here are some quick benchmarks comparing the pure-Go and assembly implementations of Sum64. -| input size | purego | asm | -| --- | --- | --- | -| 5 B | 979.66 MB/s | 1291.17 MB/s | -| 100 B | 7475.26 MB/s | 7973.40 MB/s | -| 4 KB | 17573.46 MB/s | 17602.65 MB/s | -| 10 MB | 17131.46 MB/s | 17142.16 MB/s | +| input size | purego | asm | +| ---------- | --------- | --------- | +| 4 B | 1.3 GB/s | 1.2 GB/s | +| 16 B | 2.9 GB/s | 3.5 GB/s | +| 100 B | 6.9 GB/s | 8.1 GB/s | +| 4 KB | 11.7 GB/s | 16.7 GB/s | +| 10 MB | 12.0 GB/s | 17.3 GB/s | -These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using -the following commands under Go 1.11.2: +These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C +CPU using the following commands under Go 1.19.2: ``` -$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes' -$ go test -benchtime 10s -bench '/xxhash,direct,bytes' +benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$') +benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$') ``` ## Projects using this package - [InfluxDB](https://github.com/influxdata/influxdb) - [Prometheus](https://github.com/prometheus/prometheus) +- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) - [FreeCache](https://github.com/coocood/freecache) +- [FastCache](https://github.com/VictoriaMetrics/fastcache) diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go index 2c112a0a..fc40c820 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go @@ -18,19 +18,11 @@ const ( prime5 uint64 = 2870177450012600261 ) -// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where -// possible in the Go code is worth a small (but measurable) performance boost -// by avoiding some MOVQs. Vars are needed for the asm and also are useful for -// convenience in the Go code in a few places where we need to intentionally -// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the -// result overflows a uint64). -var ( - prime1v = prime1 - prime2v = prime2 - prime3v = prime3 - prime4v = prime4 - prime5v = prime5 -) +// Store the primes in an array as well. +// +// The consts are used when possible in Go code to avoid MOVs but we need a +// contiguous array of the assembly code. +var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5} // Digest implements hash.Hash64. type Digest struct { @@ -52,10 +44,10 @@ func New() *Digest { // Reset clears the Digest's state so that it can be reused. func (d *Digest) Reset() { - d.v1 = prime1v + prime2 + d.v1 = primes[0] + prime2 d.v2 = prime2 d.v3 = 0 - d.v4 = -prime1v + d.v4 = -primes[0] d.total = 0 d.n = 0 } @@ -71,21 +63,23 @@ func (d *Digest) Write(b []byte) (n int, err error) { n = len(b) d.total += uint64(n) + memleft := d.mem[d.n&(len(d.mem)-1):] + if d.n+n < 32 { // This new data doesn't even fill the current block. - copy(d.mem[d.n:], b) + copy(memleft, b) d.n += n return } if d.n > 0 { // Finish off the partial block. - copy(d.mem[d.n:], b) + c := copy(memleft, b) d.v1 = round(d.v1, u64(d.mem[0:8])) d.v2 = round(d.v2, u64(d.mem[8:16])) d.v3 = round(d.v3, u64(d.mem[16:24])) d.v4 = round(d.v4, u64(d.mem[24:32])) - b = b[32-d.n:] + b = b[c:] d.n = 0 } @@ -135,21 +129,20 @@ func (d *Digest) Sum64() uint64 { h += d.total - i, end := 0, d.n - for ; i+8 <= end; i += 8 { - k1 := round(0, u64(d.mem[i:i+8])) + b := d.mem[:d.n&(len(d.mem)-1)] + for ; len(b) >= 8; b = b[8:] { + k1 := round(0, u64(b[:8])) h ^= k1 h = rol27(h)*prime1 + prime4 } - if i+4 <= end { - h ^= uint64(u32(d.mem[i:i+4])) * prime1 + if len(b) >= 4 { + h ^= uint64(u32(b[:4])) * prime1 h = rol23(h)*prime2 + prime3 - i += 4 + b = b[4:] } - for i < end { - h ^= uint64(d.mem[i]) * prime5 + for ; len(b) > 0; b = b[1:] { + h ^= uint64(b[0]) * prime5 h = rol11(h) * prime1 - i++ } h ^= h >> 33 diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s index cea17856..ddb63aa9 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s @@ -1,3 +1,4 @@ +//go:build !appengine && gc && !purego && !noasm // +build !appengine // +build gc // +build !purego @@ -5,212 +6,205 @@ #include "textflag.h" -// Register allocation: -// AX h -// SI pointer to advance through b -// DX n -// BX loop end -// R8 v1, k1 -// R9 v2 -// R10 v3 -// R11 v4 -// R12 tmp -// R13 prime1v -// R14 prime2v -// DI prime4v - -// round reads from and advances the buffer pointer in SI. -// It assumes that R13 has prime1v and R14 has prime2v. -#define round(r) \ - MOVQ (SI), R12 \ - ADDQ $8, SI \ - IMULQ R14, R12 \ - ADDQ R12, r \ - ROLQ $31, r \ - IMULQ R13, r - -// mergeRound applies a merge round on the two registers acc and val. -// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v. -#define mergeRound(acc, val) \ - IMULQ R14, val \ - ROLQ $31, val \ - IMULQ R13, val \ - XORQ val, acc \ - IMULQ R13, acc \ - ADDQ DI, acc +// Registers: +#define h AX +#define d AX +#define p SI // pointer to advance through b +#define n DX +#define end BX // loop end +#define v1 R8 +#define v2 R9 +#define v3 R10 +#define v4 R11 +#define x R12 +#define prime1 R13 +#define prime2 R14 +#define prime4 DI + +#define round(acc, x) \ + IMULQ prime2, x \ + ADDQ x, acc \ + ROLQ $31, acc \ + IMULQ prime1, acc + +// round0 performs the operation x = round(0, x). +#define round0(x) \ + IMULQ prime2, x \ + ROLQ $31, x \ + IMULQ prime1, x + +// mergeRound applies a merge round on the two registers acc and x. +// It assumes that prime1, prime2, and prime4 have been loaded. +#define mergeRound(acc, x) \ + round0(x) \ + XORQ x, acc \ + IMULQ prime1, acc \ + ADDQ prime4, acc + +// blockLoop processes as many 32-byte blocks as possible, +// updating v1, v2, v3, and v4. It assumes that there is at least one block +// to process. +#define blockLoop() \ +loop: \ + MOVQ +0(p), x \ + round(v1, x) \ + MOVQ +8(p), x \ + round(v2, x) \ + MOVQ +16(p), x \ + round(v3, x) \ + MOVQ +24(p), x \ + round(v4, x) \ + ADDQ $32, p \ + CMPQ p, end \ + JLE loop // func Sum64(b []byte) uint64 -TEXT ·Sum64(SB), NOSPLIT, $0-32 +TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 // Load fixed primes. - MOVQ ·prime1v(SB), R13 - MOVQ ·prime2v(SB), R14 - MOVQ ·prime4v(SB), DI + MOVQ ·primes+0(SB), prime1 + MOVQ ·primes+8(SB), prime2 + MOVQ ·primes+24(SB), prime4 // Load slice. - MOVQ b_base+0(FP), SI - MOVQ b_len+8(FP), DX - LEAQ (SI)(DX*1), BX + MOVQ b_base+0(FP), p + MOVQ b_len+8(FP), n + LEAQ (p)(n*1), end // The first loop limit will be len(b)-32. - SUBQ $32, BX + SUBQ $32, end // Check whether we have at least one block. - CMPQ DX, $32 + CMPQ n, $32 JLT noBlocks // Set up initial state (v1, v2, v3, v4). - MOVQ R13, R8 - ADDQ R14, R8 - MOVQ R14, R9 - XORQ R10, R10 - XORQ R11, R11 - SUBQ R13, R11 - - // Loop until SI > BX. -blockLoop: - round(R8) - round(R9) - round(R10) - round(R11) - - CMPQ SI, BX - JLE blockLoop - - MOVQ R8, AX - ROLQ $1, AX - MOVQ R9, R12 - ROLQ $7, R12 - ADDQ R12, AX - MOVQ R10, R12 - ROLQ $12, R12 - ADDQ R12, AX - MOVQ R11, R12 - ROLQ $18, R12 - ADDQ R12, AX - - mergeRound(AX, R8) - mergeRound(AX, R9) - mergeRound(AX, R10) - mergeRound(AX, R11) + MOVQ prime1, v1 + ADDQ prime2, v1 + MOVQ prime2, v2 + XORQ v3, v3 + XORQ v4, v4 + SUBQ prime1, v4 + + blockLoop() + + MOVQ v1, h + ROLQ $1, h + MOVQ v2, x + ROLQ $7, x + ADDQ x, h + MOVQ v3, x + ROLQ $12, x + ADDQ x, h + MOVQ v4, x + ROLQ $18, x + ADDQ x, h + + mergeRound(h, v1) + mergeRound(h, v2) + mergeRound(h, v3) + mergeRound(h, v4) JMP afterBlocks noBlocks: - MOVQ ·prime5v(SB), AX + MOVQ ·primes+32(SB), h afterBlocks: - ADDQ DX, AX - - // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8. - ADDQ $24, BX - - CMPQ SI, BX - JG fourByte - -wordLoop: - // Calculate k1. - MOVQ (SI), R8 - ADDQ $8, SI - IMULQ R14, R8 - ROLQ $31, R8 - IMULQ R13, R8 - - XORQ R8, AX - ROLQ $27, AX - IMULQ R13, AX - ADDQ DI, AX - - CMPQ SI, BX - JLE wordLoop - -fourByte: - ADDQ $4, BX - CMPQ SI, BX - JG singles - - MOVL (SI), R8 - ADDQ $4, SI - IMULQ R13, R8 - XORQ R8, AX - - ROLQ $23, AX - IMULQ R14, AX - ADDQ ·prime3v(SB), AX - -singles: - ADDQ $4, BX - CMPQ SI, BX + ADDQ n, h + + ADDQ $24, end + CMPQ p, end + JG try4 + +loop8: + MOVQ (p), x + ADDQ $8, p + round0(x) + XORQ x, h + ROLQ $27, h + IMULQ prime1, h + ADDQ prime4, h + + CMPQ p, end + JLE loop8 + +try4: + ADDQ $4, end + CMPQ p, end + JG try1 + + MOVL (p), x + ADDQ $4, p + IMULQ prime1, x + XORQ x, h + + ROLQ $23, h + IMULQ prime2, h + ADDQ ·primes+16(SB), h + +try1: + ADDQ $4, end + CMPQ p, end JGE finalize -singlesLoop: - MOVBQZX (SI), R12 - ADDQ $1, SI - IMULQ ·prime5v(SB), R12 - XORQ R12, AX +loop1: + MOVBQZX (p), x + ADDQ $1, p + IMULQ ·primes+32(SB), x + XORQ x, h + ROLQ $11, h + IMULQ prime1, h - ROLQ $11, AX - IMULQ R13, AX - - CMPQ SI, BX - JL singlesLoop + CMPQ p, end + JL loop1 finalize: - MOVQ AX, R12 - SHRQ $33, R12 - XORQ R12, AX - IMULQ R14, AX - MOVQ AX, R12 - SHRQ $29, R12 - XORQ R12, AX - IMULQ ·prime3v(SB), AX - MOVQ AX, R12 - SHRQ $32, R12 - XORQ R12, AX - - MOVQ AX, ret+24(FP) + MOVQ h, x + SHRQ $33, x + XORQ x, h + IMULQ prime2, h + MOVQ h, x + SHRQ $29, x + XORQ x, h + IMULQ ·primes+16(SB), h + MOVQ h, x + SHRQ $32, x + XORQ x, h + + MOVQ h, ret+24(FP) RET -// writeBlocks uses the same registers as above except that it uses AX to store -// the d pointer. - // func writeBlocks(d *Digest, b []byte) int -TEXT ·writeBlocks(SB), NOSPLIT, $0-40 +TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 // Load fixed primes needed for round. - MOVQ ·prime1v(SB), R13 - MOVQ ·prime2v(SB), R14 + MOVQ ·primes+0(SB), prime1 + MOVQ ·primes+8(SB), prime2 // Load slice. - MOVQ b_base+8(FP), SI - MOVQ b_len+16(FP), DX - LEAQ (SI)(DX*1), BX - SUBQ $32, BX + MOVQ b_base+8(FP), p + MOVQ b_len+16(FP), n + LEAQ (p)(n*1), end + SUBQ $32, end // Load vN from d. - MOVQ d+0(FP), AX - MOVQ 0(AX), R8 // v1 - MOVQ 8(AX), R9 // v2 - MOVQ 16(AX), R10 // v3 - MOVQ 24(AX), R11 // v4 + MOVQ s+0(FP), d + MOVQ 0(d), v1 + MOVQ 8(d), v2 + MOVQ 16(d), v3 + MOVQ 24(d), v4 // We don't need to check the loop condition here; this function is // always called with at least one block of data to process. -blockLoop: - round(R8) - round(R9) - round(R10) - round(R11) - - CMPQ SI, BX - JLE blockLoop + blockLoop() // Copy vN back to d. - MOVQ R8, 0(AX) - MOVQ R9, 8(AX) - MOVQ R10, 16(AX) - MOVQ R11, 24(AX) - - // The number of bytes written is SI minus the old base pointer. - SUBQ b_base+8(FP), SI - MOVQ SI, ret+32(FP) + MOVQ v1, 0(d) + MOVQ v2, 8(d) + MOVQ v3, 16(d) + MOVQ v4, 24(d) + + // The number of bytes written is p minus the old base pointer. + SUBQ b_base+8(FP), p + MOVQ p, ret+32(FP) RET diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s index 4d64a17d..17901e08 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s @@ -1,13 +1,17 @@ -// +build gc,!purego,!noasm +//go:build !appengine && gc && !purego && !noasm +// +build !appengine +// +build gc +// +build !purego +// +build !noasm #include "textflag.h" -// Register allocation. +// Registers: #define digest R1 -#define h R2 // Return value. -#define p R3 // Input pointer. -#define len R4 -#define nblocks R5 // len / 32. +#define h R2 // return value +#define p R3 // input pointer +#define n R4 // input length +#define nblocks R5 // n / 32 #define prime1 R7 #define prime2 R8 #define prime3 R9 @@ -25,60 +29,52 @@ #define round(acc, x) \ MADD prime2, acc, x, acc \ ROR $64-31, acc \ - MUL prime1, acc \ + MUL prime1, acc -// x = round(0, x). +// round0 performs the operation x = round(0, x). #define round0(x) \ MUL prime2, x \ ROR $64-31, x \ - MUL prime1, x \ - -#define mergeRound(x) \ - round0(x) \ - EOR x, h \ - MADD h, prime4, prime1, h \ - -// Update v[1-4] with 32-byte blocks. Assumes len >= 32. -#define blocksLoop() \ - LSR $5, len, nblocks \ - PCALIGN $16 \ - loop: \ - LDP.P 32(p), (x1, x2) \ - round(v1, x1) \ - LDP -16(p), (x3, x4) \ - round(v2, x2) \ - SUB $1, nblocks \ - round(v3, x3) \ - round(v4, x4) \ - CBNZ nblocks, loop \ - -// The primes are repeated here to ensure that they're stored -// in a contiguous array, so we can load them with LDP. -DATA primes<> +0(SB)/8, $11400714785074694791 -DATA primes<> +8(SB)/8, $14029467366897019727 -DATA primes<>+16(SB)/8, $1609587929392839161 -DATA primes<>+24(SB)/8, $9650029242287828579 -DATA primes<>+32(SB)/8, $2870177450012600261 -GLOBL primes<>(SB), NOPTR+RODATA, $40 + MUL prime1, x + +#define mergeRound(acc, x) \ + round0(x) \ + EOR x, acc \ + MADD acc, prime4, prime1, acc + +// blockLoop processes as many 32-byte blocks as possible, +// updating v1, v2, v3, and v4. It assumes that n >= 32. +#define blockLoop() \ + LSR $5, n, nblocks \ + PCALIGN $16 \ + loop: \ + LDP.P 16(p), (x1, x2) \ + LDP.P 16(p), (x3, x4) \ + round(v1, x1) \ + round(v2, x2) \ + round(v3, x3) \ + round(v4, x4) \ + SUB $1, nblocks \ + CBNZ nblocks, loop // func Sum64(b []byte) uint64 -TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 - LDP b_base+0(FP), (p, len) +TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 + LDP b_base+0(FP), (p, n) - LDP primes<> +0(SB), (prime1, prime2) - LDP primes<>+16(SB), (prime3, prime4) - MOVD primes<>+32(SB), prime5 + LDP ·primes+0(SB), (prime1, prime2) + LDP ·primes+16(SB), (prime3, prime4) + MOVD ·primes+32(SB), prime5 - CMP $32, len - CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 } - BLO afterLoop + CMP $32, n + CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 } + BLT afterLoop ADD prime1, prime2, v1 MOVD prime2, v2 MOVD $0, v3 NEG prime1, v4 - blocksLoop() + blockLoop() ROR $64-1, v1, x1 ROR $64-7, v2, x2 @@ -88,71 +84,75 @@ TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 ADD x3, x4 ADD x2, x4, h - mergeRound(v1) - mergeRound(v2) - mergeRound(v3) - mergeRound(v4) + mergeRound(h, v1) + mergeRound(h, v2) + mergeRound(h, v3) + mergeRound(h, v4) afterLoop: - ADD len, h + ADD n, h - TBZ $4, len, try8 + TBZ $4, n, try8 LDP.P 16(p), (x1, x2) round0(x1) + + // NOTE: here and below, sequencing the EOR after the ROR (using a + // rotated register) is worth a small but measurable speedup for small + // inputs. ROR $64-27, h EOR x1 @> 64-27, h, h MADD h, prime4, prime1, h round0(x2) ROR $64-27, h - EOR x2 @> 64-27, h + EOR x2 @> 64-27, h, h MADD h, prime4, prime1, h try8: - TBZ $3, len, try4 + TBZ $3, n, try4 MOVD.P 8(p), x1 round0(x1) ROR $64-27, h - EOR x1 @> 64-27, h + EOR x1 @> 64-27, h, h MADD h, prime4, prime1, h try4: - TBZ $2, len, try2 + TBZ $2, n, try2 MOVWU.P 4(p), x2 MUL prime1, x2 ROR $64-23, h - EOR x2 @> 64-23, h + EOR x2 @> 64-23, h, h MADD h, prime3, prime2, h try2: - TBZ $1, len, try1 + TBZ $1, n, try1 MOVHU.P 2(p), x3 AND $255, x3, x1 LSR $8, x3, x2 MUL prime5, x1 ROR $64-11, h - EOR x1 @> 64-11, h + EOR x1 @> 64-11, h, h MUL prime1, h MUL prime5, x2 ROR $64-11, h - EOR x2 @> 64-11, h + EOR x2 @> 64-11, h, h MUL prime1, h try1: - TBZ $0, len, end + TBZ $0, n, finalize MOVBU (p), x4 MUL prime5, x4 ROR $64-11, h - EOR x4 @> 64-11, h + EOR x4 @> 64-11, h, h MUL prime1, h -end: +finalize: EOR h >> 33, h MUL prime2, h EOR h >> 29, h @@ -163,24 +163,22 @@ end: RET // func writeBlocks(d *Digest, b []byte) int -// -// Assumes len(b) >= 32. -TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 - LDP primes<>(SB), (prime1, prime2) +TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 + LDP ·primes+0(SB), (prime1, prime2) // Load state. Assume v[1-4] are stored contiguously. MOVD d+0(FP), digest LDP 0(digest), (v1, v2) LDP 16(digest), (v3, v4) - LDP b_base+8(FP), (p, len) + LDP b_base+8(FP), (p, n) - blocksLoop() + blockLoop() // Store updated state. STP (v1, v2), 0(digest) STP (v3, v4), 16(digest) - BIC $31, len - MOVD len, ret+32(FP) + BIC $31, n + MOVD n, ret+32(FP) RET diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go index 1a1fac9c..d4221edf 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go @@ -13,4 +13,4 @@ package xxhash func Sum64(b []byte) uint64 //go:noescape -func writeBlocks(d *Digest, b []byte) int +func writeBlocks(s *Digest, b []byte) int diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go index 209cb4a9..0be16cef 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go @@ -15,10 +15,10 @@ func Sum64(b []byte) uint64 { var h uint64 if n >= 32 { - v1 := prime1v + prime2 + v1 := primes[0] + prime2 v2 := prime2 v3 := uint64(0) - v4 := -prime1v + v4 := -primes[0] for len(b) >= 32 { v1 = round(v1, u64(b[0:8:len(b)])) v2 = round(v2, u64(b[8:16:len(b)])) @@ -37,19 +37,18 @@ func Sum64(b []byte) uint64 { h += uint64(n) - i, end := 0, len(b) - for ; i+8 <= end; i += 8 { - k1 := round(0, u64(b[i:i+8:len(b)])) + for ; len(b) >= 8; b = b[8:] { + k1 := round(0, u64(b[:8])) h ^= k1 h = rol27(h)*prime1 + prime4 } - if i+4 <= end { - h ^= uint64(u32(b[i:i+4:len(b)])) * prime1 + if len(b) >= 4 { + h ^= uint64(u32(b[:4])) * prime1 h = rol23(h)*prime2 + prime3 - i += 4 + b = b[4:] } - for ; i < end; i++ { - h ^= uint64(b[i]) * prime5 + for ; len(b) > 0; b = b[1:] { + h ^= uint64(b[0]) * prime5 h = rol11(h) * prime1 } diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go index df044720..f833d154 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec.go +++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go @@ -99,6 +99,21 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro return nil } +func (s *sequenceDecs) freeDecoders() { + if f := s.litLengths.fse; f != nil && !f.preDefined { + fseDecoderPool.Put(f) + s.litLengths.fse = nil + } + if f := s.offsets.fse; f != nil && !f.preDefined { + fseDecoderPool.Put(f) + s.offsets.fse = nil + } + if f := s.matchLengths.fse; f != nil && !f.preDefined { + fseDecoderPool.Put(f) + s.matchLengths.fse = nil + } +} + // execute will execute the decoded sequence with the provided history. // The sequence must be evaluated before being sent. func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error { @@ -299,7 +314,10 @@ func (s *sequenceDecs) decodeSync(hist []byte) error { } size := ll + ml + len(out) if size-startSize > maxBlockSize { - return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize) + if size-startSize == 424242 { + panic("here") + } + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) } if size > cap(out) { // Not enough size, which can happen under high volume block streaming conditions @@ -411,7 +429,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error { // Check if space for literals if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize { - return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize) + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) } // Add final literals diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go index 847b322a..191384ad 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go @@ -32,18 +32,22 @@ type decodeSyncAsmContext struct { // sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm. // // Please refer to seqdec_generic.go for the reference implementation. +// //go:noescape func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions. +// //go:noescape func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer. +// //go:noescape func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer. +// //go:noescape func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int @@ -55,16 +59,22 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) { if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize { return false, nil } - useSafe := false - if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc { - useSafe = true - } - if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) { - useSafe = true - } - if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc { - useSafe = true - } + + // FIXME: Using unsafe memory copies leads to rare, random crashes + // with fuzz testing. It is therefore disabled for now. + const useSafe = true + /* + useSafe := false + if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc { + useSafe = true + } + if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) { + useSafe = true + } + if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc { + useSafe = true + } + */ br := s.br @@ -129,7 +139,7 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) { if debugDecoder { println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize) } - return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize) + return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) default: return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode) @@ -137,7 +147,8 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) { s.seqSize += ctx.litRemain if s.seqSize > maxBlockSize { - return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize) + return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) + } err := br.close() if err != nil { @@ -195,20 +206,24 @@ const errorNotEnoughSpace = 5 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm. // // Please refer to seqdec_generic.go for the reference implementation. +// //go:noescape func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm. // // Please refer to seqdec_generic.go for the reference implementation. +// //go:noescape func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions. +// //go:noescape func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions. +// //go:noescape func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int @@ -275,7 +290,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error { s.seqSize += ctx.litRemain if s.seqSize > maxBlockSize { - return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize) + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) } err := br.close() if err != nil { @@ -302,10 +317,12 @@ type executeAsmContext struct { // Returns false if a match offset is too big. // // Please refer to seqdec_generic.go for the reference implementation. +// //go:noescape func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool // Same as above, but with safe memcopies +// //go:noescape func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s index 71e64e06..b94993a0 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s @@ -1,7 +1,6 @@ // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm -// +build !appengine,!noasm,gc,!noasm // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: CMOV @@ -52,34 +51,46 @@ sequenceDecs_decode_amd64_fill_byte_by_byte: sequenceDecs_decode_amd64_fill_end: // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, 16(R10) + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_amd64_of_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_amd64_of_update_zero: + MOVQ AX, 16(R10) // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, 8(R10) + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_amd64_ml_update_zero: + MOVQ AX, 8(R10) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 @@ -107,19 +118,25 @@ sequenceDecs_decode_amd64_fill_2_byte_by_byte: sequenceDecs_decode_amd64_fill_2_end: // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, (R10) + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_amd64_ll_update_zero: + MOVQ AX, (R10) // Fill bitreader for state updates MOVQ R14, (SP) @@ -198,7 +215,7 @@ sequenceDecs_decode_amd64_skip_update: MOVQ R12, R13 MOVQ R11, R12 MOVQ CX, R11 - JMP sequenceDecs_decode_amd64_adjust_end + JMP sequenceDecs_decode_amd64_after_adjust sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: CMPQ (R10), $0x00000000 @@ -210,7 +227,7 @@ sequenceDecs_decode_amd64_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero MOVQ R11, CX - JMP sequenceDecs_decode_amd64_adjust_end + JMP sequenceDecs_decode_amd64_after_adjust sequenceDecs_decode_amd64_adjust_offset_nonzero: CMPQ CX, $0x01 @@ -247,7 +264,7 @@ sequenceDecs_decode_amd64_adjust_temp_valid: MOVQ AX, R11 MOVQ AX, CX -sequenceDecs_decode_amd64_adjust_end: +sequenceDecs_decode_amd64_after_adjust: MOVQ CX, 16(R10) // Check values @@ -303,10 +320,6 @@ error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET - // Return with not enough output space error - MOVQ $0x00000005, ret+24(FP) - RET - // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: CMOV TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 @@ -356,49 +369,67 @@ sequenceDecs_decode_56_amd64_fill_byte_by_byte: sequenceDecs_decode_56_amd64_fill_end: // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, 16(R10) + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_56_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_56_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_56_amd64_of_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_56_amd64_of_update_zero: + MOVQ AX, 16(R10) // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, 8(R10) + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_56_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_56_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_56_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_56_amd64_ml_update_zero: + MOVQ AX, 8(R10) // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R15 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R15 - ADDQ R15, AX - MOVQ AX, (R10) + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_56_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_56_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_56_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_56_amd64_ll_update_zero: + MOVQ AX, (R10) // Fill bitreader for state updates MOVQ R14, (SP) @@ -477,7 +508,7 @@ sequenceDecs_decode_56_amd64_skip_update: MOVQ R12, R13 MOVQ R11, R12 MOVQ CX, R11 - JMP sequenceDecs_decode_56_amd64_adjust_end + JMP sequenceDecs_decode_56_amd64_after_adjust sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: CMPQ (R10), $0x00000000 @@ -489,7 +520,7 @@ sequenceDecs_decode_56_amd64_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero MOVQ R11, CX - JMP sequenceDecs_decode_56_amd64_adjust_end + JMP sequenceDecs_decode_56_amd64_after_adjust sequenceDecs_decode_56_amd64_adjust_offset_nonzero: CMPQ CX, $0x01 @@ -526,7 +557,7 @@ sequenceDecs_decode_56_amd64_adjust_temp_valid: MOVQ AX, R11 MOVQ AX, CX -sequenceDecs_decode_56_amd64_adjust_end: +sequenceDecs_decode_56_amd64_after_adjust: MOVQ CX, 16(R10) // Check values @@ -582,10 +613,6 @@ error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET - // Return with not enough output space error - MOVQ $0x00000005, ret+24(FP) - RET - // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 @@ -757,7 +784,7 @@ sequenceDecs_decode_bmi2_skip_update: MOVQ R11, R12 MOVQ R10, R11 MOVQ CX, R10 - JMP sequenceDecs_decode_bmi2_adjust_end + JMP sequenceDecs_decode_bmi2_after_adjust sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: CMPQ (R9), $0x00000000 @@ -769,7 +796,7 @@ sequenceDecs_decode_bmi2_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero MOVQ R10, CX - JMP sequenceDecs_decode_bmi2_adjust_end + JMP sequenceDecs_decode_bmi2_after_adjust sequenceDecs_decode_bmi2_adjust_offset_nonzero: CMPQ CX, $0x01 @@ -806,7 +833,7 @@ sequenceDecs_decode_bmi2_adjust_temp_valid: MOVQ R13, R10 MOVQ R13, CX -sequenceDecs_decode_bmi2_adjust_end: +sequenceDecs_decode_bmi2_after_adjust: MOVQ CX, 16(R9) // Check values @@ -862,10 +889,6 @@ error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET - // Return with not enough output space error - MOVQ $0x00000005, ret+24(FP) - RET - // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 @@ -1012,7 +1035,7 @@ sequenceDecs_decode_56_bmi2_skip_update: MOVQ R11, R12 MOVQ R10, R11 MOVQ CX, R10 - JMP sequenceDecs_decode_56_bmi2_adjust_end + JMP sequenceDecs_decode_56_bmi2_after_adjust sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: CMPQ (R9), $0x00000000 @@ -1024,7 +1047,7 @@ sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero MOVQ R10, CX - JMP sequenceDecs_decode_56_bmi2_adjust_end + JMP sequenceDecs_decode_56_bmi2_after_adjust sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: CMPQ CX, $0x01 @@ -1061,7 +1084,7 @@ sequenceDecs_decode_56_bmi2_adjust_temp_valid: MOVQ R13, R10 MOVQ R13, CX -sequenceDecs_decode_56_bmi2_adjust_end: +sequenceDecs_decode_56_bmi2_after_adjust: MOVQ CX, 16(R9) // Check values @@ -1117,10 +1140,6 @@ error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET - // Return with not enough output space error - MOVQ $0x00000005, ret+24(FP) - RET - // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool // Requires: SSE TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 @@ -1354,8 +1373,7 @@ loop_finished: MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) - MOVQ 80(AX), CX - SUBQ CX, SI + SUBQ 80(AX), SI MOVQ SI, 112(AX) RET @@ -1367,8 +1385,7 @@ error_match_off_too_big: MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) - MOVQ 80(AX), CX - SUBQ CX, SI + SUBQ 80(AX), SI MOVQ SI, 112(AX) RET @@ -1712,8 +1729,7 @@ loop_finished: MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) - MOVQ 80(AX), CX - SUBQ CX, SI + SUBQ 80(AX), SI MOVQ SI, 112(AX) RET @@ -1725,8 +1741,7 @@ error_match_off_too_big: MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) - MOVQ 80(AX), CX - SUBQ CX, SI + SUBQ 80(AX), SI MOVQ SI, 112(AX) RET @@ -1749,6 +1764,10 @@ TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 + XORQ CX, CX + MOVQ CX, 8(SP) + MOVQ CX, 16(SP) + MOVQ CX, 24(SP) MOVQ 112(AX), R10 MOVQ 128(AX), CX MOVQ CX, 32(SP) @@ -1798,34 +1817,46 @@ sequenceDecs_decodeSync_amd64_fill_byte_by_byte: sequenceDecs_decodeSync_amd64_fill_end: // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 8(SP) + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_amd64_of_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_amd64_of_update_zero: + MOVQ AX, 8(SP) // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 16(SP) + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_amd64_ml_update_zero: + MOVQ AX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 @@ -1853,19 +1884,25 @@ sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: sequenceDecs_decodeSync_amd64_fill_2_end: // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 24(SP) + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_amd64_ll_update_zero: + MOVQ AX, 24(SP) // Fill bitreader for state updates MOVQ R13, (SP) @@ -1945,7 +1982,7 @@ sequenceDecs_decodeSync_amd64_skip_update: MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_amd64_adjust_end + JMP sequenceDecs_decodeSync_amd64_after_adjust sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 @@ -1957,7 +1994,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_amd64_adjust_end + JMP sequenceDecs_decodeSync_amd64_after_adjust sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: MOVQ R13, AX @@ -1966,8 +2003,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: CMPQ R13, $0x03 CMOVQEQ R14, AX CMOVQEQ R15, R14 - LEAQ 144(CX), R15 - ADDQ (R15)(AX*8), R14 + ADDQ 144(CX)(AX*8), R14 JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid MOVQ $0x00000001, R14 @@ -1983,7 +2019,7 @@ sequenceDecs_decodeSync_amd64_adjust_skip: MOVQ R14, 144(CX) MOVQ R14, R13 -sequenceDecs_decodeSync_amd64_adjust_end: +sequenceDecs_decodeSync_amd64_after_adjust: MOVQ R13, 8(SP) // Check values @@ -2280,6 +2316,10 @@ TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 + XORQ R9, R9 + MOVQ R9, 8(SP) + MOVQ R9, 16(SP) + MOVQ R9, 24(SP) MOVQ 112(CX), R9 MOVQ 128(CX), R10 MOVQ R10, 32(SP) @@ -2452,7 +2492,7 @@ sequenceDecs_decodeSync_bmi2_skip_update: MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_bmi2_adjust_end + JMP sequenceDecs_decodeSync_bmi2_after_adjust sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 @@ -2464,7 +2504,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_bmi2_adjust_end + JMP sequenceDecs_decodeSync_bmi2_after_adjust sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: MOVQ R13, R12 @@ -2473,8 +2513,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: CMPQ R13, $0x03 CMOVQEQ R14, R12 CMOVQEQ R15, R14 - LEAQ 144(CX), R15 - ADDQ (R15)(R12*8), R14 + ADDQ 144(CX)(R12*8), R14 JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid MOVQ $0x00000001, R14 @@ -2490,7 +2529,7 @@ sequenceDecs_decodeSync_bmi2_adjust_skip: MOVQ R14, 144(CX) MOVQ R14, R13 -sequenceDecs_decodeSync_bmi2_adjust_end: +sequenceDecs_decodeSync_bmi2_after_adjust: MOVQ R13, 8(SP) // Check values @@ -2787,6 +2826,10 @@ TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 + XORQ CX, CX + MOVQ CX, 8(SP) + MOVQ CX, 16(SP) + MOVQ CX, 24(SP) MOVQ 112(AX), R10 MOVQ 128(AX), CX MOVQ CX, 32(SP) @@ -2836,34 +2879,46 @@ sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: sequenceDecs_decodeSync_safe_amd64_fill_end: // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 8(SP) + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_safe_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_safe_amd64_of_update_zero: + MOVQ AX, 8(SP) // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 16(SP) + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_safe_amd64_ml_update_zero: + MOVQ AX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 @@ -2891,19 +2946,25 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: sequenceDecs_decodeSync_safe_amd64_fill_2_end: // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - ADDQ CX, BX - NEGL CX - SHRQ CL, R14 - SHRQ $0x20, AX - TESTQ CX, CX - CMOVQEQ CX, R14 - ADDQ R14, AX - MOVQ AX, 24(SP) + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_safe_amd64_ll_update_zero: + MOVQ AX, 24(SP) // Fill bitreader for state updates MOVQ R13, (SP) @@ -2983,7 +3044,7 @@ sequenceDecs_decodeSync_safe_amd64_skip_update: MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_safe_amd64_adjust_end + JMP sequenceDecs_decodeSync_safe_amd64_after_adjust sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 @@ -2995,7 +3056,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_safe_amd64_adjust_end + JMP sequenceDecs_decodeSync_safe_amd64_after_adjust sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: MOVQ R13, AX @@ -3004,8 +3065,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: CMPQ R13, $0x03 CMOVQEQ R14, AX CMOVQEQ R15, R14 - LEAQ 144(CX), R15 - ADDQ (R15)(AX*8), R14 + ADDQ 144(CX)(AX*8), R14 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid MOVQ $0x00000001, R14 @@ -3021,7 +3081,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_skip: MOVQ R14, 144(CX) MOVQ R14, R13 -sequenceDecs_decodeSync_safe_amd64_adjust_end: +sequenceDecs_decodeSync_safe_amd64_after_adjust: MOVQ R13, 8(SP) // Check values @@ -3420,6 +3480,10 @@ TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 + XORQ R9, R9 + MOVQ R9, 8(SP) + MOVQ R9, 16(SP) + MOVQ R9, 24(SP) MOVQ 112(CX), R9 MOVQ 128(CX), R10 MOVQ R10, 32(SP) @@ -3592,7 +3656,7 @@ sequenceDecs_decodeSync_safe_bmi2_skip_update: MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end + JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 @@ -3604,7 +3668,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end + JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: MOVQ R13, R12 @@ -3613,8 +3677,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: CMPQ R13, $0x03 CMOVQEQ R14, R12 CMOVQEQ R15, R14 - LEAQ 144(CX), R15 - ADDQ (R15)(R12*8), R14 + ADDQ 144(CX)(R12*8), R14 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid MOVQ $0x00000001, R14 @@ -3630,7 +3693,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_skip: MOVQ R14, 144(CX) MOVQ R14, R13 -sequenceDecs_decodeSync_safe_bmi2_adjust_end: +sequenceDecs_decodeSync_safe_bmi2_after_adjust: MOVQ R13, 8(SP) // Check values diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go index c3452bc3..ac2a80d2 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go @@ -111,7 +111,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error { } s.seqSize += ll + ml if s.seqSize > maxBlockSize { - return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize) + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) } litRemain -= ll if litRemain < 0 { @@ -149,7 +149,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error { } s.seqSize += litRemain if s.seqSize > maxBlockSize { - return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize) + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) } err := br.close() if err != nil { diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go index 3eb3f1c8..5ffa82f5 100644 --- a/vendor/github.com/klauspost/compress/zstd/zstd.go +++ b/vendor/github.com/klauspost/compress/zstd/zstd.go @@ -36,9 +36,6 @@ const forcePreDef = false // zstdMinMatch is the minimum zstd match length. const zstdMinMatch = 3 -// Reset the buffer offset when reaching this. -const bufferReset = math.MaxInt32 - MaxWindowSize - // fcsUnknown is used for unknown frame content size. const fcsUnknown = math.MaxUint64 @@ -75,7 +72,6 @@ var ( ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit") // ErrUnknownDictionary is returned if the dictionary ID is unknown. - // For the time being dictionaries are not supported. ErrUnknownDictionary = errors.New("unknown dictionary") // ErrFrameSizeExceeded is returned if the stated frame size is exceeded. @@ -110,26 +106,25 @@ func printf(format string, a ...interface{}) { } } -// matchLen returns the maximum length. +// matchLen returns the maximum common prefix length of a and b. // a must be the shortest of the two. -// The function also returns whether all bytes matched. -func matchLen(a, b []byte) int { - b = b[:len(a)] - for i := 0; i < len(a)-7; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - return i + (bits.TrailingZeros64(diff) >> 3) +func matchLen(a, b []byte) (n int) { + for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] { + diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b) + if diff != 0 { + return n + bits.TrailingZeros64(diff)>>3 } + n += 8 } - checked := (len(a) >> 3) << 3 - a = a[checked:] - b = b[checked:] for i := range a { if a[i] != b[i] { - return i + checked + break } + n++ } - return len(a) + checked + return n + } func load3232(b []byte, i int32) uint32 { @@ -140,10 +135,6 @@ func load6432(b []byte, i int32) uint64 { return binary.LittleEndian.Uint64(b[i:]) } -func load64(b []byte, i int) uint64 { - return binary.LittleEndian.Uint64(b[i:]) -} - type byter interface { Bytes() []byte Len() int |