diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2')
8 files changed, 1867 insertions, 451 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md index 11979345..73c0c462 100644 --- a/vendor/github.com/klauspost/compress/s2/README.md +++ b/vendor/github.com/klauspost/compress/s2/README.md @@ -19,6 +19,7 @@ This is important, so you don't have to worry about spending CPU cycles on alrea * Adjustable compression (3 levels) * Concurrent stream compression * Faster decompression, even for Snappy compatible content +* Concurrent Snappy/S2 stream decompression * Ability to quickly skip forward in compressed stream * Random seeking with indexes * Compatible with reading Snappy compressed content @@ -415,6 +416,25 @@ Without assembly decompression is also very fast; single goroutine decompression Even though S2 typically compresses better than Snappy, decompression speed is always better. +### Concurrent Stream Decompression + +For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent) +that will decode a full stream using multiple goroutines. + +Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 <input>`, best of 3: + +| Input | `-cpu=1` | `-cpu=2` | `-cpu=4` | `-cpu=8` | `-cpu=16` | +|-------------------------------------------|------------|------------|------------|------------|-------------| +| enwik10.snappy | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s | +| enwik10.s2 | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s | +| sofia-air-quality-dataset.tar.snappy | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s | +| sofia-air-quality-dataset.tar.s2 | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s | +| sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s | + +Scaling can be expected to be pretty linear until memory bandwidth is saturated. + +For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads. + ## Block compression @@ -873,7 +893,7 @@ for each entry { } // Uncompressed uses previous offset and adds EstBlockSize - entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff } @@ -901,6 +921,14 @@ for each entry { } ``` +To decode from any given uncompressed offset `(wantOffset)`: + +* Iterate entries until `entry[n].UncompressedOffset > wantOffset`. +* Start decoding from `entry[n-1].CompressedOffset`. +* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream. + +See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface. + # Format Extensions * Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`. diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go index 9e7fce88..2aba9e27 100644 --- a/vendor/github.com/klauspost/compress/s2/decode.go +++ b/vendor/github.com/klauspost/compress/s2/decode.go @@ -11,6 +11,8 @@ import ( "fmt" "io" "io/ioutil" + "runtime" + "sync" ) var ( @@ -169,6 +171,14 @@ func ReaderSkippableCB(id uint8, fn func(r io.Reader) error) ReaderOption { } } +// ReaderIgnoreCRC will make the reader skip CRC calculation and checks. +func ReaderIgnoreCRC() ReaderOption { + return func(r *Reader) error { + r.ignoreCRC = true + return nil + } +} + // Reader is an io.Reader that can read Snappy-compressed bytes. type Reader struct { r io.Reader @@ -191,18 +201,19 @@ type Reader struct { paramsOK bool snappyFrame bool ignoreStreamID bool + ignoreCRC bool } // ensureBufferSize will ensure that the buffer can take at least n bytes. // If false is returned the buffer exceeds maximum allowed size. func (r *Reader) ensureBufferSize(n int) bool { - if len(r.buf) >= n { - return true - } if n > r.maxBufSize { r.err = ErrCorrupt return false } + if cap(r.buf) >= n { + return true + } // Realloc buffer. r.buf = make([]byte, n) return true @@ -220,6 +231,7 @@ func (r *Reader) Reset(reader io.Reader) { r.err = nil r.i = 0 r.j = 0 + r.blockStart = 0 r.readHeader = r.ignoreStreamID } @@ -344,7 +356,7 @@ func (r *Reader) Read(p []byte) (int, error) { r.err = err return 0, r.err } - if crc(r.decoded[:n]) != checksum { + if !r.ignoreCRC && crc(r.decoded[:n]) != checksum { r.err = ErrCRC return 0, r.err } @@ -385,7 +397,7 @@ func (r *Reader) Read(p []byte) (int, error) { if !r.readFull(r.decoded[:n], false) { return 0, r.err } - if crc(r.decoded[:n]) != checksum { + if !r.ignoreCRC && crc(r.decoded[:n]) != checksum { r.err = ErrCRC return 0, r.err } @@ -435,6 +447,259 @@ func (r *Reader) Read(p []byte) (int, error) { } } +// DecodeConcurrent will decode the full stream to w. +// This function should not be combined with reading, seeking or other operations. +// Up to 'concurrent' goroutines will be used. +// If <= 0, runtime.NumCPU will be used. +// On success the number of bytes decompressed nil and is returned. +// This is mainly intended for bigger streams. +func (r *Reader) DecodeConcurrent(w io.Writer, concurrent int) (written int64, err error) { + if r.i > 0 || r.j > 0 || r.blockStart > 0 { + return 0, errors.New("DecodeConcurrent called after ") + } + if concurrent <= 0 { + concurrent = runtime.NumCPU() + } + + // Write to output + var errMu sync.Mutex + var aErr error + setErr := func(e error) (ok bool) { + errMu.Lock() + defer errMu.Unlock() + if e == nil { + return aErr == nil + } + if aErr == nil { + aErr = e + } + return false + } + hasErr := func() (ok bool) { + errMu.Lock() + v := aErr != nil + errMu.Unlock() + return v + } + + var aWritten int64 + toRead := make(chan []byte, concurrent) + writtenBlocks := make(chan []byte, concurrent) + queue := make(chan chan []byte, concurrent) + reUse := make(chan chan []byte, concurrent) + for i := 0; i < concurrent; i++ { + toRead <- make([]byte, 0, r.maxBufSize) + writtenBlocks <- make([]byte, 0, r.maxBufSize) + reUse <- make(chan []byte, 1) + } + // Writer + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + for toWrite := range queue { + entry := <-toWrite + reUse <- toWrite + if hasErr() { + writtenBlocks <- entry + continue + } + n, err := w.Write(entry) + want := len(entry) + writtenBlocks <- entry + if err != nil { + setErr(err) + continue + } + if n != want { + setErr(io.ErrShortWrite) + continue + } + aWritten += int64(n) + } + }() + + // Reader + defer func() { + close(queue) + if r.err != nil { + err = r.err + setErr(r.err) + } + wg.Wait() + if err == nil { + err = aErr + } + written = aWritten + }() + + for !hasErr() { + if !r.readFull(r.buf[:4], true) { + if r.err == io.EOF { + r.err = nil + } + return 0, r.err + } + chunkType := r.buf[0] + if !r.readHeader { + if chunkType != chunkTypeStreamIdentifier { + r.err = ErrCorrupt + return 0, r.err + } + r.readHeader = true + } + chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 + + // The chunk types are specified at + // https://github.com/google/snappy/blob/master/framing_format.txt + switch chunkType { + case chunkTypeCompressedData: + r.blockStart += int64(r.j) + // Section 4.2. Compressed data (chunk type 0x00). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + if chunkLen > r.maxBufSize { + r.err = ErrCorrupt + return 0, r.err + } + orgBuf := <-toRead + buf := orgBuf[:chunkLen] + + if !r.readFull(buf, false) { + return 0, r.err + } + + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + buf = buf[checksumSize:] + + n, err := DecodedLen(buf) + if err != nil { + r.err = err + return 0, r.err + } + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } + + if n > r.maxBlock { + r.err = ErrCorrupt + return 0, r.err + } + wg.Add(1) + + decoded := <-writtenBlocks + entry := <-reUse + queue <- entry + go func() { + defer wg.Done() + decoded = decoded[:n] + _, err := Decode(decoded, buf) + toRead <- orgBuf + if err != nil { + writtenBlocks <- decoded + setErr(err) + return + } + if !r.ignoreCRC && crc(decoded) != checksum { + writtenBlocks <- decoded + setErr(ErrCRC) + return + } + entry <- decoded + }() + continue + + case chunkTypeUncompressedData: + + // Section 4.3. Uncompressed data (chunk type 0x01). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + if chunkLen > r.maxBufSize { + r.err = ErrCorrupt + return 0, r.err + } + // Grab write buffer + orgBuf := <-writtenBlocks + buf := orgBuf[:checksumSize] + if !r.readFull(buf, false) { + return 0, r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + // Read content. + n := chunkLen - checksumSize + + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } + if n > r.maxBlock { + r.err = ErrCorrupt + return 0, r.err + } + // Read uncompressed + buf = orgBuf[:n] + if !r.readFull(buf, false) { + return 0, r.err + } + + if !r.ignoreCRC && crc(buf) != checksum { + r.err = ErrCRC + return 0, r.err + } + entry := <-reUse + queue <- entry + entry <- buf + continue + + case chunkTypeStreamIdentifier: + // Section 4.1. Stream identifier (chunk type 0xff). + if chunkLen != len(magicBody) { + r.err = ErrCorrupt + return 0, r.err + } + if !r.readFull(r.buf[:len(magicBody)], false) { + return 0, r.err + } + if string(r.buf[:len(magicBody)]) != magicBody { + if string(r.buf[:len(magicBody)]) != magicBodySnappy { + r.err = ErrCorrupt + return 0, r.err + } else { + r.snappyFrame = true + } + } else { + r.snappyFrame = false + } + continue + } + + if chunkType <= 0x7f { + // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). + // fmt.Printf("ERR chunktype: 0x%x\n", chunkType) + r.err = ErrUnsupported + return 0, r.err + } + // Section 4.4 Padding (chunk type 0xfe). + // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). + if chunkLen > maxChunkSize { + // fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen) + r.err = ErrUnsupported + return 0, r.err + } + + // fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen) + if !r.skippable(r.buf, chunkLen, false, chunkType) { + return 0, r.err + } + } + return 0, r.err +} + // Skip will skip n bytes forward in the decompressed output. // For larger skips this consumes less CPU and is faster than reading output and discarding it. // CRC is not checked on skipped blocks. @@ -699,8 +964,16 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) { case io.SeekCurrent: offset += r.blockStart + int64(r.i) case io.SeekEnd: - offset = -offset + if offset > 0 { + return 0, errors.New("seek after end of file") + } + offset = r.index.TotalUncompressed + offset + } + + if offset < 0 { + return 0, errors.New("seek before start of file") } + c, u, err := r.index.Find(offset) if err != nil { return r.blockStart + int64(r.i), err @@ -712,10 +985,6 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) { return 0, err } - if offset < 0 { - offset = r.index.TotalUncompressed + offset - } - r.i = r.j // Remove rest of current block. if u < offset { // Forward inside block diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go index 59f992ca..1aefabf3 100644 --- a/vendor/github.com/klauspost/compress/s2/encode.go +++ b/vendor/github.com/klauspost/compress/s2/encode.go @@ -1119,12 +1119,6 @@ func (w *Writer) closeIndex(idx bool) ([]byte, error) { if w.appendIndex { w.written += int64(len(index)) } - if true { - _, err := w.index.Load(index) - if err != nil { - panic(err) - } - } } if w.pad > 1 { diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go index 44803477..4bc80bc6 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_best.go +++ b/vendor/github.com/klauspost/compress/s2/encode_best.go @@ -370,7 +370,7 @@ func encodeBlockBestSnappy(dst, src []byte) (d int) { } offset := m.s - m.offset - return score - emitCopySize(offset, m.length) + return score - emitCopyNoRepeatSize(offset, m.length) } matchAt := func(offset, s int, first uint32) match { @@ -567,6 +567,10 @@ func emitCopySize(offset, length int) int { // Offset no more than 2 bytes. if length > 64 { + if offset < 2048 { + // Emit 8 bytes, then rest as repeats... + return 2 + emitRepeatSize(offset, length-8) + } // Emit remaining as repeats, at least 4 bytes remain. return 3 + emitRepeatSize(offset, length-60) } @@ -577,6 +581,28 @@ func emitCopySize(offset, length int) int { return 2 } +// emitCopyNoRepeatSize returns the size to encode the offset+length +// +// It assumes that: +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +func emitCopyNoRepeatSize(offset, length int) int { + if offset >= 65536 { + return 5 + 5*(length/64) + } + + // Offset no more than 2 bytes. + if length > 64 { + // Emit remaining as repeats, at least 4 bytes remain. + return 3 + 3*(length/60) + } + if length >= 12 || offset >= 2048 { + return 3 + } + // Emit the remaining copy, encoded as 2 bytes. + return 2 +} + // emitRepeatSize returns the number of bytes required to encode a repeat. // Length must be at least 4 and < 1<<24 func emitRepeatSize(offset, length int) int { diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go index 43d43534..94784b82 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_go.go +++ b/vendor/github.com/klauspost/compress/s2/encode_go.go @@ -180,14 +180,23 @@ func emitCopy(dst []byte, offset, length int) int { // Offset no more than 2 bytes. if length > 64 { - // Emit a length 60 copy, encoded as 3 bytes. - // Emit remaining as repeat value (minimum 4 bytes). - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = 59<<2 | tagCopy2 - length -= 60 + off := 3 + if offset < 2048 { + // emit 8 bytes as tagCopy1, rest as repeats. + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1 + length -= 8 + off = 2 + } else { + // Emit a length 60 copy, encoded as 3 bytes. + // Emit remaining as repeat value (minimum 4 bytes). + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 59<<2 | tagCopy2 + length -= 60 + } // Emit remaining as repeats, at least 4 bytes remain. - return 3 + emitRepeat(dst[3:], offset, length) + return off + emitRepeat(dst[off:], offset, length) } if length >= 12 || offset >= 2048 { // Emit the remaining copy, encoded as 3 bytes. diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go index d9312e5b..88f27c09 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go @@ -5,6 +5,8 @@ package s2 +func _dummy_() + // encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 4294967295 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s index 729dbf53..36915d94 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -5,6 +5,15 @@ #include "textflag.h" +// func _dummy_() +TEXT ·_dummy_(SB), $0 +#ifdef GOAMD64_v4 +#ifndef GOAMD64_v3 +#define GOAMD64_v3 +#endif +#endif + RET + // func encodeBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm(SB), $65560-56 @@ -253,17 +262,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -476,6 +474,90 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm: two_byte_offset_repeat_as_copy_encodeBlockAsm: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + MOVL DI, R9 + SHRL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + +emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + CMPL SI, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + LEAL -16842747(SI), SI + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + +repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +long_offset_short_repeat_as_copy_encodeBlockAsm: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -791,17 +873,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -944,6 +1015,90 @@ four_bytes_remain_match_nolit_encodeBlockAsm: two_byte_offset_match_nolit_encodeBlockAsm: CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + MOVL SI, R8 + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b + +emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b: + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b + CMPL R10, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b + LEAL -16842747(R10), R10 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b + +repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b: + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +long_offset_short_match_nolit_encodeBlockAsm: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -1134,17 +1289,36 @@ memmove_emit_remainder_encodeBlockAsm: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: @@ -1466,17 +1640,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -1667,6 +1830,77 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +long_offset_short_repeat_as_copy_encodeBlockAsm4MB: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -1963,17 +2197,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -2105,6 +2328,77 @@ four_bytes_remain_match_nolit_encodeBlockAsm4MB: two_byte_offset_match_nolit_encodeBlockAsm4MB: CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm4MB + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +long_offset_short_match_nolit_encodeBlockAsm4MB: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -2276,17 +2570,36 @@ memmove_emit_remainder_encodeBlockAsm4MB: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: @@ -2597,17 +2910,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -2706,6 +3008,65 @@ repeat_as_copy_encodeBlockAsm12B: two_byte_offset_repeat_as_copy_encodeBlockAsm12B: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +long_offset_short_repeat_as_copy_encodeBlockAsm12B: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -2979,17 +3340,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -3041,6 +3391,65 @@ match_nolit_end_encodeBlockAsm12B: two_byte_offset_match_nolit_encodeBlockAsm12B: CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm12B + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +long_offset_short_match_nolit_encodeBlockAsm12B: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -3189,17 +3598,36 @@ memmove_emit_remainder_encodeBlockAsm12B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: @@ -3510,17 +3938,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -3619,6 +4036,65 @@ repeat_as_copy_encodeBlockAsm10B: two_byte_offset_repeat_as_copy_encodeBlockAsm10B: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +long_offset_short_repeat_as_copy_encodeBlockAsm10B: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -3892,17 +4368,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -3954,6 +4419,65 @@ match_nolit_end_encodeBlockAsm10B: two_byte_offset_match_nolit_encodeBlockAsm10B: CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm10B + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +long_offset_short_match_nolit_encodeBlockAsm10B: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -4102,17 +4626,36 @@ memmove_emit_remainder_encodeBlockAsm10B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: @@ -4423,17 +4966,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -4528,6 +5060,61 @@ repeat_as_copy_encodeBlockAsm8B: two_byte_offset_repeat_as_copy_encodeBlockAsm8B: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b + MOVL SI, DI + LEAL -4(SI), SI + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +long_offset_short_repeat_as_copy_encodeBlockAsm8B: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -4795,17 +5382,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -4857,6 +5433,61 @@ match_nolit_end_encodeBlockAsm8B: two_byte_offset_match_nolit_encodeBlockAsm8B: CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm8B + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b + MOVL R10, SI + LEAL -4(R10), R10 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +long_offset_short_match_nolit_encodeBlockAsm8B: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -4999,17 +5630,36 @@ memmove_emit_remainder_encodeBlockAsm8B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: @@ -5225,17 +5875,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -5541,6 +6180,90 @@ four_bytes_remain_match_nolit_encodeBetterBlockAsm: two_byte_offset_match_nolit_encodeBetterBlockAsm: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + MOVL R8, R9 + SHRL $0x08, R9 + SHLL $0x05, R9 + ORL R9, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + CMPL R12, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + LEAL -16842747(R12), R12 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +long_offset_short_match_nolit_encodeBetterBlockAsm: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -5979,8 +6702,9 @@ memmove_emit_remainder_encodeBetterBlockAsm: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 @@ -5989,9 +6713,18 @@ memmove_emit_remainder_encodeBetterBlockAsm: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: @@ -6214,17 +6947,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -6511,6 +7233,77 @@ four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +long_offset_short_match_nolit_encodeBetterBlockAsm4MB: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -6911,8 +7704,9 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 @@ -6921,9 +7715,18 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: @@ -7138,17 +7941,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -7335,6 +8127,65 @@ emit_literal_done_match_emit_encodeBetterBlockAsm12B: two_byte_offset_match_nolit_encodeBetterBlockAsm12B: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +long_offset_short_match_nolit_encodeBetterBlockAsm12B: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -7689,8 +8540,9 @@ memmove_emit_remainder_encodeBetterBlockAsm12B: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 @@ -7699,9 +8551,18 @@ memmove_emit_remainder_encodeBetterBlockAsm12B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: @@ -7916,17 +8777,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -8113,6 +8963,65 @@ emit_literal_done_match_emit_encodeBetterBlockAsm10B: two_byte_offset_match_nolit_encodeBetterBlockAsm10B: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +long_offset_short_match_nolit_encodeBetterBlockAsm10B: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -8467,8 +9376,9 @@ memmove_emit_remainder_encodeBetterBlockAsm10B: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 @@ -8477,9 +9387,18 @@ memmove_emit_remainder_encodeBetterBlockAsm10B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: @@ -8694,17 +9613,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -8891,6 +9799,61 @@ emit_literal_done_match_emit_encodeBetterBlockAsm8B: two_byte_offset_match_nolit_encodeBetterBlockAsm8B: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +long_offset_short_match_nolit_encodeBetterBlockAsm8B: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -9235,8 +10198,9 @@ memmove_emit_remainder_encodeBetterBlockAsm8B: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 @@ -9245,9 +10209,18 @@ memmove_emit_remainder_encodeBetterBlockAsm8B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: @@ -9584,17 +10557,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -9918,17 +10880,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -10127,17 +11078,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: @@ -10448,17 +11418,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -10739,17 +11698,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -10905,17 +11853,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm64K: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: @@ -11226,17 +12193,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -11517,17 +12473,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -11683,17 +12628,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm12B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: @@ -12004,17 +12968,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -12295,17 +13248,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -12461,17 +13403,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm10B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: @@ -12782,17 +13743,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R10, R10 @@ -13071,17 +14021,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R9, R9 @@ -13235,17 +14174,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm8B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: @@ -13461,17 +14419,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -13850,17 +14797,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: @@ -14068,17 +15034,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -14386,17 +15341,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: @@ -14604,17 +15578,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -14922,17 +15885,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: @@ -15140,17 +16122,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -15458,17 +16429,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: @@ -15676,17 +16666,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ R11, R11 @@ -15992,17 +16971,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: @@ -16443,6 +17441,97 @@ four_bytes_remain_standalone: two_byte_offset_standalone: CMPL DX, $0x40 JLE two_byte_offset_short_standalone + CMPL CX, $0x00000800 + JAE long_offset_short_standalone + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB CL, 1(AX) + MOVL CX, DI + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + SUBL $0x08, DX + + // emitRepeat + LEAL -4(DX), DX + JMP cant_repeat_two_offset_standalone_emit_copy_short_2b + +emit_repeat_again_standalone_emit_copy_short_2b: + MOVL DX, SI + LEAL -4(DX), DX + CMPL SI, $0x08 + JLE repeat_two_standalone_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_standalone_emit_copy_short_2b + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone_emit_copy_short_2b + +cant_repeat_two_offset_standalone_emit_copy_short_2b: + CMPL DX, $0x00000104 + JLT repeat_three_standalone_emit_copy_short_2b + CMPL DX, $0x00010100 + JLT repeat_four_standalone_emit_copy_short_2b + CMPL DX, $0x0100ffff + JLT repeat_five_standalone_emit_copy_short_2b + LEAL -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone_emit_copy_short_2b + +repeat_five_standalone_emit_copy_short_2b: + LEAL -65536(DX), DX + MOVL DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARL $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +repeat_four_standalone_emit_copy_short_2b: + LEAL -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_copy_end + +repeat_three_standalone_emit_copy_short_2b: + LEAL -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_copy_end + +repeat_two_standalone_emit_copy_short_2b: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +repeat_two_offset_standalone_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +long_offset_short_standalone: MOVB $0xee, (AX) MOVW CX, 1(AX) LEAL -60(DX), DX @@ -16644,17 +17733,6 @@ matchlen_loopback_standalone: #ifdef GOAMD64_v3 TZCNTQ BX, BX -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 - TZCNTQ BX, BX - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED #else BSFQ BX, BX diff --git a/vendor/github.com/klauspost/compress/s2/index.go b/vendor/github.com/klauspost/compress/s2/index.go index fd857682..7b24a006 100644 --- a/vendor/github.com/klauspost/compress/s2/index.go +++ b/vendor/github.com/klauspost/compress/s2/index.go @@ -10,6 +10,7 @@ import ( "encoding/json" "fmt" "io" + "sort" ) const ( @@ -100,6 +101,15 @@ func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err er if offset > i.TotalUncompressed { return 0, 0, io.ErrUnexpectedEOF } + if len(i.info) > 200 { + n := sort.Search(len(i.info), func(n int) bool { + return i.info[n].uncompressedOffset > offset + }) + if n == 0 { + n = 1 + } + return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil + } for _, info := range i.info { if info.uncompressedOffset > offset { break |