summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/s2
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2')
-rw-r--r--vendor/github.com/klauspost/compress/s2/README.md30
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode.go289
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode.go6
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_best.go28
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_go.go23
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go2
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s1930
-rw-r--r--vendor/github.com/klauspost/compress/s2/index.go10
8 files changed, 1867 insertions, 451 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
index 11979345..73c0c462 100644
--- a/vendor/github.com/klauspost/compress/s2/README.md
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -19,6 +19,7 @@ This is important, so you don't have to worry about spending CPU cycles on alrea
* Adjustable compression (3 levels)
* Concurrent stream compression
* Faster decompression, even for Snappy compatible content
+* Concurrent Snappy/S2 stream decompression
* Ability to quickly skip forward in compressed stream
* Random seeking with indexes
* Compatible with reading Snappy compressed content
@@ -415,6 +416,25 @@ Without assembly decompression is also very fast; single goroutine decompression
Even though S2 typically compresses better than Snappy, decompression speed is always better.
+### Concurrent Stream Decompression
+
+For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent)
+that will decode a full stream using multiple goroutines.
+
+Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 <input>`, best of 3:
+
+| Input | `-cpu=1` | `-cpu=2` | `-cpu=4` | `-cpu=8` | `-cpu=16` |
+|-------------------------------------------|------------|------------|------------|------------|-------------|
+| enwik10.snappy | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s |
+| enwik10.s2 | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s |
+| sofia-air-quality-dataset.tar.snappy | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s |
+| sofia-air-quality-dataset.tar.s2 | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s |
+| sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s |
+
+Scaling can be expected to be pretty linear until memory bandwidth is saturated.
+
+For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads.
+
## Block compression
@@ -873,7 +893,7 @@ for each entry {
}
// Uncompressed uses previous offset and adds EstBlockSize
- entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize
+ entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff
}
@@ -901,6 +921,14 @@ for each entry {
}
```
+To decode from any given uncompressed offset `(wantOffset)`:
+
+* Iterate entries until `entry[n].UncompressedOffset > wantOffset`.
+* Start decoding from `entry[n-1].CompressedOffset`.
+* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream.
+
+See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface.
+
# Format Extensions
* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
index 9e7fce88..2aba9e27 100644
--- a/vendor/github.com/klauspost/compress/s2/decode.go
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -11,6 +11,8 @@ import (
"fmt"
"io"
"io/ioutil"
+ "runtime"
+ "sync"
)
var (
@@ -169,6 +171,14 @@ func ReaderSkippableCB(id uint8, fn func(r io.Reader) error) ReaderOption {
}
}
+// ReaderIgnoreCRC will make the reader skip CRC calculation and checks.
+func ReaderIgnoreCRC() ReaderOption {
+ return func(r *Reader) error {
+ r.ignoreCRC = true
+ return nil
+ }
+}
+
// Reader is an io.Reader that can read Snappy-compressed bytes.
type Reader struct {
r io.Reader
@@ -191,18 +201,19 @@ type Reader struct {
paramsOK bool
snappyFrame bool
ignoreStreamID bool
+ ignoreCRC bool
}
// ensureBufferSize will ensure that the buffer can take at least n bytes.
// If false is returned the buffer exceeds maximum allowed size.
func (r *Reader) ensureBufferSize(n int) bool {
- if len(r.buf) >= n {
- return true
- }
if n > r.maxBufSize {
r.err = ErrCorrupt
return false
}
+ if cap(r.buf) >= n {
+ return true
+ }
// Realloc buffer.
r.buf = make([]byte, n)
return true
@@ -220,6 +231,7 @@ func (r *Reader) Reset(reader io.Reader) {
r.err = nil
r.i = 0
r.j = 0
+ r.blockStart = 0
r.readHeader = r.ignoreStreamID
}
@@ -344,7 +356,7 @@ func (r *Reader) Read(p []byte) (int, error) {
r.err = err
return 0, r.err
}
- if crc(r.decoded[:n]) != checksum {
+ if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
r.err = ErrCRC
return 0, r.err
}
@@ -385,7 +397,7 @@ func (r *Reader) Read(p []byte) (int, error) {
if !r.readFull(r.decoded[:n], false) {
return 0, r.err
}
- if crc(r.decoded[:n]) != checksum {
+ if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
r.err = ErrCRC
return 0, r.err
}
@@ -435,6 +447,259 @@ func (r *Reader) Read(p []byte) (int, error) {
}
}
+// DecodeConcurrent will decode the full stream to w.
+// This function should not be combined with reading, seeking or other operations.
+// Up to 'concurrent' goroutines will be used.
+// If <= 0, runtime.NumCPU will be used.
+// On success the number of bytes decompressed nil and is returned.
+// This is mainly intended for bigger streams.
+func (r *Reader) DecodeConcurrent(w io.Writer, concurrent int) (written int64, err error) {
+ if r.i > 0 || r.j > 0 || r.blockStart > 0 {
+ return 0, errors.New("DecodeConcurrent called after ")
+ }
+ if concurrent <= 0 {
+ concurrent = runtime.NumCPU()
+ }
+
+ // Write to output
+ var errMu sync.Mutex
+ var aErr error
+ setErr := func(e error) (ok bool) {
+ errMu.Lock()
+ defer errMu.Unlock()
+ if e == nil {
+ return aErr == nil
+ }
+ if aErr == nil {
+ aErr = e
+ }
+ return false
+ }
+ hasErr := func() (ok bool) {
+ errMu.Lock()
+ v := aErr != nil
+ errMu.Unlock()
+ return v
+ }
+
+ var aWritten int64
+ toRead := make(chan []byte, concurrent)
+ writtenBlocks := make(chan []byte, concurrent)
+ queue := make(chan chan []byte, concurrent)
+ reUse := make(chan chan []byte, concurrent)
+ for i := 0; i < concurrent; i++ {
+ toRead <- make([]byte, 0, r.maxBufSize)
+ writtenBlocks <- make([]byte, 0, r.maxBufSize)
+ reUse <- make(chan []byte, 1)
+ }
+ // Writer
+ var wg sync.WaitGroup
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ for toWrite := range queue {
+ entry := <-toWrite
+ reUse <- toWrite
+ if hasErr() {
+ writtenBlocks <- entry
+ continue
+ }
+ n, err := w.Write(entry)
+ want := len(entry)
+ writtenBlocks <- entry
+ if err != nil {
+ setErr(err)
+ continue
+ }
+ if n != want {
+ setErr(io.ErrShortWrite)
+ continue
+ }
+ aWritten += int64(n)
+ }
+ }()
+
+ // Reader
+ defer func() {
+ close(queue)
+ if r.err != nil {
+ err = r.err
+ setErr(r.err)
+ }
+ wg.Wait()
+ if err == nil {
+ err = aErr
+ }
+ written = aWritten
+ }()
+
+ for !hasErr() {
+ if !r.readFull(r.buf[:4], true) {
+ if r.err == io.EOF {
+ r.err = nil
+ }
+ return 0, r.err
+ }
+ chunkType := r.buf[0]
+ if !r.readHeader {
+ if chunkType != chunkTypeStreamIdentifier {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ r.readHeader = true
+ }
+ chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+ // The chunk types are specified at
+ // https://github.com/google/snappy/blob/master/framing_format.txt
+ switch chunkType {
+ case chunkTypeCompressedData:
+ r.blockStart += int64(r.j)
+ // Section 4.2. Compressed data (chunk type 0x00).
+ if chunkLen < checksumSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ if chunkLen > r.maxBufSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ orgBuf := <-toRead
+ buf := orgBuf[:chunkLen]
+
+ if !r.readFull(buf, false) {
+ return 0, r.err
+ }
+
+ checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+ buf = buf[checksumSize:]
+
+ n, err := DecodedLen(buf)
+ if err != nil {
+ r.err = err
+ return 0, r.err
+ }
+ if r.snappyFrame && n > maxSnappyBlockSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+
+ if n > r.maxBlock {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ wg.Add(1)
+
+ decoded := <-writtenBlocks
+ entry := <-reUse
+ queue <- entry
+ go func() {
+ defer wg.Done()
+ decoded = decoded[:n]
+ _, err := Decode(decoded, buf)
+ toRead <- orgBuf
+ if err != nil {
+ writtenBlocks <- decoded
+ setErr(err)
+ return
+ }
+ if !r.ignoreCRC && crc(decoded) != checksum {
+ writtenBlocks <- decoded
+ setErr(ErrCRC)
+ return
+ }
+ entry <- decoded
+ }()
+ continue
+
+ case chunkTypeUncompressedData:
+
+ // Section 4.3. Uncompressed data (chunk type 0x01).
+ if chunkLen < checksumSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ if chunkLen > r.maxBufSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ // Grab write buffer
+ orgBuf := <-writtenBlocks
+ buf := orgBuf[:checksumSize]
+ if !r.readFull(buf, false) {
+ return 0, r.err
+ }
+ checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+ // Read content.
+ n := chunkLen - checksumSize
+
+ if r.snappyFrame && n > maxSnappyBlockSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ if n > r.maxBlock {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ // Read uncompressed
+ buf = orgBuf[:n]
+ if !r.readFull(buf, false) {
+ return 0, r.err
+ }
+
+ if !r.ignoreCRC && crc(buf) != checksum {
+ r.err = ErrCRC
+ return 0, r.err
+ }
+ entry := <-reUse
+ queue <- entry
+ entry <- buf
+ continue
+
+ case chunkTypeStreamIdentifier:
+ // Section 4.1. Stream identifier (chunk type 0xff).
+ if chunkLen != len(magicBody) {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ if !r.readFull(r.buf[:len(magicBody)], false) {
+ return 0, r.err
+ }
+ if string(r.buf[:len(magicBody)]) != magicBody {
+ if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+ r.err = ErrCorrupt
+ return 0, r.err
+ } else {
+ r.snappyFrame = true
+ }
+ } else {
+ r.snappyFrame = false
+ }
+ continue
+ }
+
+ if chunkType <= 0x7f {
+ // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+ // fmt.Printf("ERR chunktype: 0x%x\n", chunkType)
+ r.err = ErrUnsupported
+ return 0, r.err
+ }
+ // Section 4.4 Padding (chunk type 0xfe).
+ // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+ if chunkLen > maxChunkSize {
+ // fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen)
+ r.err = ErrUnsupported
+ return 0, r.err
+ }
+
+ // fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen)
+ if !r.skippable(r.buf, chunkLen, false, chunkType) {
+ return 0, r.err
+ }
+ }
+ return 0, r.err
+}
+
// Skip will skip n bytes forward in the decompressed output.
// For larger skips this consumes less CPU and is faster than reading output and discarding it.
// CRC is not checked on skipped blocks.
@@ -699,8 +964,16 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
case io.SeekCurrent:
offset += r.blockStart + int64(r.i)
case io.SeekEnd:
- offset = -offset
+ if offset > 0 {
+ return 0, errors.New("seek after end of file")
+ }
+ offset = r.index.TotalUncompressed + offset
+ }
+
+ if offset < 0 {
+ return 0, errors.New("seek before start of file")
}
+
c, u, err := r.index.Find(offset)
if err != nil {
return r.blockStart + int64(r.i), err
@@ -712,10 +985,6 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
return 0, err
}
- if offset < 0 {
- offset = r.index.TotalUncompressed + offset
- }
-
r.i = r.j // Remove rest of current block.
if u < offset {
// Forward inside block
diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go
index 59f992ca..1aefabf3 100644
--- a/vendor/github.com/klauspost/compress/s2/encode.go
+++ b/vendor/github.com/klauspost/compress/s2/encode.go
@@ -1119,12 +1119,6 @@ func (w *Writer) closeIndex(idx bool) ([]byte, error) {
if w.appendIndex {
w.written += int64(len(index))
}
- if true {
- _, err := w.index.Load(index)
- if err != nil {
- panic(err)
- }
- }
}
if w.pad > 1 {
diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go
index 44803477..4bc80bc6 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_best.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@@ -370,7 +370,7 @@ func encodeBlockBestSnappy(dst, src []byte) (d int) {
}
offset := m.s - m.offset
- return score - emitCopySize(offset, m.length)
+ return score - emitCopyNoRepeatSize(offset, m.length)
}
matchAt := func(offset, s int, first uint32) match {
@@ -567,6 +567,10 @@ func emitCopySize(offset, length int) int {
// Offset no more than 2 bytes.
if length > 64 {
+ if offset < 2048 {
+ // Emit 8 bytes, then rest as repeats...
+ return 2 + emitRepeatSize(offset, length-8)
+ }
// Emit remaining as repeats, at least 4 bytes remain.
return 3 + emitRepeatSize(offset, length-60)
}
@@ -577,6 +581,28 @@ func emitCopySize(offset, length int) int {
return 2
}
+// emitCopyNoRepeatSize returns the size to encode the offset+length
+//
+// It assumes that:
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
+func emitCopyNoRepeatSize(offset, length int) int {
+ if offset >= 65536 {
+ return 5 + 5*(length/64)
+ }
+
+ // Offset no more than 2 bytes.
+ if length > 64 {
+ // Emit remaining as repeats, at least 4 bytes remain.
+ return 3 + 3*(length/60)
+ }
+ if length >= 12 || offset >= 2048 {
+ return 3
+ }
+ // Emit the remaining copy, encoded as 2 bytes.
+ return 2
+}
+
// emitRepeatSize returns the number of bytes required to encode a repeat.
// Length must be at least 4 and < 1<<24
func emitRepeatSize(offset, length int) int {
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
index 43d43534..94784b82 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_go.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -180,14 +180,23 @@ func emitCopy(dst []byte, offset, length int) int {
// Offset no more than 2 bytes.
if length > 64 {
- // Emit a length 60 copy, encoded as 3 bytes.
- // Emit remaining as repeat value (minimum 4 bytes).
- dst[2] = uint8(offset >> 8)
- dst[1] = uint8(offset)
- dst[0] = 59<<2 | tagCopy2
- length -= 60
+ off := 3
+ if offset < 2048 {
+ // emit 8 bytes as tagCopy1, rest as repeats.
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+ length -= 8
+ off = 2
+ } else {
+ // Emit a length 60 copy, encoded as 3 bytes.
+ // Emit remaining as repeat value (minimum 4 bytes).
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = 59<<2 | tagCopy2
+ length -= 60
+ }
// Emit remaining as repeats, at least 4 bytes remain.
- return 3 + emitRepeat(dst[3:], offset, length)
+ return off + emitRepeat(dst[off:], offset, length)
}
if length >= 12 || offset >= 2048 {
// Emit the remaining copy, encoded as 3 bytes.
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
index d9312e5b..88f27c09 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@@ -5,6 +5,8 @@
package s2
+func _dummy_()
+
// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
// Maximum input 4294967295 bytes.
// It assumes that the varint-encoded length of the decompressed bytes has already been written.
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
index 729dbf53..36915d94 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -5,6 +5,15 @@
#include "textflag.h"
+// func _dummy_()
+TEXT ·_dummy_(SB), $0
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+ RET
+
// func encodeBlockAsm(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm(SB), $65560-56
@@ -253,17 +262,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -476,6 +474,90 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm:
two_byte_offset_repeat_as_copy_encodeBlockAsm:
CMPL SI, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
+ CMPL DI, $0x00000800
+ JAE long_offset_short_repeat_as_copy_encodeBlockAsm
+ MOVL $0x00000001, R8
+ LEAL 16(R8), R8
+ MOVB DI, 1(AX)
+ MOVL DI, R9
+ SHRL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, SI
+
+ // emitRepeat
+ LEAL -4(SI), SI
+ JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+ CMPL SI, $0x0100ffff
+ JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+ LEAL -16842747(SI), SI
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+long_offset_short_repeat_as_copy_encodeBlockAsm:
MOVB $0xee, (AX)
MOVW DI, 1(AX)
LEAL -60(SI), SI
@@ -791,17 +873,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -944,6 +1015,90 @@ four_bytes_remain_match_nolit_encodeBlockAsm:
two_byte_offset_match_nolit_encodeBlockAsm:
CMPL R10, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm
+ CMPL SI, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBlockAsm
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ MOVL SI, R8
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, DI
+ MOVB DI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R10
+
+ // emitRepeat
+ LEAL -4(R10), R10
+ JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
+
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
+ LEAL -16842747(R10), R10
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+long_offset_short_match_nolit_encodeBlockAsm:
MOVB $0xee, (AX)
MOVW SI, 1(AX)
LEAL -60(R10), R10
@@ -1134,17 +1289,36 @@ memmove_emit_remainder_encodeBlockAsm:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeBlockAsm
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
@@ -1466,17 +1640,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -1667,6 +1830,77 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
CMPL SI, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
+ CMPL DI, $0x00000800
+ JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB
+ MOVL $0x00000001, R8
+ LEAL 16(R8), R8
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, SI
+
+ // emitRepeat
+ LEAL -4(SI), SI
+ JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
MOVB $0xee, (AX)
MOVW DI, 1(AX)
LEAL -60(SI), SI
@@ -1963,17 +2197,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -2105,6 +2328,77 @@ four_bytes_remain_match_nolit_encodeBlockAsm4MB:
two_byte_offset_match_nolit_encodeBlockAsm4MB:
CMPL R10, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
+ CMPL SI, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBlockAsm4MB
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R10
+
+ // emitRepeat
+ LEAL -4(R10), R10
+ JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+long_offset_short_match_nolit_encodeBlockAsm4MB:
MOVB $0xee, (AX)
MOVW SI, 1(AX)
LEAL -60(R10), R10
@@ -2276,17 +2570,36 @@ memmove_emit_remainder_encodeBlockAsm4MB:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
@@ -2597,17 +2910,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm12B:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -2706,6 +3008,65 @@ repeat_as_copy_encodeBlockAsm12B:
two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
CMPL SI, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
+ CMPL DI, $0x00000800
+ JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B
+ MOVL $0x00000001, R8
+ LEAL 16(R8), R8
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, SI
+
+ // emitRepeat
+ LEAL -4(SI), SI
+ JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+long_offset_short_repeat_as_copy_encodeBlockAsm12B:
MOVB $0xee, (AX)
MOVW DI, 1(AX)
LEAL -60(SI), SI
@@ -2979,17 +3340,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm12B:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -3041,6 +3391,65 @@ match_nolit_end_encodeBlockAsm12B:
two_byte_offset_match_nolit_encodeBlockAsm12B:
CMPL R10, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B
+ CMPL SI, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBlockAsm12B
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R10
+
+ // emitRepeat
+ LEAL -4(R10), R10
+ JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+long_offset_short_match_nolit_encodeBlockAsm12B:
MOVB $0xee, (AX)
MOVW SI, 1(AX)
LEAL -60(R10), R10
@@ -3189,17 +3598,36 @@ memmove_emit_remainder_encodeBlockAsm12B:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
@@ -3510,17 +3938,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm10B:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -3619,6 +4036,65 @@ repeat_as_copy_encodeBlockAsm10B:
two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
CMPL SI, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
+ CMPL DI, $0x00000800
+ JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B
+ MOVL $0x00000001, R8
+ LEAL 16(R8), R8
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, SI
+
+ // emitRepeat
+ LEAL -4(SI), SI
+ JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+long_offset_short_repeat_as_copy_encodeBlockAsm10B:
MOVB $0xee, (AX)
MOVW DI, 1(AX)
LEAL -60(SI), SI
@@ -3892,17 +4368,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm10B:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -3954,6 +4419,65 @@ match_nolit_end_encodeBlockAsm10B:
two_byte_offset_match_nolit_encodeBlockAsm10B:
CMPL R10, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B
+ CMPL SI, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBlockAsm10B
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R10
+
+ // emitRepeat
+ LEAL -4(R10), R10
+ JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+long_offset_short_match_nolit_encodeBlockAsm10B:
MOVB $0xee, (AX)
MOVW SI, 1(AX)
LEAL -60(R10), R10
@@ -4102,17 +4626,36 @@ memmove_emit_remainder_encodeBlockAsm10B:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
@@ -4423,17 +4966,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm8B:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -4528,6 +5060,61 @@ repeat_as_copy_encodeBlockAsm8B:
two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
CMPL SI, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
+ CMPL DI, $0x00000800
+ JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B
+ MOVL $0x00000001, R8
+ LEAL 16(R8), R8
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, SI
+
+ // emitRepeat
+ LEAL -4(SI), SI
+ JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+ MOVL SI, DI
+ LEAL -4(SI), SI
+ CMPL DI, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+long_offset_short_repeat_as_copy_encodeBlockAsm8B:
MOVB $0xee, (AX)
MOVW DI, 1(AX)
LEAL -60(SI), SI
@@ -4795,17 +5382,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm8B:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -4857,6 +5433,61 @@ match_nolit_end_encodeBlockAsm8B:
two_byte_offset_match_nolit_encodeBlockAsm8B:
CMPL R10, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B
+ CMPL SI, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBlockAsm8B
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R10
+
+ // emitRepeat
+ LEAL -4(R10), R10
+ JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+ MOVL R10, SI
+ LEAL -4(R10), R10
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+
+long_offset_short_match_nolit_encodeBlockAsm8B:
MOVB $0xee, (AX)
MOVW SI, 1(AX)
LEAL -60(R10), R10
@@ -4999,17 +5630,36 @@ memmove_emit_remainder_encodeBlockAsm8B:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
@@ -5225,17 +5875,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -5541,6 +6180,90 @@ four_bytes_remain_match_nolit_encodeBetterBlockAsm:
two_byte_offset_match_nolit_encodeBetterBlockAsm:
CMPL R12, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
+ CMPL R8, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBetterBlockAsm
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB R8, 1(AX)
+ MOVL R8, R9
+ SHRL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R12
+
+ // emitRepeat
+ LEAL -4(R12), R12
+ JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+ CMPL R12, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+ LEAL -16842747(R12), R12
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+long_offset_short_match_nolit_encodeBetterBlockAsm:
MOVB $0xee, (AX)
MOVW R8, 1(AX)
LEAL -60(R12), R12
@@ -5979,8 +6702,9 @@ memmove_emit_remainder_encodeBetterBlockAsm:
MOVL SI, BX
// genMemMoveShort
- CMPQ BX, $0x04
- JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
CMPQ BX, $0x08
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
CMPQ BX, $0x10
@@ -5989,9 +6713,18 @@ memmove_emit_remainder_encodeBetterBlockAsm:
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4:
- MOVL (CX), SI
- MOVL SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
@@ -6214,17 +6947,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -6511,6 +7233,77 @@ four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
CMPL R12, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
+ CMPL R8, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R12
+
+ // emitRepeat
+ LEAL -4(R12), R12
+ JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
MOVB $0xee, (AX)
MOVW R8, 1(AX)
LEAL -60(R12), R12
@@ -6911,8 +7704,9 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB:
MOVL SI, BX
// genMemMoveShort
- CMPQ BX, $0x04
- JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
CMPQ BX, $0x08
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
CMPQ BX, $0x10
@@ -6921,9 +7715,18 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB:
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4:
- MOVL (CX), SI
- MOVL SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
@@ -7138,17 +7941,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -7335,6 +8127,65 @@ emit_literal_done_match_emit_encodeBetterBlockAsm12B:
two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
CMPL R12, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
+ CMPL R8, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R12
+
+ // emitRepeat
+ LEAL -4(R12), R12
+ JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+long_offset_short_match_nolit_encodeBetterBlockAsm12B:
MOVB $0xee, (AX)
MOVW R8, 1(AX)
LEAL -60(R12), R12
@@ -7689,8 +8540,9 @@ memmove_emit_remainder_encodeBetterBlockAsm12B:
MOVL SI, BX
// genMemMoveShort
- CMPQ BX, $0x04
- JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
CMPQ BX, $0x08
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
CMPQ BX, $0x10
@@ -7699,9 +8551,18 @@ memmove_emit_remainder_encodeBetterBlockAsm12B:
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4:
- MOVL (CX), SI
- MOVL SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
@@ -7916,17 +8777,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -8113,6 +8963,65 @@ emit_literal_done_match_emit_encodeBetterBlockAsm10B:
two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
CMPL R12, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
+ CMPL R8, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R12
+
+ // emitRepeat
+ LEAL -4(R12), R12
+ JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+long_offset_short_match_nolit_encodeBetterBlockAsm10B:
MOVB $0xee, (AX)
MOVW R8, 1(AX)
LEAL -60(R12), R12
@@ -8467,8 +9376,9 @@ memmove_emit_remainder_encodeBetterBlockAsm10B:
MOVL SI, BX
// genMemMoveShort
- CMPQ BX, $0x04
- JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
CMPQ BX, $0x08
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
CMPQ BX, $0x10
@@ -8477,9 +9387,18 @@ memmove_emit_remainder_encodeBetterBlockAsm10B:
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4:
- MOVL (CX), SI
- MOVL SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
@@ -8694,17 +9613,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -8891,6 +9799,61 @@ emit_literal_done_match_emit_encodeBetterBlockAsm8B:
two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
CMPL R12, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
+ CMPL R8, $0x00000800
+ JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R12
+
+ // emitRepeat
+ LEAL -4(R12), R12
+ JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+long_offset_short_match_nolit_encodeBetterBlockAsm8B:
MOVB $0xee, (AX)
MOVW R8, 1(AX)
LEAL -60(R12), R12
@@ -9235,8 +10198,9 @@ memmove_emit_remainder_encodeBetterBlockAsm8B:
MOVL SI, BX
// genMemMoveShort
- CMPQ BX, $0x04
- JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
CMPQ BX, $0x08
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
CMPQ BX, $0x10
@@ -9245,9 +10209,18 @@ memmove_emit_remainder_encodeBetterBlockAsm8B:
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4:
- MOVL (CX), SI
- MOVL SI, (AX)
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
@@ -9584,17 +10557,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
TZCNTQ R10, R10
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R10, R10
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R10, R10
@@ -9918,17 +10880,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -10127,17 +11078,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
@@ -10448,17 +11418,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
TZCNTQ R10, R10
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R10, R10
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R10, R10
@@ -10739,17 +11698,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -10905,17 +11853,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm64K:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
@@ -11226,17 +12193,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
TZCNTQ R10, R10
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R10, R10
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R10, R10
@@ -11517,17 +12473,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -11683,17 +12628,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm12B:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
@@ -12004,17 +12968,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
TZCNTQ R10, R10
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R10, R10
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R10, R10
@@ -12295,17 +13248,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -12461,17 +13403,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm10B:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
@@ -12782,17 +13743,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
TZCNTQ R10, R10
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R10, R10
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R10, R10
@@ -13071,17 +14021,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
TZCNTQ R9, R9
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R9, R9
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R9, R9
@@ -13235,17 +14174,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm8B:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
@@ -13461,17 +14419,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -13850,17 +14797,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
@@ -14068,17 +15034,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -14386,17 +15341,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
@@ -14604,17 +15578,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -14922,17 +15885,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
@@ -15140,17 +16122,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -15458,17 +16429,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
@@ -15676,17 +16666,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
#ifdef GOAMD64_v3
TZCNTQ R11, R11
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ R11, R11
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ R11, R11
@@ -15992,17 +16971,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
MOVL SI, BX
// genMemMoveShort
+ CMPQ BX, $0x03
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
+ JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
CMPQ BX, $0x08
- JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8
+ JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
CMPQ BX, $0x10
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
CMPQ BX, $0x20
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
-emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8:
- MOVQ (CX), SI
- MOVQ SI, (AX)
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(BX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
@@ -16443,6 +17441,97 @@ four_bytes_remain_standalone:
two_byte_offset_standalone:
CMPL DX, $0x40
JLE two_byte_offset_short_standalone
+ CMPL CX, $0x00000800
+ JAE long_offset_short_standalone
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB CL, 1(AX)
+ MOVL CX, DI
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ SUBL $0x08, DX
+
+ // emitRepeat
+ LEAL -4(DX), DX
+ JMP cant_repeat_two_offset_standalone_emit_copy_short_2b
+
+emit_repeat_again_standalone_emit_copy_short_2b:
+ MOVL DX, SI
+ LEAL -4(DX), DX
+ CMPL SI, $0x08
+ JLE repeat_two_standalone_emit_copy_short_2b
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_standalone_emit_copy_short_2b
+ CMPL CX, $0x00000800
+ JLT repeat_two_offset_standalone_emit_copy_short_2b
+
+cant_repeat_two_offset_standalone_emit_copy_short_2b:
+ CMPL DX, $0x00000104
+ JLT repeat_three_standalone_emit_copy_short_2b
+ CMPL DX, $0x00010100
+ JLT repeat_four_standalone_emit_copy_short_2b
+ CMPL DX, $0x0100ffff
+ JLT repeat_five_standalone_emit_copy_short_2b
+ LEAL -16842747(DX), DX
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ ADDQ $0x05, BX
+ JMP emit_repeat_again_standalone_emit_copy_short_2b
+
+repeat_five_standalone_emit_copy_short_2b:
+ LEAL -65536(DX), DX
+ MOVL DX, CX
+ MOVW $0x001d, (AX)
+ MOVW DX, 2(AX)
+ SARL $0x10, CX
+ MOVB CL, 4(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_copy_end
+
+repeat_four_standalone_emit_copy_short_2b:
+ LEAL -256(DX), DX
+ MOVW $0x0019, (AX)
+ MOVW DX, 2(AX)
+ ADDQ $0x04, BX
+ ADDQ $0x04, AX
+ JMP gen_emit_copy_end
+
+repeat_three_standalone_emit_copy_short_2b:
+ LEAL -4(DX), DX
+ MOVW $0x0015, (AX)
+ MOVB DL, 2(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+ JMP gen_emit_copy_end
+
+repeat_two_standalone_emit_copy_short_2b:
+ SHLL $0x02, DX
+ ORL $0x01, DX
+ MOVW DX, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy_short_2b:
+ XORQ SI, SI
+ LEAL 1(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SARL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+
+long_offset_short_standalone:
MOVB $0xee, (AX)
MOVW CX, 1(AX)
LEAL -60(DX), DX
@@ -16644,17 +17733,6 @@ matchlen_loopback_standalone:
#ifdef GOAMD64_v3
TZCNTQ BX, BX
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef GOAMD64_v4
- TZCNTQ BX, BX
-
-#define TZCNTQ_EMITTED 1
-#endif
-
-#ifdef TZCNTQ_EMITTED
-#undef TZCNTQ_EMITTED
#else
BSFQ BX, BX
diff --git a/vendor/github.com/klauspost/compress/s2/index.go b/vendor/github.com/klauspost/compress/s2/index.go
index fd857682..7b24a006 100644
--- a/vendor/github.com/klauspost/compress/s2/index.go
+++ b/vendor/github.com/klauspost/compress/s2/index.go
@@ -10,6 +10,7 @@ import (
"encoding/json"
"fmt"
"io"
+ "sort"
)
const (
@@ -100,6 +101,15 @@ func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err er
if offset > i.TotalUncompressed {
return 0, 0, io.ErrUnexpectedEOF
}
+ if len(i.info) > 200 {
+ n := sort.Search(len(i.info), func(n int) bool {
+ return i.info[n].uncompressedOffset > offset
+ })
+ if n == 0 {
+ n = 1
+ }
+ return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil
+ }
for _, info := range i.info {
if info.uncompressedOffset > offset {
break