summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost')
-rw-r--r--vendor/github.com/klauspost/compress/.goreleaser.yml2
-rw-r--r--vendor/github.com/klauspost/compress/README.md64
-rw-r--r--vendor/github.com/klauspost/compress/fse/compress.go31
-rw-r--r--vendor/github.com/klauspost/compress/huff0/bitreader.go8
-rw-r--r--vendor/github.com/klauspost/compress/huff0/compress.go114
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress.go38
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.go4
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s585
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_generic.go18
-rw-r--r--vendor/github.com/klauspost/compress/internal/snapref/encode_other.go28
-rw-r--r--vendor/github.com/klauspost/compress/s2/README.md469
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode.go500
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_other.go47
-rw-r--r--vendor/github.com/klauspost/compress/s2/dict.go331
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode.go54
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_all.go595
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_amd64.go14
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_best.go225
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_better.go723
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_go.go414
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go47
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s15989
-rw-r--r--vendor/github.com/klauspost/compress/s2/lz4convert.go585
-rw-r--r--vendor/github.com/klauspost/compress/zstd/README.md2
-rw-r--r--vendor/github.com/klauspost/compress/zstd/blockdec.go19
-rw-r--r--vendor/github.com/klauspost/compress/zstd/bytebuf.go16
-rw-r--r--vendor/github.com/klauspost/compress/zstd/decodeheader.go9
-rw-r--r--vendor/github.com/klauspost/compress/zstd/decoder.go140
-rw-r--r--vendor/github.com/klauspost/compress/zstd/decoder_options.go68
-rw-r--r--vendor/github.com/klauspost/compress/zstd/dict.go51
-rw-r--r--vendor/github.com/klauspost/compress/zstd/enc_base.go28
-rw-r--r--vendor/github.com/klauspost/compress/zstd/enc_best.go64
-rw-r--r--vendor/github.com/klauspost/compress/zstd/enc_better.go35
-rw-r--r--vendor/github.com/klauspost/compress/zstd/enc_dfast.go23
-rw-r--r--vendor/github.com/klauspost/compress/zstd/enc_fast.go20
-rw-r--r--vendor/github.com/klauspost/compress/zstd/encoder.go39
-rw-r--r--vendor/github.com/klauspost/compress/zstd/encoder_options.go38
-rw-r--r--vendor/github.com/klauspost/compress/zstd/framedec.go104
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go7
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s1
-rw-r--r--vendor/github.com/klauspost/compress/zstd/history.go21
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md49
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go47
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s336
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s140
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go2
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go19
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec.go22
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go43
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s489
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_generic.go4
-rw-r--r--vendor/github.com/klauspost/compress/zstd/zstd.go31
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/README.md350
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/cpuid.go361
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/detect_x86.go2
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/featureid_string.go324
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go112
57 files changed, 15136 insertions, 8765 deletions
diff --git a/vendor/github.com/klauspost/compress/.goreleaser.yml b/vendor/github.com/klauspost/compress/.goreleaser.yml
index 0af08e65..7a008a4d 100644
--- a/vendor/github.com/klauspost/compress/.goreleaser.yml
+++ b/vendor/github.com/klauspost/compress/.goreleaser.yml
@@ -3,7 +3,7 @@
before:
hooks:
- ./gen.sh
- - go install mvdan.cc/garble@latest
+ - go install mvdan.cc/garble@v0.9.3
builds:
-
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index c7cf1a20..958666ed 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -9,7 +9,6 @@ This package provides various compression algorithms.
* [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.
* [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp) Provides client and server wrappers for handling gzipped requests efficiently.
* [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.
-* [fuzz package](https://github.com/klauspost/compress-fuzz) for fuzz testing all compressors/decompressors here.
[![Go Reference](https://pkg.go.dev/badge/klauspost/compress.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)
[![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)
@@ -17,6 +16,62 @@ This package provides various compression algorithms.
# changelog
+* Jan 21st, 2023 (v1.15.15)
+ * deflate: Improve level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/739
+ * zstd: Add delta encoding support by @greatroar in https://github.com/klauspost/compress/pull/728
+ * zstd: Various speed improvements by @greatroar https://github.com/klauspost/compress/pull/741 https://github.com/klauspost/compress/pull/734 https://github.com/klauspost/compress/pull/736 https://github.com/klauspost/compress/pull/744 https://github.com/klauspost/compress/pull/743 https://github.com/klauspost/compress/pull/745
+ * gzhttp: Add SuffixETag() and DropETag() options to prevent ETag collisions on compressed responses by @willbicks in https://github.com/klauspost/compress/pull/740
+
+* Jan 3rd, 2023 (v1.15.14)
+
+ * flate: Improve speed in big stateless blocks https://github.com/klauspost/compress/pull/718
+ * zstd: Minor speed tweaks by @greatroar in https://github.com/klauspost/compress/pull/716 https://github.com/klauspost/compress/pull/720
+ * export NoGzipResponseWriter for custom ResponseWriter wrappers by @harshavardhana in https://github.com/klauspost/compress/pull/722
+ * s2: Add example for indexing and existing stream https://github.com/klauspost/compress/pull/723
+
+* Dec 11, 2022 (v1.15.13)
+ * zstd: Add [MaxEncodedSize](https://pkg.go.dev/github.com/klauspost/compress@v1.15.13/zstd#Encoder.MaxEncodedSize) to encoder https://github.com/klauspost/compress/pull/691
+ * zstd: Various tweaks and improvements https://github.com/klauspost/compress/pull/693 https://github.com/klauspost/compress/pull/695 https://github.com/klauspost/compress/pull/696 https://github.com/klauspost/compress/pull/701 https://github.com/klauspost/compress/pull/702 https://github.com/klauspost/compress/pull/703 https://github.com/klauspost/compress/pull/704 https://github.com/klauspost/compress/pull/705 https://github.com/klauspost/compress/pull/706 https://github.com/klauspost/compress/pull/707 https://github.com/klauspost/compress/pull/708
+
+* Oct 26, 2022 (v1.15.12)
+
+ * zstd: Tweak decoder allocs. https://github.com/klauspost/compress/pull/680
+ * gzhttp: Always delete `HeaderNoCompression` https://github.com/klauspost/compress/pull/683
+
+* Sept 26, 2022 (v1.15.11)
+
+ * flate: Improve level 1-3 compression https://github.com/klauspost/compress/pull/678
+ * zstd: Improve "best" compression by @nightwolfz in https://github.com/klauspost/compress/pull/677
+ * zstd: Fix+reduce decompression allocations https://github.com/klauspost/compress/pull/668
+ * zstd: Fix non-effective noescape tag https://github.com/klauspost/compress/pull/667
+
+* Sept 16, 2022 (v1.15.10)
+
+ * zstd: Add [WithDecodeAllCapLimit](https://pkg.go.dev/github.com/klauspost/compress@v1.15.10/zstd#WithDecodeAllCapLimit) https://github.com/klauspost/compress/pull/649
+ * Add Go 1.19 - deprecate Go 1.16 https://github.com/klauspost/compress/pull/651
+ * flate: Improve level 5+6 compression https://github.com/klauspost/compress/pull/656
+ * zstd: Improve "better" compresssion https://github.com/klauspost/compress/pull/657
+ * s2: Improve "best" compression https://github.com/klauspost/compress/pull/658
+ * s2: Improve "better" compression. https://github.com/klauspost/compress/pull/635
+ * s2: Slightly faster non-assembly decompression https://github.com/klauspost/compress/pull/646
+ * Use arrays for constant size copies https://github.com/klauspost/compress/pull/659
+
+* July 21, 2022 (v1.15.9)
+
+ * zstd: Fix decoder crash on amd64 (no BMI) on invalid input https://github.com/klauspost/compress/pull/645
+ * zstd: Disable decoder extended memory copies (amd64) due to possible crashes https://github.com/klauspost/compress/pull/644
+ * zstd: Allow single segments up to "max decoded size" by @klauspost in https://github.com/klauspost/compress/pull/643
+
+* July 13, 2022 (v1.15.8)
+
+ * gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
+ * s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
+ * zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
+ * zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
+ * huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
+ * zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
+ * gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
+
* June 29, 2022 (v1.15.7)
* s2: Fix absolute forward seeks https://github.com/klauspost/compress/pull/633
@@ -81,15 +136,15 @@ This package provides various compression algorithms.
* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)
* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)
-<details>
- <summary>See Details</summary>
Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.
Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.
While the release has been extensively tested, it is recommended to testing when upgrading.
-</details>
+<details>
+ <summary>See changes to v1.14.x</summary>
+
* Feb 22, 2022 (v1.14.4)
* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
@@ -115,6 +170,7 @@ While the release has been extensively tested, it is recommended to testing when
* zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468)
* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
+</details>
<details>
<summary>See changes to v1.13.x</summary>
diff --git a/vendor/github.com/klauspost/compress/fse/compress.go b/vendor/github.com/klauspost/compress/fse/compress.go
index 6f341914..dac97e58 100644
--- a/vendor/github.com/klauspost/compress/fse/compress.go
+++ b/vendor/github.com/klauspost/compress/fse/compress.go
@@ -146,54 +146,51 @@ func (s *Scratch) compress(src []byte) error {
c1.encodeZero(tt[src[ip-2]])
ip -= 2
}
+ src = src[:ip]
// Main compression loop.
switch {
case !s.zeroBits && s.actualTableLog <= 8:
// We can encode 4 symbols without requiring a flush.
// We do not need to check if any output is 0 bits.
- for ip >= 4 {
+ for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
- v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+ v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encode(tt[v0])
c1.encode(tt[v1])
c2.encode(tt[v2])
c1.encode(tt[v3])
- ip -= 4
}
case !s.zeroBits:
// We do not need to check if any output is 0 bits.
- for ip >= 4 {
+ for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
- v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+ v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encode(tt[v0])
c1.encode(tt[v1])
s.bw.flush32()
c2.encode(tt[v2])
c1.encode(tt[v3])
- ip -= 4
}
case s.actualTableLog <= 8:
// We can encode 4 symbols without requiring a flush
- for ip >= 4 {
+ for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
- v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+ v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encodeZero(tt[v0])
c1.encodeZero(tt[v1])
c2.encodeZero(tt[v2])
c1.encodeZero(tt[v3])
- ip -= 4
}
default:
- for ip >= 4 {
+ for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
- v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+ v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encodeZero(tt[v0])
c1.encodeZero(tt[v1])
s.bw.flush32()
c2.encodeZero(tt[v2])
c1.encodeZero(tt[v3])
- ip -= 4
}
}
@@ -459,15 +456,17 @@ func (s *Scratch) countSimple(in []byte) (max int) {
for _, v := range in {
s.count[v]++
}
- m := uint32(0)
+ m, symlen := uint32(0), s.symbolLen
for i, v := range s.count[:] {
+ if v == 0 {
+ continue
+ }
if v > m {
m = v
}
- if v > 0 {
- s.symbolLen = uint16(i) + 1
- }
+ symlen = uint16(i) + 1
}
+ s.symbolLen = symlen
return int(m)
}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index 504a7be9..e36d9742 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -67,7 +67,6 @@ func (b *bitReaderBytes) fillFast() {
// 2 bounds checks.
v := b.in[b.off-4 : b.off]
- v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32
@@ -88,8 +87,7 @@ func (b *bitReaderBytes) fill() {
return
}
if b.off > 4 {
- v := b.in[b.off-4:]
- v = v[:4]
+ v := b.in[b.off-4 : b.off]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32
@@ -179,7 +177,6 @@ func (b *bitReaderShifted) fillFast() {
// 2 bounds checks.
v := b.in[b.off-4 : b.off]
- v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32
@@ -200,8 +197,7 @@ func (b *bitReaderShifted) fill() {
return
}
if b.off > 4 {
- v := b.in[b.off-4:]
- v = v[:4]
+ v := b.in[b.off-4 : b.off]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 4d14542f..cdc94856 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -365,29 +365,29 @@ func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
m := uint32(0)
if len(s.prevTable) > 0 {
for i, v := range s.count[:] {
+ if v == 0 {
+ continue
+ }
if v > m {
m = v
}
- if v > 0 {
- s.symbolLen = uint16(i) + 1
- if i >= len(s.prevTable) {
- reuse = false
- } else {
- if s.prevTable[i].nBits == 0 {
- reuse = false
- }
- }
+ s.symbolLen = uint16(i) + 1
+ if i >= len(s.prevTable) {
+ reuse = false
+ } else if s.prevTable[i].nBits == 0 {
+ reuse = false
}
}
return int(m), reuse
}
for i, v := range s.count[:] {
+ if v == 0 {
+ continue
+ }
if v > m {
m = v
}
- if v > 0 {
- s.symbolLen = uint16(i) + 1
- }
+ s.symbolLen = uint16(i) + 1
}
return int(m), false
}
@@ -484,34 +484,35 @@ func (s *Scratch) buildCTable() error {
// Different from reference implementation.
huffNode0 := s.nodes[0 : huffNodesLen+1]
- for huffNode[nonNullRank].count == 0 {
+ for huffNode[nonNullRank].count() == 0 {
nonNullRank--
}
lowS := int16(nonNullRank)
nodeRoot := nodeNb + lowS - 1
lowN := nodeNb
- huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count
- huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb)
+ huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
+ huffNode[lowS].setParent(nodeNb)
+ huffNode[lowS-1].setParent(nodeNb)
nodeNb++
lowS -= 2
for n := nodeNb; n <= nodeRoot; n++ {
- huffNode[n].count = 1 << 30
+ huffNode[n].setCount(1 << 30)
}
// fake entry, strong barrier
- huffNode0[0].count = 1 << 31
+ huffNode0[0].setCount(1 << 31)
// create parents
for nodeNb <= nodeRoot {
var n1, n2 int16
- if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+ if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n1 = lowS
lowS--
} else {
n1 = lowN
lowN++
}
- if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+ if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n2 = lowS
lowS--
} else {
@@ -519,18 +520,19 @@ func (s *Scratch) buildCTable() error {
lowN++
}
- huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count
- huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb)
+ huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
+ huffNode0[n1+1].setParent(nodeNb)
+ huffNode0[n2+1].setParent(nodeNb)
nodeNb++
}
// distribute weights (unlimited tree height)
- huffNode[nodeRoot].nbBits = 0
+ huffNode[nodeRoot].setNbBits(0)
for n := nodeRoot - 1; n >= startNode; n-- {
- huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+ huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
}
for n := uint16(0); n <= nonNullRank; n++ {
- huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+ huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
}
s.actualTableLog = s.setMaxHeight(int(nonNullRank))
maxNbBits := s.actualTableLog
@@ -542,7 +544,7 @@ func (s *Scratch) buildCTable() error {
var nbPerRank [tableLogMax + 1]uint16
var valPerRank [16]uint16
for _, v := range huffNode[:nonNullRank+1] {
- nbPerRank[v.nbBits]++
+ nbPerRank[v.nbBits()]++
}
// determine stating value per rank
{
@@ -557,7 +559,7 @@ func (s *Scratch) buildCTable() error {
// push nbBits per symbol, symbol order
for _, v := range huffNode[:nonNullRank+1] {
- s.cTable[v.symbol].nBits = v.nbBits
+ s.cTable[v.symbol()].nBits = v.nbBits()
}
// assign value within rank, symbol order
@@ -603,12 +605,12 @@ func (s *Scratch) huffSort() {
pos := rank[r].current
rank[r].current++
prev := nodes[(pos-1)&huffNodesMask]
- for pos > rank[r].base && c > prev.count {
+ for pos > rank[r].base && c > prev.count() {
nodes[pos&huffNodesMask] = prev
pos--
prev = nodes[(pos-1)&huffNodesMask]
}
- nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
+ nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
}
}
@@ -617,7 +619,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
huffNode := s.nodes[1 : huffNodesLen+1]
//huffNode = huffNode[: huffNodesLen]
- largestBits := huffNode[lastNonNull].nbBits
+ largestBits := huffNode[lastNonNull].nbBits()
// early exit : no elt > maxNbBits
if largestBits <= maxNbBits {
@@ -627,14 +629,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
baseCost := int(1) << (largestBits - maxNbBits)
n := uint32(lastNonNull)
- for huffNode[n].nbBits > maxNbBits {
- totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits))
- huffNode[n].nbBits = maxNbBits
+ for huffNode[n].nbBits() > maxNbBits {
+ totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
+ huffNode[n].setNbBits(maxNbBits)
n--
}
// n stops at huffNode[n].nbBits <= maxNbBits
- for huffNode[n].nbBits == maxNbBits {
+ for huffNode[n].nbBits() == maxNbBits {
n--
}
// n end at index of smallest symbol using < maxNbBits
@@ -655,10 +657,10 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
{
currentNbBits := maxNbBits
for pos := int(n); pos >= 0; pos-- {
- if huffNode[pos].nbBits >= currentNbBits {
+ if huffNode[pos].nbBits() >= currentNbBits {
continue
}
- currentNbBits = huffNode[pos].nbBits // < maxNbBits
+ currentNbBits = huffNode[pos].nbBits() // < maxNbBits
rankLast[maxNbBits-currentNbBits] = uint32(pos)
}
}
@@ -675,8 +677,8 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
if lowPos == noSymbol {
break
}
- highTotal := huffNode[highPos].count
- lowTotal := 2 * huffNode[lowPos].count
+ highTotal := huffNode[highPos].count()
+ lowTotal := 2 * huffNode[lowPos].count()
if highTotal <= lowTotal {
break
}
@@ -692,13 +694,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
// this rank is no longer empty
rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
}
- huffNode[rankLast[nBitsToDecrease]].nbBits++
+ huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
+ huffNode[rankLast[nBitsToDecrease]].nbBits())
if rankLast[nBitsToDecrease] == 0 {
/* special case, reached largest symbol */
rankLast[nBitsToDecrease] = noSymbol
} else {
rankLast[nBitsToDecrease]--
- if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease {
+ if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
}
}
@@ -706,15 +709,15 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
for totalCost < 0 { /* Sometimes, cost correction overshoot */
if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
- for huffNode[n].nbBits == maxNbBits {
+ for huffNode[n].nbBits() == maxNbBits {
n--
}
- huffNode[n+1].nbBits--
+ huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
rankLast[1] = n + 1
totalCost++
continue
}
- huffNode[rankLast[1]+1].nbBits--
+ huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
rankLast[1]++
totalCost++
}
@@ -722,9 +725,26 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
return maxNbBits
}
-type nodeElt struct {
- count uint32
- parent uint16
- symbol byte
- nbBits uint8
+// A nodeElt is the fields
+//
+// count uint32
+// parent uint16
+// symbol byte
+// nbBits uint8
+//
+// in some order, all squashed into an integer so that the compiler
+// always loads and stores entire nodeElts instead of separate fields.
+type nodeElt uint64
+
+func makeNodeElt(count uint32, symbol byte) nodeElt {
+ return nodeElt(count) | nodeElt(symbol)<<48
}
+
+func (e *nodeElt) count() uint32 { return uint32(*e) }
+func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
+func (e *nodeElt) symbol() byte { return byte(*e >> 48) }
+func (e *nodeElt) nbBits() uint8 { return uint8(*e >> 56) }
+
+func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
+func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
+func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index c0c48bd7..3c0b398c 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -61,7 +61,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
b, err := fse.Decompress(in[:iSize], s.fse)
s.fse.Out = nil
if err != nil {
- return s, nil, err
+ return s, nil, fmt.Errorf("fse decompress returned: %w", err)
}
if len(b) > 255 {
return s, nil, errors.New("corrupt input: output table too large")
@@ -763,17 +763,20 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
- copy(out, buf[0][:])
- copy(out[dstEvery:], buf[1][:])
- copy(out[dstEvery*2:], buf[2][:])
- copy(out[dstEvery*3:], buf[3][:])
- out = out[bufoff:]
- decoded += bufoff * 4
// There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
+ if len(out)-bufoff < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
+ //copy(out, buf[0][:])
+ //copy(out[dstEvery:], buf[1][:])
+ //copy(out[dstEvery*2:], buf[2][:])
+ *(*[bufoff]byte)(out) = buf[0]
+ *(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+ *(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+ *(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+ out = out[bufoff:]
+ decoded += bufoff * 4
}
}
if off > 0 {
@@ -997,17 +1000,22 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
- copy(out, buf[0][:])
- copy(out[dstEvery:], buf[1][:])
- copy(out[dstEvery*2:], buf[2][:])
- copy(out[dstEvery*3:], buf[3][:])
- out = out[bufoff:]
- decoded += bufoff * 4
// There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
+ if len(out)-bufoff < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
+
+ //copy(out, buf[0][:])
+ //copy(out[dstEvery:], buf[1][:])
+ //copy(out[dstEvery*2:], buf[2][:])
+ // copy(out[dstEvery*3:], buf[3][:])
+ *(*[bufoff]byte)(out) = buf[0]
+ *(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+ *(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+ *(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+ out = out[bufoff:]
+ decoded += bufoff * 4
}
}
if off > 0 {
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index 9f3e9f79..ba7e8e6b 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -14,12 +14,14 @@ import (
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog > 8.
+//
//go:noescape
func decompress4x_main_loop_amd64(ctx *decompress4xContext)
// decompress4x_8b_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog <= 8 which decodes 4 entries
// per loop.
+//
//go:noescape
func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
@@ -145,11 +147,13 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress1X when tablelog > 8.
+//
//go:noescape
func decompress1x_main_loop_amd64(ctx *decompress1xContext)
// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
// of Decompress1X when tablelog > 8.
+//
//go:noescape
func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index dd1a5aec..c4c7ab2d 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -1,364 +1,352 @@
// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
//go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_main_loop_amd64(SB), $0-8
- XORQ DX, DX
-
// Preload values
MOVQ ctx+0(FP), AX
MOVBQZX 8(AX), DI
- MOVQ 16(AX), SI
- MOVQ 48(AX), BX
- MOVQ 24(AX), R9
- MOVQ 32(AX), R10
- MOVQ (AX), R11
+ MOVQ 16(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 24(AX), R8
+ MOVQ 32(AX), R9
+ MOVQ (AX), R10
// Main loop
main_loop:
- MOVQ SI, R8
- CMPQ R8, BX
+ XORL DX, DX
+ CMPQ BX, SI
SETGE DL
// br0.fillFast32()
- MOVQ 32(R11), R12
- MOVBQZX 40(R11), R13
- CMPQ R13, $0x20
+ MOVQ 32(R10), R11
+ MOVBQZX 40(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill0
- MOVQ 24(R11), AX
- SUBQ $0x20, R13
+ MOVQ 24(R10), AX
+ SUBQ $0x20, R12
SUBQ $0x04, AX
- MOVQ (R11), R14
+ MOVQ (R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(R14*1), R14
- MOVQ R13, CX
- SHLQ CL, R14
- MOVQ AX, 24(R11)
- ORQ R14, R12
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 24(R10)
+ ORQ R13, R11
- // exhausted = exhausted || (br0.off < 4)
- CMPQ AX, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br0.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br0.peekTopBits(peekBits)
MOVQ DI, CX
- MOVQ R12, R14
- SHRQ CL, R14
+ MOVQ R11, R13
+ SHRQ CL, R13
// v1 := table[val1&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (R8)
+ MOVW AX, (BX)
// update the bitreader structure
- MOVQ R12, 32(R11)
- MOVB R13, 40(R11)
- ADDQ R9, R8
+ MOVQ R11, 32(R10)
+ MOVB R12, 40(R10)
// br1.fillFast32()
- MOVQ 80(R11), R12
- MOVBQZX 88(R11), R13
- CMPQ R13, $0x20
+ MOVQ 80(R10), R11
+ MOVBQZX 88(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill1
- MOVQ 72(R11), AX
- SUBQ $0x20, R13
+ MOVQ 72(R10), AX
+ SUBQ $0x20, R12
SUBQ $0x04, AX
- MOVQ 48(R11), R14
+ MOVQ 48(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(R14*1), R14
- MOVQ R13, CX
- SHLQ CL, R14
- MOVQ AX, 72(R11)
- ORQ R14, R12
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 72(R10)
+ ORQ R13, R11
- // exhausted = exhausted || (br1.off < 4)
- CMPQ AX, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br1.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br1.peekTopBits(peekBits)
MOVQ DI, CX
- MOVQ R12, R14
- SHRQ CL, R14
+ MOVQ R11, R13
+ SHRQ CL, R13
// v1 := table[val1&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (R8)
+ MOVW AX, (BX)(R8*1)
// update the bitreader structure
- MOVQ R12, 80(R11)
- MOVB R13, 88(R11)
- ADDQ R9, R8
+ MOVQ R11, 80(R10)
+ MOVB R12, 88(R10)
// br2.fillFast32()
- MOVQ 128(R11), R12
- MOVBQZX 136(R11), R13
- CMPQ R13, $0x20
+ MOVQ 128(R10), R11
+ MOVBQZX 136(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill2
- MOVQ 120(R11), AX
- SUBQ $0x20, R13
+ MOVQ 120(R10), AX
+ SUBQ $0x20, R12
SUBQ $0x04, AX
- MOVQ 96(R11), R14
+ MOVQ 96(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(R14*1), R14
- MOVQ R13, CX
- SHLQ CL, R14
- MOVQ AX, 120(R11)
- ORQ R14, R12
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 120(R10)
+ ORQ R13, R11
- // exhausted = exhausted || (br2.off < 4)
- CMPQ AX, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br2.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br2.peekTopBits(peekBits)
MOVQ DI, CX
- MOVQ R12, R14
- SHRQ CL, R14
+ MOVQ R11, R13
+ SHRQ CL, R13
// v1 := table[val1&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (R8)
+ MOVW AX, (BX)(R8*2)
// update the bitreader structure
- MOVQ R12, 128(R11)
- MOVB R13, 136(R11)
- ADDQ R9, R8
+ MOVQ R11, 128(R10)
+ MOVB R12, 136(R10)
// br3.fillFast32()
- MOVQ 176(R11), R12
- MOVBQZX 184(R11), R13
- CMPQ R13, $0x20
+ MOVQ 176(R10), R11
+ MOVBQZX 184(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill3
- MOVQ 168(R11), AX
- SUBQ $0x20, R13
+ MOVQ 168(R10), AX
+ SUBQ $0x20, R12
SUBQ $0x04, AX
- MOVQ 144(R11), R14
+ MOVQ 144(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(R14*1), R14
- MOVQ R13, CX
- SHLQ CL, R14
- MOVQ AX, 168(R11)
- ORQ R14, R12
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 168(R10)
+ ORQ R13, R11
- // exhausted = exhausted || (br3.off < 4)
- CMPQ AX, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br3.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br3.peekTopBits(peekBits)
MOVQ DI, CX
- MOVQ R12, R14
- SHRQ CL, R14
+ MOVQ R11, R13
+ SHRQ CL, R13
// v1 := table[val1&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (R8)
+ LEAQ (R8)(R8*2), CX
+ MOVW AX, (BX)(CX*1)
// update the bitreader structure
- MOVQ R12, 176(R11)
- MOVB R13, 184(R11)
- ADDQ $0x02, SI
+ MOVQ R11, 176(R10)
+ MOVB R12, 184(R10)
+ ADDQ $0x02, BX
TESTB DL, DL
JZ main_loop
MOVQ ctx+0(FP), AX
- SUBQ 16(AX), SI
- SHLQ $0x02, SI
- MOVQ SI, 40(AX)
+ SUBQ 16(AX), BX
+ SHLQ $0x02, BX
+ MOVQ BX, 40(AX)
RET
// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
- XORQ DX, DX
-
// Preload values
MOVQ ctx+0(FP), CX
MOVBQZX 8(CX), DI
MOVQ 16(CX), BX
MOVQ 48(CX), SI
- MOVQ 24(CX), R9
- MOVQ 32(CX), R10
- MOVQ (CX), R11
+ MOVQ 24(CX), R8
+ MOVQ 32(CX), R9
+ MOVQ (CX), R10
// Main loop
main_loop:
- MOVQ BX, R8
- CMPQ R8, SI
+ XORL DX, DX
+ CMPQ BX, SI
SETGE DL
// br0.fillFast32()
- MOVQ 32(R11), R12
- MOVBQZX 40(R11), R13
- CMPQ R13, $0x20
+ MOVQ 32(R10), R11
+ MOVBQZX 40(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill0
- MOVQ 24(R11), R14
- SUBQ $0x20, R13
- SUBQ $0x04, R14
- MOVQ (R11), R15
+ MOVQ 24(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ (R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R14)(R15*1), R15
- MOVQ R13, CX
- SHLQ CL, R15
- MOVQ R14, 24(R11)
- ORQ R15, R12
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 24(R10)
+ ORQ R14, R11
- // exhausted = exhausted || (br0.off < 4)
- CMPQ R14, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br0.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v1 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// val2 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v2 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val3 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v3 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@@ -366,88 +354,86 @@ skip_fill0:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (R8)
+ MOVL AX, (BX)
// update the bitreader structure
- MOVQ R12, 32(R11)
- MOVB R13, 40(R11)
- ADDQ R9, R8
+ MOVQ R11, 32(R10)
+ MOVB R12, 40(R10)
// br1.fillFast32()
- MOVQ 80(R11), R12
- MOVBQZX 88(R11), R13
- CMPQ R13, $0x20
+ MOVQ 80(R10), R11
+ MOVBQZX 88(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill1
- MOVQ 72(R11), R14
- SUBQ $0x20, R13
- SUBQ $0x04, R14
- MOVQ 48(R11), R15
+ MOVQ 72(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ 48(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R14)(R15*1), R15
- MOVQ R13, CX
- SHLQ CL, R15
- MOVQ R14, 72(R11)
- ORQ R15, R12
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 72(R10)
+ ORQ R14, R11
- // exhausted = exhausted || (br1.off < 4)
- CMPQ R14, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br1.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v1 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// val2 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v2 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val3 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v3 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@@ -455,88 +441,86 @@ skip_fill1:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (R8)
+ MOVL AX, (BX)(R8*1)
// update the bitreader structure
- MOVQ R12, 80(R11)
- MOVB R13, 88(R11)
- ADDQ R9, R8
+ MOVQ R11, 80(R10)
+ MOVB R12, 88(R10)
// br2.fillFast32()
- MOVQ 128(R11), R12
- MOVBQZX 136(R11), R13
- CMPQ R13, $0x20
+ MOVQ 128(R10), R11
+ MOVBQZX 136(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill2
- MOVQ 120(R11), R14
- SUBQ $0x20, R13
- SUBQ $0x04, R14
- MOVQ 96(R11), R15
+ MOVQ 120(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ 96(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R14)(R15*1), R15
- MOVQ R13, CX
- SHLQ CL, R15
- MOVQ R14, 120(R11)
- ORQ R15, R12
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 120(R10)
+ ORQ R14, R11
- // exhausted = exhausted || (br2.off < 4)
- CMPQ R14, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br2.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v1 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// val2 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v2 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val3 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v3 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@@ -544,88 +528,86 @@ skip_fill2:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (R8)
+ MOVL AX, (BX)(R8*2)
// update the bitreader structure
- MOVQ R12, 128(R11)
- MOVB R13, 136(R11)
- ADDQ R9, R8
+ MOVQ R11, 128(R10)
+ MOVB R12, 136(R10)
// br3.fillFast32()
- MOVQ 176(R11), R12
- MOVBQZX 184(R11), R13
- CMPQ R13, $0x20
+ MOVQ 176(R10), R11
+ MOVBQZX 184(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill3
- MOVQ 168(R11), R14
- SUBQ $0x20, R13
- SUBQ $0x04, R14
- MOVQ 144(R11), R15
+ MOVQ 168(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ 144(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R14)(R15*1), R15
- MOVQ R13, CX
- SHLQ CL, R15
- MOVQ R14, 168(R11)
- ORQ R15, R12
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 168(R10)
+ ORQ R14, R11
- // exhausted = exhausted || (br3.off < 4)
- CMPQ R14, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br3.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v1 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// val2 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v2 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val3 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v3 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@@ -633,11 +615,12 @@ skip_fill3:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (R8)
+ LEAQ (R8)(R8*2), CX
+ MOVL AX, (BX)(CX*1)
// update the bitreader structure
- MOVQ R12, 176(R11)
- MOVB R13, 184(R11)
+ MOVQ R11, 176(R10)
+ MOVB R12, 184(R10)
ADDQ $0x04, BX
TESTB DL, DL
JZ main_loop
@@ -653,7 +636,7 @@ TEXT ·decompress1x_main_loop_amd64(SB), $0-8
MOVQ 16(CX), DX
MOVQ 24(CX), BX
CMPQ BX, $0x04
- JB error_max_decoded_size_exeeded
+ JB error_max_decoded_size_exceeded
LEAQ (DX)(BX*1), BX
MOVQ (CX), SI
MOVQ (SI), R8
@@ -668,7 +651,7 @@ main_loop:
// Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX
CMPQ CX, BX
- JGE error_max_decoded_size_exeeded
+ JGE error_max_decoded_size_exceeded
// Decode 4 values
CMPQ R11, $0x20
@@ -745,7 +728,7 @@ loop_condition:
RET
// Report error
-error_max_decoded_size_exeeded:
+error_max_decoded_size_exceeded:
MOVQ ctx+0(FP), AX
MOVQ $-1, CX
MOVQ CX, 40(AX)
@@ -758,7 +741,7 @@ TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
MOVQ 16(CX), DX
MOVQ 24(CX), BX
CMPQ BX, $0x04
- JB error_max_decoded_size_exeeded
+ JB error_max_decoded_size_exceeded
LEAQ (DX)(BX*1), BX
MOVQ (CX), SI
MOVQ (SI), R8
@@ -773,7 +756,7 @@ main_loop:
// Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX
CMPQ CX, BX
- JGE error_max_decoded_size_exeeded
+ JGE error_max_decoded_size_exceeded
// Decode 4 values
CMPQ R11, $0x20
@@ -840,7 +823,7 @@ loop_condition:
RET
// Report error
-error_max_decoded_size_exeeded:
+error_max_decoded_size_exceeded:
MOVQ ctx+0(FP), AX
MOVQ $-1, CX
MOVQ CX, 40(AX)
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
index 4f6f37cb..908c17de 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -122,17 +122,21 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
- copy(out, buf[0][:])
- copy(out[dstEvery:], buf[1][:])
- copy(out[dstEvery*2:], buf[2][:])
- copy(out[dstEvery*3:], buf[3][:])
- out = out[bufoff:]
- decoded += bufoff * 4
// There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
+ if len(out)-bufoff < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
+ //copy(out, buf[0][:])
+ //copy(out[dstEvery:], buf[1][:])
+ //copy(out[dstEvery*2:], buf[2][:])
+ //copy(out[dstEvery*3:], buf[3][:])
+ *(*[bufoff]byte)(out) = buf[0]
+ *(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+ *(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+ *(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+ out = out[bufoff:]
+ decoded += bufoff * 4
}
}
if off > 0 {
diff --git a/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
index 511bba65..05db94d3 100644
--- a/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
@@ -18,6 +18,7 @@ func load64(b []byte, i int) uint64 {
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
+//
// dst is long enough to hold the encoded bytes
// 1 <= len(lit) && len(lit) <= 65536
func emitLiteral(dst, lit []byte) int {
@@ -42,6 +43,7 @@ func emitLiteral(dst, lit []byte) int {
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
+//
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= 65535
// 4 <= length && length <= 65535
@@ -89,6 +91,7 @@ func emitCopy(dst []byte, offset, length int) int {
// src[i:i+k-j] and src[j:k] have the same contents.
//
// It assumes that:
+//
// 0 <= i && i < j && j <= len(src)
func extendMatch(src []byte, i, j int) int {
for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
@@ -100,13 +103,36 @@ func hash(u, shift uint32) uint32 {
return (u * 0x1e35a7bd) >> shift
}
+// EncodeBlockInto exposes encodeBlock but checks dst size.
+func EncodeBlockInto(dst, src []byte) (d int) {
+ if MaxEncodedLen(len(src)) > len(dst) {
+ return 0
+ }
+
+ // encodeBlock breaks on too big blocks, so split.
+ for len(src) > 0 {
+ p := src
+ src = nil
+ if len(p) > maxBlockSize {
+ p, src = p[:maxBlockSize], p[maxBlockSize:]
+ }
+ if len(p) < minNonLiteralBlockSize {
+ d += emitLiteral(dst[d:], p)
+ } else {
+ d += encodeBlock(dst[d:], p)
+ }
+ }
+ return d
+}
+
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlock(dst, src []byte) (d int) {
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
// The table element type is uint16, as s < sLimit and sLimit < len(src)
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
index 73c0c462..8284bb08 100644
--- a/vendor/github.com/klauspost/compress/s2/README.md
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -20,11 +20,12 @@ This is important, so you don't have to worry about spending CPU cycles on alrea
* Concurrent stream compression
* Faster decompression, even for Snappy compatible content
* Concurrent Snappy/S2 stream decompression
-* Ability to quickly skip forward in compressed stream
+* Skip forward in compressed stream
* Random seeking with indexes
* Compatible with reading Snappy compressed content
* Smaller block size overhead on incompressible blocks
* Block concatenation
+* Block Dictionary support
* Uncompressed stream mode
* Automatic stream size padding
* Snappy compatible block compression
@@ -325,35 +326,35 @@ The content compressed in this mode is fully compatible with the standard decode
Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
-| File | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
-|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
-| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 12.70x | 10556 MB/s | 7.35% | 4.15x | 3455 MB/s | 12.79% |
-| (1 CPU) | 1.14x | 948 MB/s | - | 0.42x | 349 MB/s | - |
-| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x | 14484 MB/s | 31.60% | 10.09x | 8533 MB/s | 37.71% |
-| (1 CPU) | 1.33x | 1127 MB/s | - | 0.70x | 589 MB/s | - |
-| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12000 MB/s | -5.79% | 6.59x | 5223 MB/s | 5.80% |
-| (1 CPU) | 1.11x | 877 MB/s | - | 0.47x | 370 MB/s | - |
-| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 14.62x | 12116 MB/s | 15.90% | 5.35x | 4430 MB/s | 16.08% |
-| (1 CPU) | 1.38x | 1146 MB/s | - | 0.38x | 312 MB/s | - |
-| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 8.83x | 17579 MB/s | 43.86% | 6.54x | 13011 MB/s | 47.23% |
-| (1 CPU) | 1.14x | 2259 MB/s | - | 0.74x | 1475 MB/s | - |
-| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 16.72x | 14019 MB/s | 24.02% | 10.11x | 8477 MB/s | 30.48% |
-| (1 CPU) | 1.24x | 1043 MB/s | - | 0.70x | 586 MB/s | - |
-| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9254 MB/s | 1.84% | 6.75x | 4686 MB/s | 6.72% |
-| (1 CPU) | 0.97x | 672 MB/s | - | 0.53x | 366 MB/s | - |
-| sharnd.out.2gb | 2.11x | 12639 MB/s | 0.01% | 1.98x | 11833 MB/s | 0.01% |
-| (1 CPU) | 0.93x | 5594 MB/s | - | 1.34x | 8030 MB/s | - |
-| [enwik9](http://mattmahoney.net/dc/textdata.html) | 19.34x | 8220 MB/s | 3.98% | 7.87x | 3345 MB/s | 15.82% |
-| (1 CPU) | 1.06x | 452 MB/s | - | 0.50x | 213 MB/s | - |
-| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 10.48x | 6124 MB/s | 5.67% | 3.76x | 2197 MB/s | 12.60% |
-| (1 CPU) | 0.97x | 568 MB/s | - | 0.46x | 271 MB/s | - |
-| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 21.07x | 9020 MB/s | 6.36% | 6.91x | 2959 MB/s | 16.95% |
-| (1 CPU) | 1.07x | 460 MB/s | - | 0.51x | 220 MB/s | - |
+| File | S2 Speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
+|---------------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 16.33x | 10556 MB/s | 8.0% | 6.04x | 5252 MB/s | 14.7% |
+| (1 CPU) | 1.08x | 940 MB/s | - | 0.46x | 400 MB/s | - |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 16.51x | 15224 MB/s | 31.70% | 9.47x | 8734 MB/s | 37.71% |
+| (1 CPU) | 1.26x | 1157 MB/s | - | 0.60x | 556 MB/s | - |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12598 MB/s | -5.76% | 6.23x | 5675 MB/s | 3.62% |
+| (1 CPU) | 1.02x | 932 MB/s | - | 0.47x | 432 MB/s | - |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 11.21x | 12116 MB/s | 15.95% | 3.24x | 3500 MB/s | 18.00% |
+| (1 CPU) | 1.05x | 1135 MB/s | - | 0.27x | 292 MB/s | - |
+| [apache.log](https://files.klauspost.com/compress/apache.log.zst) | 8.55x | 16673 MB/s | 20.54% | 5.85x | 11420 MB/s | 24.97% |
+| (1 CPU) | 1.91x | 1771 MB/s | - | 0.53x | 1041 MB/s | - |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 15.76x | 14357 MB/s | 24.01% | 8.67x | 7891 MB/s | 33.68% |
+| (1 CPU) | 1.17x | 1064 MB/s | - | 0.65x | 595 MB/s | - |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9835 MB/s | 2.34% | 6.85x | 4863 MB/s | 9.96% |
+| (1 CPU) | 0.97x | 689 MB/s | - | 0.55x | 387 MB/s | - |
+| sharnd.out.2gb | 9.11x | 13213 MB/s | 0.01% | 1.49x | 9184 MB/s | 0.01% |
+| (1 CPU) | 0.88x | 5418 MB/s | - | 0.77x | 5417 MB/s | - |
+| [sofia-air-quality-dataset csv](https://files.klauspost.com/compress/sofia-air-quality-dataset.tar.zst) | 22.00x | 11477 MB/s | 18.73% | 11.15x | 5817 MB/s | 27.88% |
+| (1 CPU) | 1.23x | 642 MB/s | - | 0.71x | 642 MB/s | - |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 11.23x | 6520 MB/s | 5.9% | 5.35x | 3109 MB/s | 15.88% |
+| (1 CPU) | 1.05x | 607 MB/s | - | 0.52x | 304 MB/s | - |
+| [enwik9](https://files.klauspost.com/compress/enwik9.zst) | 19.28x | 8440 MB/s | 4.04% | 9.31x | 4076 MB/s | 18.04% |
+| (1 CPU) | 1.12x | 488 MB/s | - | 0.57x | 250 MB/s | - |
### Legend
-* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
-* `S2 throughput`: Throughput of S2 in MB/s.
+* `S2 Speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
+* `S2 Throughput`: Throughput of S2 in MB/s.
* `S2 % smaller`: How many percent of the Snappy output size is S2 better.
* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy.
* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy.
@@ -361,7 +362,7 @@ Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all th
There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
-Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size.
+Machine generated data gets by far the biggest compression boost, with size being reduced by up to 35% of Snappy size.
The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
@@ -404,15 +405,15 @@ The "better" compression mode will actively look for shorter matches, which is w
Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
| File | S2 Throughput | S2 throughput |
-|--------------------------------|--------------|---------------|
-| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s |
-| 10gb.tar.s2 | 1.30x | 867.07 MB/s |
-| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s |
-| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s |
-| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s |
-| enwik9.s2 | 1.67x | 681.53 MB/s |
-| adresser.json.s2 | 3.41x | 4230.53 MB/s |
-| silesia.tar.s2 | 1.52x | 811.58 |
+|--------------------------------|---------------|---------------|
+| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s |
+| 10gb.tar.s2 | 1.30x | 867.07 MB/s |
+| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s |
+| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s |
+| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s |
+| enwik9.s2 | 1.67x | 681.53 MB/s |
+| adresser.json.s2 | 3.41x | 4230.53 MB/s |
+| silesia.tar.s2 | 1.52x | 811.58 |
Even though S2 typically compresses better than Snappy, decompression speed is always better.
@@ -450,14 +451,14 @@ The most reliable is a wide dataset.
For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
53927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
-| * | Input | Output | Reduction | MB/s |
-|-------------------|------------|------------|-----------|--------|
-| S2 | 4014735833 | 1059723369 | 73.60% | **934.34** |
-| S2 Better | 4014735833 | 969670507 | 75.85% | 532.70 |
-| S2 Best | 4014735833 | 906625668 | **77.85%** | 46.84 |
-| Snappy | 4014735833 | 1128706759 | 71.89% | 762.59 |
-| S2, Snappy Output | 4014735833 | 1093821420 | 72.75% | 908.60 |
-| LZ4 | 4014735833 | 1079259294 | 73.12% | 526.94 |
+| * | Input | Output | Reduction | MB/s |
+|-------------------|------------|------------|------------|------------|
+| S2 | 4014735833 | 1059723369 | 73.60% | **936.73** |
+| S2 Better | 4014735833 | 961580539 | 76.05% | 451.10 |
+| S2 Best | 4014735833 | 899182886 | **77.60%** | 46.84 |
+| Snappy | 4014735833 | 1128706759 | 71.89% | 790.15 |
+| S2, Snappy Output | 4014735833 | 1093823291 | 72.75% | 936.60 |
+| LZ4 | 4014735833 | 1063768713 | 73.50% | 452.02 |
S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
"Better" mode provides the same compression speed as LZ4 with better compression ratio.
@@ -489,42 +490,23 @@ AMD64 assembly is use for both S2 and Snappy.
| Absolute Perf | Snappy size | S2 Size | Snappy Speed | S2 Speed | Snappy dec | S2 dec |
|-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
-| html | 22843 | 21111 | 16246 MB/s | 17438 MB/s | 40972 MB/s | 49263 MB/s |
-| urls.10K | 335492 | 287326 | 7943 MB/s | 9693 MB/s | 22523 MB/s | 26484 MB/s |
-| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 273889 MB/s | 718321 MB/s | 827552 MB/s |
-| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 17773 MB/s | 33691 MB/s | 52421 MB/s |
-| paper-100k.pdf | 85304 | 84459 | 167546 MB/s | 101263 MB/s | 326905 MB/s | 291944 MB/s |
-| html_x_4 | 92234 | 21113 | 15194 MB/s | 50670 MB/s | 30843 MB/s | 32217 MB/s |
-| alice29.txt | 88034 | 85975 | 5936 MB/s | 6139 MB/s | 12882 MB/s | 20044 MB/s |
-| asyoulik.txt | 77503 | 79650 | 5517 MB/s | 6366 MB/s | 12735 MB/s | 22806 MB/s |
-| lcet10.txt | 234661 | 220670 | 6235 MB/s | 6067 MB/s | 14519 MB/s | 18697 MB/s |
-| plrabn12.txt | 319267 | 317985 | 5159 MB/s | 5726 MB/s | 11923 MB/s | 19901 MB/s |
-| geo.protodata | 23335 | 18690 | 21220 MB/s | 26529 MB/s | 56271 MB/s | 62540 MB/s |
-| kppkn.gtb | 69526 | 65312 | 9732 MB/s | 8559 MB/s | 18491 MB/s | 18969 MB/s |
-| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 15489 MB/s | 31883 MB/s | 38874 MB/s |
-| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13000 MB/s | 48056 MB/s | 52341 MB/s |
-| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12806 MB/s | 32378 MB/s | 46322 MB/s |
-| alice29.txt (20000B) | 12686 | 13574 | 7733 MB/s | 11210 MB/s | 30566 MB/s | 58969 MB/s |
-
-
-| Relative Perf | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed |
-|-----------------------|-------------|------------------|----------|--------------|
-| html | 22.31% | 7.58% | 1.07x | 1.20x |
-| urls.10K | 47.78% | 14.36% | 1.22x | 1.18x |
-| fireworks.jpeg | 99.95% | -0.05% | 0.78x | 1.15x |
-| fireworks.jpeg (200B) | 73.00% | -6.16% | 2.00x | 1.56x |
-| paper-100k.pdf | 83.30% | 0.99% | 0.60x | 0.89x |
-| html_x_4 | 22.52% | 77.11% | 3.33x | 1.04x |
-| alice29.txt | 57.88% | 2.34% | 1.03x | 1.56x |
-| asyoulik.txt | 61.91% | -2.77% | 1.15x | 1.79x |
-| lcet10.txt | 54.99% | 5.96% | 0.97x | 1.29x |
-| plrabn12.txt | 66.26% | 0.40% | 1.11x | 1.67x |
-| geo.protodata | 19.68% | 19.91% | 1.25x | 1.11x |
-| kppkn.gtb | 37.72% | 6.06% | 0.88x | 1.03x |
-| alice29.txt (128B) | 62.50% | -2.50% | 2.31x | 1.22x |
-| alice29.txt (1000B) | 77.40% | 0.00% | 1.07x | 1.09x |
-| alice29.txt (10000B) | 66.48% | -4.29% | 1.27x | 1.43x |
-| alice29.txt (20000B) | 63.43% | -7.00% | 1.45x | 1.93x |
+| html | 22843 | 20868 | 16246 MB/s | 18617 MB/s | 40972 MB/s | 49263 MB/s |
+| urls.10K | 335492 | 286541 | 7943 MB/s | 10201 MB/s | 22523 MB/s | 26484 MB/s |
+| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 303228 MB/s | 718321 MB/s | 827552 MB/s |
+| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 20180 MB/s | 33691 MB/s | 52421 MB/s |
+| paper-100k.pdf | 85304 | 84202 | 167546 MB/s | 112988 MB/s | 326905 MB/s | 291944 MB/s |
+| html_x_4 | 92234 | 20870 | 15194 MB/s | 54457 MB/s | 30843 MB/s | 32217 MB/s |
+| alice29.txt | 88034 | 85934 | 5936 MB/s | 6540 MB/s | 12882 MB/s | 20044 MB/s |
+| asyoulik.txt | 77503 | 79575 | 5517 MB/s | 6657 MB/s | 12735 MB/s | 22806 MB/s |
+| lcet10.txt | 234661 | 220383 | 6235 MB/s | 6303 MB/s | 14519 MB/s | 18697 MB/s |
+| plrabn12.txt | 319267 | 318196 | 5159 MB/s | 6074 MB/s | 11923 MB/s | 19901 MB/s |
+| geo.protodata | 23335 | 18606 | 21220 MB/s | 25432 MB/s | 56271 MB/s | 62540 MB/s |
+| kppkn.gtb | 69526 | 65019 | 9732 MB/s | 8905 MB/s | 18491 MB/s | 18969 MB/s |
+| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 17179 MB/s | 31883 MB/s | 38874 MB/s |
+| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13273 MB/s | 48056 MB/s | 52341 MB/s |
+| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12824 MB/s | 32378 MB/s | 46322 MB/s |
+| alice29.txt (20000B) | 12686 | 13516 | 7733 MB/s | 12160 MB/s | 30566 MB/s | 58969 MB/s |
+
Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size.
@@ -543,42 +525,23 @@ So individual benchmarks should only be seen as a guideline and the overall pict
| Absolute Perf | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec | Better dec |
|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
-| html | 22843 | 19833 | 16246 MB/s | 7731 MB/s | 40972 MB/s | 40292 MB/s |
-| urls.10K | 335492 | 253529 | 7943 MB/s | 3980 MB/s | 22523 MB/s | 20981 MB/s |
-| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 9760 MB/s | 718321 MB/s | 823698 MB/s |
-| fireworks.jpeg (200B) | 146 | 142 | 8869 MB/s | 594 MB/s | 33691 MB/s | 30101 MB/s |
-| paper-100k.pdf | 85304 | 82915 | 167546 MB/s | 7470 MB/s | 326905 MB/s | 198869 MB/s |
-| html_x_4 | 92234 | 19841 | 15194 MB/s | 23403 MB/s | 30843 MB/s | 30937 MB/s |
-| alice29.txt | 88034 | 73218 | 5936 MB/s | 2945 MB/s | 12882 MB/s | 16611 MB/s |
-| asyoulik.txt | 77503 | 66844 | 5517 MB/s | 2739 MB/s | 12735 MB/s | 14975 MB/s |
-| lcet10.txt | 234661 | 190589 | 6235 MB/s | 3099 MB/s | 14519 MB/s | 16634 MB/s |
-| plrabn12.txt | 319267 | 270828 | 5159 MB/s | 2600 MB/s | 11923 MB/s | 13382 MB/s |
-| geo.protodata | 23335 | 18278 | 21220 MB/s | 11208 MB/s | 56271 MB/s | 57961 MB/s |
-| kppkn.gtb | 69526 | 61851 | 9732 MB/s | 4556 MB/s | 18491 MB/s | 16524 MB/s |
-| alice29.txt (128B) | 80 | 81 | 6691 MB/s | 529 MB/s | 31883 MB/s | 34225 MB/s |
-| alice29.txt (1000B) | 774 | 748 | 12204 MB/s | 1943 MB/s | 48056 MB/s | 42068 MB/s |
-| alice29.txt (10000B) | 6648 | 6234 | 10044 MB/s | 2949 MB/s | 32378 MB/s | 28813 MB/s |
-| alice29.txt (20000B) | 12686 | 11584 | 7733 MB/s | 2822 MB/s | 30566 MB/s | 27315 MB/s |
-
-
-| Relative Perf | Snappy size | Better size | Better Speed | Better dec |
-|-----------------------|-------------|-------------|--------------|------------|
-| html | 22.31% | 13.18% | 0.48x | 0.98x |
-| urls.10K | 47.78% | 24.43% | 0.50x | 0.93x |
-| fireworks.jpeg | 99.95% | -0.05% | 0.03x | 1.15x |
-| fireworks.jpeg (200B) | 73.00% | 2.74% | 0.07x | 0.89x |
-| paper-100k.pdf | 83.30% | 2.80% | 0.07x | 0.61x |
-| html_x_4 | 22.52% | 78.49% | 0.04x | 1.00x |
-| alice29.txt | 57.88% | 16.83% | 1.54x | 1.29x |
-| asyoulik.txt | 61.91% | 13.75% | 0.50x | 1.18x |
-| lcet10.txt | 54.99% | 18.78% | 0.50x | 1.15x |
-| plrabn12.txt | 66.26% | 15.17% | 0.50x | 1.12x |
-| geo.protodata | 19.68% | 21.67% | 0.50x | 1.03x |
-| kppkn.gtb | 37.72% | 11.04% | 0.53x | 0.89x |
-| alice29.txt (128B) | 62.50% | -1.25% | 0.47x | 1.07x |
-| alice29.txt (1000B) | 77.40% | 3.36% | 0.08x | 0.88x |
-| alice29.txt (10000B) | 66.48% | 6.23% | 0.16x | 0.89x |
-| alice29.txt (20000B) | 63.43% | 8.69% | 0.29x | 0.89x |
+| html | 22843 | 18972 | 16246 MB/s | 8621 MB/s | 40972 MB/s | 40292 MB/s |
+| urls.10K | 335492 | 248079 | 7943 MB/s | 5104 MB/s | 22523 MB/s | 20981 MB/s |
+| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 84429 MB/s | 718321 MB/s | 823698 MB/s |
+| fireworks.jpeg (200B) | 146 | 149 | 8869 MB/s | 7125 MB/s | 33691 MB/s | 30101 MB/s |
+| paper-100k.pdf | 85304 | 82887 | 167546 MB/s | 11087 MB/s | 326905 MB/s | 198869 MB/s |
+| html_x_4 | 92234 | 18982 | 15194 MB/s | 29316 MB/s | 30843 MB/s | 30937 MB/s |
+| alice29.txt | 88034 | 71611 | 5936 MB/s | 3709 MB/s | 12882 MB/s | 16611 MB/s |
+| asyoulik.txt | 77503 | 65941 | 5517 MB/s | 3380 MB/s | 12735 MB/s | 14975 MB/s |
+| lcet10.txt | 234661 | 184939 | 6235 MB/s | 3537 MB/s | 14519 MB/s | 16634 MB/s |
+| plrabn12.txt | 319267 | 264990 | 5159 MB/s | 2960 MB/s | 11923 MB/s | 13382 MB/s |
+| geo.protodata | 23335 | 17689 | 21220 MB/s | 10859 MB/s | 56271 MB/s | 57961 MB/s |
+| kppkn.gtb | 69526 | 55398 | 9732 MB/s | 5206 MB/s | 18491 MB/s | 16524 MB/s |
+| alice29.txt (128B) | 80 | 78 | 6691 MB/s | 7422 MB/s | 31883 MB/s | 34225 MB/s |
+| alice29.txt (1000B) | 774 | 746 | 12204 MB/s | 5734 MB/s | 48056 MB/s | 42068 MB/s |
+| alice29.txt (10000B) | 6648 | 6218 | 10044 MB/s | 6055 MB/s | 32378 MB/s | 28813 MB/s |
+| alice29.txt (20000B) | 12686 | 11492 | 7733 MB/s | 3143 MB/s | 30566 MB/s | 27315 MB/s |
+
Except for the mostly incompressible JPEG image compression is better and usually in the
double digits in terms of percentage reduction over Snappy.
@@ -605,33 +568,150 @@ Some examples compared on 16 core CPU, amd64 assembly used:
```
* enwik10
-Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
-Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
-Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s
+Default... 10000000000 -> 4759950115 [47.60%]; 1.03s, 9263.0MB/s
+Better... 10000000000 -> 4084706676 [40.85%]; 2.16s, 4415.4MB/s
+Best... 10000000000 -> 3615520079 [36.16%]; 42.259s, 225.7MB/s
* github-june-2days-2019.json
-Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
-Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
-Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s
+Default... 6273951764 -> 1041700255 [16.60%]; 431ms, 13882.3MB/s
+Better... 6273951764 -> 945841238 [15.08%]; 547ms, 10938.4MB/s
+Best... 6273951764 -> 826392576 [13.17%]; 9.455s, 632.8MB/s
* nyc-taxi-data-10M.csv
-Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
-Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
-Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s
+Default... 3325605752 -> 1093516949 [32.88%]; 324ms, 9788.7MB/s
+Better... 3325605752 -> 885394158 [26.62%]; 491ms, 6459.4MB/s
+Best... 3325605752 -> 773681257 [23.26%]; 8.29s, 412.0MB/s
* 10gb.tar
-Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
-Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
-Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/
+Default... 10065157632 -> 5915541066 [58.77%]; 1.028s, 9337.4MB/s
+Better... 10065157632 -> 5453844650 [54.19%]; 1.597s, 4862.7MB/s
+Best... 10065157632 -> 5192495021 [51.59%]; 32.78s, 308.2MB/
* consensus.db.10gb
-Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
-Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
-Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
+Default... 10737418240 -> 4549762344 [42.37%]; 882ms, 12118.4MB/s
+Better... 10737418240 -> 4438535064 [41.34%]; 1.533s, 3500.9MB/s
+Best... 10737418240 -> 4210602774 [39.21%]; 42.96s, 254.4MB/s
```
Decompression speed should be around the same as using the 'better' compression mode.
+## Dictionaries
+
+*Note: S2 dictionary compression is currently at an early implementation stage, with no assembly for
+neither encoding nor decoding. Performance improvements can be expected in the future.*
+
+Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks.
+
+The same dictionary *must* be used for both encoding and decoding.
+S2 does not keep track of whether the same dictionary is used,
+and using the wrong dictionary will most often not result in an error when decompressing.
+
+Blocks encoded *without* dictionaries can be decompressed seamlessly *with* a dictionary.
+This means it is possible to switch from an encoding without dictionaries to an encoding with dictionaries
+and treat the blocks similarly.
+
+Similar to [zStandard dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression),
+the same usage scenario applies to S2 dictionaries.
+
+> Training works if there is some correlation in a family of small data samples. The more data-specific a dictionary is, the more efficient it is (there is no universal dictionary). Hence, deploying one dictionary per type of data will provide the greatest benefits. Dictionary gains are mostly effective in the first few KB. Then, the compression algorithm will gradually use previously decoded content to better compress the rest of the file.
+
+S2 further limits the dictionary to only be enabled on the first 64KB of a block.
+This will remove any negative (speed) impacts of the dictionaries on bigger blocks.
+
+### Compression
+
+Using the [github_users_sample_set](https://github.com/facebook/zstd/releases/download/v1.1.3/github_users_sample_set.tar.zst)
+and a 64KB dictionary trained with zStandard the following sizes can be achieved.
+
+| | Default | Better | Best |
+|--------------------|------------------|------------------|-----------------------|
+| Without Dictionary | 3362023 (44.92%) | 3083163 (41.19%) | 3057944 (40.86%) |
+| With Dictionary | 921524 (12.31%) | 873154 (11.67%) | 785503 bytes (10.49%) |
+
+So for highly repetitive content, this case provides an almost 3x reduction in size.
+
+For less uniform data we will use the Go source code tree.
+Compressing First 64KB of all `.go` files in `go/src`, Go 1.19.5, 8912 files, 51253563 bytes input:
+
+| | Default | Better | Best |
+|--------------------|-------------------|-------------------|-------------------|
+| Without Dictionary | 22955767 (44.79%) | 20189613 (39.39% | 19482828 (38.01%) |
+| With Dictionary | 19654568 (38.35%) | 16289357 (31.78%) | 15184589 (29.63%) |
+| Saving/file | 362 bytes | 428 bytes | 472 bytes |
+
+
+### Creating Dictionaries
+
+There are no tools to create dictionaries in S2.
+However, there are multiple ways to create a useful dictionary:
+
+#### Using a Sample File
+
+If your input is very uniform, you can just use a sample file as the dictionary.
+
+For example in the `github_users_sample_set` above, the average compression only goes up from
+10.49% to 11.48% by using the first file as dictionary compared to using a dedicated dictionary.
+
+```Go
+ // Read a sample
+ sample, err := os.ReadFile("sample.json")
+
+ // Create a dictionary.
+ dict := s2.MakeDict(sample, nil)
+
+ // b := dict.Bytes() will provide a dictionary that can be saved
+ // and reloaded with s2.NewDict(b).
+
+ // To encode:
+ encoded := dict.Encode(nil, file)
+
+ // To decode:
+ decoded, err := dict.Decode(nil, file)
+```
+
+#### Using Zstandard
+
+Zstandard dictionaries can easily be converted to S2 dictionaries.
+
+This can be helpful to generate dictionaries for files that don't have a fixed structure.
+
+
+Example, with training set files placed in `./training-set`:
+
+`λ zstd -r --train-fastcover training-set/* --maxdict=65536 -o name.dict`
+
+This will create a dictionary of 64KB, that can be converted to a dictionary like this:
+
+```Go
+ // Decode the Zstandard dictionary.
+ insp, err := zstd.InspectDictionary(zdict)
+ if err != nil {
+ panic(err)
+ }
+
+ // We are only interested in the contents.
+ // Assume that files start with "// Copyright (c) 2023".
+ // Search for the longest match for that.
+ // This may save a few bytes.
+ dict := s2.MakeDict(insp.Content(), []byte("// Copyright (c) 2023"))
+
+ // b := dict.Bytes() will provide a dictionary that can be saved
+ // and reloaded with s2.NewDict(b).
+
+ // We can now encode using this dictionary
+ encodedWithDict := dict.Encode(nil, payload)
+
+ // To decode content:
+ decoded, err := dict.Decode(nil, encodedWithDict)
+```
+
+It is recommended to save the dictionary returned by ` b:= dict.Bytes()`, since that will contain only the S2 dictionary.
+
+This dictionary can later be loaded using `s2.NewDict(b)`. The dictionary then no longer requires `zstd` to be initialized.
+
+Also note how `s2.MakeDict` allows you to search for a common starting sequence of your files.
+This can be omitted, at the expense of a few bytes.
+
# Snappy Compatibility
S2 now offers full compatibility with Snappy.
@@ -648,10 +728,10 @@ If you would like more control, you can use the s2 package as described below:
Snappy compatible blocks can be generated with the S2 encoder.
Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace
-| Snappy | S2 replacement |
-|----------------------------|-------------------------|
-| snappy.Encode(...) | s2.EncodeSnappy(...) |
-| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
+| Snappy | S2 replacement |
+|---------------------------|-----------------------|
+| snappy.Encode(...) | s2.EncodeSnappy(...) |
+| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output.
@@ -660,12 +740,12 @@ Compression and speed is typically a bit better `MaxEncodedLen` is also smaller
Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
-| Encoder | Size | MB/s | Reduction |
-|-----------------------|------------|------------|------------
-| snappy.Encode | 1128706759 | 725.59 | 71.89% |
-| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% |
-| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% |
-| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%**|
+| Encoder | Size | MB/s | Reduction |
+|-----------------------|------------|------------|------------|
+| snappy.Encode | 1128706759 | 725.59 | 71.89% |
+| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% |
+| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% |
+| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%** |
## Streams
@@ -835,6 +915,13 @@ This is done using the regular "Skip" function:
This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset.
+# Compact storage
+
+For compact storage [RemoveIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RemoveIndexHeaders) can be used to remove any redundant info from
+a serialized index. If you remove the header it must be restored before [Loading](https://pkg.go.dev/github.com/klauspost/compress/s2#Index.Load).
+
+This is expected to save 20 bytes. These can be restored using [RestoreIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RestoreIndexHeaders). This removes a layer of security, but is the most compact representation. Returns nil if headers contains errors.
+
## Index Format:
Each block is structured as a snappy skippable block, with the chunk ID 0x99.
@@ -844,20 +931,20 @@ The block can be read from the front, but contains information so it can be read
Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding),
with un-encoded value length of 64 bits, unless other limits are specified.
-| Content | Format |
-|---------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
-| ID, `[1]byte` | Always 0x99. |
-| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. |
-| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". |
-| UncompressedSize, Varint | Total Uncompressed size. |
-| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. |
-| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. |
-| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. |
-| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. |
-| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. |
-| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. |
-| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. |
-| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
+| Content | Format |
+|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
+| ID, `[1]byte` | Always 0x99. |
+| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. |
+| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". |
+| UncompressedSize, Varint | Total Uncompressed size. |
+| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. |
+| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. |
+| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. |
+| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. |
+| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. |
+| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. |
+| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. |
+| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
For regular streams the uncompressed offsets are fully predictable,
so `HasUncompressedOffsets` allows to specify that compressed blocks all have
@@ -929,6 +1016,7 @@ To decode from any given uncompressed offset `(wantOffset)`:
See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface.
+
# Format Extensions
* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
@@ -951,13 +1039,80 @@ The length is specified by reading the 3-bit length specified in the tag and dec
| 7 | 65540 + read 3 bytes |
This allows any repeat offset + length to be represented by 2 to 5 bytes.
+It also allows to emit matches longer than 64 bytes with one copy + one repeat instead of several 64 byte copies.
Lengths are stored as little endian values.
-The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams.
+The first copy of a block cannot be a repeat offset and the offset is reset on every block in streams.
Default streaming block size is 1MB.
+# Dictionary Encoding
+
+Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks.
+
+A dictionary provides an initial repeat value that can be used to point to a common header.
+
+Other than that the dictionary contains values that can be used as back-references.
+
+Often used data should be placed at the *end* of the dictionary since offsets < 2048 bytes will be smaller.
+
+## Format
+
+Dictionary *content* must at least 16 bytes and less or equal to 64KiB (65536 bytes).
+
+Encoding: `[repeat value (uvarint)][dictionary content...]`
+
+Before the dictionary content, an unsigned base-128 (uvarint) encoded value specifying the initial repeat offset.
+This value is an offset into the dictionary content and not a back-reference offset,
+so setting this to 0 will make the repeat value point to the first value of the dictionary.
+
+The value must be less than the dictionary length-8
+
+## Encoding
+
+From the decoder point of view the dictionary content is seen as preceding the encoded content.
+
+`[dictionary content][decoded output]`
+
+Backreferences to the dictionary are encoded as ordinary backreferences that have an offset before the start of the decoded block.
+
+Matches copying from the dictionary are **not** allowed to cross from the dictionary into the decoded data.
+However, if a copy ends at the end of the dictionary the next repeat will point to the start of the decoded buffer, which is allowed.
+
+The first match can be a repeat value, which will use the repeat offset stored in the dictionary.
+
+When 64KB (65536 bytes) has been en/decoded it is no longer allowed to reference the dictionary,
+neither by a copy nor repeat operations.
+If the boundary is crossed while copying from the dictionary, the operation should complete,
+but the next instruction is not allowed to reference the dictionary.
+
+Valid blocks encoded *without* a dictionary can be decoded with any dictionary.
+There are no checks whether the supplied dictionary is the correct for a block.
+Because of this there is no overhead by using a dictionary.
+
+## Example
+
+This is the dictionary content. Elements are separated by `[]`.
+
+Dictionary: `[0x0a][Yesterday 25 bananas were added to Benjamins brown bag]`.
+
+Initial repeat offset is set at 10, which is the letter `2`.
+
+Encoded `[LIT "10"][REPEAT len=10][LIT "hich"][MATCH off=50 len=6][MATCH off=31 len=6][MATCH off=61 len=10]`
+
+Decoded: `[10][ bananas w][hich][ were ][brown ][were added]`
+
+Output: `10 bananas which were brown were added`
+
+
+## Streams
+
+For streams each block can use the dictionary.
+
+The dictionary cannot not currently be provided on the stream.
+
+
# LICENSE
This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation.
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
index b5fa4d3f..b7c9adfd 100644
--- a/vendor/github.com/klauspost/compress/s2/decode.go
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -11,7 +11,9 @@ import (
"fmt"
"io"
"io/ioutil"
+ "math"
"runtime"
+ "strconv"
"sync"
)
@@ -719,7 +721,11 @@ func (r *Reader) Skip(n int64) error {
// decoded[i:j] contains decoded bytes that have not yet been passed on.
left := int64(r.j - r.i)
if left >= n {
- r.i += int(n)
+ tmp := int64(r.i) + n
+ if tmp > math.MaxInt32 {
+ return errors.New("s2: internal overflow in skip")
+ }
+ r.i = int(tmp)
return nil
}
n -= int64(r.j - r.i)
@@ -875,15 +881,20 @@ func (r *Reader) Skip(n int64) error {
// See Reader.ReadSeeker
type ReadSeeker struct {
*Reader
+ readAtMu sync.Mutex
}
-// ReadSeeker will return an io.ReadSeeker compatible version of the reader.
+// ReadSeeker will return an io.ReadSeeker and io.ReaderAt
+// compatible version of the reader.
// If 'random' is specified the returned io.Seeker can be used for
// random seeking, otherwise only forward seeking is supported.
// Enabling random seeking requires the original input to support
// the io.Seeker interface.
// A custom index can be specified which will be used if supplied.
// When using a custom index, it will not be read from the input stream.
+// The ReadAt position will affect regular reads and the current position of Seek.
+// So using Read after ReadAt will continue from where the ReadAt stopped.
+// No functions should be used concurrently.
// The returned ReadSeeker contains a shallow reference to the existing Reader,
// meaning changes performed to one is reflected in the other.
func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) {
@@ -947,44 +958,61 @@ func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) {
// Seek allows seeking in compressed data.
func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
if r.err != nil {
+ if !errors.Is(r.err, io.EOF) {
+ return 0, r.err
+ }
+ // Reset on EOF
+ r.err = nil
+ }
+
+ // Calculate absolute offset.
+ absOffset := offset
+
+ switch whence {
+ case io.SeekStart:
+ case io.SeekCurrent:
+ absOffset = r.blockStart + int64(r.i) + offset
+ case io.SeekEnd:
+ if r.index == nil {
+ return 0, ErrUnsupported
+ }
+ absOffset = r.index.TotalUncompressed + offset
+ default:
+ r.err = ErrUnsupported
return 0, r.err
}
- if offset == 0 && whence == io.SeekCurrent {
- return r.blockStart + int64(r.i), nil
+
+ if absOffset < 0 {
+ return 0, errors.New("seek before start of file")
}
+
if !r.readHeader {
// Make sure we read the header.
_, r.err = r.Read([]byte{})
+ if r.err != nil {
+ return 0, r.err
+ }
+ }
+
+ // If we are inside current block no need to seek.
+ // This includes no offset changes.
+ if absOffset >= r.blockStart && absOffset < r.blockStart+int64(r.j) {
+ r.i = int(absOffset - r.blockStart)
+ return r.blockStart + int64(r.i), nil
}
+
rs, ok := r.r.(io.ReadSeeker)
if r.index == nil || !ok {
- if whence == io.SeekCurrent && offset >= 0 {
- err := r.Skip(offset)
- return r.blockStart + int64(r.i), err
- }
- if whence == io.SeekStart && offset >= r.blockStart+int64(r.i) {
- err := r.Skip(offset - r.blockStart - int64(r.i))
+ currOffset := r.blockStart + int64(r.i)
+ if absOffset >= currOffset {
+ err := r.Skip(absOffset - currOffset)
return r.blockStart + int64(r.i), err
}
return 0, ErrUnsupported
-
- }
-
- switch whence {
- case io.SeekCurrent:
- offset += r.blockStart + int64(r.i)
- case io.SeekEnd:
- if offset > 0 {
- return 0, errors.New("seek after end of file")
- }
- offset = r.index.TotalUncompressed + offset
- }
-
- if offset < 0 {
- return 0, errors.New("seek before start of file")
}
- c, u, err := r.index.Find(offset)
+ // We can seek and we have an index.
+ c, u, err := r.index.Find(absOffset)
if err != nil {
return r.blockStart + int64(r.i), err
}
@@ -995,12 +1023,57 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
return 0, err
}
- r.i = r.j // Remove rest of current block.
- if u < offset {
+ r.i = r.j // Remove rest of current block.
+ r.blockStart = u - int64(r.j) // Adjust current block start for accounting.
+ if u < absOffset {
// Forward inside block
- return offset, r.Skip(offset - u)
+ return absOffset, r.Skip(absOffset - u)
+ }
+ if u > absOffset {
+ return 0, fmt.Errorf("s2 seek: (internal error) u (%d) > absOffset (%d)", u, absOffset)
}
- return offset, nil
+ return absOffset, nil
+}
+
+// ReadAt reads len(p) bytes into p starting at offset off in the
+// underlying input source. It returns the number of bytes
+// read (0 <= n <= len(p)) and any error encountered.
+//
+// When ReadAt returns n < len(p), it returns a non-nil error
+// explaining why more bytes were not returned. In this respect,
+// ReadAt is stricter than Read.
+//
+// Even if ReadAt returns n < len(p), it may use all of p as scratch
+// space during the call. If some data is available but not len(p) bytes,
+// ReadAt blocks until either all the data is available or an error occurs.
+// In this respect ReadAt is different from Read.
+//
+// If the n = len(p) bytes returned by ReadAt are at the end of the
+// input source, ReadAt may return either err == EOF or err == nil.
+//
+// If ReadAt is reading from an input source with a seek offset,
+// ReadAt should not affect nor be affected by the underlying
+// seek offset.
+//
+// Clients of ReadAt can execute parallel ReadAt calls on the
+// same input source. This is however not recommended.
+func (r *ReadSeeker) ReadAt(p []byte, offset int64) (int, error) {
+ r.readAtMu.Lock()
+ defer r.readAtMu.Unlock()
+ _, err := r.Seek(offset, io.SeekStart)
+ if err != nil {
+ return 0, err
+ }
+ n := 0
+ for n < len(p) {
+ n2, err := r.Read(p[n:])
+ if err != nil {
+ // This will include io.EOF
+ return n + n2, err
+ }
+ n += n2
+ }
+ return n, nil
}
// ReadByte satisfies the io.ByteReader interface.
@@ -1039,3 +1112,370 @@ func (r *Reader) SkippableCB(id uint8, fn func(r io.Reader) error) error {
r.skippableCB[id] = fn
return nil
}
+
+// s2DecodeDict writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func s2DecodeDict(dst, src []byte, dict *Dict) int {
+ if dict == nil {
+ return s2Decode(dst, src)
+ }
+ const debug = false
+ const debugErrs = debug
+
+ if debug {
+ fmt.Println("Starting decode, dst len:", len(dst))
+ }
+ var d, s, length int
+ offset := len(dict.dict) - dict.repeat
+
+ // As long as we can read at least 5 bytes...
+ for s < len(src)-5 {
+ // Removing bounds checks is SLOWER, when if doing
+ // in := src[s:s+5]
+ // Checked on Go 1.18
+ switch src[s] & 0x03 {
+ case tagLiteral:
+ x := uint32(src[s] >> 2)
+ switch {
+ case x < 60:
+ s++
+ case x == 60:
+ s += 2
+ x = uint32(src[s-1])
+ case x == 61:
+ in := src[s : s+3]
+ x = uint32(in[1]) | uint32(in[2])<<8
+ s += 3
+ case x == 62:
+ in := src[s : s+4]
+ // Load as 32 bit and shift down.
+ x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+ x >>= 8
+ s += 4
+ case x == 63:
+ in := src[s : s+5]
+ x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
+ s += 5
+ }
+ length = int(x) + 1
+ if debug {
+ fmt.Println("literals, length:", length, "d-after:", d+length)
+ }
+ if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+ if debugErrs {
+ fmt.Println("corrupt literal: length:", length, "d-left:", len(dst)-d, "src-left:", len(src)-s)
+ }
+ return decodeErrCodeCorrupt
+ }
+
+ copy(dst[d:], src[s:s+length])
+ d += length
+ s += length
+ continue
+
+ case tagCopy1:
+ s += 2
+ toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+ length = int(src[s-2]) >> 2 & 0x7
+ if toffset == 0 {
+ if debug {
+ fmt.Print("(repeat) ")
+ }
+ // keep last offset
+ switch length {
+ case 5:
+ length = int(src[s]) + 4
+ s += 1
+ case 6:
+ in := src[s : s+2]
+ length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
+ s += 2
+ case 7:
+ in := src[s : s+3]
+ length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
+ s += 3
+ default: // 0-> 4
+ }
+ } else {
+ offset = toffset
+ }
+ length += 4
+ case tagCopy2:
+ in := src[s : s+3]
+ offset = int(uint32(in[1]) | uint32(in[2])<<8)
+ length = 1 + int(in[0])>>2
+ s += 3
+
+ case tagCopy4:
+ in := src[s : s+5]
+ offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
+ length = 1 + int(in[0])>>2
+ s += 5
+ }
+
+ if offset <= 0 || length > len(dst)-d {
+ if debugErrs {
+ fmt.Println("match error; offset:", offset, "length:", length, "dst-left:", len(dst)-d)
+ }
+ return decodeErrCodeCorrupt
+ }
+
+ // copy from dict
+ if d < offset {
+ if d > MaxDictSrcOffset {
+ if debugErrs {
+ fmt.Println("dict after", MaxDictSrcOffset, "d:", d, "offset:", offset, "length:", length)
+ }
+ return decodeErrCodeCorrupt
+ }
+ startOff := len(dict.dict) - offset + d
+ if startOff < 0 || startOff+length > len(dict.dict) {
+ if debugErrs {
+ fmt.Printf("offset (%d) + length (%d) bigger than dict (%d)\n", offset, length, len(dict.dict))
+ }
+ return decodeErrCodeCorrupt
+ }
+ if debug {
+ fmt.Println("dict copy, length:", length, "offset:", offset, "d-after:", d+length, "dict start offset:", startOff)
+ }
+ copy(dst[d:d+length], dict.dict[startOff:])
+ d += length
+ continue
+ }
+
+ if debug {
+ fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+ }
+
+ // Copy from an earlier sub-slice of dst to a later sub-slice.
+ // If no overlap, use the built-in copy:
+ if offset > length {
+ copy(dst[d:d+length], dst[d-offset:])
+ d += length
+ continue
+ }
+
+ // Unlike the built-in copy function, this byte-by-byte copy always runs
+ // forwards, even if the slices overlap. Conceptually, this is:
+ //
+ // d += forwardCopy(dst[d:d+length], dst[d-offset:])
+ //
+ // We align the slices into a and b and show the compiler they are the same size.
+ // This allows the loop to run without bounds checks.
+ a := dst[d : d+length]
+ b := dst[d-offset:]
+ b = b[:len(a)]
+ for i := range a {
+ a[i] = b[i]
+ }
+ d += length
+ }
+
+ // Remaining with extra checks...
+ for s < len(src) {
+ switch src[s] & 0x03 {
+ case tagLiteral:
+ x := uint32(src[s] >> 2)
+ switch {
+ case x < 60:
+ s++
+ case x == 60:
+ s += 2
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ x = uint32(src[s-1])
+ case x == 61:
+ s += 3
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ x = uint32(src[s-2]) | uint32(src[s-1])<<8
+ case x == 62:
+ s += 4
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+ case x == 63:
+ s += 5
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+ }
+ length = int(x) + 1
+ if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+ if debugErrs {
+ fmt.Println("corrupt literal: length:", length, "d-left:", len(dst)-d, "src-left:", len(src)-s)
+ }
+ return decodeErrCodeCorrupt
+ }
+ if debug {
+ fmt.Println("literals, length:", length, "d-after:", d+length)
+ }
+
+ copy(dst[d:], src[s:s+length])
+ d += length
+ s += length
+ continue
+
+ case tagCopy1:
+ s += 2
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ length = int(src[s-2]) >> 2 & 0x7
+ toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+ if toffset == 0 {
+ if debug {
+ fmt.Print("(repeat) ")
+ }
+ // keep last offset
+ switch length {
+ case 5:
+ s += 1
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ length = int(uint32(src[s-1])) + 4
+ case 6:
+ s += 2
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+ case 7:
+ s += 3
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+ default: // 0-> 4
+ }
+ } else {
+ offset = toffset
+ }
+ length += 4
+ case tagCopy2:
+ s += 3
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ length = 1 + int(src[s-3])>>2
+ offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+ case tagCopy4:
+ s += 5
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ if debugErrs {
+ fmt.Println("src went oob")
+ }
+ return decodeErrCodeCorrupt
+ }
+ length = 1 + int(src[s-5])>>2
+ offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+ }
+
+ if offset <= 0 || length > len(dst)-d {
+ if debugErrs {
+ fmt.Println("match error; offset:", offset, "length:", length, "dst-left:", len(dst)-d)
+ }
+ return decodeErrCodeCorrupt
+ }
+
+ // copy from dict
+ if d < offset {
+ if d > MaxDictSrcOffset {
+ if debugErrs {
+ fmt.Println("dict after", MaxDictSrcOffset, "d:", d, "offset:", offset, "length:", length)
+ }
+ return decodeErrCodeCorrupt
+ }
+ rOff := len(dict.dict) - (offset - d)
+ if debug {
+ fmt.Println("starting dict entry from dict offset", len(dict.dict)-rOff)
+ }
+ if rOff+length > len(dict.dict) {
+ if debugErrs {
+ fmt.Println("err: END offset", rOff+length, "bigger than dict", len(dict.dict), "dict offset:", rOff, "length:", length)
+ }
+ return decodeErrCodeCorrupt
+ }
+ if rOff < 0 {
+ if debugErrs {
+ fmt.Println("err: START offset", rOff, "less than 0", len(dict.dict), "dict offset:", rOff, "length:", length)
+ }
+ return decodeErrCodeCorrupt
+ }
+ copy(dst[d:d+length], dict.dict[rOff:])
+ d += length
+ continue
+ }
+
+ if debug {
+ fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+ }
+
+ // Copy from an earlier sub-slice of dst to a later sub-slice.
+ // If no overlap, use the built-in copy:
+ if offset > length {
+ copy(dst[d:d+length], dst[d-offset:])
+ d += length
+ continue
+ }
+
+ // Unlike the built-in copy function, this byte-by-byte copy always runs
+ // forwards, even if the slices overlap. Conceptually, this is:
+ //
+ // d += forwardCopy(dst[d:d+length], dst[d-offset:])
+ //
+ // We align the slices into a and b and show the compiler they are the same size.
+ // This allows the loop to run without bounds checks.
+ a := dst[d : d+length]
+ b := dst[d-offset:]
+ b = b[:len(a)]
+ for i := range a {
+ a[i] = b[i]
+ }
+ d += length
+ }
+
+ if d != len(dst) {
+ if debugErrs {
+ fmt.Println("wanted length", len(dst), "got", d)
+ }
+ return decodeErrCodeCorrupt
+ }
+ return 0
+}
diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go
index 1074ebd2..2cb55c2c 100644
--- a/vendor/github.com/klauspost/compress/s2/decode_other.go
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@@ -28,6 +28,9 @@ func s2Decode(dst, src []byte) int {
// As long as we can read at least 5 bytes...
for s < len(src)-5 {
+ // Removing bounds checks is SLOWER, when if doing
+ // in := src[s:s+5]
+ // Checked on Go 1.18
switch src[s] & 0x03 {
case tagLiteral:
x := uint32(src[s] >> 2)
@@ -38,17 +41,25 @@ func s2Decode(dst, src []byte) int {
s += 2
x = uint32(src[s-1])
case x == 61:
+ in := src[s : s+3]
+ x = uint32(in[1]) | uint32(in[2])<<8
s += 3
- x = uint32(src[s-2]) | uint32(src[s-1])<<8
case x == 62:
+ in := src[s : s+4]
+ // Load as 32 bit and shift down.
+ x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+ x >>= 8
s += 4
- x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
case x == 63:
+ in := src[s : s+5]
+ x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
s += 5
- x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
}
length = int(x) + 1
if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+ if debug {
+ fmt.Println("corrupt: lit size", length)
+ }
return decodeErrCodeCorrupt
}
if debug {
@@ -62,8 +73,8 @@ func s2Decode(dst, src []byte) int {
case tagCopy1:
s += 2
- length = int(src[s-2]) >> 2 & 0x7
toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+ length = int(src[s-2]) >> 2 & 0x7
if toffset == 0 {
if debug {
fmt.Print("(repeat) ")
@@ -71,14 +82,16 @@ func s2Decode(dst, src []byte) int {
// keep last offset
switch length {
case 5:
+ length = int(src[s]) + 4
s += 1
- length = int(uint32(src[s-1])) + 4
case 6:
+ in := src[s : s+2]
+ length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
s += 2
- length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
case 7:
+ in := src[s : s+3]
+ length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
s += 3
- length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
default: // 0-> 4
}
} else {
@@ -86,17 +99,23 @@ func s2Decode(dst, src []byte) int {
}
length += 4
case tagCopy2:
+ in := src[s : s+3]
+ offset = int(uint32(in[1]) | uint32(in[2])<<8)
+ length = 1 + int(in[0])>>2
s += 3
- length = 1 + int(src[s-3])>>2
- offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
case tagCopy4:
+ in := src[s : s+5]
+ offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
+ length = 1 + int(in[0])>>2
s += 5
- length = 1 + int(src[s-5])>>2
- offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
}
if offset <= 0 || d < offset || length > len(dst)-d {
+ if debug {
+ fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
+ }
+
return decodeErrCodeCorrupt
}
@@ -163,6 +182,9 @@ func s2Decode(dst, src []byte) int {
}
length = int(x) + 1
if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+ if debug {
+ fmt.Println("corrupt: lit size", length)
+ }
return decodeErrCodeCorrupt
}
if debug {
@@ -229,6 +251,9 @@ func s2Decode(dst, src []byte) int {
}
if offset <= 0 || d < offset || length > len(dst)-d {
+ if debug {
+ fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
+ }
return decodeErrCodeCorrupt
}
diff --git a/vendor/github.com/klauspost/compress/s2/dict.go b/vendor/github.com/klauspost/compress/s2/dict.go
new file mode 100644
index 00000000..24f7ce80
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/dict.go
@@ -0,0 +1,331 @@
+// Copyright (c) 2022+ Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+ "bytes"
+ "encoding/binary"
+ "sync"
+)
+
+const (
+ // MinDictSize is the minimum dictionary size when repeat has been read.
+ MinDictSize = 16
+
+ // MaxDictSize is the maximum dictionary size when repeat has been read.
+ MaxDictSize = 65536
+
+ // MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
+ MaxDictSrcOffset = 65535
+)
+
+// Dict contains a dictionary that can be used for encoding and decoding s2
+type Dict struct {
+ dict []byte
+ repeat int // Repeat as index of dict
+
+ fast, better, best sync.Once
+ fastTable *[1 << 14]uint16
+
+ betterTableShort *[1 << 14]uint16
+ betterTableLong *[1 << 17]uint16
+
+ bestTableShort *[1 << 16]uint32
+ bestTableLong *[1 << 19]uint32
+}
+
+// NewDict will read a dictionary.
+// It will return nil if the dictionary is invalid.
+func NewDict(dict []byte) *Dict {
+ if len(dict) == 0 {
+ return nil
+ }
+ var d Dict
+ // Repeat is the first value of the dict
+ r, n := binary.Uvarint(dict)
+ if n <= 0 {
+ return nil
+ }
+ dict = dict[n:]
+ d.dict = dict
+ if cap(d.dict) < len(d.dict)+16 {
+ d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+ }
+ if len(dict) < MinDictSize || len(dict) > MaxDictSize {
+ return nil
+ }
+ d.repeat = int(r)
+ if d.repeat > len(dict) {
+ return nil
+ }
+ return &d
+}
+
+// Bytes will return a serialized version of the dictionary.
+// The output can be sent to NewDict.
+func (d *Dict) Bytes() []byte {
+ dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
+ return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
+}
+
+// MakeDict will create a dictionary.
+// 'data' must be at least MinDictSize.
+// If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
+// If searchStart is set the start repeat value will be set to the last
+// match of this content.
+// If no matches are found, it will attempt to find shorter matches.
+// This content should match the typical start of a block.
+// If at least 4 bytes cannot be matched, repeat is set to start of block.
+func MakeDict(data []byte, searchStart []byte) *Dict {
+ if len(data) == 0 {
+ return nil
+ }
+ if len(data) > MaxDictSize {
+ data = data[len(data)-MaxDictSize:]
+ }
+ var d Dict
+ dict := data
+ d.dict = dict
+ if cap(d.dict) < len(d.dict)+16 {
+ d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+ }
+ if len(dict) < MinDictSize {
+ return nil
+ }
+
+ // Find the longest match possible, last entry if multiple.
+ for s := len(searchStart); s > 4; s-- {
+ if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
+ d.repeat = idx
+ break
+ }
+ }
+
+ return &d
+}
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *Dict) Encode(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if cap(dst) < n {
+ dst = make([]byte, n)
+ } else {
+ dst = dst[:n]
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:dstP]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ dstP += emitLiteral(dst[dstP:], src)
+ return dst[:dstP]
+ }
+ n := encodeBlockDictGo(dst[dstP:], src, d)
+ if n > 0 {
+ dstP += n
+ return dst[:dstP]
+ }
+ // Not compressible
+ dstP += emitLiteral(dst[dstP:], src)
+ return dst[:dstP]
+}
+
+// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *Dict) EncodeBetter(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if len(dst) < n {
+ dst = make([]byte, n)
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:dstP]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ dstP += emitLiteral(dst[dstP:], src)
+ return dst[:dstP]
+ }
+ n := encodeBlockBetterDict(dst[dstP:], src, d)
+ if n > 0 {
+ dstP += n
+ return dst[:dstP]
+ }
+ // Not compressible
+ dstP += emitLiteral(dst[dstP:], src)
+ return dst[:dstP]
+}
+
+// EncodeBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBest compresses as good as reasonably possible but with a
+// big speed decrease.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *Dict) EncodeBest(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if len(dst) < n {
+ dst = make([]byte, n)
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:dstP]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ dstP += emitLiteral(dst[dstP:], src)
+ return dst[:dstP]
+ }
+ n := encodeBlockBest(dst[dstP:], src, d)
+ if n > 0 {
+ dstP += n
+ return dst[:dstP]
+ }
+ // Not compressible
+ dstP += emitLiteral(dst[dstP:], src)
+ return dst[:dstP]
+}
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
+ dLen, s, err := decodedLen(src)
+ if err != nil {
+ return nil, err
+ }
+ if dLen <= cap(dst) {
+ dst = dst[:dLen]
+ } else {
+ dst = make([]byte, dLen)
+ }
+ if s2DecodeDict(dst, src[s:], d) != 0 {
+ return nil, ErrCorrupt
+ }
+ return dst, nil
+}
+
+func (d *Dict) initFast() {
+ d.fast.Do(func() {
+ const (
+ tableBits = 14
+ maxTableSize = 1 << tableBits
+ )
+
+ var table [maxTableSize]uint16
+ // We stop so any entry of length 8 can always be read.
+ for i := 0; i < len(d.dict)-8-2; i += 3 {
+ x0 := load64(d.dict, i)
+ h0 := hash6(x0, tableBits)
+ h1 := hash6(x0>>8, tableBits)
+ h2 := hash6(x0>>16, tableBits)
+ table[h0] = uint16(i)
+ table[h1] = uint16(i + 1)
+ table[h2] = uint16(i + 2)
+ }
+ d.fastTable = &table
+ })
+}
+
+func (d *Dict) initBetter() {
+ d.better.Do(func() {
+ const (
+ // Long hash matches.
+ lTableBits = 17
+ maxLTableSize = 1 << lTableBits
+
+ // Short hash matches.
+ sTableBits = 14
+ maxSTableSize = 1 << sTableBits
+ )
+
+ var lTable [maxLTableSize]uint16
+ var sTable [maxSTableSize]uint16
+
+ // We stop so any entry of length 8 can always be read.
+ for i := 0; i < len(d.dict)-8; i++ {
+ cv := load64(d.dict, i)
+ lTable[hash7(cv, lTableBits)] = uint16(i)
+ sTable[hash4(cv, sTableBits)] = uint16(i)
+ }
+ d.betterTableShort = &sTable
+ d.betterTableLong = &lTable
+ })
+}
+
+func (d *Dict) initBest() {
+ d.best.Do(func() {
+ const (
+ // Long hash matches.
+ lTableBits = 19
+ maxLTableSize = 1 << lTableBits
+
+ // Short hash matches.
+ sTableBits = 16
+ maxSTableSize = 1 << sTableBits
+ )
+
+ var lTable [maxLTableSize]uint32
+ var sTable [maxSTableSize]uint32
+
+ // We stop so any entry of length 8 can always be read.
+ for i := 0; i < len(d.dict)-8; i++ {
+ cv := load64(d.dict, i)
+ hashL := hash8(cv, lTableBits)
+ hashS := hash4(cv, sTableBits)
+ candidateL := lTable[hashL]
+ candidateS := sTable[hashS]
+ lTable[hashL] = uint32(i) | candidateL<<16
+ sTable[hashS] = uint32(i) | candidateS<<16
+ }
+ d.bestTableShort = &sTable
+ d.bestTableLong = &lTable
+ })
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go
index 1aefabf3..c2ca7236 100644
--- a/vendor/github.com/klauspost/compress/s2/encode.go
+++ b/vendor/github.com/klauspost/compress/s2/encode.go
@@ -58,6 +58,32 @@ func Encode(dst, src []byte) []byte {
return dst[:d]
}
+// EstimateBlockSize will perform a very fast compression
+// without outputting the result and return the compressed output size.
+// The function returns -1 if no improvement could be achieved.
+// Using actual compression will most often produce better compression than the estimate.
+func EstimateBlockSize(src []byte) (d int) {
+ if len(src) < 6 || int64(len(src)) > 0xffffffff {
+ return -1
+ }
+ if len(src) <= 1024 {
+ d = calcBlockSizeSmall(src)
+ } else {
+ d = calcBlockSize(src)
+ }
+
+ if d == 0 {
+ return -1
+ }
+ // Size of the varint encoded block size.
+ d += (bits.Len64(uint64(len(src))) + 7) / 7
+
+ if d >= len(src) {
+ return -1
+ }
+ return d
+}
+
// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
// slice of dst if dst was large enough to hold the entire encoded block.
// Otherwise, a newly allocated slice will be returned.
@@ -132,7 +158,7 @@ func EncodeBest(dst, src []byte) []byte {
d += emitLiteral(dst[d:], src)
return dst[:d]
}
- n := encodeBlockBest(dst[d:], src)
+ n := encodeBlockBest(dst[d:], src, nil)
if n > 0 {
d += n
return dst[:d]
@@ -404,10 +430,11 @@ type Writer struct {
buffers sync.Pool
pad int
- writer io.Writer
- randSrc io.Reader
- writerWg sync.WaitGroup
- index Index
+ writer io.Writer
+ randSrc io.Reader
+ writerWg sync.WaitGroup
+ index Index
+ customEnc func(dst, src []byte) int
// wroteStreamHeader is whether we have written the stream header.
wroteStreamHeader bool
@@ -773,6 +800,9 @@ func (w *Writer) EncodeBuffer(buf []byte) (err error) {
}
func (w *Writer) encodeBlock(obuf, uncompressed []byte) int {
+ if w.customEnc != nil {
+ return w.customEnc(obuf, uncompressed)
+ }
if w.snappy {
switch w.level {
case levelFast:
@@ -790,7 +820,7 @@ func (w *Writer) encodeBlock(obuf, uncompressed []byte) int {
case levelBetter:
return encodeBlockBetter(obuf, uncompressed)
case levelBest:
- return encodeBlockBest(obuf, uncompressed)
+ return encodeBlockBest(obuf, uncompressed, nil)
}
return 0
}
@@ -1339,3 +1369,15 @@ func WriterFlushOnWrite() WriterOption {
return nil
}
}
+
+// WriterCustomEncoder allows to override the encoder for blocks on the stream.
+// The function must compress 'src' into 'dst' and return the bytes used in dst as an integer.
+// Block size (initial varint) should not be added by the encoder.
+// Returning value 0 indicates the block could not be compressed.
+// The function should expect to be called concurrently.
+func WriterCustomEncoder(fn func(dst, src []byte) int) WriterOption {
+ return func(w *Writer) error {
+ w.customEnc = fn
+ return nil
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go
index 8b16c38a..11657f09 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_all.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@@ -8,6 +8,7 @@ package s2
import (
"bytes"
"encoding/binary"
+ "fmt"
"math/bits"
)
@@ -58,8 +59,9 @@ func encodeGo(dst, src []byte) []byte {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockGo(dst, src []byte) (d int) {
// Initialize the hash table.
const (
@@ -454,3 +456,594 @@ emitRemainder:
}
return d
}
+
+// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockDictGo(dst, src []byte, dict *Dict) (d int) {
+ // Initialize the hash table.
+ const (
+ tableBits = 14
+ maxTableSize = 1 << tableBits
+ maxAhead = 8 // maximum bytes ahead without checking sLimit
+
+ debug = false
+ )
+ dict.initFast()
+
+ var table [maxTableSize]uint32
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+ if sLimit > MaxDictSrcOffset-maxAhead {
+ sLimit = MaxDictSrcOffset - maxAhead
+ }
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - len(src)>>5 - 5
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form can start with a dict entry (copy or repeat).
+ s := 0
+
+ // Convert dict repeat to offset
+ repeat := len(dict.dict) - dict.repeat
+ cv := load64(src, 0)
+
+ // While in dict
+searchDict:
+ for {
+ // Next src position to check
+ nextS := s + (s-nextEmit)>>6 + 4
+ hash0 := hash6(cv, tableBits)
+ hash1 := hash6(cv>>8, tableBits)
+ if nextS > sLimit {
+ if debug {
+ fmt.Println("slimit reached", s, nextS)
+ }
+ break searchDict
+ }
+ candidateDict := int(dict.fastTable[hash0])
+ candidateDict2 := int(dict.fastTable[hash1])
+ candidate2 := int(table[hash1])
+ candidate := int(table[hash0])
+ table[hash0] = uint32(s)
+ table[hash1] = uint32(s + 1)
+ hash2 := hash6(cv>>16, tableBits)
+
+ // Check repeat at offset checkRep.
+ const checkRep = 1
+
+ if repeat > s {
+ candidate := len(dict.dict) - repeat + s
+ if repeat-s >= 4 && uint32(cv) == load32(dict.dict, candidate) {
+ // Extend back
+ base := s
+ for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ if debug && nextEmit != base {
+ fmt.Println("emitted ", base-nextEmit, "literals")
+ }
+ s += 4
+ candidate += 4
+ for candidate < len(dict.dict)-8 && s <= len(src)-8 {
+ if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+ d += emitRepeat(dst[d:], repeat, s-base)
+ if debug {
+ fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
+ }
+ nextEmit = s
+ if s >= sLimit {
+ break searchDict
+ }
+ cv = load64(src, s)
+ continue
+ }
+ } else if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+ base := s + checkRep
+ // Extend back
+ for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ if debug && nextEmit != base {
+ fmt.Println("emitted ", base-nextEmit, "literals")
+ }
+
+ // Extend forward
+ candidate := s - repeat + 4 + checkRep
+ s += 4 + checkRep
+ for s <= sLimit {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+ if debug {
+ // Validate match.
+ if s <= candidate {
+ panic("s <= candidate")
+ }
+ a := src[base:s]
+ b := src[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+
+ if nextEmit > 0 {
+ // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+ d += emitRepeat(dst[d:], repeat, s-base)
+ } else {
+ // First match, cannot be repeat.
+ d += emitCopy(dst[d:], repeat, s-base)
+ }
+
+ nextEmit = s
+ if s >= sLimit {
+ break searchDict
+ }
+ if debug {
+ fmt.Println("emitted reg repeat", s-base, "s:", s)
+ }
+ cv = load64(src, s)
+ continue searchDict
+ }
+ if s == 0 {
+ cv = load64(src, nextS)
+ s = nextS
+ continue searchDict
+ }
+ // Start with table. These matches will always be closer.
+ if uint32(cv) == load32(src, candidate) {
+ goto emitMatch
+ }
+ candidate = int(table[hash2])
+ if uint32(cv>>8) == load32(src, candidate2) {
+ table[hash2] = uint32(s + 2)
+ candidate = candidate2
+ s++
+ goto emitMatch
+ }
+
+ // Check dict. Dicts have longer offsets, so we want longer matches.
+ if cv == load64(dict.dict, candidateDict) {
+ table[hash2] = uint32(s + 2)
+ goto emitDict
+ }
+
+ candidateDict = int(dict.fastTable[hash2])
+ // Check if upper 7 bytes match
+ if candidateDict2 >= 1 {
+ if cv^load64(dict.dict, candidateDict2-1) < (1 << 8) {
+ table[hash2] = uint32(s + 2)
+ candidateDict = candidateDict2
+ s++
+ goto emitDict
+ }
+ }
+
+ table[hash2] = uint32(s + 2)
+ if uint32(cv>>16) == load32(src, candidate) {
+ s += 2
+ goto emitMatch
+ }
+ if candidateDict >= 2 {
+ // Check if upper 6 bytes match
+ if cv^load64(dict.dict, candidateDict-2) < (1 << 16) {
+ s += 2
+ goto emitDict
+ }
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ continue searchDict
+
+ emitDict:
+ {
+ if debug {
+ if load32(dict.dict, candidateDict) != load32(src, s) {
+ panic("dict emit mismatch")
+ }
+ }
+ // Extend backwards.
+ // The top bytes will be rechecked to get the full match.
+ for candidateDict > 0 && s > nextEmit && dict.dict[candidateDict-1] == src[s-1] {
+ candidateDict--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+
+ d += emitLiteral(dst[d:], src[nextEmit:s])
+ if debug && nextEmit != s {
+ fmt.Println("emitted ", s-nextEmit, "literals")
+ }
+ {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+ repeat = s + (len(dict.dict)) - candidateDict
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidateDict += 4
+ for s <= len(src)-8 && len(dict.dict)-candidateDict >= 8 {
+ if diff := load64(src, s) ^ load64(dict.dict, candidateDict); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidateDict += 8
+ }
+
+ // Matches longer than 64 are split.
+ if s <= sLimit || s-base < 8 {
+ d += emitCopy(dst[d:], repeat, s-base)
+ } else {
+ // Split to ensure we don't start a copy within next block
+ d += emitCopy(dst[d:], repeat, 4)
+ d += emitRepeat(dst[d:], repeat, s-base-4)
+ }
+ if false {
+ // Validate match.
+ if s <= candidate {
+ panic("s <= candidate")
+ }
+ a := src[base:s]
+ b := dict.dict[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+ if debug {
+ fmt.Println("emitted dict copy, length", s-base, "offset:", repeat, "s:", s)
+ }
+ nextEmit = s
+ if s >= sLimit {
+ break searchDict
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+
+ // Index and continue loop to try new candidate.
+ x := load64(src, s-2)
+ m2Hash := hash6(x, tableBits)
+ currHash := hash6(x>>8, tableBits)
+ candidate = int(table[currHash])
+ table[m2Hash] = uint32(s - 2)
+ table[currHash] = uint32(s - 1)
+ cv = load64(src, s)
+ }
+ continue
+ }
+ emitMatch:
+
+ // Extend backwards.
+ // The top bytes will be rechecked to get the full match.
+ for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+ candidate--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+
+ d += emitLiteral(dst[d:], src[nextEmit:s])
+ if debug && nextEmit != s {
+ fmt.Println("emitted ", s-nextEmit, "literals")
+ }
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+ repeat = base - candidate
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidate += 4
+ for s <= len(src)-8 {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+
+ d += emitCopy(dst[d:], repeat, s-base)
+ if debug {
+ // Validate match.
+ if s <= candidate {
+ panic("s <= candidate")
+ }
+ a := src[base:s]
+ b := src[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+ if debug {
+ fmt.Println("emitted src copy, length", s-base, "offset:", repeat, "s:", s)
+ }
+ nextEmit = s
+ if s >= sLimit {
+ break searchDict
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Check for an immediate match, otherwise start search at s+1
+ x := load64(src, s-2)
+ m2Hash := hash6(x, tableBits)
+ currHash := hash6(x>>16, tableBits)
+ candidate = int(table[currHash])
+ table[m2Hash] = uint32(s - 2)
+ table[currHash] = uint32(s)
+ if debug && s == candidate {
+ panic("s == candidate")
+ }
+ if uint32(x>>16) != load32(src, candidate) {
+ cv = load64(src, s+1)
+ s++
+ break
+ }
+ }
+ }
+
+ // Search without dict:
+ if repeat > s {
+ repeat = 0
+ }
+
+ // No more dict
+ sLimit = len(src) - inputMargin
+ if s >= sLimit {
+ goto emitRemainder
+ }
+ if debug {
+ fmt.Println("non-dict matching at", s, "repeat:", repeat)
+ }
+ cv = load64(src, s)
+ if debug {
+ fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
+ }
+ for {
+ candidate := 0
+ for {
+ // Next src position to check
+ nextS := s + (s-nextEmit)>>6 + 4
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hash0 := hash6(cv, tableBits)
+ hash1 := hash6(cv>>8, tableBits)
+ candidate = int(table[hash0])
+ candidate2 := int(table[hash1])
+ table[hash0] = uint32(s)
+ table[hash1] = uint32(s + 1)
+ hash2 := hash6(cv>>16, tableBits)
+
+ // Check repeat at offset checkRep.
+ const checkRep = 1
+ if repeat > 0 && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+ base := s + checkRep
+ // Extend back
+ for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ if debug && nextEmit != base {
+ fmt.Println("emitted ", base-nextEmit, "literals")
+ }
+ // Extend forward
+ candidate := s - repeat + 4 + checkRep
+ s += 4 + checkRep
+ for s <= sLimit {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+ if debug {
+ // Validate match.
+ if s <= candidate {
+ panic("s <= candidate")
+ }
+ a := src[base:s]
+ b := src[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+ if nextEmit > 0 {
+ // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+ d += emitRepeat(dst[d:], repeat, s-base)
+ } else {
+ // First match, cannot be repeat.
+ d += emitCopy(dst[d:], repeat, s-base)
+ }
+ if debug {
+ fmt.Println("emitted src repeat length", s-base, "offset:", repeat, "s:", s)
+ }
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ cv = load64(src, s)
+ continue
+ }
+
+ if uint32(cv) == load32(src, candidate) {
+ break
+ }
+ candidate = int(table[hash2])
+ if uint32(cv>>8) == load32(src, candidate2) {
+ table[hash2] = uint32(s + 2)
+ candidate = candidate2
+ s++
+ break
+ }
+ table[hash2] = uint32(s + 2)
+ if uint32(cv>>16) == load32(src, candidate) {
+ s += 2
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards.
+ // The top bytes will be rechecked to get the full match.
+ for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+ candidate--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+
+ d += emitLiteral(dst[d:], src[nextEmit:s])
+ if debug && nextEmit != s {
+ fmt.Println("emitted ", s-nextEmit, "literals")
+ }
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+ repeat = base - candidate
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidate += 4
+ for s <= len(src)-8 {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+
+ d += emitCopy(dst[d:], repeat, s-base)
+ if debug {
+ // Validate match.
+ if s <= candidate {
+ panic("s <= candidate")
+ }
+ a := src[base:s]
+ b := src[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+ if debug {
+ fmt.Println("emitted src copy, length", s-base, "offset:", repeat, "s:", s)
+ }
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Check for an immediate match, otherwise start search at s+1
+ x := load64(src, s-2)
+ m2Hash := hash6(x, tableBits)
+ currHash := hash6(x>>16, tableBits)
+ candidate = int(table[currHash])
+ table[m2Hash] = uint32(s - 2)
+ table[currHash] = uint32(s)
+ if debug && s == candidate {
+ panic("s == candidate")
+ }
+ if uint32(x>>16) != load32(src, candidate) {
+ cv = load64(src, s+1)
+ s++
+ break
+ }
+ }
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:])
+ if debug && nextEmit != s {
+ fmt.Println("emitted ", len(src)-nextEmit, "literals")
+ }
+ }
+ return d
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
index e612225f..ebc332ad 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
@@ -3,13 +3,16 @@
package s2
+const hasAmd64Asm = true
+
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlock(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
@@ -43,8 +46,9 @@ func encodeBlock(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetter(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
@@ -78,8 +82,9 @@ func encodeBlockBetter(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockSnappy(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
@@ -112,8 +117,9 @@ func encodeBlockSnappy(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetterSnappy(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go
index 4bc80bc6..1d13e869 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_best.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@@ -7,6 +7,7 @@ package s2
import (
"fmt"
+ "math"
"math/bits"
)
@@ -15,9 +16,10 @@ import (
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
-func encodeBlockBest(dst, src []byte) (d int) {
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
// Initialize the hash tables.
const (
// Long hash matches.
@@ -29,6 +31,8 @@ func encodeBlockBest(dst, src []byte) (d int) {
maxSTableSize = 1 << sTableBits
inputMargin = 8 + 2
+
+ debug = false
)
// sLimit is when to stop looking for offset/length copies. The inputMargin
@@ -38,6 +42,10 @@ func encodeBlockBest(dst, src []byte) (d int) {
if len(src) < minNonLiteralBlockSize {
return 0
}
+ sLimitDict := len(src) - inputMargin
+ if sLimitDict > MaxDictSrcOffset-inputMargin {
+ sLimitDict = MaxDictSrcOffset - inputMargin
+ }
var lTable [maxLTableSize]uint64
var sTable [maxSTableSize]uint64
@@ -51,10 +59,15 @@ func encodeBlockBest(dst, src []byte) (d int) {
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
+ repeat := 1
+ if dict != nil {
+ dict.initBest()
+ s = 0
+ repeat = len(dict.dict) - dict.repeat
+ }
cv := load64(src, s)
// We search for a repeat at -1, but don't output repeats when nextEmit == 0
- repeat := 1
const lowbitMask = 0xffffffff
getCur := func(x uint64) int {
return int(x & lowbitMask)
@@ -66,11 +79,11 @@ func encodeBlockBest(dst, src []byte) (d int) {
for {
type match struct {
- offset int
- s int
- length int
- score int
- rep bool
+ offset int
+ s int
+ length int
+ score int
+ rep, dict bool
}
var best match
for {
@@ -84,6 +97,12 @@ func encodeBlockBest(dst, src []byte) (d int) {
if nextS > sLimit {
goto emitRemainder
}
+ if dict != nil && s >= MaxDictSrcOffset {
+ dict = nil
+ if repeat > s {
+ repeat = math.MinInt32
+ }
+ }
hashL := hash8(cv, lTableBits)
hashS := hash4(cv, sTableBits)
candidateL := lTable[hashL]
@@ -113,7 +132,15 @@ func encodeBlockBest(dst, src []byte) (d int) {
}
m := match{offset: offset, s: s, length: 4 + offset, rep: rep}
s += 4
- for s <= sLimit {
+ for s < len(src) {
+ if len(src)-s < 8 {
+ if src[s] == src[m.length] {
+ m.length++
+ s++
+ continue
+ }
+ break
+ }
if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
m.length += bits.TrailingZeros64(diff) >> 3
break
@@ -129,6 +156,62 @@ func encodeBlockBest(dst, src []byte) (d int) {
}
return m
}
+ matchDict := func(candidate, s int, first uint32, rep bool) match {
+ // Calculate offset as if in continuous array with s
+ offset := -len(dict.dict) + candidate
+ if best.length != 0 && best.s-best.offset == s-offset && !rep {
+ // Don't retest if we have the same offset.
+ return match{offset: offset, s: s}
+ }
+
+ if load32(dict.dict, candidate) != first {
+ return match{offset: offset, s: s}
+ }
+ m := match{offset: offset, s: s, length: 4 + candidate, rep: rep, dict: true}
+ s += 4
+ if !rep {
+ for s < sLimitDict && m.length < len(dict.dict) {
+ if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
+ if src[s] == dict.dict[m.length] {
+ m.length++
+ s++
+ continue
+ }
+ break
+ }
+ if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
+ m.length += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ m.length += 8
+ }
+ } else {
+ for s < len(src) && m.length < len(dict.dict) {
+ if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
+ if src[s] == dict.dict[m.length] {
+ m.length++
+ s++
+ continue
+ }
+ break
+ }
+ if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
+ m.length += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ m.length += 8
+ }
+ }
+ m.length -= candidate
+ m.score = score(m)
+ if m.score <= -m.s {
+ // Eliminate if no savings, we might find a better one.
+ m.length = 0
+ }
+ return m
+ }
bestOf := func(a, b match) match {
if b.length == 0 {
@@ -145,45 +228,99 @@ func encodeBlockBest(dst, src []byte) (d int) {
return b
}
- best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false))
- best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false))
- best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false))
-
+ if s > 0 {
+ best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false))
+ }
+ if dict != nil {
+ candidateL := dict.bestTableLong[hashL]
+ candidateS := dict.bestTableShort[hashS]
+ best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+ best = bestOf(best, matchDict(int(candidateL>>16), s, uint32(cv), false))
+ best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+ best = bestOf(best, matchDict(int(candidateS>>16), s, uint32(cv), false))
+ }
{
- best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
+ if (dict == nil || repeat <= s) && repeat > 0 {
+ best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
+ } else if s-repeat < -4 && dict != nil {
+ candidate := len(dict.dict) - (repeat - s)
+ best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
+ candidate++
+ best = bestOf(best, matchDict(candidate, s+1, uint32(cv>>8), true))
+ }
+
if best.length > 0 {
+ hashS := hash4(cv>>8, sTableBits)
// s+1
- nextShort := sTable[hash4(cv>>8, sTableBits)]
+ nextShort := sTable[hashS]
s := s + 1
cv := load64(src, s)
- nextLong := lTable[hash8(cv, lTableBits)]
+ hashL := hash8(cv, lTableBits)
+ nextLong := lTable[hashL]
best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
- // Repeat at + 2
- best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
+
+ // Dict at + 1
+ if dict != nil {
+ candidateL := dict.bestTableLong[hashL]
+ candidateS := dict.bestTableShort[hashS]
+
+ best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+ best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+ }
// s+2
if true {
- nextShort = sTable[hash4(cv>>8, sTableBits)]
+ hashS := hash4(cv>>8, sTableBits)
+
+ nextShort = sTable[hashS]
s++
cv = load64(src, s)
- nextLong = lTable[hash8(cv, lTableBits)]
+ hashL := hash8(cv, lTableBits)
+ nextLong = lTable[hashL]
+
+ if (dict == nil || repeat <= s) && repeat > 0 {
+ // Repeat at + 2
+ best = bestOf(best, matchAt(s-repeat, s, uint32(cv), true))
+ } else if repeat-s > 4 && dict != nil {
+ candidate := len(dict.dict) - (repeat - s)
+ best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
+ }
best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
+
+ // Dict at +2
+ // Very small gain
+ if dict != nil {
+ candidateL := dict.bestTableLong[hashL]
+ candidateS := dict.bestTableShort[hashS]
+
+ best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+ best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+ }
}
// Search for a match at best match end, see if that is better.
- if sAt := best.s + best.length; sAt < sLimit {
- sBack := best.s
- backL := best.length
+ // Allow some bytes at the beginning to mismatch.
+ // Sweet spot is around 1-2 bytes, but depends on input.
+ // The skipped bytes are tested in Extend backwards,
+ // and still picked up as part of the match if they do.
+ const skipBeginning = 2
+ const skipEnd = 1
+ if sAt := best.s + best.length - skipEnd; sAt < sLimit {
+
+ sBack := best.s + skipBeginning - skipEnd
+ backL := best.length - skipBeginning
// Load initial values
cv = load64(src, sBack)
- // Search for mismatch
+
+ // Grab candidates...
next := lTable[hash8(load64(src, sAt), lTableBits)]
- //next := sTable[hash4(load64(src, sAt), sTableBits)]
if checkAt := getCur(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
@@ -191,6 +328,16 @@ func encodeBlockBest(dst, src []byte) (d int) {
if checkAt := getPrev(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
+ // Disabled: Extremely small gain
+ if false {
+ next = sTable[hash4(load64(src, sAt), sTableBits)]
+ if checkAt := getCur(next) - backL; checkAt > 0 {
+ best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+ }
+ if checkAt := getPrev(next) - backL; checkAt > 0 {
+ best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+ }
+ }
}
}
}
@@ -209,7 +356,7 @@ func encodeBlockBest(dst, src []byte) (d int) {
// Extend backwards, not needed for repeats...
s = best.s
- if !best.rep {
+ if !best.rep && !best.dict {
for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
best.offset--
best.length++
@@ -226,7 +373,6 @@ func encodeBlockBest(dst, src []byte) (d int) {
base := s
offset := s - best.offset
-
s += best.length
if offset > 65535 && s-base <= 5 && !best.rep {
@@ -238,16 +384,28 @@ func encodeBlockBest(dst, src []byte) (d int) {
cv = load64(src, s)
continue
}
+ if debug && nextEmit != base {
+ fmt.Println("EMIT", base-nextEmit, "literals. base-after:", base)
+ }
d += emitLiteral(dst[d:], src[nextEmit:base])
if best.rep {
- if nextEmit > 0 {
+ if nextEmit > 0 || best.dict {
+ if debug {
+ fmt.Println("REPEAT, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+ }
// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
d += emitRepeat(dst[d:], offset, best.length)
} else {
- // First match, cannot be repeat.
+ // First match without dict cannot be a repeat.
+ if debug {
+ fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+ }
d += emitCopy(dst[d:], offset, best.length)
}
} else {
+ if debug {
+ fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+ }
d += emitCopy(dst[d:], offset, best.length)
}
repeat = offset
@@ -278,6 +436,9 @@ emitRemainder:
if d+len(src)-nextEmit > dstLimit {
return 0
}
+ if debug && nextEmit != s {
+ fmt.Println("emitted ", len(src)-nextEmit, "literals")
+ }
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
@@ -288,8 +449,9 @@ emitRemainder:
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBestSnappy(dst, src []byte) (d int) {
// Initialize the hash tables.
const (
@@ -546,6 +708,7 @@ emitRemainder:
// emitCopySize returns the size to encode the offset+length
//
// It assumes that:
+//
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
func emitCopySize(offset, length int) int {
@@ -584,6 +747,7 @@ func emitCopySize(offset, length int) int {
// emitCopyNoRepeatSize returns the size to encode the offset+length
//
// It assumes that:
+//
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
func emitCopyNoRepeatSize(offset, length int) int {
@@ -621,7 +785,6 @@ func emitRepeatSize(offset, length int) int {
left := 0
if length > maxRepeat {
left = length - maxRepeat + 4
- length = maxRepeat - 4
}
if left > 0 {
return 5 + emitRepeatSize(offset, left)
diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go
index 943215b8..f46adb41 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_better.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@@ -6,6 +6,8 @@
package s2
import (
+ "bytes"
+ "fmt"
"math/bits"
)
@@ -42,8 +44,9 @@ func hash8(u uint64, h uint8) uint32 {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetterGo(dst, src []byte) (d int) {
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -56,7 +59,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
// Initialize the hash tables.
const (
// Long hash matches.
- lTableBits = 16
+ lTableBits = 17
maxLTableSize = 1 << lTableBits
// Short hash matches.
@@ -97,9 +100,26 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
lTable[hashL] = uint32(s)
sTable[hashS] = uint32(s)
+ valLong := load64(src, candidateL)
+ valShort := load64(src, candidateS)
+
+ // If long matches at least 8 bytes, use that.
+ if cv == valLong {
+ break
+ }
+ if cv == valShort {
+ candidateL = candidateS
+ break
+ }
+
// Check repeat at offset checkRep.
const checkRep = 1
- if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+ // Minimum length of a repeat. Tested with various values.
+ // While 4-5 offers improvements in some, 6 reduces
+ // regressions significantly.
+ const wantRepeatBytes = 6
+ const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+ if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
base := s + checkRep
// Extend back
for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
@@ -109,8 +129,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
d += emitLiteral(dst[d:], src[nextEmit:base])
// Extend forward
- candidate := s - repeat + 4 + checkRep
- s += 4 + checkRep
+ candidate := s - repeat + wantRepeatBytes + checkRep
+ s += wantRepeatBytes + checkRep
for s < len(src) {
if len(src)-s < 8 {
if src[s] == src[candidate] {
@@ -127,28 +147,40 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
s += 8
candidate += 8
}
- if nextEmit > 0 {
- // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
- d += emitRepeat(dst[d:], repeat, s-base)
- } else {
- // First match, cannot be repeat.
- d += emitCopy(dst[d:], repeat, s-base)
- }
+ // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+ d += emitRepeat(dst[d:], repeat, s-base)
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
+ // Index in-between
+ index0 := base + 1
+ index1 := s - 2
+
+ cv = load64(src, s)
+ for index0 < index1 {
+ cv0 := load64(src, index0)
+ cv1 := load64(src, index1)
+ lTable[hash7(cv0, lTableBits)] = uint32(index0)
+ sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
+ sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 2
+ index1 -= 2
+ }
cv = load64(src, s)
continue
}
- if uint32(cv) == load32(src, candidateL) {
+ // Long likely matches 7, so take that.
+ if uint32(cv) == uint32(valLong) {
break
}
// Check our short candidate
- if uint32(cv) == load32(src, candidateS) {
+ if uint32(cv) == uint32(valShort) {
// Try a long candidate at s+1
hashL = hash7(cv>>8, lTableBits)
candidateL = int(lTable[hashL])
@@ -227,21 +259,29 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
// Do we have space for more, if not bail.
return 0
}
- // Index match start+1 (long) and start+2 (short)
+
+ // Index short & long
index0 := base + 1
- // Index match end-2 (long) and end-1 (short)
index1 := s - 2
cv0 := load64(src, index0)
cv1 := load64(src, index1)
- cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
- lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
- lTable[hash7(cv1, lTableBits)] = uint32(index1)
- lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
- sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 1
+ index1 -= 1
+ cv = load64(src, s)
+
+ // index every second long in between.
+ for index0 < index1 {
+ lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+ lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+ index0 += 2
+ index1 -= 2
+ }
}
emitRemainder:
@@ -260,8 +300,9 @@ emitRemainder:
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -402,21 +443,649 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
// Do we have space for more, if not bail.
return 0
}
- // Index match start+1 (long) and start+2 (short)
+
+ // Index short & long
index0 := base + 1
- // Index match end-2 (long) and end-1 (short)
index1 := s - 2
cv0 := load64(src, index0)
cv1 := load64(src, index1)
+ lTable[hash7(cv0, lTableBits)] = uint32(index0)
+ sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
+ sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 1
+ index1 -= 1
cv = load64(src, s)
+
+ // index every second long in between.
+ for index0 < index1 {
+ lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+ lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+ index0 += 2
+ index1 -= 2
+ }
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:])
+ }
+ return d
+}
+
+// encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) {
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ // Initialize the hash tables.
+ const (
+ // Long hash matches.
+ lTableBits = 17
+ maxLTableSize = 1 << lTableBits
+
+ // Short hash matches.
+ sTableBits = 14
+ maxSTableSize = 1 << sTableBits
+
+ maxAhead = 8 // maximum bytes ahead without checking sLimit
+
+ debug = false
+ )
+
+ sLimit := len(src) - inputMargin
+ if sLimit > MaxDictSrcOffset-maxAhead {
+ sLimit = MaxDictSrcOffset - maxAhead
+ }
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+
+ dict.initBetter()
+
+ var lTable [maxLTableSize]uint32
+ var sTable [maxSTableSize]uint32
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - len(src)>>5 - 6
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 0
+ cv := load64(src, s)
+
+ // We initialize repeat to 0, so we never match on first attempt
+ repeat := len(dict.dict) - dict.repeat
+
+ // While in dict
+searchDict:
+ for {
+ candidateL := 0
+ nextS := 0
+ for {
+ // Next src position to check
+ nextS = s + (s-nextEmit)>>7 + 1
+ if nextS > sLimit {
+ break searchDict
+ }
+ hashL := hash7(cv, lTableBits)
+ hashS := hash4(cv, sTableBits)
+ candidateL = int(lTable[hashL])
+ candidateS := int(sTable[hashS])
+ dictL := int(dict.betterTableLong[hashL])
+ dictS := int(dict.betterTableShort[hashS])
+ lTable[hashL] = uint32(s)
+ sTable[hashS] = uint32(s)
+
+ valLong := load64(src, candidateL)
+ valShort := load64(src, candidateS)
+
+ // If long matches at least 8 bytes, use that.
+ if s != 0 {
+ if cv == valLong {
+ goto emitMatch
+ }
+ if cv == valShort {
+ candidateL = candidateS
+ goto emitMatch
+ }
+ }
+
+ // Check dict repeat.
+ if repeat >= s+4 {
+ candidate := len(dict.dict) - repeat + s
+ if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
+ // Extend back
+ base := s
+ for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ if debug && nextEmit != base {
+ fmt.Println("emitted ", base-nextEmit, "literals")
+ }
+ s += 4
+ candidate += 4
+ for candidate < len(dict.dict)-8 && s <= len(src)-8 {
+ if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+ d += emitRepeat(dst[d:], repeat, s-base)
+ if debug {
+ fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
+ }
+ nextEmit = s
+ if s >= sLimit {
+ break searchDict
+ }
+ cv = load64(src, s)
+ // Index in-between
+ index0 := base + 1
+ index1 := s - 2
+
+ cv = load64(src, s)
+ for index0 < index1 {
+ cv0 := load64(src, index0)
+ cv1 := load64(src, index1)
+ lTable[hash7(cv0, lTableBits)] = uint32(index0)
+ sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
+ sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 2
+ index1 -= 2
+ }
+ continue
+ }
+ }
+ // Don't try to find match at s==0
+ if s == 0 {
+ cv = load64(src, nextS)
+ s = nextS
+ continue
+ }
+
+ // Long likely matches 7, so take that.
+ if uint32(cv) == uint32(valLong) {
+ goto emitMatch
+ }
+
+ // Long dict...
+ if uint32(cv) == load32(dict.dict, dictL) {
+ candidateL = dictL
+ goto emitDict
+ }
+
+ // Check our short candidate
+ if uint32(cv) == uint32(valShort) {
+ // Try a long candidate at s+1
+ hashL = hash7(cv>>8, lTableBits)
+ candidateL = int(lTable[hashL])
+ lTable[hashL] = uint32(s + 1)
+ if uint32(cv>>8) == load32(src, candidateL) {
+ s++
+ goto emitMatch
+ }
+ // Use our short candidate.
+ candidateL = candidateS
+ goto emitMatch
+ }
+ if uint32(cv) == load32(dict.dict, dictS) {
+ // Try a long candidate at s+1
+ hashL = hash7(cv>>8, lTableBits)
+ candidateL = int(lTable[hashL])
+ lTable[hashL] = uint32(s + 1)
+ if uint32(cv>>8) == load32(src, candidateL) {
+ s++
+ goto emitMatch
+ }
+ candidateL = dictS
+ goto emitDict
+ }
+ cv = load64(src, nextS)
+ s = nextS
+ }
+ emitDict:
+ {
+ if debug {
+ if load32(dict.dict, candidateL) != load32(src, s) {
+ panic("dict emit mismatch")
+ }
+ }
+ // Extend backwards.
+ // The top bytes will be rechecked to get the full match.
+ for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
+ candidateL--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+
+ d += emitLiteral(dst[d:], src[nextEmit:s])
+ if debug && nextEmit != s {
+ fmt.Println("emitted ", s-nextEmit, "literals")
+ }
+ {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+ offset := s + (len(dict.dict)) - candidateL
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidateL += 4
+ for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
+ if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidateL += 8
+ }
+
+ if repeat == offset {
+ if debug {
+ fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
+ }
+ d += emitRepeat(dst[d:], offset, s-base)
+ } else {
+ if debug {
+ fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
+ }
+ // Matches longer than 64 are split.
+ if s <= sLimit || s-base < 8 {
+ d += emitCopy(dst[d:], offset, s-base)
+ } else {
+ // Split to ensure we don't start a copy within next block.
+ d += emitCopy(dst[d:], offset, 4)
+ d += emitRepeat(dst[d:], offset, s-base-4)
+ }
+ repeat = offset
+ }
+ if false {
+ // Validate match.
+ if s <= candidateL {
+ panic("s <= candidate")
+ }
+ a := src[base:s]
+ b := dict.dict[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+
+ nextEmit = s
+ if s >= sLimit {
+ break searchDict
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+
+ // Index short & long
+ index0 := base + 1
+ index1 := s - 2
+
+ cv0 := load64(src, index0)
+ cv1 := load64(src, index1)
+ lTable[hash7(cv0, lTableBits)] = uint32(index0)
+ sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
+ sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 1
+ index1 -= 1
+ cv = load64(src, s)
+
+ // index every second long in between.
+ for index0 < index1 {
+ lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+ lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+ index0 += 2
+ index1 -= 2
+ }
+ }
+ continue
+ }
+ emitMatch:
+
+ // Extend backwards
+ for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+ candidateL--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ base := s
+ offset := base - candidateL
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidateL += 4
+ for s < len(src) {
+ if len(src)-s < 8 {
+ if src[s] == src[candidateL] {
+ s++
+ candidateL++
+ continue
+ }
+ break
+ }
+ if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidateL += 8
+ }
+
+ if offset > 65535 && s-base <= 5 && repeat != offset {
+ // Bail if the match is equal or worse to the encoding.
+ s = nextS + 1
+ if s >= sLimit {
+ goto emitRemainder
+ }
+ cv = load64(src, s)
+ continue
+ }
+
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ if debug && nextEmit != s {
+ fmt.Println("emitted ", s-nextEmit, "literals")
+ }
+ if repeat == offset {
+ if debug {
+ fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
+ }
+ d += emitRepeat(dst[d:], offset, s-base)
+ } else {
+ if debug {
+ fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
+ }
+ d += emitCopy(dst[d:], offset, s-base)
+ repeat = offset
+ }
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+
+ // Index short & long
+ index0 := base + 1
+ index1 := s - 2
+
+ cv0 := load64(src, index0)
+ cv1 := load64(src, index1)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
- lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
+ sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
lTable[hash7(cv1, lTableBits)] = uint32(index1)
- lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
+ sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 1
+ index1 -= 1
+ cv = load64(src, s)
+
+ // index every second long in between.
+ for index0 < index1 {
+ lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+ lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+ index0 += 2
+ index1 -= 2
+ }
+ }
+
+ // Search without dict:
+ if repeat > s {
+ repeat = 0
+ }
+
+ // No more dict
+ sLimit = len(src) - inputMargin
+ if s >= sLimit {
+ goto emitRemainder
+ }
+ cv = load64(src, s)
+ if debug {
+ fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
+ }
+ for {
+ candidateL := 0
+ nextS := 0
+ for {
+ // Next src position to check
+ nextS = s + (s-nextEmit)>>7 + 1
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hashL := hash7(cv, lTableBits)
+ hashS := hash4(cv, sTableBits)
+ candidateL = int(lTable[hashL])
+ candidateS := int(sTable[hashS])
+ lTable[hashL] = uint32(s)
+ sTable[hashS] = uint32(s)
+
+ valLong := load64(src, candidateL)
+ valShort := load64(src, candidateS)
+
+ // If long matches at least 8 bytes, use that.
+ if cv == valLong {
+ break
+ }
+ if cv == valShort {
+ candidateL = candidateS
+ break
+ }
+
+ // Check repeat at offset checkRep.
+ const checkRep = 1
+ // Minimum length of a repeat. Tested with various values.
+ // While 4-5 offers improvements in some, 6 reduces
+ // regressions significantly.
+ const wantRepeatBytes = 6
+ const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+ if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
+ base := s + checkRep
+ // Extend back
+ for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+
+ // Extend forward
+ candidate := s - repeat + wantRepeatBytes + checkRep
+ s += wantRepeatBytes + checkRep
+ for s < len(src) {
+ if len(src)-s < 8 {
+ if src[s] == src[candidate] {
+ s++
+ candidate++
+ continue
+ }
+ break
+ }
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+ // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+ d += emitRepeat(dst[d:], repeat, s-base)
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+ // Index in-between
+ index0 := base + 1
+ index1 := s - 2
+
+ cv = load64(src, s)
+ for index0 < index1 {
+ cv0 := load64(src, index0)
+ cv1 := load64(src, index1)
+ lTable[hash7(cv0, lTableBits)] = uint32(index0)
+ sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
+ sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 2
+ index1 -= 2
+ }
+
+ cv = load64(src, s)
+ continue
+ }
+
+ // Long likely matches 7, so take that.
+ if uint32(cv) == uint32(valLong) {
+ break
+ }
+
+ // Check our short candidate
+ if uint32(cv) == uint32(valShort) {
+ // Try a long candidate at s+1
+ hashL = hash7(cv>>8, lTableBits)
+ candidateL = int(lTable[hashL])
+ lTable[hashL] = uint32(s + 1)
+ if uint32(cv>>8) == load32(src, candidateL) {
+ s++
+ break
+ }
+ // Use our short candidate.
+ candidateL = candidateS
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards
+ for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+ candidateL--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ base := s
+ offset := base - candidateL
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidateL += 4
+ for s < len(src) {
+ if len(src)-s < 8 {
+ if src[s] == src[candidateL] {
+ s++
+ candidateL++
+ continue
+ }
+ break
+ }
+ if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidateL += 8
+ }
+
+ if offset > 65535 && s-base <= 5 && repeat != offset {
+ // Bail if the match is equal or worse to the encoding.
+ s = nextS + 1
+ if s >= sLimit {
+ goto emitRemainder
+ }
+ cv = load64(src, s)
+ continue
+ }
+
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ if repeat == offset {
+ d += emitRepeat(dst[d:], offset, s-base)
+ } else {
+ d += emitCopy(dst[d:], offset, s-base)
+ repeat = offset
+ }
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+
+ // Index short & long
+ index0 := base + 1
+ index1 := s - 2
+
+ cv0 := load64(src, index0)
+ cv1 := load64(src, index1)
+ lTable[hash7(cv0, lTableBits)] = uint32(index0)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
- sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 1
+ index1 -= 1
+ cv = load64(src, s)
+
+ // index every second long in between.
+ for index0 < index1 {
+ lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+ lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+ index0 += 2
+ index1 -= 2
+ }
}
emitRemainder:
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
index 94784b82..d7749d75 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_go.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -4,14 +4,18 @@
package s2
import (
+ "bytes"
"math/bits"
)
+const hasAmd64Asm = false
+
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlock(dst, src []byte) (d int) {
if len(src) < minNonLiteralBlockSize {
@@ -25,6 +29,7 @@ func encodeBlock(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlockBetter(dst, src []byte) (d int) {
return encodeBlockBetterGo(dst, src)
@@ -35,6 +40,7 @@ func encodeBlockBetter(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlockBetterSnappy(dst, src []byte) (d int) {
return encodeBlockBetterSnappyGo(dst, src)
@@ -45,6 +51,7 @@ func encodeBlockBetterSnappy(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlockSnappy(dst, src []byte) (d int) {
if len(src) < minNonLiteralBlockSize {
@@ -56,6 +63,7 @@ func encodeBlockSnappy(dst, src []byte) (d int) {
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
+//
// dst is long enough to hold the encoded bytes
// 0 <= len(lit) && len(lit) <= math.MaxUint32
func emitLiteral(dst, lit []byte) int {
@@ -146,6 +154,7 @@ func emitRepeat(dst []byte, offset, length int) int {
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
+//
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
@@ -214,6 +223,7 @@ func emitCopy(dst []byte, offset, length int) int {
// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
+//
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
@@ -273,8 +283,8 @@ func emitCopyNoRepeat(dst []byte, offset, length int) int {
// matchLen returns how many bytes match in a and b
//
// It assumes that:
-// len(a) <= len(b)
//
+// len(a) <= len(b)
func matchLen(a []byte, b []byte) int {
b = b[:len(a)]
var checked int
@@ -305,3 +315,405 @@ func matchLen(a []byte, b []byte) int {
}
return len(a) + checked
}
+
+func calcBlockSize(src []byte) (d int) {
+ // Initialize the hash table.
+ const (
+ tableBits = 13
+ maxTableSize = 1 << tableBits
+ )
+
+ var table [maxTableSize]uint32
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - len(src)>>5 - 5
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ cv := load64(src, s)
+
+ // We search for a repeat at -1, but don't output repeats when nextEmit == 0
+ repeat := 1
+
+ for {
+ candidate := 0
+ for {
+ // Next src position to check
+ nextS := s + (s-nextEmit)>>6 + 4
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hash0 := hash6(cv, tableBits)
+ hash1 := hash6(cv>>8, tableBits)
+ candidate = int(table[hash0])
+ candidate2 := int(table[hash1])
+ table[hash0] = uint32(s)
+ table[hash1] = uint32(s + 1)
+ hash2 := hash6(cv>>16, tableBits)
+
+ // Check repeat at offset checkRep.
+ const checkRep = 1
+ if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+ base := s + checkRep
+ // Extend back
+ for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteralSize(src[nextEmit:base])
+
+ // Extend forward
+ candidate := s - repeat + 4 + checkRep
+ s += 4 + checkRep
+ for s <= sLimit {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+
+ d += emitCopyNoRepeatSize(repeat, s-base)
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ cv = load64(src, s)
+ continue
+ }
+
+ if uint32(cv) == load32(src, candidate) {
+ break
+ }
+ candidate = int(table[hash2])
+ if uint32(cv>>8) == load32(src, candidate2) {
+ table[hash2] = uint32(s + 2)
+ candidate = candidate2
+ s++
+ break
+ }
+ table[hash2] = uint32(s + 2)
+ if uint32(cv>>16) == load32(src, candidate) {
+ s += 2
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards
+ for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+ candidate--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+
+ d += emitLiteralSize(src[nextEmit:s])
+
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+ repeat = base - candidate
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidate += 4
+ for s <= len(src)-8 {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+
+ d += emitCopyNoRepeatSize(repeat, s-base)
+ if false {
+ // Validate match.
+ a := src[base:s]
+ b := src[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Check for an immediate match, otherwise start search at s+1
+ x := load64(src, s-2)
+ m2Hash := hash6(x, tableBits)
+ currHash := hash6(x>>16, tableBits)
+ candidate = int(table[currHash])
+ table[m2Hash] = uint32(s - 2)
+ table[currHash] = uint32(s)
+ if uint32(x>>16) != load32(src, candidate) {
+ cv = load64(src, s+1)
+ s++
+ break
+ }
+ }
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteralSize(src[nextEmit:])
+ }
+ return d
+}
+
+func calcBlockSizeSmall(src []byte) (d int) {
+ // Initialize the hash table.
+ const (
+ tableBits = 9
+ maxTableSize = 1 << tableBits
+ )
+
+ var table [maxTableSize]uint32
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - len(src)>>5 - 5
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ cv := load64(src, s)
+
+ // We search for a repeat at -1, but don't output repeats when nextEmit == 0
+ repeat := 1
+
+ for {
+ candidate := 0
+ for {
+ // Next src position to check
+ nextS := s + (s-nextEmit)>>6 + 4
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hash0 := hash6(cv, tableBits)
+ hash1 := hash6(cv>>8, tableBits)
+ candidate = int(table[hash0])
+ candidate2 := int(table[hash1])
+ table[hash0] = uint32(s)
+ table[hash1] = uint32(s + 1)
+ hash2 := hash6(cv>>16, tableBits)
+
+ // Check repeat at offset checkRep.
+ const checkRep = 1
+ if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+ base := s + checkRep
+ // Extend back
+ for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteralSize(src[nextEmit:base])
+
+ // Extend forward
+ candidate := s - repeat + 4 + checkRep
+ s += 4 + checkRep
+ for s <= sLimit {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+
+ d += emitCopyNoRepeatSize(repeat, s-base)
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ cv = load64(src, s)
+ continue
+ }
+
+ if uint32(cv) == load32(src, candidate) {
+ break
+ }
+ candidate = int(table[hash2])
+ if uint32(cv>>8) == load32(src, candidate2) {
+ table[hash2] = uint32(s + 2)
+ candidate = candidate2
+ s++
+ break
+ }
+ table[hash2] = uint32(s + 2)
+ if uint32(cv>>16) == load32(src, candidate) {
+ s += 2
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards
+ for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+ candidate--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+
+ d += emitLiteralSize(src[nextEmit:s])
+
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+ repeat = base - candidate
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidate += 4
+ for s <= len(src)-8 {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+
+ d += emitCopyNoRepeatSize(repeat, s-base)
+ if false {
+ // Validate match.
+ a := src[base:s]
+ b := src[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Check for an immediate match, otherwise start search at s+1
+ x := load64(src, s-2)
+ m2Hash := hash6(x, tableBits)
+ currHash := hash6(x>>16, tableBits)
+ candidate = int(table[currHash])
+ table[m2Hash] = uint32(s - 2)
+ table[currHash] = uint32(s)
+ if uint32(x>>16) != load32(src, candidate) {
+ cv = load64(src, s+1)
+ s++
+ break
+ }
+ }
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteralSize(src[nextEmit:])
+ }
+ return d
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+// dst is long enough to hold the encoded bytes
+// 0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteralSize(lit []byte) int {
+ if len(lit) == 0 {
+ return 0
+ }
+ switch {
+ case len(lit) <= 60:
+ return len(lit) + 1
+ case len(lit) <= 1<<8:
+ return len(lit) + 2
+ case len(lit) <= 1<<16:
+ return len(lit) + 3
+ case len(lit) <= 1<<24:
+ return len(lit) + 4
+ default:
+ return len(lit) + 5
+ }
+}
+
+func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+ panic("cvtLZ4BlockAsm should be unreachable")
+}
+
+func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+ panic("cvtLZ4BlockSnappyAsm should be unreachable")
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
index 88f27c09..9f3dc8c2 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@@ -1,7 +1,6 @@
// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
package s2
@@ -147,11 +146,26 @@ func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
//go:noescape
func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
+// calcBlockSize encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func calcBlockSize(src []byte) int
+
+// calcBlockSizeSmall encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 1024 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func calcBlockSizeSmall(src []byte) int
+
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
-// dst is long enough to hold the encoded bytes with margin of 0 bytes
-// 0 <= len(lit) && len(lit) <= math.MaxUint32
+//
+// dst is long enough to hold the encoded bytes with margin of 0 bytes
+// 0 <= len(lit) && len(lit) <= math.MaxUint32
//
//go:noescape
func emitLiteral(dst []byte, lit []byte) int
@@ -165,9 +179,10 @@ func emitRepeat(dst []byte, offset int, length int) int
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
-// dst is long enough to hold the encoded bytes
-// 1 <= offset && offset <= math.MaxUint32
-// 4 <= length && length <= 1 << 24
+//
+// dst is long enough to hold the encoded bytes
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
//
//go:noescape
func emitCopy(dst []byte, offset int, length int) int
@@ -175,9 +190,10 @@ func emitCopy(dst []byte, offset int, length int) int
// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
-// dst is long enough to hold the encoded bytes
-// 1 <= offset && offset <= math.MaxUint32
-// 4 <= length && length <= 1 << 24
+//
+// dst is long enough to hold the encoded bytes
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
//
//go:noescape
func emitCopyNoRepeat(dst []byte, offset int, length int) int
@@ -185,7 +201,18 @@ func emitCopyNoRepeat(dst []byte, offset int, length int) int
// matchLen returns how many bytes match in a and b
//
// It assumes that:
-// len(a) <= len(b)
+//
+// len(a) <= len(b)
//
//go:noescape
func matchLen(a []byte, b []byte) int
+
+// cvtLZ4Block converts an LZ4 block to S2
+//
+//go:noescape
+func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// cvtLZ4Block converts an LZ4 block to S2
+//
+//go:noescape
+func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
index 36915d94..19bd5237 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -1,7 +1,6 @@
// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
#include "textflag.h"
@@ -37,8 +36,8 @@ zero_loop_encodeBlockAsm:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -48,609 +47,601 @@ zero_loop_encodeBlockAsm:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
SHLQ $0x10, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x32, R10
- SHLQ $0x10, R11
- IMULQ R9, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm
repeat_extend_back_loop_encodeBlockAsm:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm
repeat_extend_back_end_encodeBlockAsm:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_repeat_emit_encodeBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_repeat_emit_encodeBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_repeat_emit_encodeBlockAsm
four_bytes_repeat_emit_encodeBlockAsm:
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_repeat_emit_encodeBlockAsm
three_bytes_repeat_emit_encodeBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm
two_bytes_repeat_emit_encodeBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm
JMP memmove_long_repeat_emit_encodeBlockAsm
one_byte_repeat_emit_encodeBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm
memmove_long_repeat_emit_encodeBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm
matchlen_loopback_repeat_extend_encodeBlockAsm:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm
matchlen_loop_repeat_extend_encodeBlockAsm:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm
JZ repeat_extend_forward_end_encodeBlockAsm
matchlen_match4_repeat_extend_encodeBlockAsm:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm
// emitRepeat
emit_repeat_again_match_repeat_encodeBlockAsm:
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_match_repeat_encodeBlockAsm
cant_repeat_two_offset_match_repeat_encodeBlockAsm:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_match_repeat_encodeBlockAsm
- CMPL SI, $0x0100ffff
+ CMPL BX, $0x0100ffff
JLT repeat_five_match_repeat_encodeBlockAsm
- LEAL -16842747(SI), SI
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(BX), BX
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_repeat_encodeBlockAsm
repeat_five_match_repeat_encodeBlockAsm:
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_four_match_repeat_encodeBlockAsm:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_three_match_repeat_encodeBlockAsm:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_match_repeat_encodeBlockAsm:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_offset_match_repeat_encodeBlockAsm:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_as_copy_encodeBlockAsm:
// emitCopy
- CMPL DI, $0x00010000
+ CMPL SI, $0x00010000
JL two_byte_offset_repeat_as_copy_encodeBlockAsm
-
-four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm
MOVB $0xff, (AX)
- MOVL DI, 1(AX)
- LEAL -64(SI), SI
+ MOVL SI, 1(AX)
+ LEAL -64(BX), BX
ADDQ $0x05, AX
- CMPL SI, $0x04
+ CMPL BX, $0x04
JL four_bytes_remain_repeat_as_copy_encodeBlockAsm
// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
- CMPL SI, $0x0100ffff
+ CMPL BX, $0x0100ffff
JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
- LEAL -16842747(SI), SI
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(BX), BX
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
- JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm
four_bytes_remain_repeat_as_copy_encodeBlockAsm:
- TESTL SI, SI
+ TESTL BX, BX
JZ repeat_end_emit_encodeBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVL DI, 1(AX)
+ XORL DI, DI
+ LEAL -1(DI)(BX*4), BX
+ MOVB BL, (AX)
+ MOVL SI, 1(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
two_byte_offset_repeat_as_copy_encodeBlockAsm:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- MOVL DI, R9
- SHRL $0x08, R9
- SHLL $0x05, R9
- ORL R9, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ MOVL SI, R8
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- CMPL SI, $0x0100ffff
+ CMPL BX, $0x0100ffff
JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- LEAL -16842747(SI), SI
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(BX), BX
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
long_offset_short_repeat_as_copy_encodeBlockAsm:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
- CMPL SI, $0x0100ffff
+ CMPL BX, $0x0100ffff
JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
- LEAL -16842747(SI), SI
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(BX), BX
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
emit_copy_three_repeat_as_copy_encodeBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm:
@@ -658,16 +649,16 @@ repeat_end_emit_encodeBlockAsm:
JMP search_loop_encodeBlockAsm
no_repeat_found_encodeBlockAsm:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm
@@ -677,549 +668,542 @@ candidate3_match_encodeBlockAsm:
JMP candidate_match_encodeBlockAsm
candidate2_match_encodeBlockAsm:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm
match_extend_back_loop_encodeBlockAsm:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm
JMP match_extend_back_loop_encodeBlockAsm
match_extend_back_end_encodeBlockAsm:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 5(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JLT three_bytes_match_emit_encodeBlockAsm
- CMPL R8, $0x01000000
+ CMPL DI, $0x01000000
JLT four_bytes_match_emit_encodeBlockAsm
MOVB $0xfc, (AX)
- MOVL R8, 1(AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_encodeBlockAsm
four_bytes_match_emit_encodeBlockAsm:
- MOVL R8, R10
- SHRL $0x10, R10
+ MOVL DI, R9
+ SHRL $0x10, R9
MOVB $0xf8, (AX)
- MOVW R8, 1(AX)
- MOVB R10, 3(AX)
+ MOVW DI, 1(AX)
+ MOVB R9, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeBlockAsm
three_bytes_match_emit_encodeBlockAsm:
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm
two_bytes_match_emit_encodeBlockAsm:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm
JMP memmove_long_match_emit_encodeBlockAsm
one_byte_match_emit_encodeBlockAsm:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm
memmove_long_match_emit_encodeBlockAsm:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm:
match_nolit_loop_encodeBlockAsm:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm
matchlen_loopback_match_nolit_encodeBlockAsm:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm
matchlen_loop_match_nolit_encodeBlockAsm:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm
JZ match_nolit_end_encodeBlockAsm
matchlen_match4_match_nolit_encodeBlockAsm:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JL two_byte_offset_match_nolit_encodeBlockAsm
-
-four_bytes_loop_back_match_nolit_encodeBlockAsm:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE four_bytes_remain_match_nolit_encodeBlockAsm
MOVB $0xff, (AX)
- MOVL SI, 1(AX)
- LEAL -64(R10), R10
+ MOVL BX, 1(AX)
+ LEAL -64(R9), R9
ADDQ $0x05, AX
- CMPL R10, $0x04
+ CMPL R9, $0x04
JL four_bytes_remain_match_nolit_encodeBlockAsm
// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy
- CMPL R10, $0x0100ffff
+ CMPL R9, $0x0100ffff
JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy
- LEAL -16842747(R10), R10
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R9), R9
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
repeat_five_match_nolit_encodeBlockAsm_emit_copy:
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_four_match_nolit_encodeBlockAsm_emit_copy:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_three_match_nolit_encodeBlockAsm_emit_copy:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_match_nolit_encodeBlockAsm_emit_copy:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
- JMP four_bytes_loop_back_match_nolit_encodeBlockAsm
four_bytes_remain_match_nolit_encodeBlockAsm:
- TESTL R10, R10
+ TESTL R9, R9
JZ match_nolit_emitcopy_end_encodeBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVL SI, 1(AX)
+ XORL SI, SI
+ LEAL -1(SI)(R9*4), R9
+ MOVB R9, (AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
two_byte_offset_match_nolit_encodeBlockAsm:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- MOVL SI, R8
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ MOVL BX, DI
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
- CMPL R10, $0x0100ffff
+ CMPL R9, $0x0100ffff
JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
- LEAL -16842747(R10), R10
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R9), R9
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
long_offset_short_match_nolit_encodeBlockAsm:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
- CMPL R10, $0x0100ffff
+ CMPL R9, $0x0100ffff
JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
- LEAL -16842747(R10), R10
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R9), R9
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
- JMP two_byte_offset_match_nolit_encodeBlockAsm
two_byte_offset_short_match_nolit_encodeBlockAsm:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
emit_copy_three_match_nolit_encodeBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm:
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x10, R8
- IMULQ R9, R8
- SHRQ $0x32, R8
- SHLQ $0x10, SI
- IMULQ R9, SI
- SHRQ $0x32, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x32, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm
INCL CX
JMP search_loop_encodeBlockAsm
@@ -1423,8 +1407,8 @@ zero_loop_encodeBlockAsm4MB:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -1434,555 +1418,551 @@ zero_loop_encodeBlockAsm4MB:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm4MB:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm4MB
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- SHLQ $0x10, R11
- IMULQ R9, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
SHLQ $0x10, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x32, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm4MB
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm4MB
repeat_extend_back_loop_encodeBlockAsm4MB:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm4MB
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm4MB
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm4MB
repeat_extend_back_end_encodeBlockAsm4MB:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm4MB
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm4MB
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_repeat_emit_encodeBlockAsm4MB
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_repeat_emit_encodeBlockAsm4MB
three_bytes_repeat_emit_encodeBlockAsm4MB:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm4MB
two_bytes_repeat_emit_encodeBlockAsm4MB:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm4MB
JMP memmove_long_repeat_emit_encodeBlockAsm4MB
one_byte_repeat_emit_encodeBlockAsm4MB:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
memmove_long_repeat_emit_encodeBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm4MB:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm4MB
matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm4MB
matchlen_loop_repeat_extend_encodeBlockAsm4MB:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB
JZ repeat_extend_forward_end_encodeBlockAsm4MB
matchlen_match4_repeat_extend_encodeBlockAsm4MB:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm4MB
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm4MB:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm4MB
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm4MB:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm4MB
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm4MB
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm4MB:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm4MB
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm4MB
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB
cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm4MB
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_match_repeat_encodeBlockAsm4MB
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_four_match_repeat_encodeBlockAsm4MB:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_three_match_repeat_encodeBlockAsm4MB:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_match_repeat_encodeBlockAsm4MB:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_offset_match_repeat_encodeBlockAsm4MB:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_as_copy_encodeBlockAsm4MB:
// emitCopy
- CMPL DI, $0x00010000
+ CMPL SI, $0x00010000
JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
-
-four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
MOVB $0xff, (AX)
- MOVL DI, 1(AX)
- LEAL -64(SI), SI
+ MOVL SI, 1(AX)
+ LEAL -64(BX), BX
ADDQ $0x05, AX
- CMPL SI, $0x04
+ CMPL BX, $0x04
JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
- JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB
four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
- TESTL SI, SI
+ TESTL BX, BX
JZ repeat_end_emit_encodeBlockAsm4MB
- MOVB $0x03, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVL DI, 1(AX)
+ XORL DI, DI
+ LEAL -1(DI)(BX*4), BX
+ MOVB BL, (AX)
+ MOVL SI, 1(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm4MB:
@@ -1990,16 +1970,16 @@ repeat_end_emit_encodeBlockAsm4MB:
JMP search_loop_encodeBlockAsm4MB
no_repeat_found_encodeBlockAsm4MB:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm4MB
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm4MB
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm4MB
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm4MB
@@ -2009,506 +1989,502 @@ candidate3_match_encodeBlockAsm4MB:
JMP candidate_match_encodeBlockAsm4MB
candidate2_match_encodeBlockAsm4MB:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm4MB:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm4MB
match_extend_back_loop_encodeBlockAsm4MB:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm4MB
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm4MB
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm4MB
JMP match_extend_back_loop_encodeBlockAsm4MB
match_extend_back_end_encodeBlockAsm4MB:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 4(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 4(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm4MB
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm4MB:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm4MB
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm4MB
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JLT three_bytes_match_emit_encodeBlockAsm4MB
- MOVL R8, R10
- SHRL $0x10, R10
+ MOVL DI, R9
+ SHRL $0x10, R9
MOVB $0xf8, (AX)
- MOVW R8, 1(AX)
- MOVB R10, 3(AX)
+ MOVW DI, 1(AX)
+ MOVB R9, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeBlockAsm4MB
three_bytes_match_emit_encodeBlockAsm4MB:
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm4MB
two_bytes_match_emit_encodeBlockAsm4MB:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm4MB
JMP memmove_long_match_emit_encodeBlockAsm4MB
one_byte_match_emit_encodeBlockAsm4MB:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm4MB:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm4MB:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm4MB
memmove_long_match_emit_encodeBlockAsm4MB:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm4MB:
match_nolit_loop_encodeBlockAsm4MB:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm4MB
matchlen_loopback_match_nolit_encodeBlockAsm4MB:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm4MB
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm4MB
matchlen_loop_match_nolit_encodeBlockAsm4MB:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB
JZ match_nolit_end_encodeBlockAsm4MB
matchlen_match4_match_nolit_encodeBlockAsm4MB:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm4MB
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm4MB
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm4MB:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm4MB
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm4MB
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm4MB:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm4MB
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm4MB
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm4MB:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JL two_byte_offset_match_nolit_encodeBlockAsm4MB
-
-four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB
MOVB $0xff, (AX)
- MOVL SI, 1(AX)
- LEAL -64(R10), R10
+ MOVL BX, 1(AX)
+ LEAL -64(R9), R9
ADDQ $0x05, AX
- CMPL R10, $0x04
+ CMPL R9, $0x04
JL four_bytes_remain_match_nolit_encodeBlockAsm4MB
// emitRepeat
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
- JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB
four_bytes_remain_match_nolit_encodeBlockAsm4MB:
- TESTL R10, R10
+ TESTL R9, R9
JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
- MOVB $0x03, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVL SI, 1(AX)
+ XORL SI, SI
+ LEAL -1(SI)(R9*4), R9
+ MOVB R9, (AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
two_byte_offset_match_nolit_encodeBlockAsm4MB:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm4MB
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
long_offset_short_match_nolit_encodeBlockAsm4MB:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
- JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
emit_copy_three_match_nolit_encodeBlockAsm4MB:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm4MB:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm4MB
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm4MB
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm4MB:
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x10, R8
- IMULQ R9, R8
- SHRQ $0x32, R8
- SHLQ $0x10, SI
- IMULQ R9, SI
- SHRQ $0x32, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x32, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm4MB
INCL CX
JMP search_loop_encodeBlockAsm4MB
@@ -2704,8 +2680,8 @@ zero_loop_encodeBlockAsm12B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -2715,428 +2691,426 @@ zero_loop_encodeBlockAsm12B:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm12B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm12B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x000000cf1bbcdcbb, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x18, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- SHLQ $0x18, R11
- IMULQ R9, R11
- SHRQ $0x34, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x000000cf1bbcdcbb, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x18, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
SHLQ $0x18, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x34, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x18, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm12B
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm12B
repeat_extend_back_loop_encodeBlockAsm12B:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm12B
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm12B
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm12B
repeat_extend_back_end_encodeBlockAsm12B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm12B
two_bytes_repeat_emit_encodeBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm12B
JMP memmove_long_repeat_emit_encodeBlockAsm12B
one_byte_repeat_emit_encodeBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
memmove_long_repeat_emit_encodeBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm12B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm12B
matchlen_loopback_repeat_extend_encodeBlockAsm12B:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm12B
matchlen_loop_repeat_extend_encodeBlockAsm12B:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B
JZ repeat_extend_forward_end_encodeBlockAsm12B
matchlen_match4_repeat_extend_encodeBlockAsm12B:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm12B
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm12B
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm12B:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm12B
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm12B
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm12B:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm12B
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm12B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm12B:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm12B
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm12B
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_match_repeat_encodeBlockAsm12B
cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm12B
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_three_match_repeat_encodeBlockAsm12B:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_match_repeat_encodeBlockAsm12B:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_offset_match_repeat_encodeBlockAsm12B:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_as_copy_encodeBlockAsm12B:
// emitCopy
-two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
long_offset_short_repeat_as_copy_encodeBlockAsm12B:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm12B:
@@ -3144,16 +3118,16 @@ repeat_end_emit_encodeBlockAsm12B:
JMP search_loop_encodeBlockAsm12B
no_repeat_found_encodeBlockAsm12B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm12B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm12B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm12B
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm12B
@@ -3163,391 +3137,389 @@ candidate3_match_encodeBlockAsm12B:
JMP candidate_match_encodeBlockAsm12B
candidate2_match_encodeBlockAsm12B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm12B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm12B
match_extend_back_loop_encodeBlockAsm12B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm12B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm12B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm12B
JMP match_extend_back_loop_encodeBlockAsm12B
match_extend_back_end_encodeBlockAsm12B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm12B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm12B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm12B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm12B
two_bytes_match_emit_encodeBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm12B
JMP memmove_long_match_emit_encodeBlockAsm12B
one_byte_match_emit_encodeBlockAsm12B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm12B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm12B
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm12B
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm12B
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm12B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm12B
memmove_long_match_emit_encodeBlockAsm12B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm12B:
match_nolit_loop_encodeBlockAsm12B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm12B
matchlen_loopback_match_nolit_encodeBlockAsm12B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm12B
matchlen_loop_match_nolit_encodeBlockAsm12B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm12B
JZ match_nolit_end_encodeBlockAsm12B
matchlen_match4_match_nolit_encodeBlockAsm12B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm12B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm12B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm12B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm12B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm12B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm12B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm12B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm12B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm12B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBlockAsm12B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm12B
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
long_offset_short_match_nolit_encodeBlockAsm12B:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
- JMP two_byte_offset_match_nolit_encodeBlockAsm12B
two_byte_offset_short_match_nolit_encodeBlockAsm12B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm12B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
emit_copy_three_match_nolit_encodeBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm12B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm12B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm12B:
- MOVQ $0x000000cf1bbcdcbb, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x18, R8
- IMULQ R9, R8
- SHRQ $0x34, R8
- SHLQ $0x18, SI
- IMULQ R9, SI
- SHRQ $0x34, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x000000cf1bbcdcbb, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x18, DI
+ IMULQ R8, DI
+ SHRQ $0x34, DI
+ SHLQ $0x18, BX
+ IMULQ R8, BX
+ SHRQ $0x34, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm12B
INCL CX
JMP search_loop_encodeBlockAsm12B
@@ -3732,8 +3704,8 @@ zero_loop_encodeBlockAsm10B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -3743,428 +3715,426 @@ zero_loop_encodeBlockAsm10B:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm10B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm10B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x20, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R9, R11
- SHRQ $0x36, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
SHLQ $0x20, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x36, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm10B
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm10B
repeat_extend_back_loop_encodeBlockAsm10B:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm10B
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm10B
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm10B
repeat_extend_back_end_encodeBlockAsm10B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm10B
two_bytes_repeat_emit_encodeBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm10B
JMP memmove_long_repeat_emit_encodeBlockAsm10B
one_byte_repeat_emit_encodeBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
memmove_long_repeat_emit_encodeBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm10B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm10B
matchlen_loopback_repeat_extend_encodeBlockAsm10B:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm10B
matchlen_loop_repeat_extend_encodeBlockAsm10B:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B
JZ repeat_extend_forward_end_encodeBlockAsm10B
matchlen_match4_repeat_extend_encodeBlockAsm10B:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm10B
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm10B
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm10B:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm10B
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm10B
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm10B:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm10B
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm10B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm10B:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm10B
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm10B
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_match_repeat_encodeBlockAsm10B
cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm10B
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_three_match_repeat_encodeBlockAsm10B:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_match_repeat_encodeBlockAsm10B:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_offset_match_repeat_encodeBlockAsm10B:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_as_copy_encodeBlockAsm10B:
// emitCopy
-two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
long_offset_short_repeat_as_copy_encodeBlockAsm10B:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm10B:
@@ -4172,16 +4142,16 @@ repeat_end_emit_encodeBlockAsm10B:
JMP search_loop_encodeBlockAsm10B
no_repeat_found_encodeBlockAsm10B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm10B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm10B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm10B
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm10B
@@ -4191,391 +4161,389 @@ candidate3_match_encodeBlockAsm10B:
JMP candidate_match_encodeBlockAsm10B
candidate2_match_encodeBlockAsm10B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm10B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm10B
match_extend_back_loop_encodeBlockAsm10B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm10B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm10B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm10B
JMP match_extend_back_loop_encodeBlockAsm10B
match_extend_back_end_encodeBlockAsm10B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm10B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm10B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm10B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm10B
two_bytes_match_emit_encodeBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm10B
JMP memmove_long_match_emit_encodeBlockAsm10B
one_byte_match_emit_encodeBlockAsm10B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm10B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm10B
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm10B
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm10B
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm10B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm10B
memmove_long_match_emit_encodeBlockAsm10B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm10B:
match_nolit_loop_encodeBlockAsm10B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm10B
matchlen_loopback_match_nolit_encodeBlockAsm10B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm10B
matchlen_loop_match_nolit_encodeBlockAsm10B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm10B
JZ match_nolit_end_encodeBlockAsm10B
matchlen_match4_match_nolit_encodeBlockAsm10B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm10B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm10B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm10B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm10B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm10B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm10B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm10B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm10B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm10B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBlockAsm10B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm10B
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
long_offset_short_match_nolit_encodeBlockAsm10B:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
- JMP two_byte_offset_match_nolit_encodeBlockAsm10B
two_byte_offset_short_match_nolit_encodeBlockAsm10B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm10B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
emit_copy_three_match_nolit_encodeBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm10B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm10B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm10B:
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x20, R8
- IMULQ R9, R8
- SHRQ $0x36, R8
- SHLQ $0x20, SI
- IMULQ R9, SI
- SHRQ $0x36, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x36, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x36, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm10B
INCL CX
JMP search_loop_encodeBlockAsm10B
@@ -4760,8 +4728,8 @@ zero_loop_encodeBlockAsm8B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -4771,414 +4739,412 @@ zero_loop_encodeBlockAsm8B:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm8B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x04, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm8B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x20, R10
- IMULQ R9, R10
- SHRQ $0x38, R10
- SHLQ $0x20, R11
- IMULQ R9, R11
- SHRQ $0x38, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x38, R9
SHLQ $0x20, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x38, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x38, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm8B
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm8B
repeat_extend_back_loop_encodeBlockAsm8B:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm8B
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm8B
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm8B
repeat_extend_back_end_encodeBlockAsm8B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm8B
two_bytes_repeat_emit_encodeBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm8B
JMP memmove_long_repeat_emit_encodeBlockAsm8B
one_byte_repeat_emit_encodeBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
memmove_long_repeat_emit_encodeBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm8B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm8B
matchlen_loopback_repeat_extend_encodeBlockAsm8B:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm8B
matchlen_loop_repeat_extend_encodeBlockAsm8B:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B
JZ repeat_extend_forward_end_encodeBlockAsm8B
matchlen_match4_repeat_extend_encodeBlockAsm8B:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm8B
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm8B
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm8B:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm8B
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm8B
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm8B:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm8B
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm8B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm8B:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm8B
// emitRepeat
- MOVL SI, DI
- LEAL -4(SI), SI
- CMPL DI, $0x08
+ MOVL BX, SI
+ LEAL -4(BX), BX
+ CMPL SI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm8B
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm8B
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_three_match_repeat_encodeBlockAsm8B:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_two_match_repeat_encodeBlockAsm8B:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_as_copy_encodeBlockAsm8B:
// emitCopy
-two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
- MOVL SI, DI
- LEAL -4(SI), SI
- CMPL DI, $0x08
+ MOVL BX, SI
+ LEAL -4(BX), BX
+ CMPL SI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
long_offset_short_repeat_as_copy_encodeBlockAsm8B:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
- MOVL SI, DI
- LEAL -4(SI), SI
- CMPL DI, $0x08
+ MOVL BX, SI
+ LEAL -4(BX), BX
+ CMPL SI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm8B:
@@ -5186,16 +5152,16 @@ repeat_end_emit_encodeBlockAsm8B:
JMP search_loop_encodeBlockAsm8B
no_repeat_found_encodeBlockAsm8B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm8B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm8B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm8B
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm8B
@@ -5205,381 +5171,379 @@ candidate3_match_encodeBlockAsm8B:
JMP candidate_match_encodeBlockAsm8B
candidate2_match_encodeBlockAsm8B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm8B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm8B
match_extend_back_loop_encodeBlockAsm8B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm8B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm8B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm8B
JMP match_extend_back_loop_encodeBlockAsm8B
match_extend_back_end_encodeBlockAsm8B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm8B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm8B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm8B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm8B
two_bytes_match_emit_encodeBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm8B
JMP memmove_long_match_emit_encodeBlockAsm8B
one_byte_match_emit_encodeBlockAsm8B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm8B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm8B
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm8B
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm8B
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm8B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm8B
memmove_long_match_emit_encodeBlockAsm8B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm8B:
match_nolit_loop_encodeBlockAsm8B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm8B
matchlen_loopback_match_nolit_encodeBlockAsm8B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm8B
matchlen_loop_match_nolit_encodeBlockAsm8B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm8B
JZ match_nolit_end_encodeBlockAsm8B
matchlen_match4_match_nolit_encodeBlockAsm8B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm8B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm8B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm8B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm8B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm8B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm8B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm8B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm8B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm8B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBlockAsm8B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm8B
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
- MOVL R10, SI
- LEAL -4(R10), R10
- CMPL SI, $0x08
+ MOVL R9, BX
+ LEAL -4(R9), R9
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
long_offset_short_match_nolit_encodeBlockAsm8B:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
- MOVL R10, SI
- LEAL -4(R10), R10
- CMPL SI, $0x08
+ MOVL R9, BX
+ LEAL -4(R9), R9
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
- JMP two_byte_offset_match_nolit_encodeBlockAsm8B
two_byte_offset_short_match_nolit_encodeBlockAsm8B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
emit_copy_three_match_nolit_encodeBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm8B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm8B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm8B:
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x20, R8
- IMULQ R9, R8
- SHRQ $0x38, R8
- SHLQ $0x20, SI
- IMULQ R9, SI
- SHRQ $0x38, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x38, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x38, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm8B
INCL CX
JMP search_loop_encodeBlockAsm8B
@@ -5743,9 +5707,9 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B:
// func encodeBetterBlockAsm(dst []byte, src []byte) int
// Requires: BMI, SSE2
-TEXT ·encodeBetterBlockAsm(SB), $327704-56
+TEXT ·encodeBetterBlockAsm(SB), $589848-56
MOVQ dst_base+0(FP), AX
- MOVQ $0x00000a00, CX
+ MOVQ $0x00001200, CX
LEAQ 24(SP), DX
PXOR X0, X0
@@ -5764,8 +5728,8 @@ zero_loop_encodeBetterBlockAsm:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -5775,808 +5739,810 @@ zero_loop_encodeBetterBlockAsm:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x07, SI
- CMPL SI, $0x63
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x07, BX
+ CMPL BX, $0x63
JLE check_maxskip_ok_encodeBetterBlockAsm
- LEAL 100(CX), SI
+ LEAL 100(CX), BX
JMP check_maxskip_cont_encodeBetterBlockAsm
check_maxskip_ok_encodeBetterBlockAsm:
- LEAL 1(CX)(SI*1), SI
+ LEAL 1(CX)(BX*1), BX
check_maxskip_cont_encodeBetterBlockAsm:
- CMPL SI, 8(SP)
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x00cf1bbcdcbfa563, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 262168(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 262168(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 524312(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 524312(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm
+ CMPQ R10, SI
+ JNE no_short_found_encodeBetterBlockAsm
+ MOVL DI, BX
+ JMP candidate_match_encodeBetterBlockAsm
+
+no_short_found_encodeBetterBlockAsm:
+ CMPL R9, SI
+ JEQ candidate_match_encodeBetterBlockAsm
+ CMPL R10, SI
+ JEQ candidateS_match_encodeBetterBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm
candidateS_match_encodeBetterBlockAsm:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm
match_extend_back_loop_encodeBetterBlockAsm:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm
JMP match_extend_back_loop_encodeBetterBlockAsm
match_extend_back_end_encodeBetterBlockAsm:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 5(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm
matchlen_loopback_match_nolit_encodeBetterBlockAsm:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm
matchlen_loop_match_nolit_encodeBetterBlockAsm:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm
JZ match_nolit_end_encodeBetterBlockAsm
matchlen_match4_match_nolit_encodeBetterBlockAsm:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm
- CMPL R12, $0x01
+ CMPL R11, $0x01
JG match_length_ok_encodeBetterBlockAsm
- CMPL R8, $0x0000ffff
+ CMPL DI, $0x0000ffff
JLE match_length_ok_encodeBetterBlockAsm
MOVL 20(SP), CX
INCL CX
JMP search_loop_encodeBetterBlockAsm
match_length_ok_encodeBetterBlockAsm:
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_encodeBetterBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_match_emit_encodeBetterBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm
four_bytes_match_emit_encodeBetterBlockAsm:
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm
three_bytes_match_emit_encodeBetterBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm
two_bytes_match_emit_encodeBetterBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm
JMP memmove_long_match_emit_encodeBetterBlockAsm
one_byte_match_emit_encodeBetterBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm
memmove_long_match_emit_encodeBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JL two_byte_offset_match_nolit_encodeBetterBlockAsm
-
-four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm
MOVB $0xff, (AX)
- MOVL R8, 1(AX)
- LEAL -64(R12), R12
+ MOVL DI, 1(AX)
+ LEAL -64(R11), R11
ADDQ $0x05, AX
- CMPL R12, $0x04
+ CMPL R11, $0x04
JL four_bytes_remain_match_nolit_encodeBetterBlockAsm
// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
- CMPL R12, $0x0100ffff
+ CMPL R11, $0x0100ffff
JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
- LEAL -16842747(R12), R12
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R11), R11
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
- JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm
four_bytes_remain_match_nolit_encodeBetterBlockAsm:
- TESTL R12, R12
+ TESTL R11, R11
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVL R8, 1(AX)
+ XORL BX, BX
+ LEAL -1(BX)(R11*4), R11
+ MOVB R11, (AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
two_byte_offset_match_nolit_encodeBetterBlockAsm:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- MOVL R8, R9
- SHRL $0x08, R9
- SHLL $0x05, R9
- ORL R9, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ MOVL DI, R8
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- CMPL R12, $0x0100ffff
+ CMPL R11, $0x0100ffff
JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- LEAL -16842747(R12), R12
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R11), R11
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
long_offset_short_match_nolit_encodeBetterBlockAsm:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
- CMPL R12, $0x0100ffff
+ CMPL R11, $0x0100ffff
JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
- LEAL -16842747(R12), R12
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R11), R11
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
emit_copy_three_match_nolit_encodeBetterBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
match_is_repeat_encodeBetterBlockAsm:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
four_bytes_match_emit_repeat_encodeBetterBlockAsm:
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
three_bytes_match_emit_repeat_encodeBetterBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
two_bytes_match_emit_repeat_encodeBetterBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
one_byte_match_emit_repeat_encodeBetterBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
memmove_long_match_emit_repeat_encodeBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm
- CMPL R12, $0x0100ffff
+ CMPL R11, $0x0100ffff
JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm
- LEAL -16842747(R12), R12
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R11), R11
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm:
@@ -6588,54 +6554,51 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm:
RET
match_nolit_dst_ok_encodeBetterBlockAsm:
- MOVQ $0x00cf1bbcdcbfa563, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
+ MOVQ $0x00cf1bbcdcbfa563, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x08, R11
+ IMULQ BX, R11
+ SHRQ $0x2f, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x32, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 262168(SP)(R11*4)
- MOVL R15, 262168(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 262168(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 524312(SP)(R10*4)
+ MOVL R13, 524312(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeBetterBlockAsm:
+ CMPQ SI, R8
+ JAE search_loop_encodeBetterBlockAsm
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x08, DI
+ IMULQ BX, DI
+ SHRQ $0x2f, DI
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeBetterBlockAsm
emit_remainder_encodeBetterBlockAsm:
MOVQ src_len+32(FP), CX
@@ -6815,9 +6778,9 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm:
// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
// Requires: BMI, SSE2
-TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
+TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56
MOVQ dst_base+0(FP), AX
- MOVQ $0x00000a00, CX
+ MOVQ $0x00001200, CX
LEAQ 24(SP), DX
PXOR X0, X0
@@ -6836,8 +6799,8 @@ zero_loop_encodeBetterBlockAsm4MB:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -6847,746 +6810,752 @@ zero_loop_encodeBetterBlockAsm4MB:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm4MB:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x07, SI
- CMPL SI, $0x63
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x07, BX
+ CMPL BX, $0x63
JLE check_maxskip_ok_encodeBetterBlockAsm4MB
- LEAL 100(CX), SI
+ LEAL 100(CX), BX
JMP check_maxskip_cont_encodeBetterBlockAsm4MB
check_maxskip_ok_encodeBetterBlockAsm4MB:
- LEAL 1(CX)(SI*1), SI
+ LEAL 1(CX)(BX*1), BX
check_maxskip_cont_encodeBetterBlockAsm4MB:
- CMPL SI, 8(SP)
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm4MB
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x00cf1bbcdcbfa563, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 262168(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 262168(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 524312(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 524312(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm4MB
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm4MB
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm4MB
+ CMPQ R10, SI
+ JNE no_short_found_encodeBetterBlockAsm4MB
+ MOVL DI, BX
+ JMP candidate_match_encodeBetterBlockAsm4MB
+
+no_short_found_encodeBetterBlockAsm4MB:
+ CMPL R9, SI
+ JEQ candidate_match_encodeBetterBlockAsm4MB
+ CMPL R10, SI
+ JEQ candidateS_match_encodeBetterBlockAsm4MB
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm4MB
candidateS_match_encodeBetterBlockAsm4MB:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm4MB
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm4MB:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm4MB
match_extend_back_loop_encodeBetterBlockAsm4MB:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm4MB
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm4MB
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm4MB
JMP match_extend_back_loop_encodeBetterBlockAsm4MB
match_extend_back_end_encodeBetterBlockAsm4MB:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 4(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 4(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm4MB
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm4MB:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm4MB
matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
JZ match_nolit_end_encodeBetterBlockAsm4MB
matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm4MB
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm4MB
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm4MB:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm4MB
- CMPL R12, $0x01
+ CMPL R11, $0x01
JG match_length_ok_encodeBetterBlockAsm4MB
- CMPL R8, $0x0000ffff
+ CMPL DI, $0x0000ffff
JLE match_length_ok_encodeBetterBlockAsm4MB
MOVL 20(SP), CX
INCL CX
JMP search_loop_encodeBetterBlockAsm4MB
match_length_ok_encodeBetterBlockAsm4MB:
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm4MB
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm4MB
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_encodeBetterBlockAsm4MB
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
three_bytes_match_emit_encodeBetterBlockAsm4MB:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
two_bytes_match_emit_encodeBetterBlockAsm4MB:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm4MB
JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
one_byte_match_emit_encodeBetterBlockAsm4MB:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
memmove_long_match_emit_encodeBetterBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
-
-four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
MOVB $0xff, (AX)
- MOVL R8, 1(AX)
- LEAL -64(R12), R12
+ MOVL DI, 1(AX)
+ LEAL -64(R11), R11
ADDQ $0x05, AX
- CMPL R12, $0x04
+ CMPL R11, $0x04
JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
- JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB
four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
- TESTL R12, R12
+ TESTL R11, R11
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
- MOVB $0x03, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVL R8, 1(AX)
+ XORL BX, BX
+ LEAL -1(BX)(R11*4), R11
+ MOVB R11, (AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
match_is_repeat_encodeBetterBlockAsm4MB:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
@@ -7598,54 +7567,51 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
RET
match_nolit_dst_ok_encodeBetterBlockAsm4MB:
- MOVQ $0x00cf1bbcdcbfa563, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
+ MOVQ $0x00cf1bbcdcbfa563, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x08, R11
+ IMULQ BX, R11
+ SHRQ $0x2f, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x32, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 262168(SP)(R11*4)
- MOVL R15, 262168(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 262168(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm4MB
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 524312(SP)(R10*4)
+ MOVL R13, 524312(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeBetterBlockAsm4MB:
+ CMPQ SI, R8
+ JAE search_loop_encodeBetterBlockAsm4MB
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x08, DI
+ IMULQ BX, DI
+ SHRQ $0x2f, DI
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeBetterBlockAsm4MB
emit_remainder_encodeBetterBlockAsm4MB:
MOVQ src_len+32(FP), CX
@@ -7838,8 +7804,8 @@ zero_loop_encodeBetterBlockAsm12B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -7849,591 +7815,599 @@ zero_loop_encodeBetterBlockAsm12B:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm12B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm12B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x34, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 65560(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 65560(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x34, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 65560(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 65560(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm12B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm12B
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm12B
+ CMPQ R10, SI
+ JNE no_short_found_encodeBetterBlockAsm12B
+ MOVL DI, BX
+ JMP candidate_match_encodeBetterBlockAsm12B
+
+no_short_found_encodeBetterBlockAsm12B:
+ CMPL R9, SI
+ JEQ candidate_match_encodeBetterBlockAsm12B
+ CMPL R10, SI
+ JEQ candidateS_match_encodeBetterBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm12B
candidateS_match_encodeBetterBlockAsm12B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm12B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm12B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm12B
match_extend_back_loop_encodeBetterBlockAsm12B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm12B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm12B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm12B
JMP match_extend_back_loop_encodeBetterBlockAsm12B
match_extend_back_end_encodeBetterBlockAsm12B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm12B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm12B
matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm12B
matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
JZ match_nolit_end_encodeBetterBlockAsm12B
matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm12B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm12B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm12B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm12B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm12B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm12B
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm12B
two_bytes_match_emit_encodeBetterBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm12B
JMP memmove_long_match_emit_encodeBetterBlockAsm12B
one_byte_match_emit_encodeBetterBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
memmove_long_match_emit_encodeBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm12B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
long_offset_short_match_nolit_encodeBetterBlockAsm12B:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
match_is_repeat_encodeBetterBlockAsm12B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm12B
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
@@ -8445,54 +8419,51 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
RET
match_nolit_dst_ok_encodeBetterBlockAsm12B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x32, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x32, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x34, R11
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x32, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x32, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x34, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 65560(SP)(R11*4)
- MOVL R15, 65560(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x34, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x32, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 65560(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm12B
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 65560(SP)(R10*4)
+ MOVL R13, 65560(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeBetterBlockAsm12B:
+ CMPQ SI, R8
+ JAE search_loop_encodeBetterBlockAsm12B
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x32, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeBetterBlockAsm12B
emit_remainder_encodeBetterBlockAsm12B:
MOVQ src_len+32(FP), CX
@@ -8674,8 +8645,8 @@ zero_loop_encodeBetterBlockAsm10B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -8685,591 +8656,599 @@ zero_loop_encodeBetterBlockAsm10B:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm10B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm10B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x36, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 16408(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 16408(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x36, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 16408(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 16408(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm10B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm10B
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm10B
+ CMPQ R10, SI
+ JNE no_short_found_encodeBetterBlockAsm10B
+ MOVL DI, BX
+ JMP candidate_match_encodeBetterBlockAsm10B
+
+no_short_found_encodeBetterBlockAsm10B:
+ CMPL R9, SI
+ JEQ candidate_match_encodeBetterBlockAsm10B
+ CMPL R10, SI
+ JEQ candidateS_match_encodeBetterBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm10B
candidateS_match_encodeBetterBlockAsm10B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm10B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm10B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm10B
match_extend_back_loop_encodeBetterBlockAsm10B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm10B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm10B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm10B
JMP match_extend_back_loop_encodeBetterBlockAsm10B
match_extend_back_end_encodeBetterBlockAsm10B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm10B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm10B
matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm10B
matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
JZ match_nolit_end_encodeBetterBlockAsm10B
matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm10B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm10B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm10B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm10B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm10B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm10B
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm10B
two_bytes_match_emit_encodeBetterBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm10B
JMP memmove_long_match_emit_encodeBetterBlockAsm10B
one_byte_match_emit_encodeBetterBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
memmove_long_match_emit_encodeBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm10B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
long_offset_short_match_nolit_encodeBetterBlockAsm10B:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
match_is_repeat_encodeBetterBlockAsm10B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm10B
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
@@ -9281,54 +9260,51 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
RET
match_nolit_dst_ok_encodeBetterBlockAsm10B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x34, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x34, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x36, R11
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x34, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x34, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x36, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 16408(SP)(R11*4)
- MOVL R15, 16408(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x36, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x34, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 16408(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm10B
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 16408(SP)(R10*4)
+ MOVL R13, 16408(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeBetterBlockAsm10B:
+ CMPQ SI, R8
+ JAE search_loop_encodeBetterBlockAsm10B
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x34, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x34, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeBetterBlockAsm10B
emit_remainder_encodeBetterBlockAsm10B:
MOVQ src_len+32(FP), CX
@@ -9510,8 +9486,8 @@ zero_loop_encodeBetterBlockAsm8B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -9521,577 +9497,585 @@ zero_loop_encodeBetterBlockAsm8B:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm8B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x04, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm8B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x38, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 4120(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 4120(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x38, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 4120(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 4120(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm8B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm8B
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm8B
+ CMPQ R10, SI
+ JNE no_short_found_encodeBetterBlockAsm8B
+ MOVL DI, BX
+ JMP candidate_match_encodeBetterBlockAsm8B
+
+no_short_found_encodeBetterBlockAsm8B:
+ CMPL R9, SI
+ JEQ candidate_match_encodeBetterBlockAsm8B
+ CMPL R10, SI
+ JEQ candidateS_match_encodeBetterBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm8B
candidateS_match_encodeBetterBlockAsm8B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm8B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm8B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm8B
match_extend_back_loop_encodeBetterBlockAsm8B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm8B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm8B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm8B
JMP match_extend_back_loop_encodeBetterBlockAsm8B
match_extend_back_end_encodeBetterBlockAsm8B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm8B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm8B
matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm8B
matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
JZ match_nolit_end_encodeBetterBlockAsm8B
matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm8B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm8B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm8B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm8B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm8B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm8B
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm8B
two_bytes_match_emit_encodeBetterBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm8B
JMP memmove_long_match_emit_encodeBetterBlockAsm8B
one_byte_match_emit_encodeBetterBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
memmove_long_match_emit_encodeBetterBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm8B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
long_offset_short_match_nolit_encodeBetterBlockAsm8B:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
match_is_repeat_encodeBetterBlockAsm8B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm8B
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm8B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x04
+ CMPQ DI, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
- MOVL (R9), R10
- MOVL R10, (AX)
+ MOVL (R8), R9
+ MOVL R9, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
- MOVL (R9), R10
- MOVL -4(R9)(R8*1), R9
- MOVL R10, (AX)
- MOVL R9, -4(AX)(R8*1)
+ MOVL (R8), R9
+ MOVL -4(R8)(DI*1), R8
+ MOVL R9, (AX)
+ MOVL R8, -4(AX)(DI*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R13
- SUBQ R10, R13
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R12
+ SUBQ R9, R12
+ DECQ R10
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R9)(R13*1), R10
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R8)(R12*1), R9
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R12
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R9)(R13*1), X4
- MOVOU -16(R9)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R8, R13
+ MOVOU -32(R8)(R12*1), X4
+ MOVOU -16(R8)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ DI, R12
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
@@ -10103,54 +10087,51 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
RET
match_nolit_dst_ok_encodeBetterBlockAsm8B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x36, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x36, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x38, R11
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x36, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x38, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x36, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x38, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 4120(SP)(R11*4)
- MOVL R15, 4120(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x38, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x36, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 4120(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm8B
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 4120(SP)(R10*4)
+ MOVL R13, 4120(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeBetterBlockAsm8B:
+ CMPQ SI, R8
+ JAE search_loop_encodeBetterBlockAsm8B
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x36, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x36, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeBetterBlockAsm8B
emit_remainder_encodeBetterBlockAsm8B:
MOVQ src_len+32(FP), CX
@@ -10332,8 +10313,8 @@ zero_loop_encodeSnappyBlockAsm:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -10343,321 +10324,321 @@ zero_loop_encodeSnappyBlockAsm:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- SHLQ $0x10, R11
- IMULQ R9, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
SHLQ $0x10, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x32, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm
repeat_extend_back_loop_encodeSnappyBlockAsm:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
repeat_extend_back_end_encodeSnappyBlockAsm:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_repeat_emit_encodeSnappyBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_repeat_emit_encodeSnappyBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
four_bytes_repeat_emit_encodeSnappyBlockAsm:
- MOVL SI, R10
- SHRL $0x10, R10
+ MOVL BX, R9
+ SHRL $0x10, R9
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R10, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R9, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
three_bytes_repeat_emit_encodeSnappyBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
two_bytes_repeat_emit_encodeSnappyBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
one_byte_repeat_emit_encodeSnappyBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
memmove_long_repeat_emit_encodeSnappyBlockAsm:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm
matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
JZ repeat_extend_forward_end_encodeSnappyBlockAsm
matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
- CMPL DI, $0x00010000
+ CMPL SI, $0x00010000
JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
MOVB $0xff, (AX)
- MOVL DI, 1(AX)
- LEAL -64(SI), SI
+ MOVL SI, 1(AX)
+ LEAL -64(BX), BX
ADDQ $0x05, AX
- CMPL SI, $0x04
+ CMPL BX, $0x04
JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
- TESTL SI, SI
+ TESTL BX, BX
JZ repeat_end_emit_encodeSnappyBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVL DI, 1(AX)
+ XORL DI, DI
+ LEAL -1(DI)(BX*4), BX
+ MOVB BL, (AX)
+ MOVL SI, 1(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeSnappyBlockAsm
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm:
@@ -10665,16 +10646,16 @@ repeat_end_emit_encodeSnappyBlockAsm:
JMP search_loop_encodeSnappyBlockAsm
no_repeat_found_encodeSnappyBlockAsm:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm
@@ -10684,331 +10665,331 @@ candidate3_match_encodeSnappyBlockAsm:
JMP candidate_match_encodeSnappyBlockAsm
candidate2_match_encodeSnappyBlockAsm:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm
match_extend_back_loop_encodeSnappyBlockAsm:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm
JMP match_extend_back_loop_encodeSnappyBlockAsm
match_extend_back_end_encodeSnappyBlockAsm:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 5(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JLT three_bytes_match_emit_encodeSnappyBlockAsm
- CMPL R8, $0x01000000
+ CMPL DI, $0x01000000
JLT four_bytes_match_emit_encodeSnappyBlockAsm
MOVB $0xfc, (AX)
- MOVL R8, 1(AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm
four_bytes_match_emit_encodeSnappyBlockAsm:
- MOVL R8, R10
- SHRL $0x10, R10
+ MOVL DI, R9
+ SHRL $0x10, R9
MOVB $0xf8, (AX)
- MOVW R8, 1(AX)
- MOVB R10, 3(AX)
+ MOVW DI, 1(AX)
+ MOVB R9, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm
three_bytes_match_emit_encodeSnappyBlockAsm:
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm
two_bytes_match_emit_encodeSnappyBlockAsm:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm
JMP memmove_long_match_emit_encodeSnappyBlockAsm
one_byte_match_emit_encodeSnappyBlockAsm:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
memmove_long_match_emit_encodeSnappyBlockAsm:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm:
match_nolit_loop_encodeSnappyBlockAsm:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm
matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm
matchlen_loop_match_nolit_encodeSnappyBlockAsm:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm
JZ match_nolit_end_encodeSnappyBlockAsm
matchlen_match4_match_nolit_encodeSnappyBlockAsm:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JL two_byte_offset_match_nolit_encodeSnappyBlockAsm
four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
MOVB $0xff, (AX)
- MOVL SI, 1(AX)
- LEAL -64(R10), R10
+ MOVL BX, 1(AX)
+ LEAL -64(R9), R9
ADDQ $0x05, AX
- CMPL R10, $0x04
+ CMPL R9, $0x04
JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm
JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
- TESTL R10, R10
+ TESTL R9, R9
JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVL SI, 1(AX)
+ XORL SI, SI
+ LEAL -1(SI)(R9*4), R9
+ MOVB R9, (AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
two_byte_offset_match_nolit_encodeSnappyBlockAsm:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
emit_copy_three_match_nolit_encodeSnappyBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm:
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x10, R8
- IMULQ R9, R8
- SHRQ $0x32, R8
- SHLQ $0x10, SI
- IMULQ R9, SI
- SHRQ $0x32, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x32, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm
INCL CX
JMP search_loop_encodeSnappyBlockAsm
@@ -11212,8 +11193,8 @@ zero_loop_encodeSnappyBlockAsm64K:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -11223,278 +11204,278 @@ zero_loop_encodeSnappyBlockAsm64K:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm64K:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm64K
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
SHLQ $0x10, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x32, R10
- SHLQ $0x10, R11
- IMULQ R9, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm64K
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm64K
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
repeat_extend_back_loop_encodeSnappyBlockAsm64K:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm64K
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
repeat_extend_back_end_encodeSnappyBlockAsm64K:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm64K
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
one_byte_repeat_emit_encodeSnappyBlockAsm64K:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm64K:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K
JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm64K
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm64K:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm64K
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm64K:
@@ -11502,16 +11483,16 @@ repeat_end_emit_encodeSnappyBlockAsm64K:
JMP search_loop_encodeSnappyBlockAsm64K
no_repeat_found_encodeSnappyBlockAsm64K:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm64K
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm64K
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm64K
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm64K
@@ -11521,288 +11502,288 @@ candidate3_match_encodeSnappyBlockAsm64K:
JMP candidate_match_encodeSnappyBlockAsm64K
candidate2_match_encodeSnappyBlockAsm64K:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm64K:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm64K
match_extend_back_loop_encodeSnappyBlockAsm64K:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm64K
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm64K
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm64K
JMP match_extend_back_loop_encodeSnappyBlockAsm64K
match_extend_back_end_encodeSnappyBlockAsm64K:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm64K
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm64K:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm64K
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm64K
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
two_bytes_match_emit_encodeSnappyBlockAsm64K:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm64K
JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
one_byte_match_emit_encodeSnappyBlockAsm64K:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm64K:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
memmove_long_match_emit_encodeSnappyBlockAsm64K:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
match_nolit_loop_encodeSnappyBlockAsm64K:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm64K
matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K
JZ match_nolit_end_encodeSnappyBlockAsm64K
matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm64K
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm64K
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm64K:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm64K
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm64K
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm64K:
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x10, R8
- IMULQ R9, R8
- SHRQ $0x32, R8
- SHLQ $0x10, SI
- IMULQ R9, SI
- SHRQ $0x32, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x32, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm64K
INCL CX
JMP search_loop_encodeSnappyBlockAsm64K
@@ -11987,8 +11968,8 @@ zero_loop_encodeSnappyBlockAsm12B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -11998,278 +11979,278 @@ zero_loop_encodeSnappyBlockAsm12B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm12B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm12B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x000000cf1bbcdcbb, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x000000cf1bbcdcbb, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x18, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
SHLQ $0x18, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x34, R10
- SHLQ $0x18, R11
- IMULQ R9, R11
- SHRQ $0x34, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
- SHLQ $0x18, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm12B
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x18, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm12B
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
repeat_extend_back_loop_encodeSnappyBlockAsm12B:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm12B
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
repeat_extend_back_end_encodeSnappyBlockAsm12B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm12B
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
one_byte_repeat_emit_encodeSnappyBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm12B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm12B
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm12B:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm12B
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm12B:
@@ -12277,16 +12258,16 @@ repeat_end_emit_encodeSnappyBlockAsm12B:
JMP search_loop_encodeSnappyBlockAsm12B
no_repeat_found_encodeSnappyBlockAsm12B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm12B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm12B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm12B
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm12B
@@ -12296,288 +12277,288 @@ candidate3_match_encodeSnappyBlockAsm12B:
JMP candidate_match_encodeSnappyBlockAsm12B
candidate2_match_encodeSnappyBlockAsm12B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm12B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm12B
match_extend_back_loop_encodeSnappyBlockAsm12B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm12B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm12B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm12B
JMP match_extend_back_loop_encodeSnappyBlockAsm12B
match_extend_back_end_encodeSnappyBlockAsm12B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm12B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm12B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm12B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
two_bytes_match_emit_encodeSnappyBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm12B
JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
one_byte_match_emit_encodeSnappyBlockAsm12B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm12B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
memmove_long_match_emit_encodeSnappyBlockAsm12B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
match_nolit_loop_encodeSnappyBlockAsm12B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm12B
matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
JZ match_nolit_end_encodeSnappyBlockAsm12B
matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm12B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm12B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm12B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm12B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm12B:
- MOVQ $0x000000cf1bbcdcbb, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x18, R8
- IMULQ R9, R8
- SHRQ $0x34, R8
- SHLQ $0x18, SI
- IMULQ R9, SI
- SHRQ $0x34, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x000000cf1bbcdcbb, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x18, DI
+ IMULQ R8, DI
+ SHRQ $0x34, DI
+ SHLQ $0x18, BX
+ IMULQ R8, BX
+ SHRQ $0x34, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm12B
INCL CX
JMP search_loop_encodeSnappyBlockAsm12B
@@ -12762,8 +12743,8 @@ zero_loop_encodeSnappyBlockAsm10B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -12773,278 +12754,278 @@ zero_loop_encodeSnappyBlockAsm10B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm10B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm10B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x20, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R9, R11
- SHRQ $0x36, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
SHLQ $0x20, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x36, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm10B
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm10B
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
repeat_extend_back_loop_encodeSnappyBlockAsm10B:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm10B
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
repeat_extend_back_end_encodeSnappyBlockAsm10B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm10B
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
one_byte_repeat_emit_encodeSnappyBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm10B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm10B
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm10B:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm10B
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm10B:
@@ -13052,16 +13033,16 @@ repeat_end_emit_encodeSnappyBlockAsm10B:
JMP search_loop_encodeSnappyBlockAsm10B
no_repeat_found_encodeSnappyBlockAsm10B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm10B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm10B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm10B
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm10B
@@ -13071,288 +13052,288 @@ candidate3_match_encodeSnappyBlockAsm10B:
JMP candidate_match_encodeSnappyBlockAsm10B
candidate2_match_encodeSnappyBlockAsm10B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm10B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm10B
match_extend_back_loop_encodeSnappyBlockAsm10B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm10B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm10B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm10B
JMP match_extend_back_loop_encodeSnappyBlockAsm10B
match_extend_back_end_encodeSnappyBlockAsm10B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm10B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm10B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm10B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
two_bytes_match_emit_encodeSnappyBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm10B
JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
one_byte_match_emit_encodeSnappyBlockAsm10B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm10B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
memmove_long_match_emit_encodeSnappyBlockAsm10B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
match_nolit_loop_encodeSnappyBlockAsm10B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm10B
matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
JZ match_nolit_end_encodeSnappyBlockAsm10B
matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm10B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm10B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm10B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm10B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm10B:
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x20, R8
- IMULQ R9, R8
- SHRQ $0x36, R8
- SHLQ $0x20, SI
- IMULQ R9, SI
- SHRQ $0x36, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x36, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x36, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm10B
INCL CX
JMP search_loop_encodeSnappyBlockAsm10B
@@ -13537,8 +13518,8 @@ zero_loop_encodeSnappyBlockAsm8B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -13548,276 +13529,276 @@ zero_loop_encodeSnappyBlockAsm8B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm8B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x04, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm8B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x20, R10
- IMULQ R9, R10
- SHRQ $0x38, R10
- SHLQ $0x20, R11
- IMULQ R9, R11
- SHRQ $0x38, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x38, R9
SHLQ $0x20, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x38, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm8B
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x38, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm8B
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
repeat_extend_back_loop_encodeSnappyBlockAsm8B:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm8B
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
repeat_extend_back_end_encodeSnappyBlockAsm8B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm8B
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
one_byte_repeat_emit_encodeSnappyBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm8B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm8B
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm8B:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm8B
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm8B:
@@ -13825,16 +13806,16 @@ repeat_end_emit_encodeSnappyBlockAsm8B:
JMP search_loop_encodeSnappyBlockAsm8B
no_repeat_found_encodeSnappyBlockAsm8B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm8B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm8B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm8B
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm8B
@@ -13844,286 +13825,286 @@ candidate3_match_encodeSnappyBlockAsm8B:
JMP candidate_match_encodeSnappyBlockAsm8B
candidate2_match_encodeSnappyBlockAsm8B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm8B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm8B
match_extend_back_loop_encodeSnappyBlockAsm8B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm8B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm8B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm8B
JMP match_extend_back_loop_encodeSnappyBlockAsm8B
match_extend_back_end_encodeSnappyBlockAsm8B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm8B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm8B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm8B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
two_bytes_match_emit_encodeSnappyBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm8B
JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
one_byte_match_emit_encodeSnappyBlockAsm8B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm8B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
memmove_long_match_emit_encodeSnappyBlockAsm8B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
match_nolit_loop_encodeSnappyBlockAsm8B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm8B
matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
JZ match_nolit_end_encodeSnappyBlockAsm8B
matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm8B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm8B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm8B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm8B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm8B:
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x20, R8
- IMULQ R9, R8
- SHRQ $0x38, R8
- SHLQ $0x20, SI
- IMULQ R9, SI
- SHRQ $0x38, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x38, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x38, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm8B
INCL CX
JMP search_loop_encodeSnappyBlockAsm8B
@@ -14287,9 +14268,9 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
// Requires: BMI, SSE2
-TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
+TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56
MOVQ dst_base+0(FP), AX
- MOVQ $0x00000a00, CX
+ MOVQ $0x00001200, CX
LEAQ 24(SP), DX
PXOR X0, X0
@@ -14308,8 +14289,8 @@ zero_loop_encodeSnappyBetterBlockAsm:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -14319,359 +14300,369 @@ zero_loop_encodeSnappyBetterBlockAsm:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x07, SI
- CMPL SI, $0x63
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x07, BX
+ CMPL BX, $0x63
JLE check_maxskip_ok_encodeSnappyBetterBlockAsm
- LEAL 100(CX), SI
+ LEAL 100(CX), BX
JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
check_maxskip_ok_encodeSnappyBetterBlockAsm:
- LEAL 1(CX)(SI*1), SI
+ LEAL 1(CX)(BX*1), BX
check_maxskip_cont_encodeSnappyBetterBlockAsm:
- CMPL SI, 8(SP)
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x00cf1bbcdcbfa563, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 262168(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 262168(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 524312(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 524312(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm
+ CMPQ R10, SI
+ JNE no_short_found_encodeSnappyBetterBlockAsm
+ MOVL DI, BX
+ JMP candidate_match_encodeSnappyBetterBlockAsm
+
+no_short_found_encodeSnappyBetterBlockAsm:
+ CMPL R9, SI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm
+ CMPL R10, SI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm
candidateS_match_encodeSnappyBetterBlockAsm:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm
match_extend_back_loop_encodeSnappyBetterBlockAsm:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
match_extend_back_end_encodeSnappyBetterBlockAsm:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 5(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm
JZ match_nolit_end_encodeSnappyBetterBlockAsm
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL R12, $0x01
+ CMPL R11, $0x01
JG match_length_ok_encodeSnappyBetterBlockAsm
- CMPL R8, $0x0000ffff
+ CMPL DI, $0x0000ffff
JLE match_length_ok_encodeSnappyBetterBlockAsm
MOVL 20(SP), CX
INCL CX
JMP search_loop_encodeSnappyBetterBlockAsm
match_length_ok_encodeSnappyBetterBlockAsm:
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
four_bytes_match_emit_encodeSnappyBetterBlockAsm:
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
three_bytes_match_emit_encodeSnappyBetterBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
two_bytes_match_emit_encodeSnappyBetterBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
one_byte_match_emit_encodeSnappyBetterBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
memmove_long_match_emit_encodeSnappyBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
MOVB $0xff, (AX)
- MOVL R8, 1(AX)
- LEAL -64(R12), R12
+ MOVL DI, 1(AX)
+ LEAL -64(R11), R11
ADDQ $0x05, AX
- CMPL R12, $0x04
+ CMPL R11, $0x04
JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
- TESTL R12, R12
+ TESTL R11, R11
JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVL R8, 1(AX)
+ XORL BX, BX
+ LEAL -1(BX)(R11*4), R11
+ MOVB R11, (AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
@@ -14683,54 +14674,51 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
- MOVQ $0x00cf1bbcdcbfa563, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
+ MOVQ $0x00cf1bbcdcbfa563, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x08, R11
+ IMULQ BX, R11
+ SHRQ $0x2f, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x32, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 262168(SP)(R11*4)
- MOVL R15, 262168(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 262168(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 524312(SP)(R10*4)
+ MOVL R13, 524312(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm:
+ CMPQ SI, R8
+ JAE search_loop_encodeSnappyBetterBlockAsm
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x08, DI
+ IMULQ BX, DI
+ SHRQ $0x2f, DI
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeSnappyBetterBlockAsm
emit_remainder_encodeSnappyBetterBlockAsm:
MOVQ src_len+32(FP), CX
@@ -14931,8 +14919,8 @@ zero_loop_encodeSnappyBetterBlockAsm64K:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -14942,299 +14930,309 @@ zero_loop_encodeSnappyBetterBlockAsm64K:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm64K:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x07, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x07, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm64K
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x00cf1bbcdcbfa563, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 262168(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 262168(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x30, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 262168(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 262168(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm64K
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm64K
+ CMPQ R10, SI
+ JNE no_short_found_encodeSnappyBetterBlockAsm64K
+ MOVL DI, BX
+ JMP candidate_match_encodeSnappyBetterBlockAsm64K
+
+no_short_found_encodeSnappyBetterBlockAsm64K:
+ CMPL R9, SI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm64K
+ CMPL R10, SI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm64K
candidateS_match_encodeSnappyBetterBlockAsm64K:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x30, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm64K
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm64K:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
match_extend_back_end_encodeSnappyBetterBlockAsm64K:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm64K
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm64K:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm64K
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm64K:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm64K
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm64K:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
@@ -15246,54 +15244,51 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
- MOVQ $0x00cf1bbcdcbfa563, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
+ MOVQ $0x00cf1bbcdcbfa563, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x30, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x08, R11
+ IMULQ BX, R11
+ SHRQ $0x30, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x32, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 262168(SP)(R11*4)
- MOVL R15, 262168(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 262168(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm64K
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 262168(SP)(R10*4)
+ MOVL R13, 262168(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm64K:
+ CMPQ SI, R8
+ JAE search_loop_encodeSnappyBetterBlockAsm64K
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x08, DI
+ IMULQ BX, DI
+ SHRQ $0x30, DI
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x30, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeSnappyBetterBlockAsm64K
emit_remainder_encodeSnappyBetterBlockAsm64K:
MOVQ src_len+32(FP), CX
@@ -15475,8 +15470,8 @@ zero_loop_encodeSnappyBetterBlockAsm12B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -15486,299 +15481,309 @@ zero_loop_encodeSnappyBetterBlockAsm12B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm12B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm12B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x34, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 65560(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 65560(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x34, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 65560(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 65560(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm12B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm12B
+ CMPQ R10, SI
+ JNE no_short_found_encodeSnappyBetterBlockAsm12B
+ MOVL DI, BX
+ JMP candidate_match_encodeSnappyBetterBlockAsm12B
+
+no_short_found_encodeSnappyBetterBlockAsm12B:
+ CMPL R9, SI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm12B
+ CMPL R10, SI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm12B
candidateS_match_encodeSnappyBetterBlockAsm12B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm12B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm12B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
match_extend_back_end_encodeSnappyBetterBlockAsm12B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm12B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm12B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm12B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm12B
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
@@ -15790,54 +15795,51 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x32, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x32, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x34, R11
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x32, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x32, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x34, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 65560(SP)(R11*4)
- MOVL R15, 65560(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x34, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x32, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 65560(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm12B
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 65560(SP)(R10*4)
+ MOVL R13, 65560(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm12B:
+ CMPQ SI, R8
+ JAE search_loop_encodeSnappyBetterBlockAsm12B
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x32, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeSnappyBetterBlockAsm12B
emit_remainder_encodeSnappyBetterBlockAsm12B:
MOVQ src_len+32(FP), CX
@@ -16019,8 +16021,8 @@ zero_loop_encodeSnappyBetterBlockAsm10B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -16030,299 +16032,309 @@ zero_loop_encodeSnappyBetterBlockAsm10B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm10B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm10B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x36, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 16408(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 16408(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x36, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 16408(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 16408(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm10B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm10B
+ CMPQ R10, SI
+ JNE no_short_found_encodeSnappyBetterBlockAsm10B
+ MOVL DI, BX
+ JMP candidate_match_encodeSnappyBetterBlockAsm10B
+
+no_short_found_encodeSnappyBetterBlockAsm10B:
+ CMPL R9, SI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm10B
+ CMPL R10, SI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm10B
candidateS_match_encodeSnappyBetterBlockAsm10B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm10B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm10B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
match_extend_back_end_encodeSnappyBetterBlockAsm10B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm10B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm10B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm10B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm10B
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
@@ -16334,54 +16346,51 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x34, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x34, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x36, R11
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x34, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x34, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x36, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 16408(SP)(R11*4)
- MOVL R15, 16408(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x36, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x34, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 16408(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm10B
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 16408(SP)(R10*4)
+ MOVL R13, 16408(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm10B:
+ CMPQ SI, R8
+ JAE search_loop_encodeSnappyBetterBlockAsm10B
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x34, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x34, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeSnappyBetterBlockAsm10B
emit_remainder_encodeSnappyBetterBlockAsm10B:
MOVQ src_len+32(FP), CX
@@ -16563,8 +16572,8 @@ zero_loop_encodeSnappyBetterBlockAsm8B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -16574,297 +16583,307 @@ zero_loop_encodeSnappyBetterBlockAsm8B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm8B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x04, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm8B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x38, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 4120(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 4120(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x38, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 4120(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 4120(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm8B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm8B
+ CMPQ R10, SI
+ JNE no_short_found_encodeSnappyBetterBlockAsm8B
+ MOVL DI, BX
+ JMP candidate_match_encodeSnappyBetterBlockAsm8B
+
+no_short_found_encodeSnappyBetterBlockAsm8B:
+ CMPL R9, SI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm8B
+ CMPL R10, SI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm8B
candidateS_match_encodeSnappyBetterBlockAsm8B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm8B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm8B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
match_extend_back_end_encodeSnappyBetterBlockAsm8B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm8B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm8B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm8B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm8B
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
@@ -16876,54 +16895,51 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x36, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x36, R13
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x38, R11
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x36, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x38, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x36, R11
SHLQ $0x20, R12
- IMULQ R8, R12
+ IMULQ DI, R12
SHRQ $0x38, R12
- MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 4120(SP)(R11*4)
- MOVL R15, 4120(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x38, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x36, R13
- MOVL R9, 24(SP)(R10*4)
- MOVL DI, 4120(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm8B
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 4120(SP)(R10*4)
+ MOVL R13, 4120(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
+
+index_loop_encodeSnappyBetterBlockAsm8B:
+ CMPQ SI, R8
+ JAE search_loop_encodeSnappyBetterBlockAsm8B
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x36, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x36, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
+ JMP index_loop_encodeSnappyBetterBlockAsm8B
emit_remainder_encodeSnappyBetterBlockAsm8B:
MOVQ src_len+32(FP), CX
@@ -17082,6 +17098,1008 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
MOVQ AX, ret+48(FP)
RET
+// func calcBlockSize(src []byte) int
+// Requires: BMI, SSE2
+TEXT ·calcBlockSize(SB), $32792-32
+ XORQ AX, AX
+ MOVQ $0x00000100, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_calcBlockSize:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_calcBlockSize
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+8(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+0(FP), DX
+
+search_loop_calcBlockSize:
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
+ JGE emit_remainder_calcBlockSize
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x33, R9
+ SHLQ $0x10, R10
+ IMULQ R8, R10
+ SHRQ $0x33, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x33, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_calcBlockSize
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
+ JZ repeat_extend_back_end_calcBlockSize
+
+repeat_extend_back_loop_calcBlockSize:
+ CMPL SI, BX
+ JLE repeat_extend_back_end_calcBlockSize
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
+ JNE repeat_extend_back_end_calcBlockSize
+ LEAL -1(SI), SI
+ DECL DI
+ JNZ repeat_extend_back_loop_calcBlockSize
+
+repeat_extend_back_end_calcBlockSize:
+ MOVL 12(SP), BX
+ CMPL BX, SI
+ JEQ emit_literal_done_repeat_emit_calcBlockSize
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
+ JLT one_byte_repeat_emit_calcBlockSize
+ CMPL BX, $0x00000100
+ JLT two_bytes_repeat_emit_calcBlockSize
+ CMPL BX, $0x00010000
+ JLT three_bytes_repeat_emit_calcBlockSize
+ CMPL BX, $0x01000000
+ JLT four_bytes_repeat_emit_calcBlockSize
+ ADDQ $0x05, AX
+ JMP memmove_long_repeat_emit_calcBlockSize
+
+four_bytes_repeat_emit_calcBlockSize:
+ ADDQ $0x04, AX
+ JMP memmove_long_repeat_emit_calcBlockSize
+
+three_bytes_repeat_emit_calcBlockSize:
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_calcBlockSize
+
+two_bytes_repeat_emit_calcBlockSize:
+ ADDQ $0x02, AX
+ CMPL BX, $0x40
+ JL memmove_repeat_emit_calcBlockSize
+ JMP memmove_long_repeat_emit_calcBlockSize
+
+one_byte_repeat_emit_calcBlockSize:
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_calcBlockSize:
+ LEAQ (AX)(DI*1), AX
+ JMP emit_literal_done_repeat_emit_calcBlockSize
+
+memmove_long_repeat_emit_calcBlockSize:
+ LEAQ (AX)(DI*1), AX
+
+emit_literal_done_repeat_emit_calcBlockSize:
+ ADDL $0x05, CX
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+8(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_match4_repeat_extend_calcBlockSize
+
+matchlen_loopback_repeat_extend_calcBlockSize:
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_repeat_extend_calcBlockSize
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP repeat_extend_forward_end_calcBlockSize
+
+matchlen_loop_repeat_extend_calcBlockSize:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_repeat_extend_calcBlockSize
+ JZ repeat_extend_forward_end_calcBlockSize
+
+matchlen_match4_repeat_extend_calcBlockSize:
+ CMPL DI, $0x04
+ JL matchlen_match2_repeat_extend_calcBlockSize
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
+ JNE matchlen_match2_repeat_extend_calcBlockSize
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_calcBlockSize:
+ CMPL DI, $0x02
+ JL matchlen_match1_repeat_extend_calcBlockSize
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
+ JNE matchlen_match1_repeat_extend_calcBlockSize
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_repeat_extend_calcBlockSize:
+ CMPL DI, $0x01
+ JL repeat_extend_forward_end_calcBlockSize
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
+ JNE repeat_extend_forward_end_calcBlockSize
+ LEAL 1(R10), R10
+
+repeat_extend_forward_end_calcBlockSize:
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+
+ // emitCopy
+ CMPL SI, $0x00010000
+ JL two_byte_offset_repeat_as_copy_calcBlockSize
+
+four_bytes_loop_back_repeat_as_copy_calcBlockSize:
+ CMPL BX, $0x40
+ JLE four_bytes_remain_repeat_as_copy_calcBlockSize
+ LEAL -64(BX), BX
+ ADDQ $0x05, AX
+ CMPL BX, $0x04
+ JL four_bytes_remain_repeat_as_copy_calcBlockSize
+ JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize
+
+four_bytes_remain_repeat_as_copy_calcBlockSize:
+ TESTL BX, BX
+ JZ repeat_end_emit_calcBlockSize
+ XORL BX, BX
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_calcBlockSize
+
+two_byte_offset_repeat_as_copy_calcBlockSize:
+ CMPL BX, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_calcBlockSize
+ LEAL -60(BX), BX
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_calcBlockSize
+
+two_byte_offset_short_repeat_as_copy_calcBlockSize:
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
+ JGE emit_copy_three_repeat_as_copy_calcBlockSize
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_calcBlockSize
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_calcBlockSize
+
+emit_copy_three_repeat_as_copy_calcBlockSize:
+ ADDQ $0x03, AX
+
+repeat_end_emit_calcBlockSize:
+ MOVL CX, 12(SP)
+ JMP search_loop_calcBlockSize
+
+no_repeat_found_calcBlockSize:
+ CMPL (DX)(BX*1), SI
+ JEQ candidate_match_calcBlockSize
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
+ JEQ candidate2_match_calcBlockSize
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
+ JEQ candidate3_match_calcBlockSize
+ MOVL 20(SP), CX
+ JMP search_loop_calcBlockSize
+
+candidate3_match_calcBlockSize:
+ ADDL $0x02, CX
+ JMP candidate_match_calcBlockSize
+
+candidate2_match_calcBlockSize:
+ MOVL R8, 24(SP)(R9*4)
+ INCL CX
+ MOVL DI, BX
+
+candidate_match_calcBlockSize:
+ MOVL 12(SP), SI
+ TESTL BX, BX
+ JZ match_extend_back_end_calcBlockSize
+
+match_extend_back_loop_calcBlockSize:
+ CMPL CX, SI
+ JLE match_extend_back_end_calcBlockSize
+ MOVB -1(DX)(BX*1), DI
+ MOVB -1(DX)(CX*1), R8
+ CMPB DI, R8
+ JNE match_extend_back_end_calcBlockSize
+ LEAL -1(CX), CX
+ DECL BX
+ JZ match_extend_back_end_calcBlockSize
+ JMP match_extend_back_loop_calcBlockSize
+
+match_extend_back_end_calcBlockSize:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
+ JL match_dst_size_check_calcBlockSize
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+match_dst_size_check_calcBlockSize:
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
+ JEQ emit_literal_done_match_emit_calcBlockSize
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_calcBlockSize
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_calcBlockSize
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_calcBlockSize
+ CMPL SI, $0x01000000
+ JLT four_bytes_match_emit_calcBlockSize
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_calcBlockSize
+
+four_bytes_match_emit_calcBlockSize:
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_calcBlockSize
+
+three_bytes_match_emit_calcBlockSize:
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_calcBlockSize
+
+two_bytes_match_emit_calcBlockSize:
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_calcBlockSize
+ JMP memmove_long_match_emit_calcBlockSize
+
+one_byte_match_emit_calcBlockSize:
+ ADDQ $0x01, AX
+
+memmove_match_emit_calcBlockSize:
+ LEAQ (AX)(R8*1), AX
+ JMP emit_literal_done_match_emit_calcBlockSize
+
+memmove_long_match_emit_calcBlockSize:
+ LEAQ (AX)(R8*1), AX
+
+emit_literal_done_match_emit_calcBlockSize:
+match_nolit_loop_calcBlockSize:
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, BX
+ MOVQ src_len+8(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
+
+ // matchLen
+ XORL R9, R9
+ CMPL SI, $0x08
+ JL matchlen_match4_match_nolit_calcBlockSize
+
+matchlen_loopback_match_nolit_calcBlockSize:
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
+ JZ matchlen_loop_match_nolit_calcBlockSize
+
+#ifdef GOAMD64_v3
+ TZCNTQ R8, R8
+
+#else
+ BSFQ R8, R8
+
+#endif
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
+ JMP match_nolit_end_calcBlockSize
+
+matchlen_loop_match_nolit_calcBlockSize:
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
+ JGE matchlen_loopback_match_nolit_calcBlockSize
+ JZ match_nolit_end_calcBlockSize
+
+matchlen_match4_match_nolit_calcBlockSize:
+ CMPL SI, $0x04
+ JL matchlen_match2_match_nolit_calcBlockSize
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
+ JNE matchlen_match2_match_nolit_calcBlockSize
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_calcBlockSize:
+ CMPL SI, $0x02
+ JL matchlen_match1_match_nolit_calcBlockSize
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
+ JNE matchlen_match1_match_nolit_calcBlockSize
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
+
+matchlen_match1_match_nolit_calcBlockSize:
+ CMPL SI, $0x01
+ JL match_nolit_end_calcBlockSize
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
+ JNE match_nolit_end_calcBlockSize
+ LEAL 1(R9), R9
+
+match_nolit_end_calcBlockSize:
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL BX, $0x00010000
+ JL two_byte_offset_match_nolit_calcBlockSize
+
+four_bytes_loop_back_match_nolit_calcBlockSize:
+ CMPL R9, $0x40
+ JLE four_bytes_remain_match_nolit_calcBlockSize
+ LEAL -64(R9), R9
+ ADDQ $0x05, AX
+ CMPL R9, $0x04
+ JL four_bytes_remain_match_nolit_calcBlockSize
+ JMP four_bytes_loop_back_match_nolit_calcBlockSize
+
+four_bytes_remain_match_nolit_calcBlockSize:
+ TESTL R9, R9
+ JZ match_nolit_emitcopy_end_calcBlockSize
+ XORL BX, BX
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_calcBlockSize
+
+two_byte_offset_match_nolit_calcBlockSize:
+ CMPL R9, $0x40
+ JLE two_byte_offset_short_match_nolit_calcBlockSize
+ LEAL -60(R9), R9
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_calcBlockSize
+
+two_byte_offset_short_match_nolit_calcBlockSize:
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
+ JGE emit_copy_three_match_nolit_calcBlockSize
+ CMPL BX, $0x00000800
+ JGE emit_copy_three_match_nolit_calcBlockSize
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_calcBlockSize
+
+emit_copy_three_match_nolit_calcBlockSize:
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_calcBlockSize:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_calcBlockSize
+ MOVQ -2(DX)(CX*1), SI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_calcBlockSize
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+match_nolit_dst_ok_calcBlockSize:
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x33, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x33, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
+ JEQ match_nolit_loop_calcBlockSize
+ INCL CX
+ JMP search_loop_calcBlockSize
+
+emit_remainder_calcBlockSize:
+ MOVQ src_len+8(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_calcBlockSize
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+emit_remainder_ok_calcBlockSize:
+ MOVQ src_len+8(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_calcBlockSize
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), CX
+ CMPL CX, $0x3c
+ JLT one_byte_emit_remainder_calcBlockSize
+ CMPL CX, $0x00000100
+ JLT two_bytes_emit_remainder_calcBlockSize
+ CMPL CX, $0x00010000
+ JLT three_bytes_emit_remainder_calcBlockSize
+ CMPL CX, $0x01000000
+ JLT four_bytes_emit_remainder_calcBlockSize
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_calcBlockSize
+
+four_bytes_emit_remainder_calcBlockSize:
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_calcBlockSize
+
+three_bytes_emit_remainder_calcBlockSize:
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_calcBlockSize
+
+two_bytes_emit_remainder_calcBlockSize:
+ ADDQ $0x02, AX
+ CMPL CX, $0x40
+ JL memmove_emit_remainder_calcBlockSize
+ JMP memmove_long_emit_remainder_calcBlockSize
+
+one_byte_emit_remainder_calcBlockSize:
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_calcBlockSize:
+ LEAQ (AX)(SI*1), AX
+ JMP emit_literal_done_emit_remainder_calcBlockSize
+
+memmove_long_emit_remainder_calcBlockSize:
+ LEAQ (AX)(SI*1), AX
+
+emit_literal_done_emit_remainder_calcBlockSize:
+ MOVQ AX, ret+24(FP)
+ RET
+
+// func calcBlockSizeSmall(src []byte) int
+// Requires: BMI, SSE2
+TEXT ·calcBlockSizeSmall(SB), $2072-32
+ XORQ AX, AX
+ MOVQ $0x00000010, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_calcBlockSizeSmall:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_calcBlockSizeSmall
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+8(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+0(FP), DX
+
+search_loop_calcBlockSizeSmall:
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
+ JGE emit_remainder_calcBlockSizeSmall
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x37, R9
+ SHLQ $0x20, R10
+ IMULQ R8, R10
+ SHRQ $0x37, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x37, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_calcBlockSizeSmall
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
+ JZ repeat_extend_back_end_calcBlockSizeSmall
+
+repeat_extend_back_loop_calcBlockSizeSmall:
+ CMPL SI, BX
+ JLE repeat_extend_back_end_calcBlockSizeSmall
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
+ JNE repeat_extend_back_end_calcBlockSizeSmall
+ LEAL -1(SI), SI
+ DECL DI
+ JNZ repeat_extend_back_loop_calcBlockSizeSmall
+
+repeat_extend_back_end_calcBlockSizeSmall:
+ MOVL 12(SP), BX
+ CMPL BX, SI
+ JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
+ JLT one_byte_repeat_emit_calcBlockSizeSmall
+ CMPL BX, $0x00000100
+ JLT two_bytes_repeat_emit_calcBlockSizeSmall
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_calcBlockSizeSmall
+
+two_bytes_repeat_emit_calcBlockSizeSmall:
+ ADDQ $0x02, AX
+ CMPL BX, $0x40
+ JL memmove_repeat_emit_calcBlockSizeSmall
+ JMP memmove_long_repeat_emit_calcBlockSizeSmall
+
+one_byte_repeat_emit_calcBlockSizeSmall:
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_calcBlockSizeSmall:
+ LEAQ (AX)(DI*1), AX
+ JMP emit_literal_done_repeat_emit_calcBlockSizeSmall
+
+memmove_long_repeat_emit_calcBlockSizeSmall:
+ LEAQ (AX)(DI*1), AX
+
+emit_literal_done_repeat_emit_calcBlockSizeSmall:
+ ADDL $0x05, CX
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+8(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_match4_repeat_extend_calcBlockSizeSmall
+
+matchlen_loopback_repeat_extend_calcBlockSizeSmall:
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_repeat_extend_calcBlockSizeSmall
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_loop_repeat_extend_calcBlockSizeSmall:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_repeat_extend_calcBlockSizeSmall
+ JZ repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_match4_repeat_extend_calcBlockSizeSmall:
+ CMPL DI, $0x04
+ JL matchlen_match2_repeat_extend_calcBlockSizeSmall
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
+ JNE matchlen_match2_repeat_extend_calcBlockSizeSmall
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_calcBlockSizeSmall:
+ CMPL DI, $0x02
+ JL matchlen_match1_repeat_extend_calcBlockSizeSmall
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
+ JNE matchlen_match1_repeat_extend_calcBlockSizeSmall
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_repeat_extend_calcBlockSizeSmall:
+ CMPL DI, $0x01
+ JL repeat_extend_forward_end_calcBlockSizeSmall
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
+ JNE repeat_extend_forward_end_calcBlockSizeSmall
+ LEAL 1(R10), R10
+
+repeat_extend_forward_end_calcBlockSizeSmall:
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
+ CMPL BX, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
+ LEAL -60(BX), BX
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall
+
+two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
+ MOVL BX, SI
+ SHLL $0x02, SI
+ CMPL BX, $0x0c
+ JGE emit_copy_three_repeat_as_copy_calcBlockSizeSmall
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_calcBlockSizeSmall
+
+emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
+ ADDQ $0x03, AX
+
+repeat_end_emit_calcBlockSizeSmall:
+ MOVL CX, 12(SP)
+ JMP search_loop_calcBlockSizeSmall
+
+no_repeat_found_calcBlockSizeSmall:
+ CMPL (DX)(BX*1), SI
+ JEQ candidate_match_calcBlockSizeSmall
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
+ JEQ candidate2_match_calcBlockSizeSmall
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
+ JEQ candidate3_match_calcBlockSizeSmall
+ MOVL 20(SP), CX
+ JMP search_loop_calcBlockSizeSmall
+
+candidate3_match_calcBlockSizeSmall:
+ ADDL $0x02, CX
+ JMP candidate_match_calcBlockSizeSmall
+
+candidate2_match_calcBlockSizeSmall:
+ MOVL R8, 24(SP)(R9*4)
+ INCL CX
+ MOVL DI, BX
+
+candidate_match_calcBlockSizeSmall:
+ MOVL 12(SP), SI
+ TESTL BX, BX
+ JZ match_extend_back_end_calcBlockSizeSmall
+
+match_extend_back_loop_calcBlockSizeSmall:
+ CMPL CX, SI
+ JLE match_extend_back_end_calcBlockSizeSmall
+ MOVB -1(DX)(BX*1), DI
+ MOVB -1(DX)(CX*1), R8
+ CMPB DI, R8
+ JNE match_extend_back_end_calcBlockSizeSmall
+ LEAL -1(CX), CX
+ DECL BX
+ JZ match_extend_back_end_calcBlockSizeSmall
+ JMP match_extend_back_loop_calcBlockSizeSmall
+
+match_extend_back_end_calcBlockSizeSmall:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
+ JL match_dst_size_check_calcBlockSizeSmall
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+match_dst_size_check_calcBlockSizeSmall:
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
+ JEQ emit_literal_done_match_emit_calcBlockSizeSmall
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_calcBlockSizeSmall
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_calcBlockSizeSmall
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_calcBlockSizeSmall
+
+two_bytes_match_emit_calcBlockSizeSmall:
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_calcBlockSizeSmall
+ JMP memmove_long_match_emit_calcBlockSizeSmall
+
+one_byte_match_emit_calcBlockSizeSmall:
+ ADDQ $0x01, AX
+
+memmove_match_emit_calcBlockSizeSmall:
+ LEAQ (AX)(R8*1), AX
+ JMP emit_literal_done_match_emit_calcBlockSizeSmall
+
+memmove_long_match_emit_calcBlockSizeSmall:
+ LEAQ (AX)(R8*1), AX
+
+emit_literal_done_match_emit_calcBlockSizeSmall:
+match_nolit_loop_calcBlockSizeSmall:
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, BX
+ MOVQ src_len+8(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
+
+ // matchLen
+ XORL R9, R9
+ CMPL SI, $0x08
+ JL matchlen_match4_match_nolit_calcBlockSizeSmall
+
+matchlen_loopback_match_nolit_calcBlockSizeSmall:
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
+ JZ matchlen_loop_match_nolit_calcBlockSizeSmall
+
+#ifdef GOAMD64_v3
+ TZCNTQ R8, R8
+
+#else
+ BSFQ R8, R8
+
+#endif
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
+ JMP match_nolit_end_calcBlockSizeSmall
+
+matchlen_loop_match_nolit_calcBlockSizeSmall:
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
+ JGE matchlen_loopback_match_nolit_calcBlockSizeSmall
+ JZ match_nolit_end_calcBlockSizeSmall
+
+matchlen_match4_match_nolit_calcBlockSizeSmall:
+ CMPL SI, $0x04
+ JL matchlen_match2_match_nolit_calcBlockSizeSmall
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
+ JNE matchlen_match2_match_nolit_calcBlockSizeSmall
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_calcBlockSizeSmall:
+ CMPL SI, $0x02
+ JL matchlen_match1_match_nolit_calcBlockSizeSmall
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
+ JNE matchlen_match1_match_nolit_calcBlockSizeSmall
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
+
+matchlen_match1_match_nolit_calcBlockSizeSmall:
+ CMPL SI, $0x01
+ JL match_nolit_end_calcBlockSizeSmall
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
+ JNE match_nolit_end_calcBlockSizeSmall
+ LEAL 1(R9), R9
+
+match_nolit_end_calcBlockSizeSmall:
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_calcBlockSizeSmall:
+ CMPL R9, $0x40
+ JLE two_byte_offset_short_match_nolit_calcBlockSizeSmall
+ LEAL -60(R9), R9
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_calcBlockSizeSmall
+
+two_byte_offset_short_match_nolit_calcBlockSizeSmall:
+ MOVL R9, BX
+ SHLL $0x02, BX
+ CMPL R9, $0x0c
+ JGE emit_copy_three_match_nolit_calcBlockSizeSmall
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_calcBlockSizeSmall
+
+emit_copy_three_match_nolit_calcBlockSizeSmall:
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_calcBlockSizeSmall:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_calcBlockSizeSmall
+ MOVQ -2(DX)(CX*1), SI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_calcBlockSizeSmall
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+match_nolit_dst_ok_calcBlockSizeSmall:
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x37, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x37, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
+ JEQ match_nolit_loop_calcBlockSizeSmall
+ INCL CX
+ JMP search_loop_calcBlockSizeSmall
+
+emit_remainder_calcBlockSizeSmall:
+ MOVQ src_len+8(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_calcBlockSizeSmall
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+emit_remainder_ok_calcBlockSizeSmall:
+ MOVQ src_len+8(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), CX
+ CMPL CX, $0x3c
+ JLT one_byte_emit_remainder_calcBlockSizeSmall
+ CMPL CX, $0x00000100
+ JLT two_bytes_emit_remainder_calcBlockSizeSmall
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_calcBlockSizeSmall
+
+two_bytes_emit_remainder_calcBlockSizeSmall:
+ ADDQ $0x02, AX
+ CMPL CX, $0x40
+ JL memmove_emit_remainder_calcBlockSizeSmall
+ JMP memmove_long_emit_remainder_calcBlockSizeSmall
+
+one_byte_emit_remainder_calcBlockSizeSmall:
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_calcBlockSizeSmall:
+ LEAQ (AX)(SI*1), AX
+ JMP emit_literal_done_emit_remainder_calcBlockSizeSmall
+
+memmove_long_emit_remainder_calcBlockSizeSmall:
+ LEAQ (AX)(SI*1), AX
+
+emit_literal_done_emit_remainder_calcBlockSizeSmall:
+ MOVQ AX, ret+24(FP)
+ RET
+
// func emitLiteral(dst []byte, lit []byte) int
// Requires: SSE2
TEXT ·emitLiteral(SB), NOSPLIT, $0-56
@@ -17274,8 +18292,7 @@ cant_repeat_two_offset_standalone:
CMPL DX, $0x0100ffff
JLT repeat_five_standalone
LEAL -16842747(DX), DX
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
ADDQ $0x05, BX
@@ -17341,8 +18358,6 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
// emitCopy
CMPL CX, $0x00010000
JL two_byte_offset_standalone
-
-four_bytes_loop_back_standalone:
CMPL DX, $0x40
JLE four_bytes_remain_standalone
MOVB $0xff, (AX)
@@ -17372,8 +18387,7 @@ cant_repeat_two_offset_standalone_emit_copy:
CMPL DX, $0x0100ffff
JLT repeat_five_standalone_emit_copy
LEAL -16842747(DX), DX
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
ADDQ $0x05, BX
@@ -17425,13 +18439,12 @@ repeat_two_offset_standalone_emit_copy:
ADDQ $0x02, BX
ADDQ $0x02, AX
JMP gen_emit_copy_end
- JMP four_bytes_loop_back_standalone
four_bytes_remain_standalone:
TESTL DX, DX
JZ gen_emit_copy_end
- MOVB $0x03, SI
- LEAL -4(SI)(DX*4), DX
+ XORL SI, SI
+ LEAL -1(SI)(DX*4), DX
MOVB DL, (AX)
MOVL CX, 1(AX)
ADDQ $0x05, BX
@@ -17477,8 +18490,7 @@ cant_repeat_two_offset_standalone_emit_copy_short_2b:
CMPL DX, $0x0100ffff
JLT repeat_five_standalone_emit_copy_short_2b
LEAL -16842747(DX), DX
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
ADDQ $0x05, BX
@@ -17557,8 +18569,7 @@ cant_repeat_two_offset_standalone_emit_copy_short:
CMPL DX, $0x0100ffff
JLT repeat_five_standalone_emit_copy_short
LEAL -16842747(DX), DX
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
ADDQ $0x05, BX
@@ -17610,28 +18621,27 @@ repeat_two_offset_standalone_emit_copy_short:
ADDQ $0x02, BX
ADDQ $0x02, AX
JMP gen_emit_copy_end
- JMP two_byte_offset_standalone
two_byte_offset_short_standalone:
+ MOVL DX, SI
+ SHLL $0x02, SI
CMPL DX, $0x0c
JGE emit_copy_three_standalone
CMPL CX, $0x00000800
JGE emit_copy_three_standalone
- MOVB $0x01, SI
- LEAL -16(SI)(DX*4), DX
+ LEAL -15(SI), SI
MOVB CL, 1(AX)
SHRL $0x08, CX
SHLL $0x05, CX
- ORL CX, DX
- MOVB DL, (AX)
+ ORL CX, SI
+ MOVB SI, (AX)
ADDQ $0x02, BX
ADDQ $0x02, AX
JMP gen_emit_copy_end
emit_copy_three_standalone:
- MOVB $0x02, SI
- LEAL -4(SI)(DX*4), DX
- MOVB DL, (AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
MOVW CX, 1(AX)
ADDQ $0x03, BX
ADDQ $0x03, AX
@@ -17666,8 +18676,8 @@ four_bytes_loop_back_standalone_snappy:
four_bytes_remain_standalone_snappy:
TESTL DX, DX
JZ gen_emit_copy_end_snappy
- MOVB $0x03, SI
- LEAL -4(SI)(DX*4), DX
+ XORL SI, SI
+ LEAL -1(SI)(DX*4), DX
MOVB DL, (AX)
MOVL CX, 1(AX)
ADDQ $0x05, BX
@@ -17685,25 +18695,25 @@ two_byte_offset_standalone_snappy:
JMP two_byte_offset_standalone_snappy
two_byte_offset_short_standalone_snappy:
+ MOVL DX, SI
+ SHLL $0x02, SI
CMPL DX, $0x0c
JGE emit_copy_three_standalone_snappy
CMPL CX, $0x00000800
JGE emit_copy_three_standalone_snappy
- MOVB $0x01, SI
- LEAL -16(SI)(DX*4), DX
+ LEAL -15(SI), SI
MOVB CL, 1(AX)
SHRL $0x08, CX
SHLL $0x05, CX
- ORL CX, DX
- MOVB DL, (AX)
+ ORL CX, SI
+ MOVB SI, (AX)
ADDQ $0x02, BX
ADDQ $0x02, AX
JMP gen_emit_copy_end_snappy
emit_copy_three_standalone_snappy:
- MOVB $0x02, SI
- LEAL -4(SI)(DX*4), DX
- MOVB DL, (AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
MOVW CX, 1(AX)
ADDQ $0x03, BX
ADDQ $0x03, AX
@@ -17777,3 +18787,752 @@ matchlen_match1_standalone:
gen_match_len_end:
MOVQ SI, ret+48(FP)
RET
+
+// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
+ XORQ SI, SI
+ MOVQ dst_base+0(FP), AX
+ MOVQ dst_len+8(FP), CX
+ MOVQ src_base+24(FP), DX
+ MOVQ src_len+32(FP), BX
+ LEAQ (DX)(BX*1), BX
+ LEAQ -10(AX)(CX*1), CX
+ XORQ DI, DI
+
+lz4_s2_loop:
+ CMPQ DX, BX
+ JAE lz4_s2_corrupt
+ CMPQ AX, CX
+ JAE lz4_s2_dstfull
+ MOVBQZX (DX), R8
+ MOVQ R8, R9
+ MOVQ R8, R10
+ SHRQ $0x04, R9
+ ANDQ $0x0f, R10
+ CMPQ R8, $0xf0
+ JB lz4_s2_ll_end
+
+lz4_s2_ll_loop:
+ INCQ DX
+ CMPQ DX, BX
+ JAE lz4_s2_corrupt
+ MOVBQZX (DX), R8
+ ADDQ R8, R9
+ CMPQ R8, $0xff
+ JEQ lz4_s2_ll_loop
+
+lz4_s2_ll_end:
+ LEAQ (DX)(R9*1), R8
+ ADDQ $0x04, R10
+ CMPQ R8, BX
+ JAE lz4_s2_corrupt
+ INCQ DX
+ INCQ R8
+ TESTQ R9, R9
+ JZ lz4_s2_lits_done
+ LEAQ (AX)(R9*1), R11
+ CMPQ R11, CX
+ JAE lz4_s2_dstfull
+ ADDQ R9, SI
+ LEAL -1(R9), R11
+ CMPL R11, $0x3c
+ JLT one_byte_lz4_s2
+ CMPL R11, $0x00000100
+ JLT two_bytes_lz4_s2
+ CMPL R11, $0x00010000
+ JLT three_bytes_lz4_s2
+ CMPL R11, $0x01000000
+ JLT four_bytes_lz4_s2
+ MOVB $0xfc, (AX)
+ MOVL R11, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_lz4_s2
+
+four_bytes_lz4_s2:
+ MOVL R11, R12
+ SHRL $0x10, R12
+ MOVB $0xf8, (AX)
+ MOVW R11, 1(AX)
+ MOVB R12, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_lz4_s2
+
+three_bytes_lz4_s2:
+ MOVB $0xf4, (AX)
+ MOVW R11, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_lz4_s2
+
+two_bytes_lz4_s2:
+ MOVB $0xf0, (AX)
+ MOVB R11, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R11, $0x40
+ JL memmove_lz4_s2
+ JMP memmove_long_lz4_s2
+
+one_byte_lz4_s2:
+ SHLB $0x02, R11
+ MOVB R11, (AX)
+ ADDQ $0x01, AX
+
+memmove_lz4_s2:
+ LEAQ (AX)(R9*1), R11
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_lz4_s2_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_lz4_s2_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_lz4_s2_memmove_move_17through32
+ JMP emit_lit_memmove_lz4_s2_memmove_move_33through64
+
+emit_lit_memmove_lz4_s2_memmove_move_8:
+ MOVQ (DX), R12
+ MOVQ R12, (AX)
+ JMP memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_8through16:
+ MOVQ (DX), R12
+ MOVQ -8(DX)(R9*1), DX
+ MOVQ R12, (AX)
+ MOVQ DX, -8(AX)(R9*1)
+ JMP memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_17through32:
+ MOVOU (DX), X0
+ MOVOU -16(DX)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_33through64:
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R9*1), X2
+ MOVOU -16(DX)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_lz4_s2:
+ MOVQ R11, AX
+ JMP lz4_s2_lits_emit_done
+
+memmove_long_lz4_s2:
+ LEAQ (AX)(R9*1), R11
+
+ // genMemMoveLong
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R9*1), X2
+ MOVOU -16(DX)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R12
+ ANDL $0x0000001f, R12
+ MOVQ $0x00000040, R14
+ SUBQ R12, R14
+ DECQ R13
+ JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+ LEAQ -32(DX)(R14*1), R12
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4_s2large_big_loop_back:
+ MOVOU (R12), X4
+ MOVOU 16(R12), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R12
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_lz4_s2large_big_loop_back
+
+emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
+ MOVOU -32(DX)(R14*1), X4
+ MOVOU -16(DX)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R11, AX
+
+lz4_s2_lits_emit_done:
+ MOVQ R8, DX
+
+lz4_s2_lits_done:
+ CMPQ DX, BX
+ JNE lz4_s2_match
+ CMPQ R10, $0x04
+ JEQ lz4_s2_done
+ JMP lz4_s2_corrupt
+
+lz4_s2_match:
+ LEAQ 2(DX), R8
+ CMPQ R8, BX
+ JAE lz4_s2_corrupt
+ MOVWQZX (DX), R9
+ MOVQ R8, DX
+ TESTQ R9, R9
+ JZ lz4_s2_corrupt
+ CMPQ R9, SI
+ JA lz4_s2_corrupt
+ CMPQ R10, $0x13
+ JNE lz4_s2_ml_done
+
+lz4_s2_ml_loop:
+ MOVBQZX (DX), R8
+ INCQ DX
+ ADDQ R8, R10
+ CMPQ DX, BX
+ JAE lz4_s2_corrupt
+ CMPQ R8, $0xff
+ JEQ lz4_s2_ml_loop
+
+lz4_s2_ml_done:
+ ADDQ R10, SI
+ CMPQ R9, DI
+ JNE lz4_s2_docopy
+
+ // emitRepeat
+emit_repeat_again_lz4_s2:
+ MOVL R10, R8
+ LEAL -4(R10), R10
+ CMPL R8, $0x08
+ JLE repeat_two_lz4_s2
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_lz4_s2
+ CMPL R9, $0x00000800
+ JLT repeat_two_offset_lz4_s2
+
+cant_repeat_two_offset_lz4_s2:
+ CMPL R10, $0x00000104
+ JLT repeat_three_lz4_s2
+ CMPL R10, $0x00010100
+ JLT repeat_four_lz4_s2
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_lz4_s2
+ LEAL -16842747(R10), R10
+ MOVL $0xfffb001d, (AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_lz4_s2
+
+repeat_five_lz4_s2:
+ LEAL -65536(R10), R10
+ MOVL R10, R9
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, R9
+ MOVB R9, 4(AX)
+ ADDQ $0x05, AX
+ JMP lz4_s2_loop
+
+repeat_four_lz4_s2:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP lz4_s2_loop
+
+repeat_three_lz4_s2:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP lz4_s2_loop
+
+repeat_two_lz4_s2:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+repeat_two_offset_lz4_s2:
+ XORQ R8, R8
+ LEAL 1(R8)(R10*4), R10
+ MOVB R9, 1(AX)
+ SARL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+lz4_s2_docopy:
+ MOVQ R9, DI
+
+ // emitCopy
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_lz4_s2
+ CMPL R9, $0x00000800
+ JAE long_offset_short_lz4_s2
+ MOVL $0x00000001, R8
+ LEAL 16(R8), R8
+ MOVB R9, 1(AX)
+ MOVL R9, R11
+ SHRL $0x08, R11
+ SHLL $0x05, R11
+ ORL R11, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R10
+
+ // emitRepeat
+ LEAL -4(R10), R10
+ JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+emit_repeat_again_lz4_s2_emit_copy_short_2b:
+ MOVL R10, R8
+ LEAL -4(R10), R10
+ CMPL R8, $0x08
+ JLE repeat_two_lz4_s2_emit_copy_short_2b
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+ CMPL R9, $0x00000800
+ JLT repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
+ CMPL R10, $0x00000104
+ JLT repeat_three_lz4_s2_emit_copy_short_2b
+ CMPL R10, $0x00010100
+ JLT repeat_four_lz4_s2_emit_copy_short_2b
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_lz4_s2_emit_copy_short_2b
+ LEAL -16842747(R10), R10
+ MOVL $0xfffb001d, (AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
+
+repeat_five_lz4_s2_emit_copy_short_2b:
+ LEAL -65536(R10), R10
+ MOVL R10, R9
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, R9
+ MOVB R9, 4(AX)
+ ADDQ $0x05, AX
+ JMP lz4_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short_2b:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short_2b:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short_2b:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short_2b:
+ XORQ R8, R8
+ LEAL 1(R8)(R10*4), R10
+ MOVB R9, 1(AX)
+ SARL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+long_offset_short_lz4_s2:
+ MOVB $0xee, (AX)
+ MOVW R9, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+emit_repeat_again_lz4_s2_emit_copy_short:
+ MOVL R10, R8
+ LEAL -4(R10), R10
+ CMPL R8, $0x08
+ JLE repeat_two_lz4_s2_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_lz4_s2_emit_copy_short
+ CMPL R9, $0x00000800
+ JLT repeat_two_offset_lz4_s2_emit_copy_short
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_lz4_s2_emit_copy_short
+ CMPL R10, $0x00010100
+ JLT repeat_four_lz4_s2_emit_copy_short
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_lz4_s2_emit_copy_short
+ LEAL -16842747(R10), R10
+ MOVL $0xfffb001d, (AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_lz4_s2_emit_copy_short
+
+repeat_five_lz4_s2_emit_copy_short:
+ LEAL -65536(R10), R10
+ MOVL R10, R9
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, R9
+ MOVB R9, 4(AX)
+ ADDQ $0x05, AX
+ JMP lz4_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(R10*4), R10
+ MOVB R9, 1(AX)
+ SARL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+two_byte_offset_short_lz4_s2:
+ MOVL R10, R8
+ SHLL $0x02, R8
+ CMPL R10, $0x0c
+ JGE emit_copy_three_lz4_s2
+ CMPL R9, $0x00000800
+ JGE emit_copy_three_lz4_s2
+ LEAL -15(R8), R8
+ MOVB R9, 1(AX)
+ SHRL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+emit_copy_three_lz4_s2:
+ LEAL -2(R8), R8
+ MOVB R8, (AX)
+ MOVW R9, 1(AX)
+ ADDQ $0x03, AX
+ JMP lz4_s2_loop
+
+lz4_s2_done:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ SI, uncompressed+48(FP)
+ MOVQ AX, dstUsed+56(FP)
+ RET
+
+lz4_s2_corrupt:
+ XORQ AX, AX
+ LEAQ -1(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+lz4_s2_dstfull:
+ XORQ AX, AX
+ LEAQ -2(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
+ XORQ SI, SI
+ MOVQ dst_base+0(FP), AX
+ MOVQ dst_len+8(FP), CX
+ MOVQ src_base+24(FP), DX
+ MOVQ src_len+32(FP), BX
+ LEAQ (DX)(BX*1), BX
+ LEAQ -10(AX)(CX*1), CX
+
+lz4_snappy_loop:
+ CMPQ DX, BX
+ JAE lz4_snappy_corrupt
+ CMPQ AX, CX
+ JAE lz4_snappy_dstfull
+ MOVBQZX (DX), DI
+ MOVQ DI, R8
+ MOVQ DI, R9
+ SHRQ $0x04, R8
+ ANDQ $0x0f, R9
+ CMPQ DI, $0xf0
+ JB lz4_snappy_ll_end
+
+lz4_snappy_ll_loop:
+ INCQ DX
+ CMPQ DX, BX
+ JAE lz4_snappy_corrupt
+ MOVBQZX (DX), DI
+ ADDQ DI, R8
+ CMPQ DI, $0xff
+ JEQ lz4_snappy_ll_loop
+
+lz4_snappy_ll_end:
+ LEAQ (DX)(R8*1), DI
+ ADDQ $0x04, R9
+ CMPQ DI, BX
+ JAE lz4_snappy_corrupt
+ INCQ DX
+ INCQ DI
+ TESTQ R8, R8
+ JZ lz4_snappy_lits_done
+ LEAQ (AX)(R8*1), R10
+ CMPQ R10, CX
+ JAE lz4_snappy_dstfull
+ ADDQ R8, SI
+ LEAL -1(R8), R10
+ CMPL R10, $0x3c
+ JLT one_byte_lz4_snappy
+ CMPL R10, $0x00000100
+ JLT two_bytes_lz4_snappy
+ CMPL R10, $0x00010000
+ JLT three_bytes_lz4_snappy
+ CMPL R10, $0x01000000
+ JLT four_bytes_lz4_snappy
+ MOVB $0xfc, (AX)
+ MOVL R10, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_lz4_snappy
+
+four_bytes_lz4_snappy:
+ MOVL R10, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW R10, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_lz4_snappy
+
+three_bytes_lz4_snappy:
+ MOVB $0xf4, (AX)
+ MOVW R10, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_lz4_snappy
+
+two_bytes_lz4_snappy:
+ MOVB $0xf0, (AX)
+ MOVB R10, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R10, $0x40
+ JL memmove_lz4_snappy
+ JMP memmove_long_lz4_snappy
+
+one_byte_lz4_snappy:
+ SHLB $0x02, R10
+ MOVB R10, (AX)
+ ADDQ $0x01, AX
+
+memmove_lz4_snappy:
+ LEAQ (AX)(R8*1), R10
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_lz4_snappy_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32
+ JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64
+
+emit_lit_memmove_lz4_snappy_memmove_move_8:
+ MOVQ (DX), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_8through16:
+ MOVQ (DX), R11
+ MOVQ -8(DX)(R8*1), DX
+ MOVQ R11, (AX)
+ MOVQ DX, -8(AX)(R8*1)
+ JMP memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_17through32:
+ MOVOU (DX), X0
+ MOVOU -16(DX)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_33through64:
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R8*1), X2
+ MOVOU -16(DX)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_lz4_snappy:
+ MOVQ R10, AX
+ JMP lz4_snappy_lits_emit_done
+
+memmove_long_lz4_snappy:
+ LEAQ (AX)(R8*1), R10
+
+ // genMemMoveLong
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R8*1), X2
+ MOVOU -16(DX)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
+ LEAQ -32(DX)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back
+
+emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
+ MOVOU -32(DX)(R13*1), X4
+ MOVOU -16(DX)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
+ JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ R10, AX
+
+lz4_snappy_lits_emit_done:
+ MOVQ DI, DX
+
+lz4_snappy_lits_done:
+ CMPQ DX, BX
+ JNE lz4_snappy_match
+ CMPQ R9, $0x04
+ JEQ lz4_snappy_done
+ JMP lz4_snappy_corrupt
+
+lz4_snappy_match:
+ LEAQ 2(DX), DI
+ CMPQ DI, BX
+ JAE lz4_snappy_corrupt
+ MOVWQZX (DX), R8
+ MOVQ DI, DX
+ TESTQ R8, R8
+ JZ lz4_snappy_corrupt
+ CMPQ R8, SI
+ JA lz4_snappy_corrupt
+ CMPQ R9, $0x13
+ JNE lz4_snappy_ml_done
+
+lz4_snappy_ml_loop:
+ MOVBQZX (DX), DI
+ INCQ DX
+ ADDQ DI, R9
+ CMPQ DX, BX
+ JAE lz4_snappy_corrupt
+ CMPQ DI, $0xff
+ JEQ lz4_snappy_ml_loop
+
+lz4_snappy_ml_done:
+ ADDQ R9, SI
+
+ // emitCopy
+two_byte_offset_lz4_s2:
+ CMPL R9, $0x40
+ JLE two_byte_offset_short_lz4_s2
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R9), R9
+ ADDQ $0x03, AX
+ CMPQ AX, CX
+ JAE lz4_snappy_loop
+ JMP two_byte_offset_lz4_s2
+
+two_byte_offset_short_lz4_s2:
+ MOVL R9, DI
+ SHLL $0x02, DI
+ CMPL R9, $0x0c
+ JGE emit_copy_three_lz4_s2
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_lz4_s2
+ LEAL -15(DI), DI
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, DI
+ MOVB DI, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_snappy_loop
+
+emit_copy_three_lz4_s2:
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP lz4_snappy_loop
+
+lz4_snappy_done:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ SI, uncompressed+48(FP)
+ MOVQ AX, dstUsed+56(FP)
+ RET
+
+lz4_snappy_corrupt:
+ XORQ AX, AX
+ LEAQ -1(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+lz4_snappy_dstfull:
+ XORQ AX, AX
+ LEAQ -2(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/s2/lz4convert.go b/vendor/github.com/klauspost/compress/s2/lz4convert.go
new file mode 100644
index 00000000..46ed908e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/lz4convert.go
@@ -0,0 +1,585 @@
+// Copyright (c) 2022 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+)
+
+// LZ4Converter provides conversion from LZ4 blocks as defined here:
+// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md
+type LZ4Converter struct {
+}
+
+// ErrDstTooSmall is returned when provided destination is too small.
+var ErrDstTooSmall = errors.New("s2: destination too small")
+
+// ConvertBlock will convert an LZ4 block and append it as an S2
+// block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
+ if len(src) == 0 {
+ return dst, 0, nil
+ }
+ const debug = false
+ const inline = true
+ const lz4MinMatch = 4
+
+ s, d := 0, len(dst)
+ dst = dst[:cap(dst)]
+ if !debug && hasAmd64Asm {
+ res, sz := cvtLZ4BlockAsm(dst[d:], src)
+ if res < 0 {
+ const (
+ errCorrupt = -1
+ errDstTooSmall = -2
+ )
+ switch res {
+ case errCorrupt:
+ return nil, 0, ErrCorrupt
+ case errDstTooSmall:
+ return nil, 0, ErrDstTooSmall
+ default:
+ return nil, 0, fmt.Errorf("unexpected result: %d", res)
+ }
+ }
+ if d+sz > len(dst) {
+ return nil, 0, ErrDstTooSmall
+ }
+ return dst[:d+sz], res, nil
+ }
+
+ dLimit := len(dst) - 10
+ var lastOffset uint16
+ var uncompressed int
+ if debug {
+ fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+ }
+
+ for {
+ if s >= len(src) {
+ return dst[:d], 0, ErrCorrupt
+ }
+ // Read literal info
+ token := src[s]
+ ll := int(token >> 4)
+ ml := int(lz4MinMatch + (token & 0xf))
+
+ // If upper nibble is 15, literal length is extended
+ if token >= 0xf0 {
+ for {
+ s++
+ if s >= len(src) {
+ if debug {
+ fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+ }
+ return dst[:d], 0, ErrCorrupt
+ }
+ val := src[s]
+ ll += int(val)
+ if val != 255 {
+ break
+ }
+ }
+ }
+ // Skip past token
+ if s+ll >= len(src) {
+ if debug {
+ fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+ }
+ return nil, 0, ErrCorrupt
+ }
+ s++
+ if ll > 0 {
+ if d+ll > dLimit {
+ return nil, 0, ErrDstTooSmall
+ }
+ if debug {
+ fmt.Printf("emit %d literals\n", ll)
+ }
+ d += emitLiteralGo(dst[d:], src[s:s+ll])
+ s += ll
+ uncompressed += ll
+ }
+
+ // Check if we are done...
+ if s == len(src) && ml == lz4MinMatch {
+ break
+ }
+ // 2 byte offset
+ if s >= len(src)-2 {
+ if debug {
+ fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+ }
+ return nil, 0, ErrCorrupt
+ }
+ offset := binary.LittleEndian.Uint16(src[s:])
+ s += 2
+ if offset == 0 {
+ if debug {
+ fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+ }
+ return nil, 0, ErrCorrupt
+ }
+ if int(offset) > uncompressed {
+ if debug {
+ fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+ }
+ return nil, 0, ErrCorrupt
+ }
+
+ if ml == lz4MinMatch+15 {
+ for {
+ if s >= len(src) {
+ if debug {
+ fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+ }
+ return nil, 0, ErrCorrupt
+ }
+ val := src[s]
+ s++
+ ml += int(val)
+ if val != 255 {
+ if s >= len(src) {
+ if debug {
+ fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+ }
+ return nil, 0, ErrCorrupt
+ }
+ break
+ }
+ }
+ }
+ if offset == lastOffset {
+ if debug {
+ fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset)
+ }
+ if !inline {
+ d += emitRepeat16(dst[d:], offset, ml)
+ } else {
+ length := ml
+ dst := dst[d:]
+ for len(dst) > 5 {
+ // Repeat offset, make length cheaper
+ length -= 4
+ if length <= 4 {
+ dst[0] = uint8(length)<<2 | tagCopy1
+ dst[1] = 0
+ d += 2
+ break
+ }
+ if length < 8 && offset < 2048 {
+ // Encode WITH offset
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+ d += 2
+ break
+ }
+ if length < (1<<8)+4 {
+ length -= 4
+ dst[2] = uint8(length)
+ dst[1] = 0
+ dst[0] = 5<<2 | tagCopy1
+ d += 3
+ break
+ }
+ if length < (1<<16)+(1<<8) {
+ length -= 1 << 8
+ dst[3] = uint8(length >> 8)
+ dst[2] = uint8(length >> 0)
+ dst[1] = 0
+ dst[0] = 6<<2 | tagCopy1
+ d += 4
+ break
+ }
+ const maxRepeat = (1 << 24) - 1
+ length -= 1 << 16
+ left := 0
+ if length > maxRepeat {
+ left = length - maxRepeat + 4
+ length = maxRepeat - 4
+ }
+ dst[4] = uint8(length >> 16)
+ dst[3] = uint8(length >> 8)
+ dst[2] = uint8(length >> 0)
+ dst[1] = 0
+ dst[0] = 7<<2 | tagCopy1
+ if left > 0 {
+ d += 5 + emitRepeat16(dst[5:], offset, left)
+ break
+ }
+ d += 5
+ break
+ }
+ }
+ } else {
+ if debug {
+ fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+ }
+ if !inline {
+ d += emitCopy16(dst[d:], offset, ml)
+ } else {
+ length := ml
+ dst := dst[d:]
+ for len(dst) > 5 {
+ // Offset no more than 2 bytes.
+ if length > 64 {
+ off := 3
+ if offset < 2048 {
+ // emit 8 bytes as tagCopy1, rest as repeats.
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+ length -= 8
+ off = 2
+ } else {
+ // Emit a length 60 copy, encoded as 3 bytes.
+ // Emit remaining as repeat value (minimum 4 bytes).
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = 59<<2 | tagCopy2
+ length -= 60
+ }
+ // Emit remaining as repeats, at least 4 bytes remain.
+ d += off + emitRepeat16(dst[off:], offset, length)
+ break
+ }
+ if length >= 12 || offset >= 2048 {
+ // Emit the remaining copy, encoded as 3 bytes.
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = uint8(length-1)<<2 | tagCopy2
+ d += 3
+ break
+ }
+ // Emit the remaining copy, encoded as 2 bytes.
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+ d += 2
+ break
+ }
+ }
+ lastOffset = offset
+ }
+ uncompressed += ml
+ if d > dLimit {
+ return nil, 0, ErrDstTooSmall
+ }
+ }
+
+ return dst[:d], uncompressed, nil
+}
+
+// ConvertBlockSnappy will convert an LZ4 block and append it
+// as a Snappy block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4Converter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) {
+ if len(src) == 0 {
+ return dst, 0, nil
+ }
+ const debug = false
+ const lz4MinMatch = 4
+
+ s, d := 0, len(dst)
+ dst = dst[:cap(dst)]
+ // Use assembly when possible
+ if !debug && hasAmd64Asm {
+ res, sz := cvtLZ4BlockSnappyAsm(dst[d:], src)
+ if res < 0 {
+ const (
+ errCorrupt = -1
+ errDstTooSmall = -2
+ )
+ switch res {
+ case errCorrupt:
+ return nil, 0, ErrCorrupt
+ case errDstTooSmall:
+ return nil, 0, ErrDstTooSmall
+ default:
+ return nil, 0, fmt.Errorf("unexpected result: %d", res)
+ }
+ }
+ if d+sz > len(dst) {
+ return nil, 0, ErrDstTooSmall
+ }
+ return dst[:d+sz], res, nil
+ }
+
+ dLimit := len(dst) - 10
+ var uncompressed int
+ if debug {
+ fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+ }
+
+ for {
+ if s >= len(src) {
+ return nil, 0, ErrCorrupt
+ }
+ // Read literal info
+ token := src[s]
+ ll := int(token >> 4)
+ ml := int(lz4MinMatch + (token & 0xf))
+
+ // If upper nibble is 15, literal length is extended
+ if token >= 0xf0 {
+ for {
+ s++
+ if s >= len(src) {
+ if debug {
+ fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+ }
+ return nil, 0, ErrCorrupt
+ }
+ val := src[s]
+ ll += int(val)
+ if val != 255 {
+ break
+ }
+ }
+ }
+ // Skip past token
+ if s+ll >= len(src) {
+ if debug {
+ fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+ }
+ return nil, 0, ErrCorrupt
+ }
+ s++
+ if ll > 0 {
+ if d+ll > dLimit {
+ return nil, 0, ErrDstTooSmall
+ }
+ if debug {
+ fmt.Printf("emit %d literals\n", ll)
+ }
+ d += emitLiteralGo(dst[d:], src[s:s+ll])
+ s += ll
+ uncompressed += ll
+ }
+
+ // Check if we are done...
+ if s == len(src) && ml == lz4MinMatch {
+ break
+ }
+ // 2 byte offset
+ if s >= len(src)-2 {
+ if debug {
+ fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+ }
+ return nil, 0, ErrCorrupt
+ }
+ offset := binary.LittleEndian.Uint16(src[s:])
+ s += 2
+ if offset == 0 {
+ if debug {
+ fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+ }
+ return nil, 0, ErrCorrupt
+ }
+ if int(offset) > uncompressed {
+ if debug {
+ fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+ }
+ return nil, 0, ErrCorrupt
+ }
+
+ if ml == lz4MinMatch+15 {
+ for {
+ if s >= len(src) {
+ if debug {
+ fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+ }
+ return nil, 0, ErrCorrupt
+ }
+ val := src[s]
+ s++
+ ml += int(val)
+ if val != 255 {
+ if s >= len(src) {
+ if debug {
+ fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+ }
+ return nil, 0, ErrCorrupt
+ }
+ break
+ }
+ }
+ }
+ if debug {
+ fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+ }
+ length := ml
+ // d += emitCopyNoRepeat(dst[d:], int(offset), ml)
+ for length > 0 {
+ if d >= dLimit {
+ return nil, 0, ErrDstTooSmall
+ }
+
+ // Offset no more than 2 bytes.
+ if length > 64 {
+ // Emit a length 64 copy, encoded as 3 bytes.
+ dst[d+2] = uint8(offset >> 8)
+ dst[d+1] = uint8(offset)
+ dst[d+0] = 63<<2 | tagCopy2
+ length -= 64
+ d += 3
+ continue
+ }
+ if length >= 12 || offset >= 2048 || length < 4 {
+ // Emit the remaining copy, encoded as 3 bytes.
+ dst[d+2] = uint8(offset >> 8)
+ dst[d+1] = uint8(offset)
+ dst[d+0] = uint8(length-1)<<2 | tagCopy2
+ d += 3
+ break
+ }
+ // Emit the remaining copy, encoded as 2 bytes.
+ dst[d+1] = uint8(offset)
+ dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+ d += 2
+ break
+ }
+ uncompressed += ml
+ if d > dLimit {
+ return nil, 0, ErrDstTooSmall
+ }
+ }
+
+ return dst[:d], uncompressed, nil
+}
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<24
+func emitRepeat16(dst []byte, offset uint16, length int) int {
+ // Repeat offset, make length cheaper
+ length -= 4
+ if length <= 4 {
+ dst[0] = uint8(length)<<2 | tagCopy1
+ dst[1] = 0
+ return 2
+ }
+ if length < 8 && offset < 2048 {
+ // Encode WITH offset
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+ return 2
+ }
+ if length < (1<<8)+4 {
+ length -= 4
+ dst[2] = uint8(length)
+ dst[1] = 0
+ dst[0] = 5<<2 | tagCopy1
+ return 3
+ }
+ if length < (1<<16)+(1<<8) {
+ length -= 1 << 8
+ dst[3] = uint8(length >> 8)
+ dst[2] = uint8(length >> 0)
+ dst[1] = 0
+ dst[0] = 6<<2 | tagCopy1
+ return 4
+ }
+ const maxRepeat = (1 << 24) - 1
+ length -= 1 << 16
+ left := 0
+ if length > maxRepeat {
+ left = length - maxRepeat + 4
+ length = maxRepeat - 4
+ }
+ dst[4] = uint8(length >> 16)
+ dst[3] = uint8(length >> 8)
+ dst[2] = uint8(length >> 0)
+ dst[1] = 0
+ dst[0] = 7<<2 | tagCopy1
+ if left > 0 {
+ return 5 + emitRepeat16(dst[5:], offset, left)
+ }
+ return 5
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+// dst is long enough to hold the encoded bytes
+// 1 <= offset && offset <= math.MaxUint16
+// 4 <= length && length <= math.MaxUint32
+func emitCopy16(dst []byte, offset uint16, length int) int {
+ // Offset no more than 2 bytes.
+ if length > 64 {
+ off := 3
+ if offset < 2048 {
+ // emit 8 bytes as tagCopy1, rest as repeats.
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+ length -= 8
+ off = 2
+ } else {
+ // Emit a length 60 copy, encoded as 3 bytes.
+ // Emit remaining as repeat value (minimum 4 bytes).
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = 59<<2 | tagCopy2
+ length -= 60
+ }
+ // Emit remaining as repeats, at least 4 bytes remain.
+ return off + emitRepeat16(dst[off:], offset, length)
+ }
+ if length >= 12 || offset >= 2048 {
+ // Emit the remaining copy, encoded as 3 bytes.
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = uint8(length-1)<<2 | tagCopy2
+ return 3
+ }
+ // Emit the remaining copy, encoded as 2 bytes.
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+ return 2
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+// dst is long enough to hold the encoded bytes
+// 0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteralGo(dst, lit []byte) int {
+ if len(lit) == 0 {
+ return 0
+ }
+ i, n := 0, uint(len(lit)-1)
+ switch {
+ case n < 60:
+ dst[0] = uint8(n)<<2 | tagLiteral
+ i = 1
+ case n < 1<<8:
+ dst[1] = uint8(n)
+ dst[0] = 60<<2 | tagLiteral
+ i = 2
+ case n < 1<<16:
+ dst[2] = uint8(n >> 8)
+ dst[1] = uint8(n)
+ dst[0] = 61<<2 | tagLiteral
+ i = 3
+ case n < 1<<24:
+ dst[3] = uint8(n >> 16)
+ dst[2] = uint8(n >> 8)
+ dst[1] = uint8(n)
+ dst[0] = 62<<2 | tagLiteral
+ i = 4
+ default:
+ dst[4] = uint8(n >> 24)
+ dst[3] = uint8(n >> 16)
+ dst[2] = uint8(n >> 8)
+ dst[1] = uint8(n)
+ dst[0] = 63<<2 | tagLiteral
+ i = 5
+ }
+ return i + copy(dst[i:], lit)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index beb7fa87..65b38abe 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -12,6 +12,8 @@ The `zstd` package is provided as open source software using a Go standard licen
Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors.
+For seekable zstd streams, see [this excellent package](https://github.com/SaveTheRbtz/zstd-seekable-format-go).
+
## Installation
Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index 7eed729b..2445bb4f 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -10,7 +10,6 @@ import (
"errors"
"fmt"
"io"
- "io/ioutil"
"os"
"path/filepath"
"sync"
@@ -83,8 +82,9 @@ type blockDec struct {
err error
- // Check against this crc
- checkCRC []byte
+ // Check against this crc, if hasCRC is true.
+ checkCRC uint32
+ hasCRC bool
// Frame to use for singlethreaded decoding.
// Should not be used by the decoder itself since parent may be another frame.
@@ -192,16 +192,14 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
}
// Read block data.
- if cap(b.dataStorage) < cSize {
+ if _, ok := br.(*byteBuf); !ok && cap(b.dataStorage) < cSize {
+ // byteBuf doesn't need a destination buffer.
if b.lowMem || cSize > maxCompressedBlockSize {
b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
} else {
b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
}
}
- if cap(b.dst) <= maxSize {
- b.dst = make([]byte, 0, maxSize+1)
- }
b.data, err = br.readBig(cSize, b.dataStorage)
if err != nil {
if debugDecoder {
@@ -210,6 +208,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
}
return err
}
+ if cap(b.dst) <= maxSize {
+ b.dst = make([]byte, 0, maxSize+1)
+ }
return nil
}
@@ -233,7 +234,7 @@ func (b *blockDec) decodeBuf(hist *history) error {
if b.lowMem {
b.dst = make([]byte, b.RLESize)
} else {
- b.dst = make([]byte, maxBlockSize)
+ b.dst = make([]byte, maxCompressedBlockSize)
}
}
b.dst = b.dst[:b.RLESize]
@@ -651,7 +652,7 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
buf.Write(in)
- ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
+ os.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
}
return nil
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 4493baa7..176788f2 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -7,7 +7,6 @@ package zstd
import (
"fmt"
"io"
- "io/ioutil"
)
type byteBuffer interface {
@@ -23,7 +22,7 @@ type byteBuffer interface {
readByte() (byte, error)
// Skip n bytes.
- skipN(n int) error
+ skipN(n int64) error
}
// in-memory buffer
@@ -62,9 +61,12 @@ func (b *byteBuf) readByte() (byte, error) {
return r, nil
}
-func (b *byteBuf) skipN(n int) error {
+func (b *byteBuf) skipN(n int64) error {
bb := *b
- if len(bb) < n {
+ if n < 0 {
+ return fmt.Errorf("negative skip (%d) requested", n)
+ }
+ if int64(len(bb)) < n {
return io.ErrUnexpectedEOF
}
*b = bb[n:]
@@ -120,9 +122,9 @@ func (r *readerWrapper) readByte() (byte, error) {
return r.tmp[0], nil
}
-func (r *readerWrapper) skipN(n int) error {
- n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
- if n2 != int64(n) {
+func (r *readerWrapper) skipN(n int64) error {
+ n2, err := io.CopyN(io.Discard, r.r, n)
+ if n2 != n {
err = io.ErrUnexpectedEOF
}
return err
diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
index 5022e71c..f6a24097 100644
--- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go
+++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
@@ -4,7 +4,6 @@
package zstd
import (
- "bytes"
"encoding/binary"
"errors"
"io"
@@ -102,8 +101,8 @@ func (h *Header) Decode(in []byte) error {
}
h.HeaderSize += 4
b, in := in[:4], in[4:]
- if !bytes.Equal(b, frameMagic) {
- if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
+ if string(b) != frameMagic {
+ if string(b[1:4]) != skippableFrameMagic || b[0]&0xf0 != 0x50 {
return ErrMagicMismatch
}
if len(in) < 4 {
@@ -153,7 +152,7 @@ func (h *Header) Decode(in []byte) error {
}
b, in = in[:size], in[size:]
h.HeaderSize += int(size)
- switch size {
+ switch len(b) {
case 1:
h.DictionaryID = uint32(b[0])
case 2:
@@ -183,7 +182,7 @@ func (h *Header) Decode(in []byte) error {
}
b, in = in[:fcsSize], in[fcsSize:]
h.HeaderSize += int(fcsSize)
- switch fcsSize {
+ switch len(b) {
case 1:
h.FrameContentSize = uint64(b[0])
case 2:
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 286c8f9d..7113e69e 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -5,7 +5,6 @@
package zstd
import (
- "bytes"
"context"
"encoding/binary"
"io"
@@ -35,13 +34,13 @@ type Decoder struct {
br readerWrapper
enabled bool
inFrame bool
+ dstBuf []byte
}
frame *frameDec
// Custom dictionaries.
- // Always uses copies.
- dicts map[uint32]dict
+ dicts map[uint32]*dict
// streamWg is the waitgroup for all streams
streamWg sync.WaitGroup
@@ -103,7 +102,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
}
// Transfer option dicts.
- d.dicts = make(map[uint32]dict, len(d.o.dicts))
+ d.dicts = make(map[uint32]*dict, len(d.o.dicts))
for _, dc := range d.o.dicts {
d.dicts[dc.id] = dc
}
@@ -187,21 +186,23 @@ func (d *Decoder) Reset(r io.Reader) error {
}
// If bytes buffer and < 5MB, do sync decoding anyway.
- if bb, ok := r.(byter); ok && bb.Len() < 5<<20 {
+ if bb, ok := r.(byter); ok && bb.Len() < d.o.decodeBufsBelow && !d.o.limitToCap {
bb2 := bb
if debugDecoder {
println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
}
b := bb2.Bytes()
var dst []byte
- if cap(d.current.b) > 0 {
- dst = d.current.b
+ if cap(d.syncStream.dstBuf) > 0 {
+ dst = d.syncStream.dstBuf[:0]
}
- dst, err := d.DecodeAll(b, dst[:0])
+ dst, err := d.DecodeAll(b, dst)
if err == nil {
err = io.EOF
}
+ // Save output buffer
+ d.syncStream.dstBuf = dst
d.current.b = dst
d.current.err = err
d.current.flushed = true
@@ -216,6 +217,7 @@ func (d *Decoder) Reset(r io.Reader) error {
d.current.err = nil
d.current.flushed = false
d.current.d = nil
+ d.syncStream.dstBuf = nil
// Ensure no-one else is still running...
d.streamWg.Wait()
@@ -312,6 +314,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
// Grab a block decoder and frame decoder.
block := <-d.decoders
frame := block.localFrame
+ initialSize := len(dst)
defer func() {
if debugDecoder {
printf("re-adding decoder: %p", block)
@@ -337,21 +340,26 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
}
return dst, err
}
- if frame.DictionaryID != nil {
- dict, ok := d.dicts[*frame.DictionaryID]
- if !ok {
- return nil, ErrUnknownDictionary
- }
- if debugDecoder {
- println("setting dict", frame.DictionaryID)
- }
- frame.history.setDict(&dict)
+ if err = d.setDict(frame); err != nil {
+ return nil, err
}
if frame.WindowSize > d.o.maxWindowSize {
+ if debugDecoder {
+ println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
+ }
return dst, ErrWindowSizeExceeded
}
if frame.FrameContentSize != fcsUnknown {
- if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+ if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)-initialSize) {
+ if debugDecoder {
+ println("decoder size exceeded; fcs:", frame.FrameContentSize, "> mcs:", d.o.maxDecodedSize-uint64(len(dst)-initialSize), "len:", len(dst))
+ }
+ return dst, ErrDecoderSizeExceeded
+ }
+ if d.o.limitToCap && frame.FrameContentSize > uint64(cap(dst)-len(dst)) {
+ if debugDecoder {
+ println("decoder size exceeded; fcs:", frame.FrameContentSize, "> (cap-len)", cap(dst)-len(dst))
+ }
return dst, ErrDecoderSizeExceeded
}
if cap(dst)-len(dst) < int(frame.FrameContentSize) {
@@ -361,7 +369,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
}
}
- if cap(dst) == 0 {
+ if cap(dst) == 0 && !d.o.limitToCap {
// Allocate len(input) * 2 by default if nothing is provided
// and we didn't get frame content size.
size := len(input) * 2
@@ -379,6 +387,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
if err != nil {
return dst, err
}
+ if uint64(len(dst)-initialSize) > d.o.maxDecodedSize {
+ return dst, ErrDecoderSizeExceeded
+ }
if len(frame.bBuf) == 0 {
if debugDecoder {
println("frame dbuf empty")
@@ -439,7 +450,11 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
}
- if !d.o.ignoreChecksum && len(next.b) > 0 {
+ if d.o.ignoreChecksum {
+ return true
+ }
+
+ if len(next.b) > 0 {
n, err := d.current.crc.Write(next.b)
if err == nil {
if n != len(next.b) {
@@ -447,18 +462,16 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
}
}
}
- if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 {
- got := d.current.crc.Sum64()
- var tmp [4]byte
- binary.LittleEndian.PutUint32(tmp[:], uint32(got))
- if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
+ if next.err == nil && next.d != nil && next.d.hasCRC {
+ got := uint32(d.current.crc.Sum64())
+ if got != next.d.checkCRC {
if debugDecoder {
- println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
+ printf("CRC Check Failed: %08x (got) != %08x (on stream)\n", got, next.d.checkCRC)
}
d.current.err = ErrCRCMismatch
} else {
if debugDecoder {
- println("CRC ok", tmp[:])
+ printf("CRC ok %08x\n", got)
}
}
}
@@ -474,18 +487,12 @@ func (d *Decoder) nextBlockSync() (ok bool) {
if !d.syncStream.inFrame {
d.frame.history.reset()
d.current.err = d.frame.reset(&d.syncStream.br)
+ if d.current.err == nil {
+ d.current.err = d.setDict(d.frame)
+ }
if d.current.err != nil {
return false
}
- if d.frame.DictionaryID != nil {
- dict, ok := d.dicts[*d.frame.DictionaryID]
- if !ok {
- d.current.err = ErrUnknownDictionary
- return false
- } else {
- d.frame.history.setDict(&dict)
- }
- }
if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
d.current.err = ErrDecoderSizeExceeded
return false
@@ -664,6 +671,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if debugDecoder {
println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
}
+ hist.reset()
hist.decoders = block.async.newHist.decoders
hist.recentOffsets = block.async.newHist.recentOffsets
hist.windowSize = block.async.newHist.windowSize
@@ -695,6 +703,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
seqExecute <- block
}
close(seqExecute)
+ hist.reset()
}()
var wg sync.WaitGroup
@@ -718,6 +727,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if debugDecoder {
println("Async 2: new history")
}
+ hist.reset()
hist.windowSize = block.async.newHist.windowSize
hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
if block.async.newHist.dict != nil {
@@ -747,7 +757,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if block.lowMem {
block.dst = make([]byte, block.RLESize)
} else {
- block.dst = make([]byte, maxBlockSize)
+ block.dst = make([]byte, maxCompressedBlockSize)
}
}
block.dst = block.dst[:block.RLESize]
@@ -799,13 +809,14 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if debugDecoder {
println("decoder goroutines finished")
}
+ hist.reset()
}()
+ var hist history
decodeStream:
for {
- var hist history
var hasErr bool
-
+ hist.reset()
decodeBlock := func(block *blockDec) {
if hasErr {
if block != nil {
@@ -840,15 +851,14 @@ decodeStream:
if debugDecoder && err != nil {
println("Frame decoder returned", err)
}
- if err == nil && frame.DictionaryID != nil {
- dict, ok := d.dicts[*frame.DictionaryID]
- if !ok {
- err = ErrUnknownDictionary
- } else {
- frame.history.setDict(&dict)
- }
+ if err == nil {
+ err = d.setDict(frame)
}
if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
+ if debugDecoder {
+ println("decoder size exceeded, fws:", d.frame.WindowSize, "> mws:", d.o.maxWindowSize)
+ }
+
err = ErrDecoderSizeExceeded
}
if err != nil {
@@ -890,18 +900,22 @@ decodeStream:
println("next block returned error:", err)
}
dec.err = err
- dec.checkCRC = nil
+ dec.hasCRC = false
if dec.Last && frame.HasCheckSum && err == nil {
crc, err := frame.rawInput.readSmall(4)
- if err != nil {
+ if len(crc) < 4 {
+ if err == nil {
+ err = io.ErrUnexpectedEOF
+
+ }
println("CRC missing?", err)
dec.err = err
- }
- var tmp [4]byte
- copy(tmp[:], crc)
- dec.checkCRC = tmp[:]
- if debugDecoder {
- println("found crc to check:", dec.checkCRC)
+ } else {
+ dec.checkCRC = binary.LittleEndian.Uint32(crc)
+ dec.hasCRC = true
+ if debugDecoder {
+ printf("found crc to check: %08x\n", dec.checkCRC)
+ }
}
}
err = dec.err
@@ -917,5 +931,23 @@ decodeStream:
}
close(seqDecode)
wg.Wait()
+ hist.reset()
d.frame.history.b = frameHistCache
}
+
+func (d *Decoder) setDict(frame *frameDec) (err error) {
+ dict, ok := d.dicts[frame.DictionaryID]
+ if ok {
+ if debugDecoder {
+ println("setting dict", frame.DictionaryID)
+ }
+ frame.history.setDict(dict)
+ } else if frame.DictionaryID != 0 {
+ // A zero or missing dictionary id is ambiguous:
+ // either dictionary zero, or no dictionary. In particular,
+ // zstd --patch-from uses this id for the source file,
+ // so only return an error if the dictionary id is not zero.
+ err = ErrUnknownDictionary
+ }
+ return err
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index c70e6fa0..07a90dd7 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -6,6 +6,8 @@ package zstd
import (
"errors"
+ "fmt"
+ "math/bits"
"runtime"
)
@@ -14,20 +16,23 @@ type DOption func(*decoderOptions) error
// options retains accumulated state of multiple options.
type decoderOptions struct {
- lowMem bool
- concurrent int
- maxDecodedSize uint64
- maxWindowSize uint64
- dicts []dict
- ignoreChecksum bool
+ lowMem bool
+ concurrent int
+ maxDecodedSize uint64
+ maxWindowSize uint64
+ dicts []*dict
+ ignoreChecksum bool
+ limitToCap bool
+ decodeBufsBelow int
}
func (o *decoderOptions) setDefault() {
*o = decoderOptions{
// use less ram: true for now, but may change.
- lowMem: true,
- concurrent: runtime.GOMAXPROCS(0),
- maxWindowSize: MaxWindowSize,
+ lowMem: true,
+ concurrent: runtime.GOMAXPROCS(0),
+ maxWindowSize: MaxWindowSize,
+ decodeBufsBelow: 128 << 10,
}
if o.concurrent > 4 {
o.concurrent = 4
@@ -82,7 +87,13 @@ func WithDecoderMaxMemory(n uint64) DOption {
}
// WithDecoderDicts allows to register one or more dictionaries for the decoder.
-// If several dictionaries with the same ID is provided the last one will be used.
+//
+// Each slice in dict must be in the [dictionary format] produced by
+// "zstd --train" from the Zstandard reference implementation.
+//
+// If several dictionaries with the same ID are provided, the last one will be used.
+//
+// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithDecoderDicts(dicts ...[]byte) DOption {
return func(o *decoderOptions) error {
for _, b := range dicts {
@@ -90,12 +101,24 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
if err != nil {
return err
}
- o.dicts = append(o.dicts, *d)
+ o.dicts = append(o.dicts, d)
}
return nil
}
}
+// WithEncoderDictRaw registers a dictionary that may be used by the decoder.
+// The slice content can be arbitrary data.
+func WithDecoderDictRaw(id uint32, content []byte) DOption {
+ return func(o *decoderOptions) error {
+ if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
+ return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
+ }
+ o.dicts = append(o.dicts, &dict{id: id, content: content, offsets: [3]int{1, 4, 8}})
+ return nil
+ }
+}
+
// WithDecoderMaxWindow allows to set a maximum window size for decodes.
// This allows rejecting packets that will cause big memory usage.
// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.
@@ -114,6 +137,29 @@ func WithDecoderMaxWindow(size uint64) DOption {
}
}
+// WithDecodeAllCapLimit will limit DecodeAll to decoding cap(dst)-len(dst) bytes,
+// or any size set in WithDecoderMaxMemory.
+// This can be used to limit decoding to a specific maximum output size.
+// Disabled by default.
+func WithDecodeAllCapLimit(b bool) DOption {
+ return func(o *decoderOptions) error {
+ o.limitToCap = b
+ return nil
+ }
+}
+
+// WithDecodeBuffersBelow will fully decode readers that have a
+// `Bytes() []byte` and `Len() int` interface similar to bytes.Buffer.
+// This typically uses less allocations but will have the full decompressed object in memory.
+// Note that DecodeAllCapLimit will disable this, as well as giving a size of 0 or less.
+// Default is 128KiB.
+func WithDecodeBuffersBelow(size int) DOption {
+ return func(o *decoderOptions) error {
+ o.decodeBufsBelow = size
+ return nil
+ }
+}
+
// IgnoreChecksum allows to forcibly ignore checksum checking.
func IgnoreChecksum(b bool) DOption {
return func(o *decoderOptions) error {
diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go
index a36ae83e..ca095145 100644
--- a/vendor/github.com/klauspost/compress/zstd/dict.go
+++ b/vendor/github.com/klauspost/compress/zstd/dict.go
@@ -1,7 +1,6 @@
package zstd
import (
- "bytes"
"encoding/binary"
"errors"
"fmt"
@@ -20,7 +19,10 @@ type dict struct {
content []byte
}
-var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
+const dictMagic = "\x37\xa4\x30\xec"
+
+// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB.
+const dictMaxLength = 1 << 31
// ID returns the dictionary id or 0 if d is nil.
func (d *dict) ID() uint32 {
@@ -30,14 +32,38 @@ func (d *dict) ID() uint32 {
return d.id
}
-// DictContentSize returns the dictionary content size or 0 if d is nil.
-func (d *dict) DictContentSize() int {
+// ContentSize returns the dictionary content size or 0 if d is nil.
+func (d *dict) ContentSize() int {
if d == nil {
return 0
}
return len(d.content)
}
+// Content returns the dictionary content.
+func (d *dict) Content() []byte {
+ if d == nil {
+ return nil
+ }
+ return d.content
+}
+
+// Offsets returns the initial offsets.
+func (d *dict) Offsets() [3]int {
+ if d == nil {
+ return [3]int{}
+ }
+ return d.offsets
+}
+
+// LitEncoder returns the literal encoder.
+func (d *dict) LitEncoder() *huff0.Scratch {
+ if d == nil {
+ return nil
+ }
+ return d.litEnc
+}
+
// Load a dictionary as described in
// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
func loadDict(b []byte) (*dict, error) {
@@ -50,7 +76,7 @@ func loadDict(b []byte) (*dict, error) {
ofDec: sequenceDec{fse: &fseDecoder{}},
mlDec: sequenceDec{fse: &fseDecoder{}},
}
- if !bytes.Equal(b[:4], dictMagic[:]) {
+ if string(b[:4]) != dictMagic {
return nil, ErrMagicMismatch
}
d.id = binary.LittleEndian.Uint32(b[4:8])
@@ -62,7 +88,7 @@ func loadDict(b []byte) (*dict, error) {
var err error
d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
if err != nil {
- return nil, err
+ return nil, fmt.Errorf("loading literal table: %w", err)
}
d.litEnc.Reuse = huff0.ReusePolicyMust
@@ -120,3 +146,16 @@ func loadDict(b []byte) (*dict, error) {
return &d, nil
}
+
+// InspectDictionary loads a zstd dictionary and provides functions to inspect the content.
+func InspectDictionary(b []byte) (interface {
+ ID() uint32
+ ContentSize() int
+ Content() []byte
+ Offsets() [3]int
+ LitEncoder() *huff0.Scratch
+}, error) {
+ initPredefined()
+ d, err := loadDict(b)
+ return d, err
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
index 15ae8ee8..e008b992 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@@ -16,6 +16,7 @@ type fastBase struct {
cur int32
// maximum offset. Should be at least 2x block size.
maxMatchOff int32
+ bufferReset int32
hist []byte
crc *xxhash.Digest
tmp [8]byte
@@ -56,8 +57,8 @@ func (e *fastBase) Block() *blockEnc {
}
func (e *fastBase) addBlock(src []byte) int32 {
- if debugAsserts && e.cur > bufferReset {
- panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
+ if debugAsserts && e.cur > e.bufferReset {
+ panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, e.bufferReset))
}
// check if we have space already
if len(e.hist)+len(src) > cap(e.hist) {
@@ -126,24 +127,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
}
}
- a := src[s:]
- b := src[t:]
- b = b[:len(a)]
- end := int32((len(a) >> 3) << 3)
- for i := int32(0); i < end; i += 8 {
- if diff := load6432(a, i) ^ load6432(b, i); diff != 0 {
- return i + int32(bits.TrailingZeros64(diff)>>3)
- }
- }
-
- a = a[end:]
- b = b[end:]
- for i := range a {
- if a[i] != b[i] {
- return int32(i) + end
- }
- }
- return int32(len(a)) + end
+ return int32(matchLen(src[s:], src[t:]))
}
// Reset the encoding table.
@@ -165,13 +149,13 @@ func (e *fastBase) resetBase(d *dict, singleBlock bool) {
if singleBlock {
e.lowMem = true
}
- e.ensureHist(d.DictContentSize() + maxCompressedBlockSize)
+ e.ensureHist(d.ContentSize() + maxCompressedBlockSize)
e.lowMem = low
}
// We offset current position so everything will be out of reach.
// If above reset line, history will be purged.
- if e.cur < bufferReset {
+ if e.cur < e.bufferReset {
e.cur += e.maxMatchOff + int32(len(e.hist))
}
e.hist = e.hist[:0]
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go
index 96028ecd..830f5ba7 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_best.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go
@@ -32,6 +32,7 @@ type match struct {
length int32
rep int32
est int32
+ _ [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes
}
const highScore = 25000
@@ -84,14 +85,10 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
- for e.cur >= bufferReset {
+ for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
- for i := range e.table[:] {
- e.table[i] = prevEntry{}
- }
- for i := range e.longTable[:] {
- e.longTable[i] = prevEntry{}
- }
+ e.table = [bestShortTableSize]prevEntry{}
+ e.longTable = [bestLongTableSize]prevEntry{}
e.cur = e.maxMatchOff
break
}
@@ -192,8 +189,8 @@ encodeLoop:
panic("offset0 was 0")
}
- bestOf := func(a, b match) match {
- if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 {
+ bestOf := func(a, b *match) *match {
+ if a.est-b.est+(a.s-b.s)*bitsPerByte>>10 < 0 {
return a
}
return b
@@ -219,22 +216,26 @@ encodeLoop:
return m
}
- best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
- best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
- best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
+ m1 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)
+ m2 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)
+ m3 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)
+ m4 := matchAt(candidateS.prev-e.cur, s, uint32(cv), -1)
+ best := bestOf(bestOf(&m1, &m2), bestOf(&m3, &m4))
if canRepeat && best.length < goodEnough {
cv32 := uint32(cv >> 8)
spp := s + 1
- best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
- best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
- best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
+ m1 := matchAt(spp-offset1, spp, cv32, 1)
+ m2 := matchAt(spp-offset2, spp, cv32, 2)
+ m3 := matchAt(spp-offset3, spp, cv32, 3)
+ best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3))
if best.length > 0 {
cv32 = uint32(cv >> 24)
spp += 2
- best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
- best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
- best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
+ m1 := matchAt(spp-offset1, spp, cv32, 1)
+ m2 := matchAt(spp-offset2, spp, cv32, 2)
+ m3 := matchAt(spp-offset3, spp, cv32, 3)
+ best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3))
}
}
// Load next and check...
@@ -261,26 +262,33 @@ encodeLoop:
candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
// Short at s+1
- best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
+ m1 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)
// Long at s+1, s+2
- best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
- best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
- best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
- best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
+ m2 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)
+ m3 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)
+ m4 := matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
+ m5 := matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
+ best = bestOf(bestOf(bestOf(best, &m1), &m2), bestOf(bestOf(&m3, &m4), &m5))
if false {
// Short at s+3.
// Too often worse...
- best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1))
+ m := matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
+ best = bestOf(best, &m)
}
// See if we can find a better match by checking where the current best ends.
// Use that offset to see if we can find a better full match.
if sAt := best.s + best.length; sAt < sLimit {
nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
candidateEnd := e.longTable[nextHashL]
- if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
- bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
- if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 {
- bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1))
+ // Start check at a fixed offset to allow for a few mismatches.
+ // For this compression level 2 yields the best results.
+ const skipBeginning = 2
+ if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 {
+ m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
+ bestEnd := bestOf(best, &m)
+ if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 {
+ m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
+ bestEnd = bestOf(bestEnd, &m)
}
best = bestEnd
}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
index c769f694..8582f31a 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -62,14 +62,10 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
- for e.cur >= bufferReset {
+ for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
- for i := range e.table[:] {
- e.table[i] = tableEntry{}
- }
- for i := range e.longTable[:] {
- e.longTable[i] = prevEntry{}
- }
+ e.table = [betterShortTableSize]tableEntry{}
+ e.longTable = [betterLongTableSize]prevEntry{}
e.cur = e.maxMatchOff
break
}
@@ -416,15 +412,23 @@ encodeLoop:
// Try to find a better match by searching for a long match at the end of the current best match
if s+matched < sLimit {
+ // Allow some bytes at the beginning to mismatch.
+ // Sweet spot is around 3 bytes, but depends on input.
+ // The skipped bytes are tested in Extend backwards,
+ // and still picked up as part of the match if they do.
+ const skipBeginning = 3
+
nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
- cv := load3232(src, s)
+ s2 := s + skipBeginning
+ cv := load3232(src, s2)
candidateL := e.longTable[nextHashL]
- coffsetL := candidateL.offset - e.cur - matched
- if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+ coffsetL := candidateL.offset - e.cur - matched + skipBeginning
+ if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
// Found a long match, at least 4 bytes.
- matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+ matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
if matchedNext > matched {
t = coffsetL
+ s = s2
matched = matchedNext
if debugMatches {
println("long match at end-of-match")
@@ -434,12 +438,13 @@ encodeLoop:
// Check prev long...
if true {
- coffsetL = candidateL.prev - e.cur - matched
- if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+ coffsetL = candidateL.prev - e.cur - matched + skipBeginning
+ if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
// Found a long match, at least 4 bytes.
- matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+ matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
if matchedNext > matched {
t = coffsetL
+ s = s2
matched = matchedNext
if debugMatches {
println("prev long match at end-of-match")
@@ -578,7 +583,7 @@ func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
- for e.cur >= bufferReset {
+ for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index 7ff0c64f..7d425109 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -44,14 +44,10 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
- for e.cur >= bufferReset {
+ for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
- for i := range e.table[:] {
- e.table[i] = tableEntry{}
- }
- for i := range e.longTable[:] {
- e.longTable[i] = tableEntry{}
- }
+ e.table = [dFastShortTableSize]tableEntry{}
+ e.longTable = [dFastLongTableSize]tableEntry{}
e.cur = e.maxMatchOff
break
}
@@ -388,7 +384,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
- if e.cur >= bufferReset {
+ if e.cur >= e.bufferReset {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
@@ -685,7 +681,7 @@ encodeLoop:
}
// We do not store history, so we must offset e.cur to avoid false matches for next user.
- if e.cur < bufferReset {
+ if e.cur < e.bufferReset {
e.cur += int32(len(src))
}
}
@@ -700,7 +696,7 @@ func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
- for e.cur >= bufferReset {
+ for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
@@ -1103,7 +1099,8 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
}
if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
- copy(e.longTable[:], e.dictLongTable)
+ //copy(e.longTable[:], e.dictLongTable)
+ e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable)
for i := range e.longTableShardDirty {
e.longTableShardDirty[i] = false
}
@@ -1114,7 +1111,9 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
continue
}
- copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+ // copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+ *(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:])
+
e.longTableShardDirty[i] = false
}
}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index f51ab529..315b1a8f 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -43,7 +43,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
- for e.cur >= bufferReset {
+ for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
@@ -304,13 +304,13 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
minNonLiteralBlockSize = 1 + 1 + inputMargin
)
if debugEncoder {
- if len(src) > maxBlockSize {
+ if len(src) > maxCompressedBlockSize {
panic("src too big")
}
}
// Protect against e.cur wraparound.
- if e.cur >= bufferReset {
+ if e.cur >= e.bufferReset {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
@@ -538,7 +538,7 @@ encodeLoop:
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
// We do not store history, so we must offset e.cur to avoid false matches for next user.
- if e.cur < bufferReset {
+ if e.cur < e.bufferReset {
e.cur += int32(len(src))
}
}
@@ -555,11 +555,9 @@ func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
return
}
// Protect against e.cur wraparound.
- for e.cur >= bufferReset {
+ for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
- for i := range e.table[:] {
- e.table[i] = tableEntry{}
- }
+ e.table = [tableSize]tableEntry{}
e.cur = e.maxMatchOff
break
}
@@ -871,7 +869,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
const shardCnt = tableShardCnt
const shardSize = tableShardSize
if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
- copy(e.table[:], e.dictTable)
+ //copy(e.table[:], e.dictTable)
+ e.table = *(*[tableSize]tableEntry)(e.dictTable)
for i := range e.tableShardDirty {
e.tableShardDirty[i] = false
}
@@ -883,7 +882,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
continue
}
- copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+ //copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+ *(*[shardSize]tableEntry)(e.table[i*shardSize:]) = *(*[shardSize]tableEntry)(e.dictTable[i*shardSize:])
e.tableShardDirty[i] = false
}
e.allDirty = false
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index e6b1d01c..65c6c36d 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -8,6 +8,7 @@ import (
"crypto/rand"
"fmt"
"io"
+ "math"
rdebug "runtime/debug"
"sync"
@@ -528,8 +529,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
// If a non-single block is needed the encoder will reset again.
e.encoders <- enc
}()
- // Use single segments when above minimum window and below 1MB.
- single := len(src) < 1<<20 && len(src) > MinWindowSize
+ // Use single segments when above minimum window and below window size.
+ single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
if e.o.single != nil {
single = *e.o.single
}
@@ -639,3 +640,37 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
}
return dst
}
+
+// MaxEncodedSize returns the expected maximum
+// size of an encoded block or stream.
+func (e *Encoder) MaxEncodedSize(size int) int {
+ frameHeader := 4 + 2 // magic + frame header & window descriptor
+ if e.o.dict != nil {
+ frameHeader += 4
+ }
+ // Frame content size:
+ if size < 256 {
+ frameHeader++
+ } else if size < 65536+256 {
+ frameHeader += 2
+ } else if size < math.MaxInt32 {
+ frameHeader += 4
+ } else {
+ frameHeader += 8
+ }
+ // Final crc
+ if e.o.crc {
+ frameHeader += 4
+ }
+
+ // Max overhead is 3 bytes/block.
+ // There cannot be 0 blocks.
+ blocks := (size + e.o.blockSize) / e.o.blockSize
+
+ // Combine, add padding.
+ maxSz := frameHeader + 3*blocks + size
+ if e.o.pad > 1 {
+ maxSz += calcSkippableFrame(int64(maxSz), int64(e.o.pad))
+ }
+ return maxSz
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 44d8dbd1..8e15be2f 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -3,6 +3,8 @@ package zstd
import (
"errors"
"fmt"
+ "math"
+ "math/bits"
"runtime"
"strings"
)
@@ -47,22 +49,22 @@ func (o encoderOptions) encoder() encoder {
switch o.level {
case SpeedFastest:
if o.dict != nil {
- return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+ return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
}
- return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+ return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
case SpeedDefault:
if o.dict != nil {
- return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}}
+ return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}}
}
- return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+ return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
case SpeedBetterCompression:
if o.dict != nil {
- return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+ return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
}
- return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+ return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
case SpeedBestCompression:
- return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+ return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
}
panic("unknown compression level")
}
@@ -283,7 +285,7 @@ func WithNoEntropyCompression(b bool) EOption {
// a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
// For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
// This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
-// If this is not specified, block encodes will automatically choose this based on the input size.
+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
// This setting has no effect on streamed encodes.
func WithSingleSegment(b bool) EOption {
return func(o *encoderOptions) error {
@@ -304,7 +306,13 @@ func WithLowerEncoderMem(b bool) EOption {
}
// WithEncoderDict allows to register a dictionary that will be used for the encode.
+//
+// The slice dict must be in the [dictionary format] produced by
+// "zstd --train" from the Zstandard reference implementation.
+//
// The encoder *may* choose to use no dictionary instead for certain payloads.
+//
+// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithEncoderDict(dict []byte) EOption {
return func(o *encoderOptions) error {
d, err := loadDict(dict)
@@ -315,3 +323,17 @@ func WithEncoderDict(dict []byte) EOption {
return nil
}
}
+
+// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
+//
+// The slice content may contain arbitrary data. It will be used as an initial
+// history.
+func WithEncoderDictRaw(id uint32, content []byte) EOption {
+ return func(o *encoderOptions) error {
+ if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
+ return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
+ }
+ o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}
+ return nil
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index fa0a633f..d8e8a05b 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -5,7 +5,7 @@
package zstd
import (
- "bytes"
+ "encoding/binary"
"encoding/hex"
"errors"
"io"
@@ -29,7 +29,7 @@ type frameDec struct {
FrameContentSize uint64
- DictionaryID *uint32
+ DictionaryID uint32
HasCheckSum bool
SingleSegment bool
}
@@ -43,9 +43,9 @@ const (
MaxWindowSize = 1 << 29
)
-var (
- frameMagic = []byte{0x28, 0xb5, 0x2f, 0xfd}
- skippableFrameMagic = []byte{0x2a, 0x4d, 0x18}
+const (
+ frameMagic = "\x28\xb5\x2f\xfd"
+ skippableFrameMagic = "\x2a\x4d\x18"
)
func newFrameDec(o decoderOptions) *frameDec {
@@ -89,9 +89,9 @@ func (d *frameDec) reset(br byteBuffer) error {
copy(signature[1:], b)
}
- if !bytes.Equal(signature[1:4], skippableFrameMagic) || signature[0]&0xf0 != 0x50 {
+ if string(signature[1:4]) != skippableFrameMagic || signature[0]&0xf0 != 0x50 {
if debugDecoder {
- println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString(skippableFrameMagic))
+ println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString([]byte(skippableFrameMagic)))
}
// Break if not skippable frame.
break
@@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
}
n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
println("Skipping frame with", n, "bytes.")
- err = br.skipN(int(n))
+ err = br.skipN(int64(n))
if err != nil {
if debugDecoder {
println("Reading discarded frame", err)
@@ -114,9 +114,9 @@ func (d *frameDec) reset(br byteBuffer) error {
return err
}
}
- if !bytes.Equal(signature[:], frameMagic) {
+ if string(signature[:]) != frameMagic {
if debugDecoder {
- println("Got magic numbers: ", signature, "want:", frameMagic)
+ println("Got magic numbers: ", signature, "want:", []byte(frameMagic))
}
return ErrMagicMismatch
}
@@ -155,7 +155,7 @@ func (d *frameDec) reset(br byteBuffer) error {
// Read Dictionary_ID
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
- d.DictionaryID = nil
+ d.DictionaryID = 0
if size := fhd & 3; size != 0 {
if size == 3 {
size = 4
@@ -167,7 +167,7 @@ func (d *frameDec) reset(br byteBuffer) error {
return err
}
var id uint32
- switch size {
+ switch len(b) {
case 1:
id = uint32(b[0])
case 2:
@@ -178,11 +178,7 @@ func (d *frameDec) reset(br byteBuffer) error {
if debugDecoder {
println("Dict size", size, "ID:", id)
}
- if id > 0 {
- // ID 0 means "sorry, no dictionary anyway".
- // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
- d.DictionaryID = &id
- }
+ d.DictionaryID = id
}
// Read Frame_Content_Size
@@ -204,7 +200,7 @@ func (d *frameDec) reset(br byteBuffer) error {
println("Reading Frame content", err)
return err
}
- switch fcsSize {
+ switch len(b) {
case 1:
d.FrameContentSize = uint64(b[0])
case 2:
@@ -231,20 +227,27 @@ func (d *frameDec) reset(br byteBuffer) error {
d.crc.Reset()
}
+ if d.WindowSize > d.o.maxWindowSize {
+ if debugDecoder {
+ printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ }
+ return ErrWindowSizeExceeded
+ }
+
if d.WindowSize == 0 && d.SingleSegment {
// We may not need window in this case.
d.WindowSize = d.FrameContentSize
if d.WindowSize < MinWindowSize {
d.WindowSize = MinWindowSize
}
- }
-
- if d.WindowSize > uint64(d.o.maxWindowSize) {
- if debugDecoder {
- printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ if d.WindowSize > d.o.maxDecodedSize {
+ if debugDecoder {
+ printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ }
+ return ErrDecoderSizeExceeded
}
- return ErrWindowSizeExceeded
}
+
// The minimum Window_Size is 1 KB.
if d.WindowSize < MinWindowSize {
if debugDecoder {
@@ -254,11 +257,16 @@ func (d *frameDec) reset(br byteBuffer) error {
}
d.history.windowSize = int(d.WindowSize)
if !d.o.lowMem || d.history.windowSize < maxBlockSize {
- // Alloc 2x window size if not low-mem, or very small window size.
+ // Alloc 2x window size if not low-mem, or window size below 2MB.
d.history.allocFrameBuffer = d.history.windowSize * 2
} else {
- // Alloc with one additional block
- d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
+ if d.o.lowMem {
+ // Alloc with 1MB extra.
+ d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize/2
+ } else {
+ // Alloc with 2MB extra.
+ d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
+ }
}
if debugDecoder {
@@ -293,7 +301,7 @@ func (d *frameDec) checkCRC() error {
}
// We can overwrite upper tmp now
- want, err := d.rawInput.readSmall(4)
+ buf, err := d.rawInput.readSmall(4)
if err != nil {
println("CRC missing?", err)
return err
@@ -303,22 +311,17 @@ func (d *frameDec) checkCRC() error {
return nil
}
- var tmp [4]byte
- got := d.crc.Sum64()
- // Flip to match file order.
- tmp[0] = byte(got >> 0)
- tmp[1] = byte(got >> 8)
- tmp[2] = byte(got >> 16)
- tmp[3] = byte(got >> 24)
+ want := binary.LittleEndian.Uint32(buf[:4])
+ got := uint32(d.crc.Sum64())
- if !bytes.Equal(tmp[:], want) {
+ if got != want {
if debugDecoder {
- println("CRC Check Failed:", tmp[:], "!=", want)
+ printf("CRC check failed: got %08x, want %08x\n", got, want)
}
return ErrCRCMismatch
}
if debugDecoder {
- println("CRC ok", tmp[:])
+ printf("CRC ok %08x\n", got)
}
return nil
}
@@ -336,7 +339,7 @@ func (d *frameDec) consumeCRC() error {
return nil
}
-// runDecoder will create a sync decoder that will decode a block of data.
+// runDecoder will run the decoder for the remainder of the frame.
func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
saved := d.history.b
@@ -346,12 +349,23 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
// Store input length, so we only check new data.
crcStart := len(dst)
d.history.decoders.maxSyncLen = 0
+ if d.o.limitToCap {
+ d.history.decoders.maxSyncLen = uint64(cap(dst) - len(dst))
+ }
if d.FrameContentSize != fcsUnknown {
- d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+ if !d.o.limitToCap || d.FrameContentSize+uint64(len(dst)) < d.history.decoders.maxSyncLen {
+ d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+ }
if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
+ if debugDecoder {
+ println("maxSyncLen:", d.history.decoders.maxSyncLen, "> maxDecodedSize:", d.o.maxDecodedSize)
+ }
return dst, ErrDecoderSizeExceeded
}
- if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
+ if debugDecoder {
+ println("maxSyncLen:", d.history.decoders.maxSyncLen)
+ }
+ if !d.o.limitToCap && uint64(cap(dst)) < d.history.decoders.maxSyncLen {
// Alloc for output
dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
copy(dst2, dst)
@@ -371,7 +385,13 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
if err != nil {
break
}
- if uint64(len(d.history.b)) > d.o.maxDecodedSize {
+ if uint64(len(d.history.b)-crcStart) > d.o.maxDecodedSize {
+ println("runDecoder: maxDecodedSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.o.maxDecodedSize)
+ err = ErrDecoderSizeExceeded
+ break
+ }
+ if d.o.limitToCap && len(d.history.b) > cap(dst) {
+ println("runDecoder: cap exceeded", uint64(len(d.history.b)), ">", cap(dst))
err = ErrDecoderSizeExceeded
break
}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
index e74df436..d04a829b 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@@ -21,7 +21,8 @@ type buildDtableAsmContext struct {
// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
// Function returns non-zero exit code on error.
-// go:noescape
+//
+//go:noescape
func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
// please keep in sync with _generate/gen_fse.go
@@ -34,8 +35,8 @@ const (
// buildDtable will build the decoding table.
func (s *fseDecoder) buildDtable() error {
ctx := buildDtableAsmContext{
- stateTable: (*uint16)(&s.stateTable[0]),
- norm: (*int16)(&s.norm[0]),
+ stateTable: &s.stateTable[0],
+ norm: &s.norm[0],
dt: (*uint64)(&s.dt[0]),
}
code := buildDtable_asm(s, &ctx)
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
index da32b442..bcde3986 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
@@ -1,7 +1,6 @@
// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
TEXT ·buildDtable_asm(SB), $0-24
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index 28b40153..09164856 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -37,24 +37,21 @@ func (h *history) reset() {
h.ignoreBuffer = 0
h.error = false
h.recentOffsets = [3]int{1, 4, 8}
- if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
- fseDecoderPool.Put(f)
- }
- if f := h.decoders.offsets.fse; f != nil && !f.preDefined {
- fseDecoderPool.Put(f)
- }
- if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
- fseDecoderPool.Put(f)
- }
+ h.decoders.freeDecoders()
h.decoders = sequenceDecs{br: h.decoders.br}
+ h.freeHuffDecoder()
+ h.huffTree = nil
+ h.dict = nil
+ //printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
+}
+
+func (h *history) freeHuffDecoder() {
if h.huffTree != nil {
if h.dict == nil || h.dict.litEnc != h.huffTree {
huffDecoderPool.Put(h.huffTree)
+ h.huffTree = nil
}
}
- h.huffTree = nil
- h.dict = nil
- //printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
}
func (h *history) setDict(dict *dict) {
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
index 69aa3bb5..777290d4 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
@@ -2,12 +2,7 @@
VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.
-
-[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
-[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
-
-xxhash is a Go implementation of the 64-bit
-[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
+xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
high-quality hashing algorithm that is much faster than anything in the Go
standard library.
@@ -28,31 +23,49 @@ func (*Digest) WriteString(string) (int, error)
func (*Digest) Sum64() uint64
```
-This implementation provides a fast pure-Go implementation and an even faster
-assembly implementation for amd64.
+The package is written with optimized pure Go and also contains even faster
+assembly implementations for amd64 and arm64. If desired, the `purego` build tag
+opts into using the Go code even on those architectures.
+
+[xxHash]: http://cyan4973.github.io/xxHash/
+
+## Compatibility
+
+This package is in a module and the latest code is in version 2 of the module.
+You need a version of Go with at least "minimal module compatibility" to use
+github.com/cespare/xxhash/v2:
+
+* 1.9.7+ for Go 1.9
+* 1.10.3+ for Go 1.10
+* Go 1.11 or later
+
+I recommend using the latest release of Go.
## Benchmarks
Here are some quick benchmarks comparing the pure-Go and assembly
implementations of Sum64.
-| input size | purego | asm |
-| --- | --- | --- |
-| 5 B | 979.66 MB/s | 1291.17 MB/s |
-| 100 B | 7475.26 MB/s | 7973.40 MB/s |
-| 4 KB | 17573.46 MB/s | 17602.65 MB/s |
-| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
+| input size | purego | asm |
+| ---------- | --------- | --------- |
+| 4 B | 1.3 GB/s | 1.2 GB/s |
+| 16 B | 2.9 GB/s | 3.5 GB/s |
+| 100 B | 6.9 GB/s | 8.1 GB/s |
+| 4 KB | 11.7 GB/s | 16.7 GB/s |
+| 10 MB | 12.0 GB/s | 17.3 GB/s |
-These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
-the following commands under Go 1.11.2:
+These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
+CPU using the following commands under Go 1.19.2:
```
-$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
-$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
+benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
+benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
```
## Projects using this package
- [InfluxDB](https://github.com/influxdata/influxdb)
- [Prometheus](https://github.com/prometheus/prometheus)
+- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
- [FreeCache](https://github.com/coocood/freecache)
+- [FastCache](https://github.com/VictoriaMetrics/fastcache)
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
index 2c112a0a..fc40c820 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
@@ -18,19 +18,11 @@ const (
prime5 uint64 = 2870177450012600261
)
-// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
-// possible in the Go code is worth a small (but measurable) performance boost
-// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
-// convenience in the Go code in a few places where we need to intentionally
-// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
-// result overflows a uint64).
-var (
- prime1v = prime1
- prime2v = prime2
- prime3v = prime3
- prime4v = prime4
- prime5v = prime5
-)
+// Store the primes in an array as well.
+//
+// The consts are used when possible in Go code to avoid MOVs but we need a
+// contiguous array of the assembly code.
+var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
// Digest implements hash.Hash64.
type Digest struct {
@@ -52,10 +44,10 @@ func New() *Digest {
// Reset clears the Digest's state so that it can be reused.
func (d *Digest) Reset() {
- d.v1 = prime1v + prime2
+ d.v1 = primes[0] + prime2
d.v2 = prime2
d.v3 = 0
- d.v4 = -prime1v
+ d.v4 = -primes[0]
d.total = 0
d.n = 0
}
@@ -71,21 +63,23 @@ func (d *Digest) Write(b []byte) (n int, err error) {
n = len(b)
d.total += uint64(n)
+ memleft := d.mem[d.n&(len(d.mem)-1):]
+
if d.n+n < 32 {
// This new data doesn't even fill the current block.
- copy(d.mem[d.n:], b)
+ copy(memleft, b)
d.n += n
return
}
if d.n > 0 {
// Finish off the partial block.
- copy(d.mem[d.n:], b)
+ c := copy(memleft, b)
d.v1 = round(d.v1, u64(d.mem[0:8]))
d.v2 = round(d.v2, u64(d.mem[8:16]))
d.v3 = round(d.v3, u64(d.mem[16:24]))
d.v4 = round(d.v4, u64(d.mem[24:32]))
- b = b[32-d.n:]
+ b = b[c:]
d.n = 0
}
@@ -135,21 +129,20 @@ func (d *Digest) Sum64() uint64 {
h += d.total
- i, end := 0, d.n
- for ; i+8 <= end; i += 8 {
- k1 := round(0, u64(d.mem[i:i+8]))
+ b := d.mem[:d.n&(len(d.mem)-1)]
+ for ; len(b) >= 8; b = b[8:] {
+ k1 := round(0, u64(b[:8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
- if i+4 <= end {
- h ^= uint64(u32(d.mem[i:i+4])) * prime1
+ if len(b) >= 4 {
+ h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3
- i += 4
+ b = b[4:]
}
- for i < end {
- h ^= uint64(d.mem[i]) * prime5
+ for ; len(b) > 0; b = b[1:] {
+ h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1
- i++
}
h ^= h >> 33
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
index cea17856..ddb63aa9 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@@ -1,3 +1,4 @@
+//go:build !appengine && gc && !purego && !noasm
// +build !appengine
// +build gc
// +build !purego
@@ -5,212 +6,205 @@
#include "textflag.h"
-// Register allocation:
-// AX h
-// SI pointer to advance through b
-// DX n
-// BX loop end
-// R8 v1, k1
-// R9 v2
-// R10 v3
-// R11 v4
-// R12 tmp
-// R13 prime1v
-// R14 prime2v
-// DI prime4v
-
-// round reads from and advances the buffer pointer in SI.
-// It assumes that R13 has prime1v and R14 has prime2v.
-#define round(r) \
- MOVQ (SI), R12 \
- ADDQ $8, SI \
- IMULQ R14, R12 \
- ADDQ R12, r \
- ROLQ $31, r \
- IMULQ R13, r
-
-// mergeRound applies a merge round on the two registers acc and val.
-// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
-#define mergeRound(acc, val) \
- IMULQ R14, val \
- ROLQ $31, val \
- IMULQ R13, val \
- XORQ val, acc \
- IMULQ R13, acc \
- ADDQ DI, acc
+// Registers:
+#define h AX
+#define d AX
+#define p SI // pointer to advance through b
+#define n DX
+#define end BX // loop end
+#define v1 R8
+#define v2 R9
+#define v3 R10
+#define v4 R11
+#define x R12
+#define prime1 R13
+#define prime2 R14
+#define prime4 DI
+
+#define round(acc, x) \
+ IMULQ prime2, x \
+ ADDQ x, acc \
+ ROLQ $31, acc \
+ IMULQ prime1, acc
+
+// round0 performs the operation x = round(0, x).
+#define round0(x) \
+ IMULQ prime2, x \
+ ROLQ $31, x \
+ IMULQ prime1, x
+
+// mergeRound applies a merge round on the two registers acc and x.
+// It assumes that prime1, prime2, and prime4 have been loaded.
+#define mergeRound(acc, x) \
+ round0(x) \
+ XORQ x, acc \
+ IMULQ prime1, acc \
+ ADDQ prime4, acc
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that there is at least one block
+// to process.
+#define blockLoop() \
+loop: \
+ MOVQ +0(p), x \
+ round(v1, x) \
+ MOVQ +8(p), x \
+ round(v2, x) \
+ MOVQ +16(p), x \
+ round(v3, x) \
+ MOVQ +24(p), x \
+ round(v4, x) \
+ ADDQ $32, p \
+ CMPQ p, end \
+ JLE loop
// func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOSPLIT, $0-32
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
// Load fixed primes.
- MOVQ ·prime1v(SB), R13
- MOVQ ·prime2v(SB), R14
- MOVQ ·prime4v(SB), DI
+ MOVQ ·primes+0(SB), prime1
+ MOVQ ·primes+8(SB), prime2
+ MOVQ ·primes+24(SB), prime4
// Load slice.
- MOVQ b_base+0(FP), SI
- MOVQ b_len+8(FP), DX
- LEAQ (SI)(DX*1), BX
+ MOVQ b_base+0(FP), p
+ MOVQ b_len+8(FP), n
+ LEAQ (p)(n*1), end
// The first loop limit will be len(b)-32.
- SUBQ $32, BX
+ SUBQ $32, end
// Check whether we have at least one block.
- CMPQ DX, $32
+ CMPQ n, $32
JLT noBlocks
// Set up initial state (v1, v2, v3, v4).
- MOVQ R13, R8
- ADDQ R14, R8
- MOVQ R14, R9
- XORQ R10, R10
- XORQ R11, R11
- SUBQ R13, R11
-
- // Loop until SI > BX.
-blockLoop:
- round(R8)
- round(R9)
- round(R10)
- round(R11)
-
- CMPQ SI, BX
- JLE blockLoop
-
- MOVQ R8, AX
- ROLQ $1, AX
- MOVQ R9, R12
- ROLQ $7, R12
- ADDQ R12, AX
- MOVQ R10, R12
- ROLQ $12, R12
- ADDQ R12, AX
- MOVQ R11, R12
- ROLQ $18, R12
- ADDQ R12, AX
-
- mergeRound(AX, R8)
- mergeRound(AX, R9)
- mergeRound(AX, R10)
- mergeRound(AX, R11)
+ MOVQ prime1, v1
+ ADDQ prime2, v1
+ MOVQ prime2, v2
+ XORQ v3, v3
+ XORQ v4, v4
+ SUBQ prime1, v4
+
+ blockLoop()
+
+ MOVQ v1, h
+ ROLQ $1, h
+ MOVQ v2, x
+ ROLQ $7, x
+ ADDQ x, h
+ MOVQ v3, x
+ ROLQ $12, x
+ ADDQ x, h
+ MOVQ v4, x
+ ROLQ $18, x
+ ADDQ x, h
+
+ mergeRound(h, v1)
+ mergeRound(h, v2)
+ mergeRound(h, v3)
+ mergeRound(h, v4)
JMP afterBlocks
noBlocks:
- MOVQ ·prime5v(SB), AX
+ MOVQ ·primes+32(SB), h
afterBlocks:
- ADDQ DX, AX
-
- // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
- ADDQ $24, BX
-
- CMPQ SI, BX
- JG fourByte
-
-wordLoop:
- // Calculate k1.
- MOVQ (SI), R8
- ADDQ $8, SI
- IMULQ R14, R8
- ROLQ $31, R8
- IMULQ R13, R8
-
- XORQ R8, AX
- ROLQ $27, AX
- IMULQ R13, AX
- ADDQ DI, AX
-
- CMPQ SI, BX
- JLE wordLoop
-
-fourByte:
- ADDQ $4, BX
- CMPQ SI, BX
- JG singles
-
- MOVL (SI), R8
- ADDQ $4, SI
- IMULQ R13, R8
- XORQ R8, AX
-
- ROLQ $23, AX
- IMULQ R14, AX
- ADDQ ·prime3v(SB), AX
-
-singles:
- ADDQ $4, BX
- CMPQ SI, BX
+ ADDQ n, h
+
+ ADDQ $24, end
+ CMPQ p, end
+ JG try4
+
+loop8:
+ MOVQ (p), x
+ ADDQ $8, p
+ round0(x)
+ XORQ x, h
+ ROLQ $27, h
+ IMULQ prime1, h
+ ADDQ prime4, h
+
+ CMPQ p, end
+ JLE loop8
+
+try4:
+ ADDQ $4, end
+ CMPQ p, end
+ JG try1
+
+ MOVL (p), x
+ ADDQ $4, p
+ IMULQ prime1, x
+ XORQ x, h
+
+ ROLQ $23, h
+ IMULQ prime2, h
+ ADDQ ·primes+16(SB), h
+
+try1:
+ ADDQ $4, end
+ CMPQ p, end
JGE finalize
-singlesLoop:
- MOVBQZX (SI), R12
- ADDQ $1, SI
- IMULQ ·prime5v(SB), R12
- XORQ R12, AX
+loop1:
+ MOVBQZX (p), x
+ ADDQ $1, p
+ IMULQ ·primes+32(SB), x
+ XORQ x, h
+ ROLQ $11, h
+ IMULQ prime1, h
- ROLQ $11, AX
- IMULQ R13, AX
-
- CMPQ SI, BX
- JL singlesLoop
+ CMPQ p, end
+ JL loop1
finalize:
- MOVQ AX, R12
- SHRQ $33, R12
- XORQ R12, AX
- IMULQ R14, AX
- MOVQ AX, R12
- SHRQ $29, R12
- XORQ R12, AX
- IMULQ ·prime3v(SB), AX
- MOVQ AX, R12
- SHRQ $32, R12
- XORQ R12, AX
-
- MOVQ AX, ret+24(FP)
+ MOVQ h, x
+ SHRQ $33, x
+ XORQ x, h
+ IMULQ prime2, h
+ MOVQ h, x
+ SHRQ $29, x
+ XORQ x, h
+ IMULQ ·primes+16(SB), h
+ MOVQ h, x
+ SHRQ $32, x
+ XORQ x, h
+
+ MOVQ h, ret+24(FP)
RET
-// writeBlocks uses the same registers as above except that it uses AX to store
-// the d pointer.
-
// func writeBlocks(d *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT, $0-40
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// Load fixed primes needed for round.
- MOVQ ·prime1v(SB), R13
- MOVQ ·prime2v(SB), R14
+ MOVQ ·primes+0(SB), prime1
+ MOVQ ·primes+8(SB), prime2
// Load slice.
- MOVQ b_base+8(FP), SI
- MOVQ b_len+16(FP), DX
- LEAQ (SI)(DX*1), BX
- SUBQ $32, BX
+ MOVQ b_base+8(FP), p
+ MOVQ b_len+16(FP), n
+ LEAQ (p)(n*1), end
+ SUBQ $32, end
// Load vN from d.
- MOVQ d+0(FP), AX
- MOVQ 0(AX), R8 // v1
- MOVQ 8(AX), R9 // v2
- MOVQ 16(AX), R10 // v3
- MOVQ 24(AX), R11 // v4
+ MOVQ s+0(FP), d
+ MOVQ 0(d), v1
+ MOVQ 8(d), v2
+ MOVQ 16(d), v3
+ MOVQ 24(d), v4
// We don't need to check the loop condition here; this function is
// always called with at least one block of data to process.
-blockLoop:
- round(R8)
- round(R9)
- round(R10)
- round(R11)
-
- CMPQ SI, BX
- JLE blockLoop
+ blockLoop()
// Copy vN back to d.
- MOVQ R8, 0(AX)
- MOVQ R9, 8(AX)
- MOVQ R10, 16(AX)
- MOVQ R11, 24(AX)
-
- // The number of bytes written is SI minus the old base pointer.
- SUBQ b_base+8(FP), SI
- MOVQ SI, ret+32(FP)
+ MOVQ v1, 0(d)
+ MOVQ v2, 8(d)
+ MOVQ v3, 16(d)
+ MOVQ v4, 24(d)
+
+ // The number of bytes written is p minus the old base pointer.
+ SUBQ b_base+8(FP), p
+ MOVQ p, ret+32(FP)
RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
index 4d64a17d..17901e08 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
@@ -1,13 +1,17 @@
-// +build gc,!purego,!noasm
+//go:build !appengine && gc && !purego && !noasm
+// +build !appengine
+// +build gc
+// +build !purego
+// +build !noasm
#include "textflag.h"
-// Register allocation.
+// Registers:
#define digest R1
-#define h R2 // Return value.
-#define p R3 // Input pointer.
-#define len R4
-#define nblocks R5 // len / 32.
+#define h R2 // return value
+#define p R3 // input pointer
+#define n R4 // input length
+#define nblocks R5 // n / 32
#define prime1 R7
#define prime2 R8
#define prime3 R9
@@ -25,60 +29,52 @@
#define round(acc, x) \
MADD prime2, acc, x, acc \
ROR $64-31, acc \
- MUL prime1, acc \
+ MUL prime1, acc
-// x = round(0, x).
+// round0 performs the operation x = round(0, x).
#define round0(x) \
MUL prime2, x \
ROR $64-31, x \
- MUL prime1, x \
-
-#define mergeRound(x) \
- round0(x) \
- EOR x, h \
- MADD h, prime4, prime1, h \
-
-// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
-#define blocksLoop() \
- LSR $5, len, nblocks \
- PCALIGN $16 \
- loop: \
- LDP.P 32(p), (x1, x2) \
- round(v1, x1) \
- LDP -16(p), (x3, x4) \
- round(v2, x2) \
- SUB $1, nblocks \
- round(v3, x3) \
- round(v4, x4) \
- CBNZ nblocks, loop \
-
-// The primes are repeated here to ensure that they're stored
-// in a contiguous array, so we can load them with LDP.
-DATA primes<> +0(SB)/8, $11400714785074694791
-DATA primes<> +8(SB)/8, $14029467366897019727
-DATA primes<>+16(SB)/8, $1609587929392839161
-DATA primes<>+24(SB)/8, $9650029242287828579
-DATA primes<>+32(SB)/8, $2870177450012600261
-GLOBL primes<>(SB), NOPTR+RODATA, $40
+ MUL prime1, x
+
+#define mergeRound(acc, x) \
+ round0(x) \
+ EOR x, acc \
+ MADD acc, prime4, prime1, acc
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that n >= 32.
+#define blockLoop() \
+ LSR $5, n, nblocks \
+ PCALIGN $16 \
+ loop: \
+ LDP.P 16(p), (x1, x2) \
+ LDP.P 16(p), (x3, x4) \
+ round(v1, x1) \
+ round(v2, x2) \
+ round(v3, x3) \
+ round(v4, x4) \
+ SUB $1, nblocks \
+ CBNZ nblocks, loop
// func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
- LDP b_base+0(FP), (p, len)
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
+ LDP b_base+0(FP), (p, n)
- LDP primes<> +0(SB), (prime1, prime2)
- LDP primes<>+16(SB), (prime3, prime4)
- MOVD primes<>+32(SB), prime5
+ LDP ·primes+0(SB), (prime1, prime2)
+ LDP ·primes+16(SB), (prime3, prime4)
+ MOVD ·primes+32(SB), prime5
- CMP $32, len
- CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
- BLO afterLoop
+ CMP $32, n
+ CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
+ BLT afterLoop
ADD prime1, prime2, v1
MOVD prime2, v2
MOVD $0, v3
NEG prime1, v4
- blocksLoop()
+ blockLoop()
ROR $64-1, v1, x1
ROR $64-7, v2, x2
@@ -88,71 +84,75 @@ TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
ADD x3, x4
ADD x2, x4, h
- mergeRound(v1)
- mergeRound(v2)
- mergeRound(v3)
- mergeRound(v4)
+ mergeRound(h, v1)
+ mergeRound(h, v2)
+ mergeRound(h, v3)
+ mergeRound(h, v4)
afterLoop:
- ADD len, h
+ ADD n, h
- TBZ $4, len, try8
+ TBZ $4, n, try8
LDP.P 16(p), (x1, x2)
round0(x1)
+
+ // NOTE: here and below, sequencing the EOR after the ROR (using a
+ // rotated register) is worth a small but measurable speedup for small
+ // inputs.
ROR $64-27, h
EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h
round0(x2)
ROR $64-27, h
- EOR x2 @> 64-27, h
+ EOR x2 @> 64-27, h, h
MADD h, prime4, prime1, h
try8:
- TBZ $3, len, try4
+ TBZ $3, n, try4
MOVD.P 8(p), x1
round0(x1)
ROR $64-27, h
- EOR x1 @> 64-27, h
+ EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h
try4:
- TBZ $2, len, try2
+ TBZ $2, n, try2
MOVWU.P 4(p), x2
MUL prime1, x2
ROR $64-23, h
- EOR x2 @> 64-23, h
+ EOR x2 @> 64-23, h, h
MADD h, prime3, prime2, h
try2:
- TBZ $1, len, try1
+ TBZ $1, n, try1
MOVHU.P 2(p), x3
AND $255, x3, x1
LSR $8, x3, x2
MUL prime5, x1
ROR $64-11, h
- EOR x1 @> 64-11, h
+ EOR x1 @> 64-11, h, h
MUL prime1, h
MUL prime5, x2
ROR $64-11, h
- EOR x2 @> 64-11, h
+ EOR x2 @> 64-11, h, h
MUL prime1, h
try1:
- TBZ $0, len, end
+ TBZ $0, n, finalize
MOVBU (p), x4
MUL prime5, x4
ROR $64-11, h
- EOR x4 @> 64-11, h
+ EOR x4 @> 64-11, h, h
MUL prime1, h
-end:
+finalize:
EOR h >> 33, h
MUL prime2, h
EOR h >> 29, h
@@ -163,24 +163,22 @@ end:
RET
// func writeBlocks(d *Digest, b []byte) int
-//
-// Assumes len(b) >= 32.
-TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
- LDP primes<>(SB), (prime1, prime2)
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
+ LDP ·primes+0(SB), (prime1, prime2)
// Load state. Assume v[1-4] are stored contiguously.
MOVD d+0(FP), digest
LDP 0(digest), (v1, v2)
LDP 16(digest), (v3, v4)
- LDP b_base+8(FP), (p, len)
+ LDP b_base+8(FP), (p, n)
- blocksLoop()
+ blockLoop()
// Store updated state.
STP (v1, v2), 0(digest)
STP (v3, v4), 16(digest)
- BIC $31, len
- MOVD len, ret+32(FP)
+ BIC $31, n
+ MOVD n, ret+32(FP)
RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
index 1a1fac9c..d4221edf 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
@@ -13,4 +13,4 @@ package xxhash
func Sum64(b []byte) uint64
//go:noescape
-func writeBlocks(d *Digest, b []byte) int
+func writeBlocks(s *Digest, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
index 209cb4a9..0be16cef 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
@@ -15,10 +15,10 @@ func Sum64(b []byte) uint64 {
var h uint64
if n >= 32 {
- v1 := prime1v + prime2
+ v1 := primes[0] + prime2
v2 := prime2
v3 := uint64(0)
- v4 := -prime1v
+ v4 := -primes[0]
for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)]))
v2 = round(v2, u64(b[8:16:len(b)]))
@@ -37,19 +37,18 @@ func Sum64(b []byte) uint64 {
h += uint64(n)
- i, end := 0, len(b)
- for ; i+8 <= end; i += 8 {
- k1 := round(0, u64(b[i:i+8:len(b)]))
+ for ; len(b) >= 8; b = b[8:] {
+ k1 := round(0, u64(b[:8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
- if i+4 <= end {
- h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
+ if len(b) >= 4 {
+ h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3
- i += 4
+ b = b[4:]
}
- for ; i < end; i++ {
- h ^= uint64(b[i]) * prime5
+ for ; len(b) > 0; b = b[1:] {
+ h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1
}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index df044720..f833d154 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -99,6 +99,21 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
return nil
}
+func (s *sequenceDecs) freeDecoders() {
+ if f := s.litLengths.fse; f != nil && !f.preDefined {
+ fseDecoderPool.Put(f)
+ s.litLengths.fse = nil
+ }
+ if f := s.offsets.fse; f != nil && !f.preDefined {
+ fseDecoderPool.Put(f)
+ s.offsets.fse = nil
+ }
+ if f := s.matchLengths.fse; f != nil && !f.preDefined {
+ fseDecoderPool.Put(f)
+ s.matchLengths.fse = nil
+ }
+}
+
// execute will execute the decoded sequence with the provided history.
// The sequence must be evaluated before being sent.
func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
@@ -299,7 +314,10 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
}
size := ll + ml + len(out)
if size-startSize > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+ if size-startSize == 424242 {
+ panic("here")
+ }
+ return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
if size > cap(out) {
// Not enough size, which can happen under high volume block streaming conditions
@@ -411,7 +429,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
// Check if space for literals
if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
+ return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
// Add final literals
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
index 847b322a..191384ad 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -32,18 +32,22 @@ type decodeSyncAsmContext struct {
// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
+//
//go:noescape
func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
+//
//go:noescape
func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
+//
//go:noescape
func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
+//
//go:noescape
func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
@@ -55,16 +59,22 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
return false, nil
}
- useSafe := false
- if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
- useSafe = true
- }
- if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
- useSafe = true
- }
- if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
- useSafe = true
- }
+
+ // FIXME: Using unsafe memory copies leads to rare, random crashes
+ // with fuzz testing. It is therefore disabled for now.
+ const useSafe = true
+ /*
+ useSafe := false
+ if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+ useSafe = true
+ }
+ if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+ useSafe = true
+ }
+ if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+ useSafe = true
+ }
+ */
br := s.br
@@ -129,7 +139,7 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
if debugDecoder {
println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
}
- return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+ return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
default:
return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
@@ -137,7 +147,8 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
s.seqSize += ctx.litRemain
if s.seqSize > maxBlockSize {
- return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
+
}
err := br.close()
if err != nil {
@@ -195,20 +206,24 @@ const errorNotEnoughSpace = 5
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
+//
//go:noescape
func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
+//
//go:noescape
func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//
//go:noescape
func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//
//go:noescape
func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
@@ -275,7 +290,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
s.seqSize += ctx.litRemain
if s.seqSize > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
err := br.close()
if err != nil {
@@ -302,10 +317,12 @@ type executeAsmContext struct {
// Returns false if a match offset is too big.
//
// Please refer to seqdec_generic.go for the reference implementation.
+//
//go:noescape
func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
// Same as above, but with safe memcopies
+//
//go:noescape
func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index 71e64e06..b94993a0 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -1,7 +1,6 @@
// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
@@ -52,34 +51,46 @@ sequenceDecs_decode_amd64_fill_byte_by_byte:
sequenceDecs_decode_amd64_fill_end:
// Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, 16(R10)
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_of_update_zero:
+ MOVQ AX, 16(R10)
// Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, 8(R10)
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_ml_update_zero:
+ MOVQ AX, 8(R10)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
@@ -107,19 +118,25 @@ sequenceDecs_decode_amd64_fill_2_byte_by_byte:
sequenceDecs_decode_amd64_fill_2_end:
// Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, (R10)
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_ll_update_zero:
+ MOVQ AX, (R10)
// Fill bitreader for state updates
MOVQ R14, (SP)
@@ -198,7 +215,7 @@ sequenceDecs_decode_amd64_skip_update:
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
- JMP sequenceDecs_decode_amd64_adjust_end
+ JMP sequenceDecs_decode_amd64_after_adjust
sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
@@ -210,7 +227,7 @@ sequenceDecs_decode_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
MOVQ R11, CX
- JMP sequenceDecs_decode_amd64_adjust_end
+ JMP sequenceDecs_decode_amd64_after_adjust
sequenceDecs_decode_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
@@ -247,7 +264,7 @@ sequenceDecs_decode_amd64_adjust_temp_valid:
MOVQ AX, R11
MOVQ AX, CX
-sequenceDecs_decode_amd64_adjust_end:
+sequenceDecs_decode_amd64_after_adjust:
MOVQ CX, 16(R10)
// Check values
@@ -303,10 +320,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
- // Return with not enough output space error
- MOVQ $0x00000005, ret+24(FP)
- RET
-
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
@@ -356,49 +369,67 @@ sequenceDecs_decode_56_amd64_fill_byte_by_byte:
sequenceDecs_decode_56_amd64_fill_end:
// Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, 16(R10)
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_of_update_zero:
+ MOVQ AX, 16(R10)
// Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, 8(R10)
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_ml_update_zero:
+ MOVQ AX, 8(R10)
// Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, (R10)
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_ll_update_zero:
+ MOVQ AX, (R10)
// Fill bitreader for state updates
MOVQ R14, (SP)
@@ -477,7 +508,7 @@ sequenceDecs_decode_56_amd64_skip_update:
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
- JMP sequenceDecs_decode_56_amd64_adjust_end
+ JMP sequenceDecs_decode_56_amd64_after_adjust
sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
@@ -489,7 +520,7 @@ sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
MOVQ R11, CX
- JMP sequenceDecs_decode_56_amd64_adjust_end
+ JMP sequenceDecs_decode_56_amd64_after_adjust
sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
@@ -526,7 +557,7 @@ sequenceDecs_decode_56_amd64_adjust_temp_valid:
MOVQ AX, R11
MOVQ AX, CX
-sequenceDecs_decode_56_amd64_adjust_end:
+sequenceDecs_decode_56_amd64_after_adjust:
MOVQ CX, 16(R10)
// Check values
@@ -582,10 +613,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
- // Return with not enough output space error
- MOVQ $0x00000005, ret+24(FP)
- RET
-
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
@@ -757,7 +784,7 @@ sequenceDecs_decode_bmi2_skip_update:
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
- JMP sequenceDecs_decode_bmi2_adjust_end
+ JMP sequenceDecs_decode_bmi2_after_adjust
sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
@@ -769,7 +796,7 @@ sequenceDecs_decode_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
MOVQ R10, CX
- JMP sequenceDecs_decode_bmi2_adjust_end
+ JMP sequenceDecs_decode_bmi2_after_adjust
sequenceDecs_decode_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
@@ -806,7 +833,7 @@ sequenceDecs_decode_bmi2_adjust_temp_valid:
MOVQ R13, R10
MOVQ R13, CX
-sequenceDecs_decode_bmi2_adjust_end:
+sequenceDecs_decode_bmi2_after_adjust:
MOVQ CX, 16(R9)
// Check values
@@ -862,10 +889,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
- // Return with not enough output space error
- MOVQ $0x00000005, ret+24(FP)
- RET
-
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
@@ -1012,7 +1035,7 @@ sequenceDecs_decode_56_bmi2_skip_update:
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
- JMP sequenceDecs_decode_56_bmi2_adjust_end
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
@@ -1024,7 +1047,7 @@ sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
MOVQ R10, CX
- JMP sequenceDecs_decode_56_bmi2_adjust_end
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
@@ -1061,7 +1084,7 @@ sequenceDecs_decode_56_bmi2_adjust_temp_valid:
MOVQ R13, R10
MOVQ R13, CX
-sequenceDecs_decode_56_bmi2_adjust_end:
+sequenceDecs_decode_56_bmi2_after_adjust:
MOVQ CX, 16(R9)
// Check values
@@ -1117,10 +1140,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
- // Return with not enough output space error
- MOVQ $0x00000005, ret+24(FP)
- RET
-
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
// Requires: SSE
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
@@ -1354,8 +1373,7 @@ loop_finished:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
- MOVQ 80(AX), CX
- SUBQ CX, SI
+ SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@@ -1367,8 +1385,7 @@ error_match_off_too_big:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
- MOVQ 80(AX), CX
- SUBQ CX, SI
+ SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@@ -1712,8 +1729,7 @@ loop_finished:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
- MOVQ 80(AX), CX
- SUBQ CX, SI
+ SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@@ -1725,8 +1741,7 @@ error_match_off_too_big:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
- MOVQ 80(AX), CX
- SUBQ CX, SI
+ SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@@ -1749,6 +1764,10 @@ TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
+ XORQ CX, CX
+ MOVQ CX, 8(SP)
+ MOVQ CX, 16(SP)
+ MOVQ CX, 24(SP)
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
@@ -1798,34 +1817,46 @@ sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
sequenceDecs_decodeSync_amd64_fill_end:
// Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 8(SP)
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_of_update_zero:
+ MOVQ AX, 8(SP)
// Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 16(SP)
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_ml_update_zero:
+ MOVQ AX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
@@ -1853,19 +1884,25 @@ sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
sequenceDecs_decodeSync_amd64_fill_2_end:
// Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 24(SP)
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_ll_update_zero:
+ MOVQ AX, 24(SP)
// Fill bitreader for state updates
MOVQ R13, (SP)
@@ -1945,7 +1982,7 @@ sequenceDecs_decodeSync_amd64_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_amd64_adjust_end
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@@ -1957,7 +1994,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_amd64_adjust_end
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
MOVQ R13, AX
@@ -1966,8 +2003,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
- LEAQ 144(CX), R15
- ADDQ (R15)(AX*8), R14
+ ADDQ 144(CX)(AX*8), R14
JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
@@ -1983,7 +2019,7 @@ sequenceDecs_decodeSync_amd64_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
-sequenceDecs_decodeSync_amd64_adjust_end:
+sequenceDecs_decodeSync_amd64_after_adjust:
MOVQ R13, 8(SP)
// Check values
@@ -2280,6 +2316,10 @@ TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
+ XORQ R9, R9
+ MOVQ R9, 8(SP)
+ MOVQ R9, 16(SP)
+ MOVQ R9, 24(SP)
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
@@ -2452,7 +2492,7 @@ sequenceDecs_decodeSync_bmi2_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_bmi2_adjust_end
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@@ -2464,7 +2504,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_bmi2_adjust_end
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
@@ -2473,8 +2513,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
- LEAQ 144(CX), R15
- ADDQ (R15)(R12*8), R14
+ ADDQ 144(CX)(R12*8), R14
JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
@@ -2490,7 +2529,7 @@ sequenceDecs_decodeSync_bmi2_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
-sequenceDecs_decodeSync_bmi2_adjust_end:
+sequenceDecs_decodeSync_bmi2_after_adjust:
MOVQ R13, 8(SP)
// Check values
@@ -2787,6 +2826,10 @@ TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
+ XORQ CX, CX
+ MOVQ CX, 8(SP)
+ MOVQ CX, 16(SP)
+ MOVQ CX, 24(SP)
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
@@ -2836,34 +2879,46 @@ sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
sequenceDecs_decodeSync_safe_amd64_fill_end:
// Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 8(SP)
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
+ MOVQ AX, 8(SP)
// Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 16(SP)
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
+ MOVQ AX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
@@ -2891,19 +2946,25 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
sequenceDecs_decodeSync_safe_amd64_fill_2_end:
// Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 24(SP)
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
+ MOVQ AX, 24(SP)
// Fill bitreader for state updates
MOVQ R13, (SP)
@@ -2983,7 +3044,7 @@ sequenceDecs_decodeSync_safe_amd64_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_safe_amd64_adjust_end
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@@ -2995,7 +3056,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_safe_amd64_adjust_end
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
MOVQ R13, AX
@@ -3004,8 +3065,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
- LEAQ 144(CX), R15
- ADDQ (R15)(AX*8), R14
+ ADDQ 144(CX)(AX*8), R14
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
@@ -3021,7 +3081,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
-sequenceDecs_decodeSync_safe_amd64_adjust_end:
+sequenceDecs_decodeSync_safe_amd64_after_adjust:
MOVQ R13, 8(SP)
// Check values
@@ -3420,6 +3480,10 @@ TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
+ XORQ R9, R9
+ MOVQ R9, 8(SP)
+ MOVQ R9, 16(SP)
+ MOVQ R9, 24(SP)
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
@@ -3592,7 +3656,7 @@ sequenceDecs_decodeSync_safe_bmi2_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@@ -3604,7 +3668,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
@@ -3613,8 +3677,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
- LEAQ 144(CX), R15
- ADDQ (R15)(R12*8), R14
+ ADDQ 144(CX)(R12*8), R14
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
@@ -3630,7 +3693,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
-sequenceDecs_decodeSync_safe_bmi2_adjust_end:
+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
MOVQ R13, 8(SP)
// Check values
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
index c3452bc3..ac2a80d2 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@@ -111,7 +111,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
}
s.seqSize += ll + ml
if s.seqSize > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
litRemain -= ll
if litRemain < 0 {
@@ -149,7 +149,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
}
s.seqSize += litRemain
if s.seqSize > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
err := br.close()
if err != nil {
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 3eb3f1c8..5ffa82f5 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -36,9 +36,6 @@ const forcePreDef = false
// zstdMinMatch is the minimum zstd match length.
const zstdMinMatch = 3
-// Reset the buffer offset when reaching this.
-const bufferReset = math.MaxInt32 - MaxWindowSize
-
// fcsUnknown is used for unknown frame content size.
const fcsUnknown = math.MaxUint64
@@ -75,7 +72,6 @@ var (
ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")
// ErrUnknownDictionary is returned if the dictionary ID is unknown.
- // For the time being dictionaries are not supported.
ErrUnknownDictionary = errors.New("unknown dictionary")
// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
@@ -110,26 +106,25 @@ func printf(format string, a ...interface{}) {
}
}
-// matchLen returns the maximum length.
+// matchLen returns the maximum common prefix length of a and b.
// a must be the shortest of the two.
-// The function also returns whether all bytes matched.
-func matchLen(a, b []byte) int {
- b = b[:len(a)]
- for i := 0; i < len(a)-7; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- return i + (bits.TrailingZeros64(diff) >> 3)
+func matchLen(a, b []byte) (n int) {
+ for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
+ diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
+ if diff != 0 {
+ return n + bits.TrailingZeros64(diff)>>3
}
+ n += 8
}
- checked := (len(a) >> 3) << 3
- a = a[checked:]
- b = b[checked:]
for i := range a {
if a[i] != b[i] {
- return i + checked
+ break
}
+ n++
}
- return len(a) + checked
+ return n
+
}
func load3232(b []byte, i int32) uint32 {
@@ -140,10 +135,6 @@ func load6432(b []byte, i int32) uint64 {
return binary.LittleEndian.Uint64(b[i:])
}
-func load64(b []byte, i int) uint64 {
- return binary.LittleEndian.Uint64(b[i:])
-}
-
type byter interface {
Bytes() []byte
Len() int
diff --git a/vendor/github.com/klauspost/cpuid/v2/README.md b/vendor/github.com/klauspost/cpuid/v2/README.md
index bc2f98f0..857a93e5 100644
--- a/vendor/github.com/klauspost/cpuid/v2/README.md
+++ b/vendor/github.com/klauspost/cpuid/v2/README.md
@@ -16,10 +16,17 @@ Package home: https://github.com/klauspost/cpuid
## installing
-`go get -u github.com/klauspost/cpuid/v2` using modules.
-
+`go get -u github.com/klauspost/cpuid/v2` using modules.
Drop `v2` for others.
+### Homebrew
+
+For macOS/Linux users, you can install via [brew](https://brew.sh/)
+
+```sh
+$ brew install cpuid
+```
+
## example
```Go
@@ -77,10 +84,14 @@ We have Streaming SIMD 2 Extensions
The `cpuid.CPU` provides access to CPU features. Use `cpuid.CPU.Supports()` to check for CPU features.
A faster `cpuid.CPU.Has()` is provided which will usually be inlined by the gc compiler.
+To test a larger number of features, they can be combined using `f := CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SYSCALL, SSE, SSE2)`, etc.
+This can be using with `cpuid.CPU.HasAll(f)` to quickly test if all features are supported.
+
Note that for some cpu/os combinations some features will not be detected.
`amd64` has rather good support and should work reliably on all platforms.
-Note that hypervisors may not pass through all CPU features.
+Note that hypervisors may not pass through all CPU features through to the guest OS,
+so even if your host supports a feature it may not be visible on guests.
## arm64 feature detection
@@ -132,6 +143,339 @@ func main() {
}
```
+## commandline
+
+Download as binary from: https://github.com/klauspost/cpuid/releases
+
+Install from source:
+
+`go install github.com/klauspost/cpuid/v2/cmd/cpuid@latest`
+
+### Example
+
+```
+λ cpuid
+Name: AMD Ryzen 9 3950X 16-Core Processor
+Vendor String: AuthenticAMD
+Vendor ID: AMD
+PhysicalCores: 16
+Threads Per Core: 2
+Logical Cores: 32
+CPU Family 23 Model: 113
+Features: ADX,AESNI,AVX,AVX2,BMI1,BMI2,CLMUL,CLZERO,CMOV,CMPXCHG8,CPBOOST,CX16,F16C,FMA3,FXSR,FXSROPT,HTT,HYPERVISOR,LAHF,LZCNT,MCAOVERFLOW,MMX,MMXEXT,MOVBE,NX,OSXSAVE,POPCNT,RDRAND,RDSEED,RDTSCP,SCE,SHA,SSE,SSE2,SSE3,SSE4,SSE42,SSE4A,SSSE3,SUCCOR,X87,XSAVE
+Microarchitecture level: 3
+Cacheline bytes: 64
+L1 Instruction Cache: 32768 bytes
+L1 Data Cache: 32768 bytes
+L2 Cache: 524288 bytes
+L3 Cache: 16777216 bytes
+
+```
+### JSON Output:
+
+```
+λ cpuid --json
+{
+ "BrandName": "AMD Ryzen 9 3950X 16-Core Processor",
+ "VendorID": 2,
+ "VendorString": "AuthenticAMD",
+ "PhysicalCores": 16,
+ "ThreadsPerCore": 2,
+ "LogicalCores": 32,
+ "Family": 23,
+ "Model": 113,
+ "CacheLine": 64,
+ "Hz": 0,
+ "BoostFreq": 0,
+ "Cache": {
+ "L1I": 32768,
+ "L1D": 32768,
+ "L2": 524288,
+ "L3": 16777216
+ },
+ "SGX": {
+ "Available": false,
+ "LaunchControl": false,
+ "SGX1Supported": false,
+ "SGX2Supported": false,
+ "MaxEnclaveSizeNot64": 0,
+ "MaxEnclaveSize64": 0,
+ "EPCSections": null
+ },
+ "Features": [
+ "ADX",
+ "AESNI",
+ "AVX",
+ "AVX2",
+ "BMI1",
+ "BMI2",
+ "CLMUL",
+ "CLZERO",
+ "CMOV",
+ "CMPXCHG8",
+ "CPBOOST",
+ "CX16",
+ "F16C",
+ "FMA3",
+ "FXSR",
+ "FXSROPT",
+ "HTT",
+ "HYPERVISOR",
+ "LAHF",
+ "LZCNT",
+ "MCAOVERFLOW",
+ "MMX",
+ "MMXEXT",
+ "MOVBE",
+ "NX",
+ "OSXSAVE",
+ "POPCNT",
+ "RDRAND",
+ "RDSEED",
+ "RDTSCP",
+ "SCE",
+ "SHA",
+ "SSE",
+ "SSE2",
+ "SSE3",
+ "SSE4",
+ "SSE42",
+ "SSE4A",
+ "SSSE3",
+ "SUCCOR",
+ "X87",
+ "XSAVE"
+ ],
+ "X64Level": 3
+}
+```
+
+### Check CPU microarch level
+
+```
+λ cpuid --check-level=3
+2022/03/18 17:04:40 AMD Ryzen 9 3950X 16-Core Processor
+2022/03/18 17:04:40 Microarchitecture level 3 is supported. Max level is 3.
+Exit Code 0
+
+λ cpuid --check-level=4
+2022/03/18 17:06:18 AMD Ryzen 9 3950X 16-Core Processor
+2022/03/18 17:06:18 Microarchitecture level 4 not supported. Max level is 3.
+Exit Code 1
+```
+
+
+## Available flags
+
+### x86 & amd64
+
+| Feature Flag | Description |
+|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| ADX | Intel ADX (Multi-Precision Add-Carry Instruction Extensions) |
+| AESNI | Advanced Encryption Standard New Instructions |
+| AMD3DNOW | AMD 3DNOW |
+| AMD3DNOWEXT | AMD 3DNowExt |
+| AMXBF16 | Tile computational operations on BFLOAT16 numbers |
+| AMXINT8 | Tile computational operations on 8-bit integers |
+| AMXFP16 | Tile computational operations on FP16 numbers |
+| AMXTILE | Tile architecture |
+| AVX | AVX functions |
+| AVX2 | AVX2 functions |
+| AVX512BF16 | AVX-512 BFLOAT16 Instructions |
+| AVX512BITALG | AVX-512 Bit Algorithms |
+| AVX512BW | AVX-512 Byte and Word Instructions |
+| AVX512CD | AVX-512 Conflict Detection Instructions |
+| AVX512DQ | AVX-512 Doubleword and Quadword Instructions |
+| AVX512ER | AVX-512 Exponential and Reciprocal Instructions |
+| AVX512F | AVX-512 Foundation |
+| AVX512FP16 | AVX-512 FP16 Instructions |
+| AVX512IFMA | AVX-512 Integer Fused Multiply-Add Instructions |
+| AVX512PF | AVX-512 Prefetch Instructions |
+| AVX512VBMI | AVX-512 Vector Bit Manipulation Instructions |
+| AVX512VBMI2 | AVX-512 Vector Bit Manipulation Instructions, Version 2 |
+| AVX512VL | AVX-512 Vector Length Extensions |
+| AVX512VNNI | AVX-512 Vector Neural Network Instructions |
+| AVX512VP2INTERSECT | AVX-512 Intersect for D/Q |
+| AVX512VPOPCNTDQ | AVX-512 Vector Population Count Doubleword and Quadword |
+| AVXIFMA | AVX-IFMA instructions |
+| AVXNECONVERT | AVX-NE-CONVERT instructions |
+| AVXSLOW | Indicates the CPU performs 2 128 bit operations instead of one |
+| AVXVNNI | AVX (VEX encoded) VNNI neural network instructions |
+| AVXVNNIINT8 | AVX-VNNI-INT8 instructions |
+| BMI1 | Bit Manipulation Instruction Set 1 |
+| BMI2 | Bit Manipulation Instruction Set 2 |
+| CETIBT | Intel CET Indirect Branch Tracking |
+| CETSS | Intel CET Shadow Stack |
+| CLDEMOTE | Cache Line Demote |
+| CLMUL | Carry-less Multiplication |
+| CLZERO | CLZERO instruction supported |
+| CMOV | i686 CMOV |
+| CMPCCXADD | CMPCCXADD instructions |
+| CMPSB_SCADBS_SHORT | Fast short CMPSB and SCASB |
+| CMPXCHG8 | CMPXCHG8 instruction |
+| CPBOOST | Core Performance Boost |
+| CPPC | AMD: Collaborative Processor Performance Control |
+| CX16 | CMPXCHG16B Instruction |
+| EFER_LMSLE_UNS | AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ |
+| ENQCMD | Enqueue Command |
+| ERMS | Enhanced REP MOVSB/STOSB |
+| F16C | Half-precision floating-point conversion |
+| FLUSH_L1D | Flush L1D cache |
+| FMA3 | Intel FMA 3. Does not imply AVX. |
+| FMA4 | Bulldozer FMA4 functions |
+| FP128 | AMD: When set, the internal FP/SIMD execution datapath is 128-bits wide |
+| FP256 | AMD: When set, the internal FP/SIMD execution datapath is 256-bits wide |
+| FSRM | Fast Short Rep Mov |
+| FXSR | FXSAVE, FXRESTOR instructions, CR4 bit 9 |
+| FXSROPT | FXSAVE/FXRSTOR optimizations |
+| GFNI | Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage. |
+| HLE | Hardware Lock Elision |
+| HRESET | If set CPU supports history reset and the IA32_HRESET_ENABLE MSR |
+| HTT | Hyperthreading (enabled) |
+| HWA | Hardware assert supported. Indicates support for MSRC001_10 |
+| HYBRID_CPU | This part has CPUs of more than one type. |
+| HYPERVISOR | This bit has been reserved by Intel & AMD for use by hypervisors |
+| IA32_ARCH_CAP | IA32_ARCH_CAPABILITIES MSR (Intel) |
+| IA32_CORE_CAP | IA32_CORE_CAPABILITIES MSR |
+| IBPB | Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) |
+| IBRS | AMD: Indirect Branch Restricted Speculation |
+| IBRS_PREFERRED | AMD: IBRS is preferred over software solution |
+| IBRS_PROVIDES_SMP | AMD: IBRS provides Same Mode Protection |
+| IBS | Instruction Based Sampling (AMD) |
+| IBSBRNTRGT | Instruction Based Sampling Feature (AMD) |
+| IBSFETCHSAM | Instruction Based Sampling Feature (AMD) |
+| IBSFFV | Instruction Based Sampling Feature (AMD) |
+| IBSOPCNT | Instruction Based Sampling Feature (AMD) |
+| IBSOPCNTEXT | Instruction Based Sampling Feature (AMD) |
+| IBSOPSAM | Instruction Based Sampling Feature (AMD) |
+| IBSRDWROPCNT | Instruction Based Sampling Feature (AMD) |
+| IBSRIPINVALIDCHK | Instruction Based Sampling Feature (AMD) |
+| IBS_FETCH_CTLX | AMD: IBS fetch control extended MSR supported |
+| IBS_OPDATA4 | AMD: IBS op data 4 MSR supported |
+| IBS_OPFUSE | AMD: Indicates support for IbsOpFuse |
+| IBS_PREVENTHOST | Disallowing IBS use by the host supported |
+| IBS_ZEN4 | Fetch and Op IBS support IBS extensions added with Zen4 |
+| INT_WBINVD | WBINVD/WBNOINVD are interruptible. |
+| INVLPGB | NVLPGB and TLBSYNC instruction supported |
+| LAHF | LAHF/SAHF in long mode |
+| LAM | If set, CPU supports Linear Address Masking |
+| LBRVIRT | LBR virtualization |
+| LZCNT | LZCNT instruction |
+| MCAOVERFLOW | MCA overflow recovery support. |
+| MCDT_NO | Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it. |
+| MCOMMIT | MCOMMIT instruction supported |
+| MD_CLEAR | VERW clears CPU buffers |
+| MMX | standard MMX |
+| MMXEXT | SSE integer functions or AMD MMX ext |
+| MOVBE | MOVBE instruction (big-endian) |
+| MOVDIR64B | Move 64 Bytes as Direct Store |
+| MOVDIRI | Move Doubleword as Direct Store |
+| MOVSB_ZL | Fast Zero-Length MOVSB |
+| MPX | Intel MPX (Memory Protection Extensions) |
+| MOVU | MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD |
+| MSRIRC | Instruction Retired Counter MSR available |
+| MSR_PAGEFLUSH | Page Flush MSR available |
+| NRIPS | Indicates support for NRIP save on VMEXIT |
+| NX | NX (No-Execute) bit |
+| OSXSAVE | XSAVE enabled by OS |
+| PCONFIG | PCONFIG for Intel Multi-Key Total Memory Encryption |
+| POPCNT | POPCNT instruction |
+| PPIN | AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled |
+| PREFETCHI | PREFETCHIT0/1 instructions |
+| PSFD | AMD: Predictive Store Forward Disable |
+| RDPRU | RDPRU instruction supported |
+| RDRAND | RDRAND instruction is available |
+| RDSEED | RDSEED instruction is available |
+| RDTSCP | RDTSCP Instruction |
+| RTM | Restricted Transactional Memory |
+| RTM_ALWAYS_ABORT | Indicates that the loaded microcode is forcing RTM abort. |
+| SERIALIZE | Serialize Instruction Execution |
+| SEV | AMD Secure Encrypted Virtualization supported |
+| SEV_64BIT | AMD SEV guest execution only allowed from a 64-bit host |
+| SEV_ALTERNATIVE | AMD SEV Alternate Injection supported |
+| SEV_DEBUGSWAP | Full debug state swap supported for SEV-ES guests |
+| SEV_ES | AMD SEV Encrypted State supported |
+| SEV_RESTRICTED | AMD SEV Restricted Injection supported |
+| SEV_SNP | AMD SEV Secure Nested Paging supported |
+| SGX | Software Guard Extensions |
+| SGXLC | Software Guard Extensions Launch Control |
+| SHA | Intel SHA Extensions |
+| SME | AMD Secure Memory Encryption supported |
+| SME_COHERENT | AMD Hardware cache coherency across encryption domains enforced |
+| SPEC_CTRL_SSBD | Speculative Store Bypass Disable |
+| SRBDS_CTRL | SRBDS mitigation MSR available |
+| SSE | SSE functions |
+| SSE2 | P4 SSE functions |
+| SSE3 | Prescott SSE3 functions |
+| SSE4 | Penryn SSE4.1 functions |
+| SSE42 | Nehalem SSE4.2 functions |
+| SSE4A | AMD Barcelona microarchitecture SSE4a instructions |
+| SSSE3 | Conroe SSSE3 functions |
+| STIBP | Single Thread Indirect Branch Predictors |
+| STIBP_ALWAYSON | AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On |
+| STOSB_SHORT | Fast short STOSB |
+| SUCCOR | Software uncorrectable error containment and recovery capability. |
+| SVM | AMD Secure Virtual Machine |
+| SVMDA | Indicates support for the SVM decode assists. |
+| SVMFBASID | SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control |
+| SVML | AMD SVM lock. Indicates support for SVM-Lock. |
+| SVMNP | AMD SVM nested paging |
+| SVMPF | SVM pause intercept filter. Indicates support for the pause intercept filter |
+| SVMPFT | SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold |
+| SYSCALL | System-Call Extension (SCE): SYSCALL and SYSRET instructions. |
+| SYSEE | SYSENTER and SYSEXIT instructions |
+| TBM | AMD Trailing Bit Manipulation |
+| TLB_FLUSH_NESTED | AMD: Flushing includes all the nested translations for guest translations |
+| TME | Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE. |
+| TOPEXT | TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX. |
+| TSCRATEMSR | MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104 |
+| TSXLDTRK | Intel TSX Suspend Load Address Tracking |
+| VAES | Vector AES. AVX(512) versions requires additional checks. |
+| VMCBCLEAN | VMCB clean bits. Indicates support for VMCB clean bits. |
+| VMPL | AMD VM Permission Levels supported |
+| VMSA_REGPROT | AMD VMSA Register Protection supported |
+| VMX | Virtual Machine Extensions |
+| VPCLMULQDQ | Carry-Less Multiplication Quadword. Requires AVX for 3 register versions. |
+| VTE | AMD Virtual Transparent Encryption supported |
+| WAITPKG | TPAUSE, UMONITOR, UMWAIT |
+| WBNOINVD | Write Back and Do Not Invalidate Cache |
+| X87 | FPU |
+| XGETBV1 | Supports XGETBV with ECX = 1 |
+| XOP | Bulldozer XOP functions |
+| XSAVE | XSAVE, XRESTOR, XSETBV, XGETBV |
+| XSAVEC | Supports XSAVEC and the compacted form of XRSTOR. |
+| XSAVEOPT | XSAVEOPT available |
+| XSAVES | Supports XSAVES/XRSTORS and IA32_XSS |
+
+# ARM features:
+
+| Feature Flag | Description |
+|--------------|------------------------------------------------------------------|
+| AESARM | AES instructions |
+| ARMCPUID | Some CPU ID registers readable at user-level |
+| ASIMD | Advanced SIMD |
+| ASIMDDP | SIMD Dot Product |
+| ASIMDHP | Advanced SIMD half-precision floating point |
+| ASIMDRDM | Rounding Double Multiply Accumulate/Subtract (SQRDMLAH/SQRDMLSH) |
+| ATOMICS | Large System Extensions (LSE) |
+| CRC32 | CRC32/CRC32C instructions |
+| DCPOP | Data cache clean to Point of Persistence (DC CVAP) |
+| EVTSTRM | Generic timer |
+| FCMA | Floatin point complex number addition and multiplication |
+| FP | Single-precision and double-precision floating point |
+| FPHP | Half-precision floating point |
+| GPA | Generic Pointer Authentication |
+| JSCVT | Javascript-style double->int convert (FJCVTZS) |
+| LRCPC | Weaker release consistency (LDAPR, etc) |
+| PMULL | Polynomial Multiply instructions (PMULL/PMULL2) |
+| SHA1 | SHA-1 instructions (SHA1C, etc) |
+| SHA2 | SHA-2 instructions (SHA256H, etc) |
+| SHA3 | SHA-3 instructions (EOR3, RAXI, XAR, BCAX) |
+| SHA512 | SHA512 instructions |
+| SM3 | SM3 instructions |
+| SM4 | SM4 instructions |
+| SVE | Scalable Vector Extension |
+
# license
This code is published under an MIT license. See LICENSE file for more information.
diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid.go b/vendor/github.com/klauspost/cpuid/v2/cpuid.go
index 3d543ce9..cf2ae9c5 100644
--- a/vendor/github.com/klauspost/cpuid/v2/cpuid.go
+++ b/vendor/github.com/klauspost/cpuid/v2/cpuid.go
@@ -14,6 +14,7 @@ import (
"flag"
"fmt"
"math"
+ "math/bits"
"os"
"runtime"
"strings"
@@ -72,6 +73,7 @@ const (
AMD3DNOW // AMD 3DNOW
AMD3DNOWEXT // AMD 3DNowExt
AMXBF16 // Tile computational operations on BFLOAT16 numbers
+ AMXFP16 // Tile computational operations on FP16 numbers
AMXINT8 // Tile computational operations on 8-bit integers
AMXTILE // Tile architecture
AVX // AVX functions
@@ -92,7 +94,11 @@ const (
AVX512VNNI // AVX-512 Vector Neural Network Instructions
AVX512VP2INTERSECT // AVX-512 Intersect for D/Q
AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword
- AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one.
+ AVXIFMA // AVX-IFMA instructions
+ AVXNECONVERT // AVX-NE-CONVERT instructions
+ AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one
+ AVXVNNI // AVX (VEX encoded) VNNI neural network instructions
+ AVXVNNIINT8 // AVX-VNNI-INT8 instructions
BMI1 // Bit Manipulation Instruction Set 1
BMI2 // Bit Manipulation Instruction Set 2
CETIBT // Intel CET Indirect Branch Tracking
@@ -101,22 +107,37 @@ const (
CLMUL // Carry-less Multiplication
CLZERO // CLZERO instruction supported
CMOV // i686 CMOV
+ CMPCCXADD // CMPCCXADD instructions
+ CMPSB_SCADBS_SHORT // Fast short CMPSB and SCASB
CMPXCHG8 // CMPXCHG8 instruction
CPBOOST // Core Performance Boost
+ CPPC // AMD: Collaborative Processor Performance Control
CX16 // CMPXCHG16B Instruction
+ EFER_LMSLE_UNS // AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ
ENQCMD // Enqueue Command
ERMS // Enhanced REP MOVSB/STOSB
F16C // Half-precision floating-point conversion
+ FLUSH_L1D // Flush L1D cache
FMA3 // Intel FMA 3. Does not imply AVX.
FMA4 // Bulldozer FMA4 functions
+ FP128 // AMD: When set, the internal FP/SIMD execution datapath is no more than 128-bits wide
+ FP256 // AMD: When set, the internal FP/SIMD execution datapath is no more than 256-bits wide
+ FSRM // Fast Short Rep Mov
FXSR // FXSAVE, FXRESTOR instructions, CR4 bit 9
FXSROPT // FXSAVE/FXRSTOR optimizations
- GFNI // Galois Field New Instructions
+ GFNI // Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage.
HLE // Hardware Lock Elision
+ HRESET // If set CPU supports history reset and the IA32_HRESET_ENABLE MSR
HTT // Hyperthreading (enabled)
HWA // Hardware assert supported. Indicates support for MSRC001_10
+ HYBRID_CPU // This part has CPUs of more than one type.
HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors
+ IA32_ARCH_CAP // IA32_ARCH_CAPABILITIES MSR (Intel)
+ IA32_CORE_CAP // IA32_CORE_CAPABILITIES MSR
IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB)
+ IBRS // AMD: Indirect Branch Restricted Speculation
+ IBRS_PREFERRED // AMD: IBRS is preferred over software solution
+ IBRS_PROVIDES_SMP // AMD: IBRS provides Same Mode Protection
IBS // Instruction Based Sampling (AMD)
IBSBRNTRGT // Instruction Based Sampling Feature (AMD)
IBSFETCHSAM // Instruction Based Sampling Feature (AMD)
@@ -126,33 +147,60 @@ const (
IBSOPSAM // Instruction Based Sampling Feature (AMD)
IBSRDWROPCNT // Instruction Based Sampling Feature (AMD)
IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD)
+ IBS_FETCH_CTLX // AMD: IBS fetch control extended MSR supported
+ IBS_OPDATA4 // AMD: IBS op data 4 MSR supported
+ IBS_OPFUSE // AMD: Indicates support for IbsOpFuse
+ IBS_PREVENTHOST // Disallowing IBS use by the host supported
+ IBS_ZEN4 // AMD: Fetch and Op IBS support IBS extensions added with Zen4
INT_WBINVD // WBINVD/WBNOINVD are interruptible.
INVLPGB // NVLPGB and TLBSYNC instruction supported
LAHF // LAHF/SAHF in long mode
+ LAM // If set, CPU supports Linear Address Masking
+ LBRVIRT // LBR virtualization
LZCNT // LZCNT instruction
MCAOVERFLOW // MCA overflow recovery support.
+ MCDT_NO // Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it.
MCOMMIT // MCOMMIT instruction supported
+ MD_CLEAR // VERW clears CPU buffers
MMX // standard MMX
MMXEXT // SSE integer functions or AMD MMX ext
MOVBE // MOVBE instruction (big-endian)
MOVDIR64B // Move 64 Bytes as Direct Store
MOVDIRI // Move Doubleword as Direct Store
+ MOVSB_ZL // Fast Zero-Length MOVSB
+ MOVU // AMD: MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD
MPX // Intel MPX (Memory Protection Extensions)
MSRIRC // Instruction Retired Counter MSR available
+ MSR_PAGEFLUSH // Page Flush MSR available
+ NRIPS // Indicates support for NRIP save on VMEXIT
NX // NX (No-Execute) bit
OSXSAVE // XSAVE enabled by OS
+ PCONFIG // PCONFIG for Intel Multi-Key Total Memory Encryption
POPCNT // POPCNT instruction
+ PPIN // AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled
+ PREFETCHI // PREFETCHIT0/1 instructions
+ PSFD // AMD: Predictive Store Forward Disable
RDPRU // RDPRU instruction supported
RDRAND // RDRAND instruction is available
RDSEED // RDSEED instruction is available
RDTSCP // RDTSCP Instruction
RTM // Restricted Transactional Memory
RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort.
- SCE // SYSENTER and SYSEXIT instructions
SERIALIZE // Serialize Instruction Execution
+ SEV // AMD Secure Encrypted Virtualization supported
+ SEV_64BIT // AMD SEV guest execution only allowed from a 64-bit host
+ SEV_ALTERNATIVE // AMD SEV Alternate Injection supported
+ SEV_DEBUGSWAP // Full debug state swap supported for SEV-ES guests
+ SEV_ES // AMD SEV Encrypted State supported
+ SEV_RESTRICTED // AMD SEV Restricted Injection supported
+ SEV_SNP // AMD SEV Secure Nested Paging supported
SGX // Software Guard Extensions
SGXLC // Software Guard Extensions Launch Control
SHA // Intel SHA Extensions
+ SME // AMD Secure Memory Encryption supported
+ SME_COHERENT // AMD Hardware cache coherency across encryption domains enforced
+ SPEC_CTRL_SSBD // Speculative Store Bypass Disable
+ SRBDS_CTRL // SRBDS mitigation MSR available
SSE // SSE functions
SSE2 // P4 SSE functions
SSE3 // Prescott SSE3 functions
@@ -161,17 +209,40 @@ const (
SSE4A // AMD Barcelona microarchitecture SSE4a instructions
SSSE3 // Conroe SSSE3 functions
STIBP // Single Thread Indirect Branch Predictors
+ STIBP_ALWAYSON // AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On
+ STOSB_SHORT // Fast short STOSB
SUCCOR // Software uncorrectable error containment and recovery capability.
+ SVM // AMD Secure Virtual Machine
+ SVMDA // Indicates support for the SVM decode assists.
+ SVMFBASID // SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control
+ SVML // AMD SVM lock. Indicates support for SVM-Lock.
+ SVMNP // AMD SVM nested paging
+ SVMPF // SVM pause intercept filter. Indicates support for the pause intercept filter
+ SVMPFT // SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold
+ SYSCALL // System-Call Extension (SCE): SYSCALL and SYSRET instructions.
+ SYSEE // SYSENTER and SYSEXIT instructions
TBM // AMD Trailing Bit Manipulation
+ TLB_FLUSH_NESTED // AMD: Flushing includes all the nested translations for guest translations
+ TME // Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE.
+ TOPEXT // TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX.
+ TSCRATEMSR // MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104
TSXLDTRK // Intel TSX Suspend Load Address Tracking
- VAES // Vector AES
+ VAES // Vector AES. AVX(512) versions requires additional checks.
+ VMCBCLEAN // VMCB clean bits. Indicates support for VMCB clean bits.
+ VMPL // AMD VM Permission Levels supported
+ VMSA_REGPROT // AMD VMSA Register Protection supported
VMX // Virtual Machine Extensions
- VPCLMULQDQ // Carry-Less Multiplication Quadword
+ VPCLMULQDQ // Carry-Less Multiplication Quadword. Requires AVX for 3 register versions.
+ VTE // AMD Virtual Transparent Encryption supported
WAITPKG // TPAUSE, UMONITOR, UMWAIT
WBNOINVD // Write Back and Do Not Invalidate Cache
X87 // FPU
+ XGETBV1 // Supports XGETBV with ECX = 1
XOP // Bulldozer XOP functions
XSAVE // XSAVE, XRESTOR, XSETBV, XGETBV
+ XSAVEC // Supports XSAVEC and the compacted form of XRSTOR.
+ XSAVEOPT // XSAVEOPT available
+ XSAVES // Supports XSAVES/XRSTORS and IA32_XSS
// ARM features:
AESARM // AES instructions
@@ -198,7 +269,6 @@ const (
SM3 // SM3 instructions
SM4 // SM4 instructions
SVE // Scalable Vector Extension
-
// Keep it last. It automatically defines the size of []flagSet
lastID
@@ -216,6 +286,7 @@ type CPUInfo struct {
LogicalCores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable.
Family int // CPU family number
Model int // CPU model number
+ Stepping int // CPU stepping info
CacheLine int // Cache line size in bytes. Will be 0 if undetectable.
Hz int64 // Clock speed, if known, 0 otherwise. Will attempt to contain base clock speed.
BoostFreq int64 // Max clock speed, if known, 0 otherwise
@@ -318,30 +389,61 @@ func (c CPUInfo) Supports(ids ...FeatureID) bool {
// Has allows for checking a single feature.
// Should be inlined by the compiler.
-func (c CPUInfo) Has(id FeatureID) bool {
+func (c *CPUInfo) Has(id FeatureID) bool {
return c.featureSet.inSet(id)
}
+// AnyOf returns whether the CPU supports one or more of the requested features.
+func (c CPUInfo) AnyOf(ids ...FeatureID) bool {
+ for _, id := range ids {
+ if c.featureSet.inSet(id) {
+ return true
+ }
+ }
+ return false
+}
+
+// Features contains several features combined for a fast check using
+// CpuInfo.HasAll
+type Features *flagSet
+
+// CombineFeatures allows to combine several features for a close to constant time lookup.
+func CombineFeatures(ids ...FeatureID) Features {
+ var v flagSet
+ for _, id := range ids {
+ v.set(id)
+ }
+ return &v
+}
+
+func (c *CPUInfo) HasAll(f Features) bool {
+ return c.featureSet.hasSetP(f)
+}
+
// https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels
-var level1Features = flagSetWith(CMOV, CMPXCHG8, X87, FXSR, MMX, SCE, SSE, SSE2)
-var level2Features = flagSetWith(CMOV, CMPXCHG8, X87, FXSR, MMX, SCE, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3)
-var level3Features = flagSetWith(CMOV, CMPXCHG8, X87, FXSR, MMX, SCE, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3, AVX, AVX2, BMI1, BMI2, F16C, FMA3, LZCNT, MOVBE, OSXSAVE)
-var level4Features = flagSetWith(CMOV, CMPXCHG8, X87, FXSR, MMX, SCE, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3, AVX, AVX2, BMI1, BMI2, F16C, FMA3, LZCNT, MOVBE, OSXSAVE, AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL)
+var oneOfLevel = CombineFeatures(SYSEE, SYSCALL)
+var level1Features = CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SSE, SSE2)
+var level2Features = CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3)
+var level3Features = CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3, AVX, AVX2, BMI1, BMI2, F16C, FMA3, LZCNT, MOVBE, OSXSAVE)
+var level4Features = CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3, AVX, AVX2, BMI1, BMI2, F16C, FMA3, LZCNT, MOVBE, OSXSAVE, AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL)
// X64Level returns the microarchitecture level detected on the CPU.
// If features are lacking or non x64 mode, 0 is returned.
// See https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels
func (c CPUInfo) X64Level() int {
- if c.featureSet.hasSet(level4Features) {
+ if !c.featureSet.hasOneOf(oneOfLevel) {
+ return 0
+ }
+ if c.featureSet.hasSetP(level4Features) {
return 4
}
- if c.featureSet.hasSet(level3Features) {
+ if c.featureSet.hasSetP(level3Features) {
return 3
}
- if c.featureSet.hasSet(level2Features) {
+ if c.featureSet.hasSetP(level2Features) {
return 2
}
- if c.featureSet.hasSet(level1Features) {
+ if c.featureSet.hasSetP(level1Features) {
return 1
}
return 0
@@ -369,8 +471,9 @@ func (c CPUInfo) IsVendor(v Vendor) bool {
return c.VendorID == v
}
+// FeatureSet returns all available features as strings.
func (c CPUInfo) FeatureSet() []string {
- s := make([]string, 0)
+ s := make([]string, 0, c.featureSet.nEnabled())
s = append(s, c.featureSet.Strings()...)
return s
}
@@ -504,7 +607,7 @@ const flagMask = flagBits - 1
// flagSet contains detected cpu features and characteristics in an array of flags
type flagSet [(lastID + flagMask) / flagBits]flags
-func (s flagSet) inSet(feat FeatureID) bool {
+func (s *flagSet) inSet(feat FeatureID) bool {
return s[feat>>flagBitsLog2]&(1<<(feat&flagMask)) != 0
}
@@ -534,7 +637,7 @@ func (s *flagSet) or(other flagSet) {
}
// hasSet returns whether all features are present.
-func (s flagSet) hasSet(other flagSet) bool {
+func (s *flagSet) hasSet(other flagSet) bool {
for i, v := range other[:] {
if s[i]&v != v {
return false
@@ -543,6 +646,34 @@ func (s flagSet) hasSet(other flagSet) bool {
return true
}
+// hasSet returns whether all features are present.
+func (s *flagSet) hasSetP(other *flagSet) bool {
+ for i, v := range other[:] {
+ if s[i]&v != v {
+ return false
+ }
+ }
+ return true
+}
+
+// hasOneOf returns whether one or more features are present.
+func (s *flagSet) hasOneOf(other *flagSet) bool {
+ for i, v := range other[:] {
+ if s[i]&v != 0 {
+ return true
+ }
+ }
+ return false
+}
+
+// nEnabled will return the number of enabled flags.
+func (s *flagSet) nEnabled() (n int) {
+ for _, v := range s[:] {
+ n += bits.OnesCount64(uint64(v))
+ }
+ return n
+}
+
func flagSetWith(feat ...FeatureID) flagSet {
var res flagSet
for _, f := range feat {
@@ -631,7 +762,7 @@ func threadsPerCore() int {
if vend == AMD {
// Workaround for AMD returning 0, assume 2 if >= Zen 2
// It will be more correct than not.
- fam, _ := familyModel()
+ fam, _, _ := familyModel()
_, _, _, d := cpuid(1)
if (d&(1<<28)) != 0 && fam >= 23 {
return 2
@@ -669,14 +800,27 @@ func logicalCores() int {
}
}
-func familyModel() (int, int) {
+func familyModel() (family, model, stepping int) {
if maxFunctionID() < 0x1 {
- return 0, 0
+ return 0, 0, 0
}
eax, _, _, _ := cpuid(1)
- family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff)
- model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0)
- return int(family), int(model)
+ // If BaseFamily[3:0] is less than Fh then ExtendedFamily[7:0] is reserved and Family is equal to BaseFamily[3:0].
+ family = int((eax >> 8) & 0xf)
+ extFam := family == 0x6 // Intel is 0x6, needs extended model.
+ if family == 0xf {
+ // Add ExtFamily
+ family += int((eax >> 20) & 0xff)
+ extFam = true
+ }
+ // If BaseFamily[3:0] is less than 0Fh then ExtendedModel[3:0] is reserved and Model is equal to BaseModel[3:0].
+ model = int((eax >> 4) & 0xf)
+ if extFam {
+ // Add ExtModel
+ model += int((eax >> 12) & 0xf0)
+ }
+ stepping = int(eax & 0xf)
+ return family, model, stepping
}
func physicalCores() int {
@@ -811,9 +955,14 @@ func (c *CPUInfo) cacheSize() {
c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
// CPUID Fn8000_001D_EAX_x[N:0] Cache Properties
- if maxExtendedFunction() < 0x8000001D {
+ if maxExtendedFunction() < 0x8000001D || !c.Has(TOPEXT) {
return
}
+
+ // Xen Hypervisor is buggy and returns the same entry no matter ECX value.
+ // Hack: When we encounter the same entry 100 times we break.
+ nSame := 0
+ var last uint32
for i := uint32(0); i < math.MaxUint32; i++ {
eax, ebx, ecx, _ := cpuidex(0x8000001D, i)
@@ -829,6 +978,16 @@ func (c *CPUInfo) cacheSize() {
return
}
+ // Check for the same value repeated.
+ comb := eax ^ ebx ^ ecx
+ if comb == last {
+ nSame++
+ if nSame == 100 {
+ return
+ }
+ }
+ last = comb
+
switch level {
case 1:
switch typ {
@@ -913,14 +1072,13 @@ func support() flagSet {
if mfi < 0x1 {
return fs
}
- family, model := familyModel()
+ family, model, _ := familyModel()
_, _, c, d := cpuid(1)
fs.setIf((d&(1<<0)) != 0, X87)
fs.setIf((d&(1<<8)) != 0, CMPXCHG8)
- fs.setIf((d&(1<<11)) != 0, SCE)
+ fs.setIf((d&(1<<11)) != 0, SYSEE)
fs.setIf((d&(1<<15)) != 0, CMOV)
- fs.setIf((d&(1<<22)) != 0, MMXEXT)
fs.setIf((d&(1<<23)) != 0, MMX)
fs.setIf((d&(1<<24)) != 0, FXSR)
fs.setIf((d&(1<<25)) != 0, FXSROPT)
@@ -928,9 +1086,9 @@ func support() flagSet {
fs.setIf((d&(1<<26)) != 0, SSE2)
fs.setIf((c&1) != 0, SSE3)
fs.setIf((c&(1<<5)) != 0, VMX)
- fs.setIf((c&0x00000200) != 0, SSSE3)
- fs.setIf((c&0x00080000) != 0, SSE4)
- fs.setIf((c&0x00100000) != 0, SSE42)
+ fs.setIf((c&(1<<9)) != 0, SSSE3)
+ fs.setIf((c&(1<<19)) != 0, SSE4)
+ fs.setIf((c&(1<<20)) != 0, SSE42)
fs.setIf((c&(1<<25)) != 0, AESNI)
fs.setIf((c&(1<<1)) != 0, CLMUL)
fs.setIf(c&(1<<22) != 0, MOVBE)
@@ -976,7 +1134,6 @@ func support() flagSet {
// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
if mfi >= 7 {
_, ebx, ecx, edx := cpuidex(7, 0)
- eax1, _, _, _ := cpuidex(7, 1)
if fs.inSet(AVX) && (ebx&0x00000020) != 0 {
fs.set(AVX2)
}
@@ -993,21 +1150,52 @@ func support() flagSet {
fs.setIf(ebx&(1<<18) != 0, RDSEED)
fs.setIf(ebx&(1<<19) != 0, ADX)
fs.setIf(ebx&(1<<29) != 0, SHA)
+
// CPUID.(EAX=7, ECX=0).ECX
fs.setIf(ecx&(1<<5) != 0, WAITPKG)
fs.setIf(ecx&(1<<7) != 0, CETSS)
+ fs.setIf(ecx&(1<<8) != 0, GFNI)
+ fs.setIf(ecx&(1<<9) != 0, VAES)
+ fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ)
+ fs.setIf(ecx&(1<<13) != 0, TME)
fs.setIf(ecx&(1<<25) != 0, CLDEMOTE)
fs.setIf(ecx&(1<<27) != 0, MOVDIRI)
fs.setIf(ecx&(1<<28) != 0, MOVDIR64B)
fs.setIf(ecx&(1<<29) != 0, ENQCMD)
fs.setIf(ecx&(1<<30) != 0, SGXLC)
+
// CPUID.(EAX=7, ECX=0).EDX
+ fs.setIf(edx&(1<<4) != 0, FSRM)
+ fs.setIf(edx&(1<<9) != 0, SRBDS_CTRL)
+ fs.setIf(edx&(1<<10) != 0, MD_CLEAR)
fs.setIf(edx&(1<<11) != 0, RTM_ALWAYS_ABORT)
fs.setIf(edx&(1<<14) != 0, SERIALIZE)
+ fs.setIf(edx&(1<<15) != 0, HYBRID_CPU)
fs.setIf(edx&(1<<16) != 0, TSXLDTRK)
+ fs.setIf(edx&(1<<18) != 0, PCONFIG)
fs.setIf(edx&(1<<20) != 0, CETIBT)
fs.setIf(edx&(1<<26) != 0, IBPB)
fs.setIf(edx&(1<<27) != 0, STIBP)
+ fs.setIf(edx&(1<<28) != 0, FLUSH_L1D)
+ fs.setIf(edx&(1<<29) != 0, IA32_ARCH_CAP)
+ fs.setIf(edx&(1<<30) != 0, IA32_CORE_CAP)
+ fs.setIf(edx&(1<<31) != 0, SPEC_CTRL_SSBD)
+
+ // CPUID.(EAX=7, ECX=1).EDX
+ fs.setIf(edx&(1<<4) != 0, AVXVNNIINT8)
+ fs.setIf(edx&(1<<5) != 0, AVXNECONVERT)
+ fs.setIf(edx&(1<<14) != 0, PREFETCHI)
+
+ // CPUID.(EAX=7, ECX=1).EAX
+ eax1, _, _, _ := cpuidex(7, 1)
+ fs.setIf(fs.inSet(AVX) && eax1&(1<<4) != 0, AVXVNNI)
+ fs.setIf(eax1&(1<<7) != 0, CMPCCXADD)
+ fs.setIf(eax1&(1<<10) != 0, MOVSB_ZL)
+ fs.setIf(eax1&(1<<11) != 0, STOSB_SHORT)
+ fs.setIf(eax1&(1<<12) != 0, CMPSB_SCADBS_SHORT)
+ fs.setIf(eax1&(1<<22) != 0, HRESET)
+ fs.setIf(eax1&(1<<23) != 0, AVXIFMA)
+ fs.setIf(eax1&(1<<26) != 0, LAM)
// Only detect AVX-512 features if XGETBV is supported
if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
@@ -1033,9 +1221,6 @@ func support() flagSet {
// ecx
fs.setIf(ecx&(1<<1) != 0, AVX512VBMI)
fs.setIf(ecx&(1<<6) != 0, AVX512VBMI2)
- fs.setIf(ecx&(1<<8) != 0, GFNI)
- fs.setIf(ecx&(1<<9) != 0, VAES)
- fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ)
fs.setIf(ecx&(1<<11) != 0, AVX512VNNI)
fs.setIf(ecx&(1<<12) != 0, AVX512BITALG)
fs.setIf(ecx&(1<<14) != 0, AVX512VPOPCNTDQ)
@@ -1047,31 +1232,66 @@ func support() flagSet {
fs.setIf(edx&(1<<25) != 0, AMXINT8)
// eax1 = CPUID.(EAX=7, ECX=1).EAX
fs.setIf(eax1&(1<<5) != 0, AVX512BF16)
+ fs.setIf(eax1&(1<<21) != 0, AMXFP16)
}
}
+
+ // CPUID.(EAX=7, ECX=2)
+ _, _, _, edx = cpuidex(7, 2)
+ fs.setIf(edx&(1<<5) != 0, MCDT_NO)
}
+ // Processor Extended State Enumeration Sub-leaf (EAX = 0DH, ECX = 1)
+ // EAX
+ // Bit 00: XSAVEOPT is available.
+ // Bit 01: Supports XSAVEC and the compacted form of XRSTOR if set.
+ // Bit 02: Supports XGETBV with ECX = 1 if set.
+ // Bit 03: Supports XSAVES/XRSTORS and IA32_XSS if set.
+ // Bits 31 - 04: Reserved.
+ // EBX
+ // Bits 31 - 00: The size in bytes of the XSAVE area containing all states enabled by XCRO | IA32_XSS.
+ // ECX
+ // Bits 31 - 00: Reports the supported bits of the lower 32 bits of the IA32_XSS MSR. IA32_XSS[n] can be set to 1 only if ECX[n] is 1.
+ // EDX?
+ // Bits 07 - 00: Used for XCR0. Bit 08: PT state. Bit 09: Used for XCR0. Bits 12 - 10: Reserved. Bit 13: HWP state. Bits 31 - 14: Reserved.
+ if mfi >= 0xd {
+ if fs.inSet(XSAVE) {
+ eax, _, _, _ := cpuidex(0xd, 1)
+ fs.setIf(eax&(1<<0) != 0, XSAVEOPT)
+ fs.setIf(eax&(1<<1) != 0, XSAVEC)
+ fs.setIf(eax&(1<<2) != 0, XGETBV1)
+ fs.setIf(eax&(1<<3) != 0, XSAVES)
+ }
+ }
if maxExtendedFunction() >= 0x80000001 {
_, _, c, d := cpuid(0x80000001)
if (c & (1 << 5)) != 0 {
fs.set(LZCNT)
fs.set(POPCNT)
}
+ // ECX
fs.setIf((c&(1<<0)) != 0, LAHF)
- fs.setIf((c&(1<<10)) != 0, IBS)
- fs.setIf((d&(1<<31)) != 0, AMD3DNOW)
- fs.setIf((d&(1<<30)) != 0, AMD3DNOWEXT)
- fs.setIf((d&(1<<23)) != 0, MMX)
- fs.setIf((d&(1<<22)) != 0, MMXEXT)
+ fs.setIf((c&(1<<2)) != 0, SVM)
fs.setIf((c&(1<<6)) != 0, SSE4A)
+ fs.setIf((c&(1<<10)) != 0, IBS)
+ fs.setIf((c&(1<<22)) != 0, TOPEXT)
+
+ // EDX
+ fs.setIf(d&(1<<11) != 0, SYSCALL)
fs.setIf(d&(1<<20) != 0, NX)
+ fs.setIf(d&(1<<22) != 0, MMXEXT)
+ fs.setIf(d&(1<<23) != 0, MMX)
+ fs.setIf(d&(1<<24) != 0, FXSR)
+ fs.setIf(d&(1<<25) != 0, FXSROPT)
fs.setIf(d&(1<<27) != 0, RDTSCP)
+ fs.setIf(d&(1<<30) != 0, AMD3DNOWEXT)
+ fs.setIf(d&(1<<31) != 0, AMD3DNOW)
/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
* used unless the OS has AVX support. */
if fs.inSet(AVX) {
- fs.setIf((c&0x00000800) != 0, XOP)
- fs.setIf((c&0x00010000) != 0, FMA4)
+ fs.setIf((c&(1<<11)) != 0, XOP)
+ fs.setIf((c&(1<<16)) != 0, FMA4)
}
}
@@ -1085,15 +1305,48 @@ func support() flagSet {
if maxExtendedFunction() >= 0x80000008 {
_, b, _, _ := cpuid(0x80000008)
+ fs.setIf(b&(1<<28) != 0, PSFD)
+ fs.setIf(b&(1<<27) != 0, CPPC)
+ fs.setIf(b&(1<<24) != 0, SPEC_CTRL_SSBD)
+ fs.setIf(b&(1<<23) != 0, PPIN)
+ fs.setIf(b&(1<<21) != 0, TLB_FLUSH_NESTED)
+ fs.setIf(b&(1<<20) != 0, EFER_LMSLE_UNS)
+ fs.setIf(b&(1<<19) != 0, IBRS_PROVIDES_SMP)
+ fs.setIf(b&(1<<18) != 0, IBRS_PREFERRED)
+ fs.setIf(b&(1<<17) != 0, STIBP_ALWAYSON)
+ fs.setIf(b&(1<<15) != 0, STIBP)
+ fs.setIf(b&(1<<14) != 0, IBRS)
+ fs.setIf((b&(1<<13)) != 0, INT_WBINVD)
+ fs.setIf(b&(1<<12) != 0, IBPB)
fs.setIf((b&(1<<9)) != 0, WBNOINVD)
fs.setIf((b&(1<<8)) != 0, MCOMMIT)
- fs.setIf((b&(1<<13)) != 0, INT_WBINVD)
fs.setIf((b&(1<<4)) != 0, RDPRU)
fs.setIf((b&(1<<3)) != 0, INVLPGB)
fs.setIf((b&(1<<1)) != 0, MSRIRC)
fs.setIf((b&(1<<0)) != 0, CLZERO)
}
+ if fs.inSet(SVM) && maxExtendedFunction() >= 0x8000000A {
+ _, _, _, edx := cpuid(0x8000000A)
+ fs.setIf((edx>>0)&1 == 1, SVMNP)
+ fs.setIf((edx>>1)&1 == 1, LBRVIRT)
+ fs.setIf((edx>>2)&1 == 1, SVML)
+ fs.setIf((edx>>3)&1 == 1, NRIPS)
+ fs.setIf((edx>>4)&1 == 1, TSCRATEMSR)
+ fs.setIf((edx>>5)&1 == 1, VMCBCLEAN)
+ fs.setIf((edx>>6)&1 == 1, SVMFBASID)
+ fs.setIf((edx>>7)&1 == 1, SVMDA)
+ fs.setIf((edx>>10)&1 == 1, SVMPF)
+ fs.setIf((edx>>12)&1 == 1, SVMPFT)
+ }
+
+ if maxExtendedFunction() >= 0x8000001a {
+ eax, _, _, _ := cpuid(0x8000001a)
+ fs.setIf((eax>>0)&1 == 1, FP128)
+ fs.setIf((eax>>1)&1 == 1, MOVU)
+ fs.setIf((eax>>2)&1 == 1, FP256)
+ }
+
if maxExtendedFunction() >= 0x8000001b && fs.inSet(IBS) {
eax, _, _, _ := cpuid(0x8000001b)
fs.setIf((eax>>0)&1 == 1, IBSFFV)
@@ -1104,6 +1357,28 @@ func support() flagSet {
fs.setIf((eax>>5)&1 == 1, IBSBRNTRGT)
fs.setIf((eax>>6)&1 == 1, IBSOPCNTEXT)
fs.setIf((eax>>7)&1 == 1, IBSRIPINVALIDCHK)
+ fs.setIf((eax>>8)&1 == 1, IBS_OPFUSE)
+ fs.setIf((eax>>9)&1 == 1, IBS_FETCH_CTLX)
+ fs.setIf((eax>>10)&1 == 1, IBS_OPDATA4) // Doc says "Fixed,0. IBS op data 4 MSR supported", but assuming they mean 1.
+ fs.setIf((eax>>11)&1 == 1, IBS_ZEN4)
+ }
+
+ if maxExtendedFunction() >= 0x8000001f && vend == AMD {
+ a, _, _, _ := cpuid(0x8000001f)
+ fs.setIf((a>>0)&1 == 1, SME)
+ fs.setIf((a>>1)&1 == 1, SEV)
+ fs.setIf((a>>2)&1 == 1, MSR_PAGEFLUSH)
+ fs.setIf((a>>3)&1 == 1, SEV_ES)
+ fs.setIf((a>>4)&1 == 1, SEV_SNP)
+ fs.setIf((a>>5)&1 == 1, VMPL)
+ fs.setIf((a>>10)&1 == 1, SME_COHERENT)
+ fs.setIf((a>>11)&1 == 1, SEV_64BIT)
+ fs.setIf((a>>12)&1 == 1, SEV_RESTRICTED)
+ fs.setIf((a>>13)&1 == 1, SEV_ALTERNATIVE)
+ fs.setIf((a>>14)&1 == 1, SEV_DEBUGSWAP)
+ fs.setIf((a>>15)&1 == 1, IBS_PREVENTHOST)
+ fs.setIf((a>>16)&1 == 1, VTE)
+ fs.setIf((a>>24)&1 == 1, VMSA_REGPROT)
}
return fs
diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go
index 35678d8a..c946824e 100644
--- a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go
+++ b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go
@@ -24,7 +24,7 @@ func addInfo(c *CPUInfo, safe bool) {
c.maxExFunc = maxExtendedFunction()
c.BrandName = brandName()
c.CacheLine = cacheLine()
- c.Family, c.Model = familyModel()
+ c.Family, c.Model, c.Stepping = familyModel()
c.featureSet = support()
c.SGX = hasSGX(c.featureSet.inSet(SGX), c.featureSet.inSet(SGXLC))
c.ThreadsPerCore = threadsPerCore()
diff --git a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go
index 02fe232a..8b6cd2b7 100644
--- a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go
+++ b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go
@@ -13,137 +13,207 @@ func _() {
_ = x[AMD3DNOW-3]
_ = x[AMD3DNOWEXT-4]
_ = x[AMXBF16-5]
- _ = x[AMXINT8-6]
- _ = x[AMXTILE-7]
- _ = x[AVX-8]
- _ = x[AVX2-9]
- _ = x[AVX512BF16-10]
- _ = x[AVX512BITALG-11]
- _ = x[AVX512BW-12]
- _ = x[AVX512CD-13]
- _ = x[AVX512DQ-14]
- _ = x[AVX512ER-15]
- _ = x[AVX512F-16]
- _ = x[AVX512FP16-17]
- _ = x[AVX512IFMA-18]
- _ = x[AVX512PF-19]
- _ = x[AVX512VBMI-20]
- _ = x[AVX512VBMI2-21]
- _ = x[AVX512VL-22]
- _ = x[AVX512VNNI-23]
- _ = x[AVX512VP2INTERSECT-24]
- _ = x[AVX512VPOPCNTDQ-25]
- _ = x[AVXSLOW-26]
- _ = x[BMI1-27]
- _ = x[BMI2-28]
- _ = x[CETIBT-29]
- _ = x[CETSS-30]
- _ = x[CLDEMOTE-31]
- _ = x[CLMUL-32]
- _ = x[CLZERO-33]
- _ = x[CMOV-34]
- _ = x[CMPXCHG8-35]
- _ = x[CPBOOST-36]
- _ = x[CX16-37]
- _ = x[ENQCMD-38]
- _ = x[ERMS-39]
- _ = x[F16C-40]
- _ = x[FMA3-41]
- _ = x[FMA4-42]
- _ = x[FXSR-43]
- _ = x[FXSROPT-44]
- _ = x[GFNI-45]
- _ = x[HLE-46]
- _ = x[HTT-47]
- _ = x[HWA-48]
- _ = x[HYPERVISOR-49]
- _ = x[IBPB-50]
- _ = x[IBS-51]
- _ = x[IBSBRNTRGT-52]
- _ = x[IBSFETCHSAM-53]
- _ = x[IBSFFV-54]
- _ = x[IBSOPCNT-55]
- _ = x[IBSOPCNTEXT-56]
- _ = x[IBSOPSAM-57]
- _ = x[IBSRDWROPCNT-58]
- _ = x[IBSRIPINVALIDCHK-59]
- _ = x[INT_WBINVD-60]
- _ = x[INVLPGB-61]
- _ = x[LAHF-62]
- _ = x[LZCNT-63]
- _ = x[MCAOVERFLOW-64]
- _ = x[MCOMMIT-65]
- _ = x[MMX-66]
- _ = x[MMXEXT-67]
- _ = x[MOVBE-68]
- _ = x[MOVDIR64B-69]
- _ = x[MOVDIRI-70]
- _ = x[MPX-71]
- _ = x[MSRIRC-72]
- _ = x[NX-73]
- _ = x[OSXSAVE-74]
- _ = x[POPCNT-75]
- _ = x[RDPRU-76]
- _ = x[RDRAND-77]
- _ = x[RDSEED-78]
- _ = x[RDTSCP-79]
- _ = x[RTM-80]
- _ = x[RTM_ALWAYS_ABORT-81]
- _ = x[SCE-82]
- _ = x[SERIALIZE-83]
- _ = x[SGX-84]
- _ = x[SGXLC-85]
- _ = x[SHA-86]
- _ = x[SSE-87]
- _ = x[SSE2-88]
- _ = x[SSE3-89]
- _ = x[SSE4-90]
- _ = x[SSE42-91]
- _ = x[SSE4A-92]
- _ = x[SSSE3-93]
- _ = x[STIBP-94]
- _ = x[SUCCOR-95]
- _ = x[TBM-96]
- _ = x[TSXLDTRK-97]
- _ = x[VAES-98]
- _ = x[VMX-99]
- _ = x[VPCLMULQDQ-100]
- _ = x[WAITPKG-101]
- _ = x[WBNOINVD-102]
- _ = x[X87-103]
- _ = x[XOP-104]
- _ = x[XSAVE-105]
- _ = x[AESARM-106]
- _ = x[ARMCPUID-107]
- _ = x[ASIMD-108]
- _ = x[ASIMDDP-109]
- _ = x[ASIMDHP-110]
- _ = x[ASIMDRDM-111]
- _ = x[ATOMICS-112]
- _ = x[CRC32-113]
- _ = x[DCPOP-114]
- _ = x[EVTSTRM-115]
- _ = x[FCMA-116]
- _ = x[FP-117]
- _ = x[FPHP-118]
- _ = x[GPA-119]
- _ = x[JSCVT-120]
- _ = x[LRCPC-121]
- _ = x[PMULL-122]
- _ = x[SHA1-123]
- _ = x[SHA2-124]
- _ = x[SHA3-125]
- _ = x[SHA512-126]
- _ = x[SM3-127]
- _ = x[SM4-128]
- _ = x[SVE-129]
- _ = x[lastID-130]
+ _ = x[AMXFP16-6]
+ _ = x[AMXINT8-7]
+ _ = x[AMXTILE-8]
+ _ = x[AVX-9]
+ _ = x[AVX2-10]
+ _ = x[AVX512BF16-11]
+ _ = x[AVX512BITALG-12]
+ _ = x[AVX512BW-13]
+ _ = x[AVX512CD-14]
+ _ = x[AVX512DQ-15]
+ _ = x[AVX512ER-16]
+ _ = x[AVX512F-17]
+ _ = x[AVX512FP16-18]
+ _ = x[AVX512IFMA-19]
+ _ = x[AVX512PF-20]
+ _ = x[AVX512VBMI-21]
+ _ = x[AVX512VBMI2-22]
+ _ = x[AVX512VL-23]
+ _ = x[AVX512VNNI-24]
+ _ = x[AVX512VP2INTERSECT-25]
+ _ = x[AVX512VPOPCNTDQ-26]
+ _ = x[AVXIFMA-27]
+ _ = x[AVXNECONVERT-28]
+ _ = x[AVXSLOW-29]
+ _ = x[AVXVNNI-30]
+ _ = x[AVXVNNIINT8-31]
+ _ = x[BMI1-32]
+ _ = x[BMI2-33]
+ _ = x[CETIBT-34]
+ _ = x[CETSS-35]
+ _ = x[CLDEMOTE-36]
+ _ = x[CLMUL-37]
+ _ = x[CLZERO-38]
+ _ = x[CMOV-39]
+ _ = x[CMPCCXADD-40]
+ _ = x[CMPSB_SCADBS_SHORT-41]
+ _ = x[CMPXCHG8-42]
+ _ = x[CPBOOST-43]
+ _ = x[CPPC-44]
+ _ = x[CX16-45]
+ _ = x[EFER_LMSLE_UNS-46]
+ _ = x[ENQCMD-47]
+ _ = x[ERMS-48]
+ _ = x[F16C-49]
+ _ = x[FLUSH_L1D-50]
+ _ = x[FMA3-51]
+ _ = x[FMA4-52]
+ _ = x[FP128-53]
+ _ = x[FP256-54]
+ _ = x[FSRM-55]
+ _ = x[FXSR-56]
+ _ = x[FXSROPT-57]
+ _ = x[GFNI-58]
+ _ = x[HLE-59]
+ _ = x[HRESET-60]
+ _ = x[HTT-61]
+ _ = x[HWA-62]
+ _ = x[HYBRID_CPU-63]
+ _ = x[HYPERVISOR-64]
+ _ = x[IA32_ARCH_CAP-65]
+ _ = x[IA32_CORE_CAP-66]
+ _ = x[IBPB-67]
+ _ = x[IBRS-68]
+ _ = x[IBRS_PREFERRED-69]
+ _ = x[IBRS_PROVIDES_SMP-70]
+ _ = x[IBS-71]
+ _ = x[IBSBRNTRGT-72]
+ _ = x[IBSFETCHSAM-73]
+ _ = x[IBSFFV-74]
+ _ = x[IBSOPCNT-75]
+ _ = x[IBSOPCNTEXT-76]
+ _ = x[IBSOPSAM-77]
+ _ = x[IBSRDWROPCNT-78]
+ _ = x[IBSRIPINVALIDCHK-79]
+ _ = x[IBS_FETCH_CTLX-80]
+ _ = x[IBS_OPDATA4-81]
+ _ = x[IBS_OPFUSE-82]
+ _ = x[IBS_PREVENTHOST-83]
+ _ = x[IBS_ZEN4-84]
+ _ = x[INT_WBINVD-85]
+ _ = x[INVLPGB-86]
+ _ = x[LAHF-87]
+ _ = x[LAM-88]
+ _ = x[LBRVIRT-89]
+ _ = x[LZCNT-90]
+ _ = x[MCAOVERFLOW-91]
+ _ = x[MCDT_NO-92]
+ _ = x[MCOMMIT-93]
+ _ = x[MD_CLEAR-94]
+ _ = x[MMX-95]
+ _ = x[MMXEXT-96]
+ _ = x[MOVBE-97]
+ _ = x[MOVDIR64B-98]
+ _ = x[MOVDIRI-99]
+ _ = x[MOVSB_ZL-100]
+ _ = x[MOVU-101]
+ _ = x[MPX-102]
+ _ = x[MSRIRC-103]
+ _ = x[MSR_PAGEFLUSH-104]
+ _ = x[NRIPS-105]
+ _ = x[NX-106]
+ _ = x[OSXSAVE-107]
+ _ = x[PCONFIG-108]
+ _ = x[POPCNT-109]
+ _ = x[PPIN-110]
+ _ = x[PREFETCHI-111]
+ _ = x[PSFD-112]
+ _ = x[RDPRU-113]
+ _ = x[RDRAND-114]
+ _ = x[RDSEED-115]
+ _ = x[RDTSCP-116]
+ _ = x[RTM-117]
+ _ = x[RTM_ALWAYS_ABORT-118]
+ _ = x[SERIALIZE-119]
+ _ = x[SEV-120]
+ _ = x[SEV_64BIT-121]
+ _ = x[SEV_ALTERNATIVE-122]
+ _ = x[SEV_DEBUGSWAP-123]
+ _ = x[SEV_ES-124]
+ _ = x[SEV_RESTRICTED-125]
+ _ = x[SEV_SNP-126]
+ _ = x[SGX-127]
+ _ = x[SGXLC-128]
+ _ = x[SHA-129]
+ _ = x[SME-130]
+ _ = x[SME_COHERENT-131]
+ _ = x[SPEC_CTRL_SSBD-132]
+ _ = x[SRBDS_CTRL-133]
+ _ = x[SSE-134]
+ _ = x[SSE2-135]
+ _ = x[SSE3-136]
+ _ = x[SSE4-137]
+ _ = x[SSE42-138]
+ _ = x[SSE4A-139]
+ _ = x[SSSE3-140]
+ _ = x[STIBP-141]
+ _ = x[STIBP_ALWAYSON-142]
+ _ = x[STOSB_SHORT-143]
+ _ = x[SUCCOR-144]
+ _ = x[SVM-145]
+ _ = x[SVMDA-146]
+ _ = x[SVMFBASID-147]
+ _ = x[SVML-148]
+ _ = x[SVMNP-149]
+ _ = x[SVMPF-150]
+ _ = x[SVMPFT-151]
+ _ = x[SYSCALL-152]
+ _ = x[SYSEE-153]
+ _ = x[TBM-154]
+ _ = x[TLB_FLUSH_NESTED-155]
+ _ = x[TME-156]
+ _ = x[TOPEXT-157]
+ _ = x[TSCRATEMSR-158]
+ _ = x[TSXLDTRK-159]
+ _ = x[VAES-160]
+ _ = x[VMCBCLEAN-161]
+ _ = x[VMPL-162]
+ _ = x[VMSA_REGPROT-163]
+ _ = x[VMX-164]
+ _ = x[VPCLMULQDQ-165]
+ _ = x[VTE-166]
+ _ = x[WAITPKG-167]
+ _ = x[WBNOINVD-168]
+ _ = x[X87-169]
+ _ = x[XGETBV1-170]
+ _ = x[XOP-171]
+ _ = x[XSAVE-172]
+ _ = x[XSAVEC-173]
+ _ = x[XSAVEOPT-174]
+ _ = x[XSAVES-175]
+ _ = x[AESARM-176]
+ _ = x[ARMCPUID-177]
+ _ = x[ASIMD-178]
+ _ = x[ASIMDDP-179]
+ _ = x[ASIMDHP-180]
+ _ = x[ASIMDRDM-181]
+ _ = x[ATOMICS-182]
+ _ = x[CRC32-183]
+ _ = x[DCPOP-184]
+ _ = x[EVTSTRM-185]
+ _ = x[FCMA-186]
+ _ = x[FP-187]
+ _ = x[FPHP-188]
+ _ = x[GPA-189]
+ _ = x[JSCVT-190]
+ _ = x[LRCPC-191]
+ _ = x[PMULL-192]
+ _ = x[SHA1-193]
+ _ = x[SHA2-194]
+ _ = x[SHA3-195]
+ _ = x[SHA512-196]
+ _ = x[SM3-197]
+ _ = x[SM4-198]
+ _ = x[SVE-199]
+ _ = x[lastID-200]
_ = x[firstID-0]
}
-const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXINT8AMXTILEAVXAVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXSLOWBMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPXCHG8CPBOOSTCX16ENQCMDERMSF16CFMA3FMA4FXSRFXSROPTGFNIHLEHTTHWAHYPERVISORIBPBIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKINT_WBINVDINVLPGBLAHFLZCNTMCAOVERFLOWMCOMMITMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMPXMSRIRCNXOSXSAVEPOPCNTRDPRURDRANDRDSEEDRDTSCPRTMRTM_ALWAYS_ABORTSCESERIALIZESGXSGXLCSHASSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSUCCORTBMTSXLDTRKVAESVMXVPCLMULQDQWAITPKGWBNOINVDX87XOPXSAVEAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID"
+const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXFP16AMXINT8AMXTILEAVXAVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXIFMAAVXNECONVERTAVXSLOWAVXVNNIAVXVNNIINT8BMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPCCXADDCMPSB_SCADBS_SHORTCMPXCHG8CPBOOSTCPPCCX16EFER_LMSLE_UNSENQCMDERMSF16CFLUSH_L1DFMA3FMA4FP128FP256FSRMFXSRFXSROPTGFNIHLEHRESETHTTHWAHYBRID_CPUHYPERVISORIA32_ARCH_CAPIA32_CORE_CAPIBPBIBRSIBRS_PREFERREDIBRS_PROVIDES_SMPIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKIBS_FETCH_CTLXIBS_OPDATA4IBS_OPFUSEIBS_PREVENTHOSTIBS_ZEN4INT_WBINVDINVLPGBLAHFLAMLBRVIRTLZCNTMCAOVERFLOWMCDT_NOMCOMMITMD_CLEARMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMOVSB_ZLMOVUMPXMSRIRCMSR_PAGEFLUSHNRIPSNXOSXSAVEPCONFIGPOPCNTPPINPREFETCHIPSFDRDPRURDRANDRDSEEDRDTSCPRTMRTM_ALWAYS_ABORTSERIALIZESEVSEV_64BITSEV_ALTERNATIVESEV_DEBUGSWAPSEV_ESSEV_RESTRICTEDSEV_SNPSGXSGXLCSHASMESME_COHERENTSPEC_CTRL_SSBDSRBDS_CTRLSSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSTIBP_ALWAYSONSTOSB_SHORTSUCCORSVMSVMDASVMFBASIDSVMLSVMNPSVMPFSVMPFTSYSCALLSYSEETBMTLB_FLUSH_NESTEDTMETOPEXTTSCRATEMSRTSXLDTRKVAESVMCBCLEANVMPLVMSA_REGPROTVMXVPCLMULQDQVTEWAITPKGWBNOINVDX87XGETBV1XOPXSAVEXSAVECXSAVEOPTXSAVESAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID"
-var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 58, 62, 72, 84, 92, 100, 108, 116, 123, 133, 143, 151, 161, 172, 180, 190, 208, 223, 230, 234, 238, 244, 249, 257, 262, 268, 272, 280, 287, 291, 297, 301, 305, 309, 313, 317, 324, 328, 331, 334, 337, 347, 351, 354, 364, 375, 381, 389, 400, 408, 420, 436, 446, 453, 457, 462, 473, 480, 483, 489, 494, 503, 510, 513, 519, 521, 528, 534, 539, 545, 551, 557, 560, 576, 579, 588, 591, 596, 599, 602, 606, 610, 614, 619, 624, 629, 634, 640, 643, 651, 655, 658, 668, 675, 683, 686, 689, 694, 700, 708, 713, 720, 727, 735, 742, 747, 752, 759, 763, 765, 769, 772, 777, 782, 787, 791, 795, 799, 805, 808, 811, 814, 820}
+var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 62, 65, 69, 79, 91, 99, 107, 115, 123, 130, 140, 150, 158, 168, 179, 187, 197, 215, 230, 237, 249, 256, 263, 274, 278, 282, 288, 293, 301, 306, 312, 316, 325, 343, 351, 358, 362, 366, 380, 386, 390, 394, 403, 407, 411, 416, 421, 425, 429, 436, 440, 443, 449, 452, 455, 465, 475, 488, 501, 505, 509, 523, 540, 543, 553, 564, 570, 578, 589, 597, 609, 625, 639, 650, 660, 675, 683, 693, 700, 704, 707, 714, 719, 730, 737, 744, 752, 755, 761, 766, 775, 782, 790, 794, 797, 803, 816, 821, 823, 830, 837, 843, 847, 856, 860, 865, 871, 877, 883, 886, 902, 911, 914, 923, 938, 951, 957, 971, 978, 981, 986, 989, 992, 1004, 1018, 1028, 1031, 1035, 1039, 1043, 1048, 1053, 1058, 1063, 1077, 1088, 1094, 1097, 1102, 1111, 1115, 1120, 1125, 1131, 1138, 1143, 1146, 1162, 1165, 1171, 1181, 1189, 1193, 1202, 1206, 1218, 1221, 1231, 1234, 1241, 1249, 1252, 1259, 1262, 1267, 1273, 1281, 1287, 1293, 1301, 1306, 1313, 1320, 1328, 1335, 1340, 1345, 1352, 1356, 1358, 1362, 1365, 1370, 1375, 1380, 1384, 1388, 1392, 1398, 1401, 1404, 1407, 1413}
func (i FeatureID) String() string {
if i < 0 || i >= FeatureID(len(_FeatureID_index)-1) {
diff --git a/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go
index 8d2cb036..84b1acd2 100644
--- a/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go
+++ b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go
@@ -2,18 +2,120 @@
package cpuid
-import "runtime"
+import (
+ "runtime"
+ "strings"
+
+ "golang.org/x/sys/unix"
+)
func detectOS(c *CPUInfo) bool {
+ if runtime.GOOS != "ios" {
+ tryToFillCPUInfoFomSysctl(c)
+ }
// There are no hw.optional sysctl values for the below features on Mac OS 11.0
// to detect their supported state dynamically. Assume the CPU features that
// Apple Silicon M1 supports to be available as a minimal set of features
// to all Go programs running on darwin/arm64.
// TODO: Add more if we know them.
c.featureSet.setIf(runtime.GOOS != "ios", AESARM, PMULL, SHA1, SHA2)
- c.PhysicalCores = runtime.NumCPU()
- // For now assuming 1 thread per core...
- c.ThreadsPerCore = 1
- c.LogicalCores = c.PhysicalCores
+
return true
}
+
+func sysctlGetBool(name string) bool {
+ value, err := unix.SysctlUint32(name)
+ if err != nil {
+ return false
+ }
+ return value != 0
+}
+
+func sysctlGetString(name string) string {
+ value, err := unix.Sysctl(name)
+ if err != nil {
+ return ""
+ }
+ return value
+}
+
+func sysctlGetInt(unknown int, names ...string) int {
+ for _, name := range names {
+ value, err := unix.SysctlUint32(name)
+ if err != nil {
+ continue
+ }
+ if value != 0 {
+ return int(value)
+ }
+ }
+ return unknown
+}
+
+func sysctlGetInt64(unknown int, names ...string) int {
+ for _, name := range names {
+ value64, err := unix.SysctlUint64(name)
+ if err != nil {
+ continue
+ }
+ if int(value64) != unknown {
+ return int(value64)
+ }
+ }
+ return unknown
+}
+
+func setFeature(c *CPUInfo, name string, feature FeatureID) {
+ c.featureSet.setIf(sysctlGetBool(name), feature)
+}
+func tryToFillCPUInfoFomSysctl(c *CPUInfo) {
+ c.BrandName = sysctlGetString("machdep.cpu.brand_string")
+
+ if len(c.BrandName) != 0 {
+ c.VendorString = strings.Fields(c.BrandName)[0]
+ }
+
+ c.PhysicalCores = sysctlGetInt(runtime.NumCPU(), "hw.physicalcpu")
+ c.ThreadsPerCore = sysctlGetInt(1, "machdep.cpu.thread_count", "kern.num_threads") /
+ sysctlGetInt(1, "hw.physicalcpu")
+ c.LogicalCores = sysctlGetInt(runtime.NumCPU(), "machdep.cpu.core_count")
+ c.Family = sysctlGetInt(0, "machdep.cpu.family", "hw.cpufamily")
+ c.Model = sysctlGetInt(0, "machdep.cpu.model")
+ c.CacheLine = sysctlGetInt64(0, "hw.cachelinesize")
+ c.Cache.L1I = sysctlGetInt64(-1, "hw.l1icachesize")
+ c.Cache.L1D = sysctlGetInt64(-1, "hw.l1dcachesize")
+ c.Cache.L2 = sysctlGetInt64(-1, "hw.l2cachesize")
+ c.Cache.L3 = sysctlGetInt64(-1, "hw.l3cachesize")
+
+ // from https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile
+ setFeature(c, "hw.optional.arm.FEAT_AES", AESARM)
+ setFeature(c, "hw.optional.AdvSIMD", ASIMD)
+ setFeature(c, "hw.optional.arm.FEAT_DotProd", ASIMDDP)
+ setFeature(c, "hw.optional.arm.FEAT_RDM", ASIMDRDM)
+ setFeature(c, "hw.optional.FEAT_CRC32", CRC32)
+ setFeature(c, "hw.optional.arm.FEAT_DPB", DCPOP)
+ // setFeature(c, "", EVTSTRM)
+ setFeature(c, "hw.optional.arm.FEAT_FCMA", FCMA)
+ setFeature(c, "hw.optional.arm.FEAT_FP", FP)
+ setFeature(c, "hw.optional.arm.FEAT_FP16", FPHP)
+ setFeature(c, "hw.optional.arm.FEAT_PAuth", GPA)
+ setFeature(c, "hw.optional.arm.FEAT_JSCVT", JSCVT)
+ setFeature(c, "hw.optional.arm.FEAT_LRCPC", LRCPC)
+ setFeature(c, "hw.optional.arm.FEAT_PMULL", PMULL)
+ setFeature(c, "hw.optional.arm.FEAT_SHA1", SHA1)
+ setFeature(c, "hw.optional.arm.FEAT_SHA256", SHA2)
+ setFeature(c, "hw.optional.arm.FEAT_SHA3", SHA3)
+ setFeature(c, "hw.optional.arm.FEAT_SHA512", SHA512)
+ // setFeature(c, "", SM3)
+ // setFeature(c, "", SM4)
+ setFeature(c, "hw.optional.arm.FEAT_SVE", SVE)
+
+ // from empirical observation
+ setFeature(c, "hw.optional.AdvSIMD_HPFPCvt", ASIMDHP)
+ setFeature(c, "hw.optional.armv8_1_atomics", ATOMICS)
+ setFeature(c, "hw.optional.floatingpoint", FP)
+ setFeature(c, "hw.optional.armv8_2_sha3", SHA3)
+ setFeature(c, "hw.optional.armv8_2_sha512", SHA512)
+ setFeature(c, "hw.optional.armv8_3_compnum", FCMA)
+ setFeature(c, "hw.optional.armv8_crc32", CRC32)
+}