summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/s2
diff options
context:
space:
mode:
authordependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>2022-01-18 20:24:14 +0100
committerGitHub <noreply@github.com>2022-01-18 20:24:14 +0100
commitaad60c882e16cd2c8769a49e6d9f87a040590d62 (patch)
tree3bfe1f8953b40f9beb39c69db3a7647ea6de54d2 /vendor/github.com/klauspost/compress/s2
parentfecca575078a21dedb0cab213dde7fd97161c0fa (diff)
downloadmatterbridge-msglm-aad60c882e16cd2c8769a49e6d9f87a040590d62.tar.gz
matterbridge-msglm-aad60c882e16cd2c8769a49e6d9f87a040590d62.tar.bz2
matterbridge-msglm-aad60c882e16cd2c8769a49e6d9f87a040590d62.zip
Bump github.com/mattermost/mattermost-server/v6 from 6.1.0 to 6.3.0 (#1686)
Bumps [github.com/mattermost/mattermost-server/v6](https://github.com/mattermost/mattermost-server) from 6.1.0 to 6.3.0. - [Release notes](https://github.com/mattermost/mattermost-server/releases) - [Changelog](https://github.com/mattermost/mattermost-server/blob/master/CHANGELOG.md) - [Commits](https://github.com/mattermost/mattermost-server/compare/v6.1.0...v6.3.0) --- updated-dependencies: - dependency-name: github.com/mattermost/mattermost-server/v6 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2')
-rw-r--r--vendor/github.com/klauspost/compress/s2/.gitignore15
-rw-r--r--vendor/github.com/klauspost/compress/s2/LICENSE28
-rw-r--r--vendor/github.com/klauspost/compress/s2/README.md717
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode.go565
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_amd64.s568
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_arm64.s574
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_asm.go17
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_other.go267
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode.go1172
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_all.go456
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_amd64.go142
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_best.go604
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_better.go431
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_go.go298
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go189
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s15678
-rw-r--r--vendor/github.com/klauspost/compress/s2/s2.go139
17 files changed, 21860 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/.gitignore b/vendor/github.com/klauspost/compress/s2/.gitignore
new file mode 100644
index 00000000..3a89c6e3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/.gitignore
@@ -0,0 +1,15 @@
+testdata/bench
+
+# These explicitly listed benchmark data files are for an obsolete version of
+# snappy_test.go.
+testdata/alice29.txt
+testdata/asyoulik.txt
+testdata/fireworks.jpeg
+testdata/geo.protodata
+testdata/html
+testdata/html_x_4
+testdata/kppkn.gtb
+testdata/lcet10.txt
+testdata/paper-100k.pdf
+testdata/plrabn12.txt
+testdata/urls.10K
diff --git a/vendor/github.com/klauspost/compress/s2/LICENSE b/vendor/github.com/klauspost/compress/s2/LICENSE
new file mode 100644
index 00000000..1d2d645b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+Copyright (c) 2019 Klaus Post. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
new file mode 100644
index 00000000..81fad652
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -0,0 +1,717 @@
+# S2 Compression
+
+S2 is an extension of [Snappy](https://github.com/google/snappy).
+
+S2 is aimed for high throughput, which is why it features concurrent compression for bigger payloads.
+
+Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy.
+This means that S2 can seamlessly replace Snappy without converting compressed content.
+
+S2 can produce Snappy compatible output, faster and better than Snappy.
+If you want full benefit of the changes you should use s2 without Snappy compatibility.
+
+S2 is designed to have high throughput on content that cannot be compressed.
+This is important, so you don't have to worry about spending CPU cycles on already compressed data.
+
+## Benefits over Snappy
+
+* Better compression
+* Adjustable compression (3 levels)
+* Concurrent stream compression
+* Faster decompression, even for Snappy compatible content
+* Ability to quickly skip forward in compressed stream
+* Compatible with reading Snappy compressed content
+* Smaller block size overhead on incompressible blocks
+* Block concatenation
+* Uncompressed stream mode
+* Automatic stream size padding
+* Snappy compatible block compression
+
+## Drawbacks over Snappy
+
+* Not optimized for 32 bit systems.
+* Streams use slightly more memory due to larger blocks and concurrency (configurable).
+
+# Usage
+
+Installation: `go get -u github.com/klauspost/compress/s2`
+
+Full package documentation:
+
+[![godoc][1]][2]
+
+[1]: https://godoc.org/github.com/klauspost/compress?status.svg
+[2]: https://godoc.org/github.com/klauspost/compress/s2
+
+## Compression
+
+```Go
+func EncodeStream(src io.Reader, dst io.Writer) error {
+ enc := s2.NewWriter(dst)
+ _, err := io.Copy(enc, src)
+ if err != nil {
+ enc.Close()
+ return err
+ }
+ // Blocks until compression is done.
+ return enc.Close()
+}
+```
+
+You should always call `enc.Close()`, otherwise you will leak resources and your encode will be incomplete.
+
+For the best throughput, you should attempt to reuse the `Writer` using the `Reset()` method.
+
+The Writer in S2 is always buffered, therefore `NewBufferedWriter` in Snappy can be replaced with `NewWriter` in S2.
+It is possible to flush any buffered data using the `Flush()` method.
+This will block until all data sent to the encoder has been written to the output.
+
+S2 also supports the `io.ReaderFrom` interface, which will consume all input from a reader.
+
+As a final method to compress data, if you have a single block of data you would like to have encoded as a stream,
+a slightly more efficient method is to use the `EncodeBuffer` method.
+This will take ownership of the buffer until the stream is closed.
+
+```Go
+func EncodeStream(src []byte, dst io.Writer) error {
+ enc := s2.NewWriter(dst)
+ // The encoder owns the buffer until Flush or Close is called.
+ err := enc.EncodeBuffer(buf)
+ if err != nil {
+ enc.Close()
+ return err
+ }
+ // Blocks until compression is done.
+ return enc.Close()
+}
+```
+
+Each call to `EncodeBuffer` will result in discrete blocks being created without buffering,
+so it should only be used a single time per stream.
+If you need to write several blocks, you should use the regular io.Writer interface.
+
+
+## Decompression
+
+```Go
+func DecodeStream(src io.Reader, dst io.Writer) error {
+ dec := s2.NewReader(src)
+ _, err := io.Copy(dst, dec)
+ return err
+}
+```
+
+Similar to the Writer, a Reader can be reused using the `Reset` method.
+
+For the best possible throughput, there is a `EncodeBuffer(buf []byte)` function available.
+However, it requires that the provided buffer isn't used after it is handed over to S2 and until the stream is flushed or closed.
+
+For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`.
+Do however note that these functions (similar to Snappy) does not provide validation of data,
+so data corruption may be undetected. Stream encoding provides CRC checks of data.
+
+It is possible to efficiently skip forward in a compressed stream using the `Skip()` method.
+For big skips the decompressor is able to skip blocks without decompressing them.
+
+## Single Blocks
+
+Similar to Snappy S2 offers single block compression.
+Blocks do not offer the same flexibility and safety as streams,
+but may be preferable for very small payloads, less than 100K.
+
+Using a simple `dst := s2.Encode(nil, src)` will compress `src` and return the compressed result.
+It is possible to provide a destination buffer.
+If the buffer has a capacity of `s2.MaxEncodedLen(len(src))` it will be used.
+If not a new will be allocated.
+
+Alternatively `EncodeBetter`/`EncodeBest` can also be used for better, but slightly slower compression.
+
+Similarly to decompress a block you can use `dst, err := s2.Decode(nil, src)`.
+Again an optional destination buffer can be supplied.
+The `s2.DecodedLen(src)` can be used to get the minimum capacity needed.
+If that is not satisfied a new buffer will be allocated.
+
+Block function always operate on a single goroutine since it should only be used for small payloads.
+
+# Commandline tools
+
+Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression.
+
+Binaries can be downloaded on the [Releases Page](https://github.com/klauspost/compress/releases).
+
+Installing then requires Go to be installed. To install them, use:
+
+`go install github.com/klauspost/compress/s2/cmd/s2c && go install github.com/klauspost/compress/s2/cmd/s2d`
+
+To build binaries to the current folder use:
+
+`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d`
+
+
+## s2c
+
+```
+Usage: s2c [options] file1 file2
+
+Compresses all files supplied as input separately.
+Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'.
+By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and compressed.
+Only http response code 200 is accepted.
+
+Options:
+ -bench int
+ Run benchmark n times. No output will be written
+ -blocksize string
+ Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M")
+ -c Write all output to stdout. Multiple input files will be concatenated
+ -cpu int
+ Compress using this amount of threads (default 32)
+ -faster
+ Compress faster, but with a minor compression loss
+ -help
+ Display help
+ -o string
+ Write output to another file. Single input file only
+ -pad string
+ Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1")
+ -q Don't write any output to terminal, except errors
+ -rm
+ Delete source file(s) after successful compression
+ -safe
+ Do not overwrite output files
+ -slower
+ Compress more, but a lot slower
+ -snappy
+ Generate Snappy compatible output stream
+ -verify
+ Verify written files
+
+```
+
+## s2d
+
+```
+Usage: s2d [options] file1 file2
+
+Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'.
+Output file names have the extension removed. By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and decompressed.
+Extensions on downloaded files are ignored. Only http response code 200 is accepted.
+
+Options:
+ -bench int
+ Run benchmark n times. No output will be written
+ -c Write all output to stdout. Multiple input files will be concatenated
+ -help
+ Display help
+ -o string
+ Write output to another file. Single input file only
+ -q Don't write any output to terminal, except errors
+ -rm
+ Delete source file(s) after successful decompression
+ -safe
+ Do not overwrite output files
+ -verify
+ Verify files, but do not write output
+```
+
+## s2sx: self-extracting archives
+
+s2sx allows creating self-extracting archives with no dependencies.
+
+By default, executables are created for the same platforms as the host os,
+but this can be overridden with `-os` and `-arch` parameters.
+
+Extracted files have 0666 permissions, except when untar option used.
+
+```
+Usage: s2sx [options] file1 file2
+
+Compresses all files supplied as input separately.
+If files have '.s2' extension they are assumed to be compressed already.
+Output files are written as 'filename.s2sx' and with '.exe' for windows targets.
+If output is big, an additional file with ".more" is written. This must be included as well.
+By default output files will be overwritten.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+Options:
+ -arch string
+ Destination architecture (default "amd64")
+ -c Write all output to stdout. Multiple input files will be concatenated
+ -cpu int
+ Compress using this amount of threads (default 32)
+ -help
+ Display help
+ -max string
+ Maximum executable size. Rest will be written to another file. (default "1G")
+ -os string
+ Destination operating system (default "windows")
+ -q Don't write any output to terminal, except errors
+ -rm
+ Delete source file(s) after successful compression
+ -safe
+ Do not overwrite output files
+ -untar
+ Untar on destination
+```
+
+Available platforms are:
+
+ * darwin-amd64
+ * darwin-arm64
+ * linux-amd64
+ * linux-arm
+ * linux-arm64
+ * linux-mips64
+ * linux-ppc64le
+ * windows-386
+ * windows-amd64
+
+By default, there is a size limit of 1GB for the output executable.
+
+When this is exceeded the remaining file content is written to a file called
+output+`.more`. This file must be included for a successful extraction and
+placed alongside the executable for a successful extraction.
+
+This file *must* have the same name as the executable, so if the executable is renamed,
+so must the `.more` file.
+
+This functionality is disabled with stdin/stdout.
+
+### Self-extracting TAR files
+
+If you wrap a TAR file you can specify `-untar` to make it untar on the destination host.
+
+Files are extracted to the current folder with the path specified in the tar file.
+
+Note that tar files are not validated before they are wrapped.
+
+For security reasons files that move below the root folder are not allowed.
+
+# Performance
+
+This section will focus on comparisons to Snappy.
+This package is solely aimed at replacing Snappy as a high speed compression package.
+If you are mainly looking for better compression [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd)
+gives better compression, but typically at speeds slightly below "better" mode in this package.
+
+Compression is increased compared to Snappy, mostly around 5-20% and the throughput is typically 25-40% increased (single threaded) compared to the Snappy Go implementation.
+
+Streams are concurrently compressed. The stream will be distributed among all available CPU cores for the best possible throughput.
+
+A "better" compression mode is also available. This allows to trade a bit of speed for a minor compression gain.
+The content compressed in this mode is fully compatible with the standard decoder.
+
+Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
+
+| File | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
+|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 12.70x | 10556 MB/s | 7.35% | 4.15x | 3455 MB/s | 12.79% |
+| (1 CPU) | 1.14x | 948 MB/s | - | 0.42x | 349 MB/s | - |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x | 14484 MB/s | 31.60% | 10.09x | 8533 MB/s | 37.71% |
+| (1 CPU) | 1.33x | 1127 MB/s | - | 0.70x | 589 MB/s | - |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12000 MB/s | -5.79% | 6.59x | 5223 MB/s | 5.80% |
+| (1 CPU) | 1.11x | 877 MB/s | - | 0.47x | 370 MB/s | - |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 14.62x | 12116 MB/s | 15.90% | 5.35x | 4430 MB/s | 16.08% |
+| (1 CPU) | 1.38x | 1146 MB/s | - | 0.38x | 312 MB/s | - |
+| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 8.83x | 17579 MB/s | 43.86% | 6.54x | 13011 MB/s | 47.23% |
+| (1 CPU) | 1.14x | 2259 MB/s | - | 0.74x | 1475 MB/s | - |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 16.72x | 14019 MB/s | 24.02% | 10.11x | 8477 MB/s | 30.48% |
+| (1 CPU) | 1.24x | 1043 MB/s | - | 0.70x | 586 MB/s | - |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9254 MB/s | 1.84% | 6.75x | 4686 MB/s | 6.72% |
+| (1 CPU) | 0.97x | 672 MB/s | - | 0.53x | 366 MB/s | - |
+| sharnd.out.2gb | 2.11x | 12639 MB/s | 0.01% | 1.98x | 11833 MB/s | 0.01% |
+| (1 CPU) | 0.93x | 5594 MB/s | - | 1.34x | 8030 MB/s | - |
+| [enwik9](http://mattmahoney.net/dc/textdata.html) | 19.34x | 8220 MB/s | 3.98% | 7.87x | 3345 MB/s | 15.82% |
+| (1 CPU) | 1.06x | 452 MB/s | - | 0.50x | 213 MB/s | - |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 10.48x | 6124 MB/s | 5.67% | 3.76x | 2197 MB/s | 12.60% |
+| (1 CPU) | 0.97x | 568 MB/s | - | 0.46x | 271 MB/s | - |
+| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 21.07x | 9020 MB/s | 6.36% | 6.91x | 2959 MB/s | 16.95% |
+| (1 CPU) | 1.07x | 460 MB/s | - | 0.51x | 220 MB/s | - |
+
+### Legend
+
+* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
+* `S2 throughput`: Throughput of S2 in MB/s.
+* `S2 % smaller`: How many percent of the Snappy output size is S2 better.
+* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy.
+* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy.
+* `"better" % smaller`: How many percent of the Snappy output size is S2 better when using "better" compression.
+
+There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
+
+Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size.
+
+The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
+
+Incompressible content (`sharnd.out.2gb`, 2GB random data) sees the smallest speedup.
+This is likely dominated by synchronization overhead, which is confirmed by the fact that single threaded performance is higher (see above).
+
+## Decompression
+
+S2 attempts to create content that is also fast to decompress, except in "better" mode where the smallest representation is used.
+
+S2 vs Snappy **decompression** speed. Both operating on single core:
+
+| File | S2 Throughput | vs. Snappy | Better Throughput | vs. Snappy |
+|-----------------------------------------------------------------------------------------------------|---------------|------------|-------------------|------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 2117 MB/s | 1.14x | 1738 MB/s | 0.94x |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 2401 MB/s | 1.25x | 2307 MB/s | 1.20x |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 2075 MB/s | 0.98x | 1764 MB/s | 0.83x |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 2967 MB/s | 1.05x | 2885 MB/s | 1.02x |
+| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 4141 MB/s | 1.07x | 4184 MB/s | 1.08x |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 2264 MB/s | 1.12x | 2185 MB/s | 1.08x |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 1525 MB/s | 1.03x | 1347 MB/s | 0.91x |
+| sharnd.out.2gb | 3813 MB/s | 0.79x | 3900 MB/s | 0.81x |
+| [enwik9](http://mattmahoney.net/dc/textdata.html) | 1246 MB/s | 1.29x | 967 MB/s | 1.00x |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 1433 MB/s | 1.12x | 1203 MB/s | 0.94x |
+| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 1284 MB/s | 1.32x | 1010 MB/s | 1.04x |
+
+### Legend
+
+* `S2 Throughput`: Decompression speed of S2 encoded content.
+* `Better Throughput`: Decompression speed of S2 "better" encoded content.
+* `vs Snappy`: Decompression speed of S2 "better" mode compared to Snappy and absolute speed.
+
+
+While the decompression code hasn't changed, there is a significant speedup in decompression speed.
+S2 prefers longer matches and will typically only find matches that are 6 bytes or longer.
+While this reduces compression a bit, it improves decompression speed.
+
+The "better" compression mode will actively look for shorter matches, which is why it has a decompression speed quite similar to Snappy.
+
+Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
+
+| File | S2 Throughput | S2 throughput |
+|--------------------------------|--------------|---------------|
+| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s |
+| 10gb.tar.s2 | 1.30x | 867.07 MB/s |
+| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s |
+| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s |
+| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s |
+| enwik9.s2 | 1.67x | 681.53 MB/s |
+| adresser.json.s2 | 3.41x | 4230.53 MB/s |
+| silesia.tar.s2 | 1.52x | 811.58 |
+
+Even though S2 typically compresses better than Snappy, decompression speed is always better.
+
+## Block compression
+
+
+When compressing blocks no concurrent compression is performed just as Snappy.
+This is because blocks are for smaller payloads and generally will not benefit from concurrent compression.
+
+An important change is that incompressible blocks will not be more than at most 10 bytes bigger than the input.
+In rare, worst case scenario Snappy blocks could be significantly bigger than the input.
+
+### Mixed content blocks
+
+The most reliable is a wide dataset.
+For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
+53927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
+
+| * | Input | Output | Reduction | MB/s |
+|-------------------|------------|------------|-----------|--------|
+| S2 | 4014735833 | 1059723369 | 73.60% | **934.34** |
+| S2 Better | 4014735833 | 969670507 | 75.85% | 532.70 |
+| S2 Best | 4014735833 | 906625668 | **77.85%** | 46.84 |
+| Snappy | 4014735833 | 1128706759 | 71.89% | 762.59 |
+| S2, Snappy Output | 4014735833 | 1093821420 | 72.75% | 908.60 |
+| LZ4 | 4014735833 | 1079259294 | 73.12% | 526.94 |
+
+S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
+"Better" mode provides the same compression speed as LZ4 with better compression ratio.
+
+When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression.
+
+As can be seen from the other benchmarks decompression should also be easier on the S2 generated output.
+
+Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for
+other Go compressors:
+
+| * | Input | Output | Reduction | MB/s |
+|-------------------|------------|------------|-----------|--------|
+| Zstd Fastest (Go) | 4014735833 | 794608518 | 80.21% | 236.04 |
+| Zstd Best (Go) | 4014735833 | 704603356 | 82.45% | 35.63 |
+| Deflate (Go) l1 | 4014735833 | 871294239 | 78.30% | 214.04 |
+| Deflate (Go) l9 | 4014735833 | 730389060 | 81.81% | 41.17 |
+
+### Standard block compression
+
+Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
+So individual benchmarks should only be seen as a guideline and the overall picture is more important.
+
+These micro-benchmarks are with data in cache and trained branch predictors. For a more realistic benchmark see the mixed content above.
+
+Block compression. Parallel benchmark running on 16 cores, 16 goroutines.
+
+AMD64 assembly is use for both S2 and Snappy.
+
+| Absolute Perf | Snappy size | S2 Size | Snappy Speed | S2 Speed | Snappy dec | S2 dec |
+|-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
+| html | 22843 | 21111 | 16246 MB/s | 17438 MB/s | 40972 MB/s | 49263 MB/s |
+| urls.10K | 335492 | 287326 | 7943 MB/s | 9693 MB/s | 22523 MB/s | 26484 MB/s |
+| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 273889 MB/s | 718321 MB/s | 827552 MB/s |
+| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 17773 MB/s | 33691 MB/s | 52421 MB/s |
+| paper-100k.pdf | 85304 | 84459 | 167546 MB/s | 101263 MB/s | 326905 MB/s | 291944 MB/s |
+| html_x_4 | 92234 | 21113 | 15194 MB/s | 50670 MB/s | 30843 MB/s | 32217 MB/s |
+| alice29.txt | 88034 | 85975 | 5936 MB/s | 6139 MB/s | 12882 MB/s | 20044 MB/s |
+| asyoulik.txt | 77503 | 79650 | 5517 MB/s | 6366 MB/s | 12735 MB/s | 22806 MB/s |
+| lcet10.txt | 234661 | 220670 | 6235 MB/s | 6067 MB/s | 14519 MB/s | 18697 MB/s |
+| plrabn12.txt | 319267 | 317985 | 5159 MB/s | 5726 MB/s | 11923 MB/s | 19901 MB/s |
+| geo.protodata | 23335 | 18690 | 21220 MB/s | 26529 MB/s | 56271 MB/s | 62540 MB/s |
+| kppkn.gtb | 69526 | 65312 | 9732 MB/s | 8559 MB/s | 18491 MB/s | 18969 MB/s |
+| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 15489 MB/s | 31883 MB/s | 38874 MB/s |
+| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13000 MB/s | 48056 MB/s | 52341 MB/s |
+| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12806 MB/s | 32378 MB/s | 46322 MB/s |
+| alice29.txt (20000B) | 12686 | 13574 | 7733 MB/s | 11210 MB/s | 30566 MB/s | 58969 MB/s |
+
+
+| Relative Perf | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed |
+|-----------------------|-------------|------------------|----------|--------------|
+| html | 22.31% | 7.58% | 1.07x | 1.20x |
+| urls.10K | 47.78% | 14.36% | 1.22x | 1.18x |
+| fireworks.jpeg | 99.95% | -0.05% | 0.78x | 1.15x |
+| fireworks.jpeg (200B) | 73.00% | -6.16% | 2.00x | 1.56x |
+| paper-100k.pdf | 83.30% | 0.99% | 0.60x | 0.89x |
+| html_x_4 | 22.52% | 77.11% | 3.33x | 1.04x |
+| alice29.txt | 57.88% | 2.34% | 1.03x | 1.56x |
+| asyoulik.txt | 61.91% | -2.77% | 1.15x | 1.79x |
+| lcet10.txt | 54.99% | 5.96% | 0.97x | 1.29x |
+| plrabn12.txt | 66.26% | 0.40% | 1.11x | 1.67x |
+| geo.protodata | 19.68% | 19.91% | 1.25x | 1.11x |
+| kppkn.gtb | 37.72% | 6.06% | 0.88x | 1.03x |
+| alice29.txt (128B) | 62.50% | -2.50% | 2.31x | 1.22x |
+| alice29.txt (1000B) | 77.40% | 0.00% | 1.07x | 1.09x |
+| alice29.txt (10000B) | 66.48% | -4.29% | 1.27x | 1.43x |
+| alice29.txt (20000B) | 63.43% | -7.00% | 1.45x | 1.93x |
+
+Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size.
+
+Decompression speed is better than Snappy, except in one case.
+
+Since payloads are very small the variance in terms of size is rather big, so they should only be seen as a general guideline.
+
+Size is on average around Snappy, but varies on content type.
+In cases where compression is worse, it usually is compensated by a speed boost.
+
+
+### Better compression
+
+Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
+So individual benchmarks should only be seen as a guideline and the overall picture is more important.
+
+| Absolute Perf | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec | Better dec |
+|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
+| html | 22843 | 19833 | 16246 MB/s | 7731 MB/s | 40972 MB/s | 40292 MB/s |
+| urls.10K | 335492 | 253529 | 7943 MB/s | 3980 MB/s | 22523 MB/s | 20981 MB/s |
+| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 9760 MB/s | 718321 MB/s | 823698 MB/s |
+| fireworks.jpeg (200B) | 146 | 142 | 8869 MB/s | 594 MB/s | 33691 MB/s | 30101 MB/s |
+| paper-100k.pdf | 85304 | 82915 | 167546 MB/s | 7470 MB/s | 326905 MB/s | 198869 MB/s |
+| html_x_4 | 92234 | 19841 | 15194 MB/s | 23403 MB/s | 30843 MB/s | 30937 MB/s |
+| alice29.txt | 88034 | 73218 | 5936 MB/s | 2945 MB/s | 12882 MB/s | 16611 MB/s |
+| asyoulik.txt | 77503 | 66844 | 5517 MB/s | 2739 MB/s | 12735 MB/s | 14975 MB/s |
+| lcet10.txt | 234661 | 190589 | 6235 MB/s | 3099 MB/s | 14519 MB/s | 16634 MB/s |
+| plrabn12.txt | 319267 | 270828 | 5159 MB/s | 2600 MB/s | 11923 MB/s | 13382 MB/s |
+| geo.protodata | 23335 | 18278 | 21220 MB/s | 11208 MB/s | 56271 MB/s | 57961 MB/s |
+| kppkn.gtb | 69526 | 61851 | 9732 MB/s | 4556 MB/s | 18491 MB/s | 16524 MB/s |
+| alice29.txt (128B) | 80 | 81 | 6691 MB/s | 529 MB/s | 31883 MB/s | 34225 MB/s |
+| alice29.txt (1000B) | 774 | 748 | 12204 MB/s | 1943 MB/s | 48056 MB/s | 42068 MB/s |
+| alice29.txt (10000B) | 6648 | 6234 | 10044 MB/s | 2949 MB/s | 32378 MB/s | 28813 MB/s |
+| alice29.txt (20000B) | 12686 | 11584 | 7733 MB/s | 2822 MB/s | 30566 MB/s | 27315 MB/s |
+
+
+| Relative Perf | Snappy size | Better size | Better Speed | Better dec |
+|-----------------------|-------------|-------------|--------------|------------|
+| html | 22.31% | 13.18% | 0.48x | 0.98x |
+| urls.10K | 47.78% | 24.43% | 0.50x | 0.93x |
+| fireworks.jpeg | 99.95% | -0.05% | 0.03x | 1.15x |
+| fireworks.jpeg (200B) | 73.00% | 2.74% | 0.07x | 0.89x |
+| paper-100k.pdf | 83.30% | 2.80% | 0.07x | 0.61x |
+| html_x_4 | 22.52% | 78.49% | 0.04x | 1.00x |
+| alice29.txt | 57.88% | 16.83% | 1.54x | 1.29x |
+| asyoulik.txt | 61.91% | 13.75% | 0.50x | 1.18x |
+| lcet10.txt | 54.99% | 18.78% | 0.50x | 1.15x |
+| plrabn12.txt | 66.26% | 15.17% | 0.50x | 1.12x |
+| geo.protodata | 19.68% | 21.67% | 0.50x | 1.03x |
+| kppkn.gtb | 37.72% | 11.04% | 0.53x | 0.89x |
+| alice29.txt (128B) | 62.50% | -1.25% | 0.47x | 1.07x |
+| alice29.txt (1000B) | 77.40% | 3.36% | 0.08x | 0.88x |
+| alice29.txt (10000B) | 66.48% | 6.23% | 0.16x | 0.89x |
+| alice29.txt (20000B) | 63.43% | 8.69% | 0.29x | 0.89x |
+
+Except for the mostly incompressible JPEG image compression is better and usually in the
+double digits in terms of percentage reduction over Snappy.
+
+The PDF sample shows a significant slowdown compared to Snappy, as this mode tries harder
+to compress the data. Very small blocks are also not favorable for better compression, so throughput is way down.
+
+This mode aims to provide better compression at the expense of performance and achieves that
+without a huge performance penalty, except on very small blocks.
+
+Decompression speed suffers a little compared to the regular S2 mode,
+but still manages to be close to Snappy in spite of increased compression.
+
+# Best compression mode
+
+S2 offers a "best" compression mode.
+
+This will compress as much as possible with little regard to CPU usage.
+
+Mainly for offline compression, but where decompression speed should still
+be high and compatible with other S2 compressed data.
+
+Some examples compared on 16 core CPU, amd64 assembly used:
+
+```
+* enwik10
+Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
+Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
+Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s
+
+* github-june-2days-2019.json
+Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
+Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
+Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s
+
+* nyc-taxi-data-10M.csv
+Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
+Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
+Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s
+
+* 10gb.tar
+Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
+Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
+Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/
+
+* consensus.db.10gb
+Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
+Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
+Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
+```
+
+Decompression speed should be around the same as using the 'better' compression mode.
+
+# Snappy Compatibility
+
+S2 now offers full compatibility with Snappy.
+
+This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output.
+
+There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by
+simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`.
+This uses "better" mode for all operations.
+If you would like more control, you can use the s2 package as described below:
+
+## Blocks
+
+Snappy compatible blocks can be generated with the S2 encoder.
+Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace
+
+| Snappy | S2 replacement |
+|----------------------------|-------------------------|
+| snappy.Encode(...) | s2.EncodeSnappy(...) |
+| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
+
+`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output.
+
+`s2.ConcatBlocks` is compatible with snappy blocks.
+
+Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
+53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
+
+| Encoder | Size | MB/s | Reduction |
+|-----------------------|------------|--------|------------
+| snappy.Encode | 1128706759 | 725.59 | 71.89% |
+| s2.EncodeSnappy | 1093823291 | 899.16 | 72.75% |
+| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% |
+| s2.EncodeSnappyBest | 944507998 | 66.00 | 76.47% |
+
+## Streams
+
+For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`.
+All other options are available, but note that block size limit is different for snappy.
+
+Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput:
+
+| File | snappy.NewWriter | S2 Snappy | S2 Snappy, Better | S2 Snappy, Best |
+|-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------|
+| nyc-taxi-data-10M.csv | 1316042016 - 517.54MB/s | 1307003093 - 8406.29MB/s | 1174534014 - 4984.35MB/s | 1115904679 - 177.81MB/s |
+| enwik10 | 5088294643 - 433.45MB/s | 5175840939 - 8454.52MB/s | 4560784526 - 4403.10MB/s | 4340299103 - 159.71MB/s |
+| 10gb.tar | 6056946612 - 703.25MB/s | 6208571995 - 9035.75MB/s | 5741646126 - 2402.08MB/s | 5548973895 - 171.17MB/s |
+| github-june-2days-2019.json | 1525176492 - 908.11MB/s | 1476519054 - 12625.93MB/s | 1400547532 - 6163.61MB/s | 1321887137 - 200.71MB/s |
+| consensus.db.10gb | 5412897703 - 1054.38MB/s | 5354073487 - 12634.82MB/s | 5335069899 - 2472.23MB/s | 5201000954 - 166.32MB/s |
+
+# Decompression
+
+All decompression functions map directly to equivalent s2 functions.
+
+| Snappy | S2 replacement |
+|------------------------|--------------------|
+| snappy.Decode(...) | s2.Decode(...) |
+| snappy.DecodedLen(...) | s2.DecodedLen(...) |
+| snappy.NewReader(...) | s2.NewReader(...) |
+
+Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip)
+are also available for Snappy streams.
+
+If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize)
+on your Reader will reduce memory consumption.
+
+# Concatenating blocks and streams.
+
+Concatenating streams will concatenate the output of both without recompressing them.
+While this is inefficient in terms of compression it might be usable in certain scenarios.
+The 10 byte 'stream identifier' of the second stream can optionally be stripped, but it is not a requirement.
+
+Blocks can be concatenated using the `ConcatBlocks` function.
+
+Snappy blocks/streams can safely be concatenated with S2 blocks and streams.
+
+# Format Extensions
+
+* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
+* [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB).
+* Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset.
+
+Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0.
+
+The length is specified by reading the 3-bit length specified in the tag and decode using this table:
+
+| Length | Actual Length |
+|--------|----------------------|
+| 0 | 4 |
+| 1 | 5 |
+| 2 | 6 |
+| 3 | 7 |
+| 4 | 8 |
+| 5 | 8 + read 1 byte |
+| 6 | 260 + read 2 bytes |
+| 7 | 65540 + read 3 bytes |
+
+This allows any repeat offset + length to be represented by 2 to 5 bytes.
+
+Lengths are stored as little endian values.
+
+The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams.
+
+Default streaming block size is 1MB.
+
+# LICENSE
+
+This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation.
+
+Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
new file mode 100644
index 00000000..d0ae5304
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -0,0 +1,565 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+ "encoding/binary"
+ "errors"
+ "io"
+)
+
+var (
+ // ErrCorrupt reports that the input is invalid.
+ ErrCorrupt = errors.New("s2: corrupt input")
+ // ErrCRC reports that the input failed CRC validation (streams only)
+ ErrCRC = errors.New("s2: corrupt input, crc mismatch")
+ // ErrTooLarge reports that the uncompressed length is too large.
+ ErrTooLarge = errors.New("s2: decoded block is too large")
+ // ErrUnsupported reports that the input isn't supported.
+ ErrUnsupported = errors.New("s2: unsupported input")
+)
+
+// DecodedLen returns the length of the decoded block.
+func DecodedLen(src []byte) (int, error) {
+ v, _, err := decodedLen(src)
+ return v, err
+}
+
+// decodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func decodedLen(src []byte) (blockLen, headerLen int, err error) {
+ v, n := binary.Uvarint(src)
+ if n <= 0 || v > 0xffffffff {
+ return 0, 0, ErrCorrupt
+ }
+
+ const wordSize = 32 << (^uint(0) >> 32 & 1)
+ if wordSize == 32 && v > 0x7fffffff {
+ return 0, 0, ErrTooLarge
+ }
+ return int(v), n, nil
+}
+
+const (
+ decodeErrCodeCorrupt = 1
+)
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Decode(dst, src []byte) ([]byte, error) {
+ dLen, s, err := decodedLen(src)
+ if err != nil {
+ return nil, err
+ }
+ if dLen <= cap(dst) {
+ dst = dst[:dLen]
+ } else {
+ dst = make([]byte, dLen)
+ }
+ if s2Decode(dst, src[s:]) != 0 {
+ return nil, ErrCorrupt
+ }
+ return dst, nil
+}
+
+// NewReader returns a new Reader that decompresses from r, using the framing
+// format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt with S2 changes.
+func NewReader(r io.Reader, opts ...ReaderOption) *Reader {
+ nr := Reader{
+ r: r,
+ maxBlock: maxBlockSize,
+ }
+ for _, opt := range opts {
+ if err := opt(&nr); err != nil {
+ nr.err = err
+ return &nr
+ }
+ }
+ nr.maxBufSize = MaxEncodedLen(nr.maxBlock) + checksumSize
+ if nr.lazyBuf > 0 {
+ nr.buf = make([]byte, MaxEncodedLen(nr.lazyBuf)+checksumSize)
+ } else {
+ nr.buf = make([]byte, MaxEncodedLen(defaultBlockSize)+checksumSize)
+ }
+ nr.paramsOK = true
+ return &nr
+}
+
+// ReaderOption is an option for creating a decoder.
+type ReaderOption func(*Reader) error
+
+// ReaderMaxBlockSize allows to control allocations if the stream
+// has been compressed with a smaller WriterBlockSize, or with the default 1MB.
+// Blocks must be this size or smaller to decompress,
+// otherwise the decoder will return ErrUnsupported.
+//
+// For streams compressed with Snappy this can safely be set to 64KB (64 << 10).
+//
+// Default is the maximum limit of 4MB.
+func ReaderMaxBlockSize(blockSize int) ReaderOption {
+ return func(r *Reader) error {
+ if blockSize > maxBlockSize || blockSize <= 0 {
+ return errors.New("s2: block size too large. Must be <= 4MB and > 0")
+ }
+ if r.lazyBuf == 0 && blockSize < defaultBlockSize {
+ r.lazyBuf = blockSize
+ }
+ r.maxBlock = blockSize
+ return nil
+ }
+}
+
+// ReaderAllocBlock allows to control upfront stream allocations
+// and not allocate for frames bigger than this initially.
+// If frames bigger than this is seen a bigger buffer will be allocated.
+//
+// Default is 1MB, which is default output size.
+func ReaderAllocBlock(blockSize int) ReaderOption {
+ return func(r *Reader) error {
+ if blockSize > maxBlockSize || blockSize < 1024 {
+ return errors.New("s2: invalid ReaderAllocBlock. Must be <= 4MB and >= 1024")
+ }
+ r.lazyBuf = blockSize
+ return nil
+ }
+}
+
+// Reader is an io.Reader that can read Snappy-compressed bytes.
+type Reader struct {
+ r io.Reader
+ err error
+ decoded []byte
+ buf []byte
+ // decoded[i:j] contains decoded bytes that have not yet been passed on.
+ i, j int
+ // maximum block size allowed.
+ maxBlock int
+ // maximum expected buffer size.
+ maxBufSize int
+ // alloc a buffer this size if > 0.
+ lazyBuf int
+ readHeader bool
+ paramsOK bool
+ snappyFrame bool
+}
+
+// ensureBufferSize will ensure that the buffer can take at least n bytes.
+// If false is returned the buffer exceeds maximum allowed size.
+func (r *Reader) ensureBufferSize(n int) bool {
+ if len(r.buf) >= n {
+ return true
+ }
+ if n > r.maxBufSize {
+ r.err = ErrCorrupt
+ return false
+ }
+ // Realloc buffer.
+ r.buf = make([]byte, n)
+ return true
+}
+
+// Reset discards any buffered data, resets all state, and switches the Snappy
+// reader to read from r. This permits reusing a Reader rather than allocating
+// a new one.
+func (r *Reader) Reset(reader io.Reader) {
+ if !r.paramsOK {
+ return
+ }
+ r.r = reader
+ r.err = nil
+ r.i = 0
+ r.j = 0
+ r.readHeader = false
+}
+
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
+ if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+ if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+ r.err = ErrCorrupt
+ }
+ return false
+ }
+ return true
+}
+
+// skipN will skip n bytes.
+// If the supplied reader supports seeking that is used.
+// tmp is used as a temporary buffer for reading.
+// The supplied slice does not need to be the size of the read.
+func (r *Reader) skipN(tmp []byte, n int, allowEOF bool) (ok bool) {
+ if rs, ok := r.r.(io.ReadSeeker); ok {
+ _, err := rs.Seek(int64(n), io.SeekCurrent)
+ if err == nil {
+ return true
+ }
+ if err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+ r.err = ErrCorrupt
+ return false
+ }
+ }
+ for n > 0 {
+ if n < len(tmp) {
+ tmp = tmp[:n]
+ }
+ if _, r.err = io.ReadFull(r.r, tmp); r.err != nil {
+ if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+ r.err = ErrCorrupt
+ }
+ return false
+ }
+ n -= len(tmp)
+ }
+ return true
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+ if r.err != nil {
+ return 0, r.err
+ }
+ for {
+ if r.i < r.j {
+ n := copy(p, r.decoded[r.i:r.j])
+ r.i += n
+ return n, nil
+ }
+ if !r.readFull(r.buf[:4], true) {
+ return 0, r.err
+ }
+ chunkType := r.buf[0]
+ if !r.readHeader {
+ if chunkType != chunkTypeStreamIdentifier {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ r.readHeader = true
+ }
+ chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+ // The chunk types are specified at
+ // https://github.com/google/snappy/blob/master/framing_format.txt
+ switch chunkType {
+ case chunkTypeCompressedData:
+ // Section 4.2. Compressed data (chunk type 0x00).
+ if chunkLen < checksumSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ if !r.ensureBufferSize(chunkLen) {
+ if r.err == nil {
+ r.err = ErrUnsupported
+ }
+ return 0, r.err
+ }
+ buf := r.buf[:chunkLen]
+ if !r.readFull(buf, false) {
+ return 0, r.err
+ }
+ checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+ buf = buf[checksumSize:]
+
+ n, err := DecodedLen(buf)
+ if err != nil {
+ r.err = err
+ return 0, r.err
+ }
+ if r.snappyFrame && n > maxSnappyBlockSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+
+ if n > len(r.decoded) {
+ if n > r.maxBlock {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ r.decoded = make([]byte, n)
+ }
+ if _, err := Decode(r.decoded, buf); err != nil {
+ r.err = err
+ return 0, r.err
+ }
+ if crc(r.decoded[:n]) != checksum {
+ r.err = ErrCRC
+ return 0, r.err
+ }
+ r.i, r.j = 0, n
+ continue
+
+ case chunkTypeUncompressedData:
+ // Section 4.3. Uncompressed data (chunk type 0x01).
+ if chunkLen < checksumSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ if !r.ensureBufferSize(chunkLen) {
+ if r.err == nil {
+ r.err = ErrUnsupported
+ }
+ return 0, r.err
+ }
+ buf := r.buf[:checksumSize]
+ if !r.readFull(buf, false) {
+ return 0, r.err
+ }
+ checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+ // Read directly into r.decoded instead of via r.buf.
+ n := chunkLen - checksumSize
+ if r.snappyFrame && n > maxSnappyBlockSize {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ if n > len(r.decoded) {
+ if n > r.maxBlock {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ r.decoded = make([]byte, n)
+ }
+ if !r.readFull(r.decoded[:n], false) {
+ return 0, r.err
+ }
+ if crc(r.decoded[:n]) != checksum {
+ r.err = ErrCRC
+ return 0, r.err
+ }
+ r.i, r.j = 0, n
+ continue
+
+ case chunkTypeStreamIdentifier:
+ // Section 4.1. Stream identifier (chunk type 0xff).
+ if chunkLen != len(magicBody) {
+ r.err = ErrCorrupt
+ return 0, r.err
+ }
+ if !r.readFull(r.buf[:len(magicBody)], false) {
+ return 0, r.err
+ }
+ if string(r.buf[:len(magicBody)]) != magicBody {
+ if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+ r.err = ErrCorrupt
+ return 0, r.err
+ } else {
+ r.snappyFrame = true
+ }
+ } else {
+ r.snappyFrame = false
+ }
+ continue
+ }
+
+ if chunkType <= 0x7f {
+ // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+ r.err = ErrUnsupported
+ return 0, r.err
+ }
+ // Section 4.4 Padding (chunk type 0xfe).
+ // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+ if chunkLen > maxBlockSize {
+ r.err = ErrUnsupported
+ return 0, r.err
+ }
+
+ if !r.skipN(r.buf, chunkLen, false) {
+ return 0, r.err
+ }
+ }
+}
+
+// Skip will skip n bytes forward in the decompressed output.
+// For larger skips this consumes less CPU and is faster than reading output and discarding it.
+// CRC is not checked on skipped blocks.
+// io.ErrUnexpectedEOF is returned if the stream ends before all bytes have been skipped.
+// If a decoding error is encountered subsequent calls to Read will also fail.
+func (r *Reader) Skip(n int64) error {
+ if n < 0 {
+ return errors.New("attempted negative skip")
+ }
+ if r.err != nil {
+ return r.err
+ }
+
+ for n > 0 {
+ if r.i < r.j {
+ // Skip in buffer.
+ // decoded[i:j] contains decoded bytes that have not yet been passed on.
+ left := int64(r.j - r.i)
+ if left >= n {
+ r.i += int(n)
+ return nil
+ }
+ n -= int64(r.j - r.i)
+ r.i, r.j = 0, 0
+ }
+
+ // Buffer empty; read blocks until we have content.
+ if !r.readFull(r.buf[:4], true) {
+ if r.err == io.EOF {
+ r.err = io.ErrUnexpectedEOF
+ }
+ return r.err
+ }
+ chunkType := r.buf[0]
+ if !r.readHeader {
+ if chunkType != chunkTypeStreamIdentifier {
+ r.err = ErrCorrupt
+ return r.err
+ }
+ r.readHeader = true
+ }
+ chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+ // The chunk types are specified at
+ // https://github.com/google/snappy/blob/master/framing_format.txt
+ switch chunkType {
+ case chunkTypeCompressedData:
+ // Section 4.2. Compressed data (chunk type 0x00).
+ if chunkLen < checksumSize {
+ r.err = ErrCorrupt
+ return r.err
+ }
+ if !r.ensureBufferSize(chunkLen) {
+ if r.err == nil {
+ r.err = ErrUnsupported
+ }
+ return r.err
+ }
+ buf := r.buf[:chunkLen]
+ if !r.readFull(buf, false) {
+ return r.err
+ }
+ checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+ buf = buf[checksumSize:]
+
+ dLen, err := DecodedLen(buf)
+ if err != nil {
+ r.err = err
+ return r.err
+ }
+ if dLen > r.maxBlock {
+ r.err = ErrCorrupt
+ return r.err
+ }
+ // Check if destination is within this block
+ if int64(dLen) > n {
+ if len(r.decoded) < dLen {
+ r.decoded = make([]byte, dLen)
+ }
+ if _, err := Decode(r.decoded, buf); err != nil {
+ r.err = err
+ return r.err
+ }
+ if crc(r.decoded[:dLen]) != checksum {
+ r.err = ErrCorrupt
+ return r.err
+ }
+ } else {
+ // Skip block completely
+ n -= int64(dLen)
+ dLen = 0
+ }
+ r.i, r.j = 0, dLen
+ continue
+ case chunkTypeUncompressedData:
+ // Section 4.3. Uncompressed data (chunk type 0x01).
+ if chunkLen < checksumSize {
+ r.err = ErrCorrupt
+ return r.err
+ }
+ if !r.ensureBufferSize(chunkLen) {
+ if r.err != nil {
+ r.err = ErrUnsupported
+ }
+ return r.err
+ }
+ buf := r.buf[:checksumSize]
+ if !r.readFull(buf, false) {
+ return r.err
+ }
+ checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+ // Read directly into r.decoded instead of via r.buf.
+ n2 := chunkLen - checksumSize
+ if n2 > len(r.decoded) {
+ if n2 > r.maxBlock {
+ r.err = ErrCorrupt
+ return r.err
+ }
+ r.decoded = make([]byte, n2)
+ }
+ if !r.readFull(r.decoded[:n2], false) {
+ return r.err
+ }
+ if int64(n2) < n {
+ if crc(r.decoded[:n2]) != checksum {
+ r.err = ErrCorrupt
+ return r.err
+ }
+ }
+ r.i, r.j = 0, n2
+ continue
+ case chunkTypeStreamIdentifier:
+ // Section 4.1. Stream identifier (chunk type 0xff).
+ if chunkLen != len(magicBody) {
+ r.err = ErrCorrupt
+ return r.err
+ }
+ if !r.readFull(r.buf[:len(magicBody)], false) {
+ return r.err
+ }
+ if string(r.buf[:len(magicBody)]) != magicBody {
+ if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+ r.err = ErrCorrupt
+ return r.err
+ }
+ }
+
+ continue
+ }
+
+ if chunkType <= 0x7f {
+ // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+ r.err = ErrUnsupported
+ return r.err
+ }
+ if chunkLen > maxBlockSize {
+ r.err = ErrUnsupported
+ return r.err
+ }
+ // Section 4.4 Padding (chunk type 0xfe).
+ // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+ if !r.skipN(r.buf, chunkLen, false) {
+ return r.err
+ }
+ }
+ return nil
+}
+
+// ReadByte satisfies the io.ByteReader interface.
+func (r *Reader) ReadByte() (byte, error) {
+ if r.err != nil {
+ return 0, r.err
+ }
+ if r.i < r.j {
+ c := r.decoded[r.i]
+ r.i++
+ return c, nil
+ }
+ var tmp [1]byte
+ for i := 0; i < 10; i++ {
+ n, err := r.Read(tmp[:])
+ if err != nil {
+ return 0, err
+ }
+ if n == 1 {
+ return tmp[0], nil
+ }
+ }
+ return 0, io.ErrNoProgress
+}
diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
new file mode 100644
index 00000000..9b105e03
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
@@ -0,0 +1,568 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+#define R_TMP0 AX
+#define R_TMP1 BX
+#define R_LEN CX
+#define R_OFF DX
+#define R_SRC SI
+#define R_DST DI
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+// - R_TMP0 scratch
+// - R_TMP1 scratch
+// - R_LEN length or x (shared)
+// - R_OFF offset
+// - R_SRC &src[s]
+// - R_DST &dst[d]
+// + R_DBASE dst_base
+// + R_DLEN dst_len
+// + R_DEND dst_base + dst_len
+// + R_SBASE src_base
+// + R_SLEN src_len
+// + R_SEND src_base + src_len
+// - R_TMP2 used by doCopy
+// - R_TMP3 used by doCopy
+//
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $48-56
+ // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+ MOVQ dst_base+0(FP), R_DBASE
+ MOVQ dst_len+8(FP), R_DLEN
+ MOVQ R_DBASE, R_DST
+ MOVQ R_DBASE, R_DEND
+ ADDQ R_DLEN, R_DEND
+ MOVQ src_base+24(FP), R_SBASE
+ MOVQ src_len+32(FP), R_SLEN
+ MOVQ R_SBASE, R_SRC
+ MOVQ R_SBASE, R_SEND
+ ADDQ R_SLEN, R_SEND
+ XORQ R_OFF, R_OFF
+
+loop:
+ // for s < len(src)
+ CMPQ R_SRC, R_SEND
+ JEQ end
+
+ // R_LEN = uint32(src[s])
+ //
+ // switch src[s] & 0x03
+ MOVBLZX (R_SRC), R_LEN
+ MOVL R_LEN, R_TMP1
+ ANDL $3, R_TMP1
+ CMPL R_TMP1, $1
+ JAE tagCopy
+
+ // ----------------------------------------
+ // The code below handles literal tags.
+
+ // case tagLiteral:
+ // x := uint32(src[s] >> 2)
+ // switch
+ SHRL $2, R_LEN
+ CMPL R_LEN, $60
+ JAE tagLit60Plus
+
+ // case x < 60:
+ // s++
+ INCQ R_SRC
+
+doLit:
+ // This is the end of the inner "switch", when we have a literal tag.
+ //
+ // We assume that R_LEN == x and x fits in a uint32, where x is the variable
+ // used in the pure Go decode_other.go code.
+
+ // length = int(x) + 1
+ //
+ // Unlike the pure Go code, we don't need to check if length <= 0 because
+ // R_LEN can hold 64 bits, so the increment cannot overflow.
+ INCQ R_LEN
+
+ // Prepare to check if copying length bytes will run past the end of dst or
+ // src.
+ //
+ // R_TMP0 = len(dst) - d
+ // R_TMP1 = len(src) - s
+ MOVQ R_DEND, R_TMP0
+ SUBQ R_DST, R_TMP0
+ MOVQ R_SEND, R_TMP1
+ SUBQ R_SRC, R_TMP1
+
+ // !!! Try a faster technique for short (16 or fewer bytes) copies.
+ //
+ // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+ // goto callMemmove // Fall back on calling runtime·memmove.
+ // }
+ //
+ // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+ // against 21 instead of 16, because it cannot assume that all of its input
+ // is contiguous in memory and so it needs to leave enough source bytes to
+ // read the next tag without refilling buffers, but Go's Decode assumes
+ // contiguousness (the src argument is a []byte).
+ CMPQ R_LEN, $16
+ JGT callMemmove
+ CMPQ R_TMP0, $16
+ JLT callMemmove
+ CMPQ R_TMP1, $16
+ JLT callMemmove
+
+ // !!! Implement the copy from src to dst as a 16-byte load and store.
+ // (Decode's documentation says that dst and src must not overlap.)
+ //
+ // This always copies 16 bytes, instead of only length bytes, but that's
+ // OK. If the input is a valid Snappy encoding then subsequent iterations
+ // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+ // non-nil error), so the overrun will be ignored.
+ //
+ // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+ // 16-byte loads and stores. This technique probably wouldn't be as
+ // effective on architectures that are fussier about alignment.
+ MOVOU 0(R_SRC), X0
+ MOVOU X0, 0(R_DST)
+
+ // d += length
+ // s += length
+ ADDQ R_LEN, R_DST
+ ADDQ R_LEN, R_SRC
+ JMP loop
+
+callMemmove:
+ // if length > len(dst)-d || length > len(src)-s { etc }
+ CMPQ R_LEN, R_TMP0
+ JGT errCorrupt
+ CMPQ R_LEN, R_TMP1
+ JGT errCorrupt
+
+ // copy(dst[d:], src[s:s+length])
+ //
+ // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+ // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
+ // three registers to the stack, to save local variables across the CALL.
+ MOVQ R_DST, 0(SP)
+ MOVQ R_SRC, 8(SP)
+ MOVQ R_LEN, 16(SP)
+ MOVQ R_DST, 24(SP)
+ MOVQ R_SRC, 32(SP)
+ MOVQ R_LEN, 40(SP)
+ MOVQ R_OFF, 48(SP)
+ CALL runtime·memmove(SB)
+
+ // Restore local variables: unspill registers from the stack and
+ // re-calculate R_DBASE-R_SEND.
+ MOVQ 24(SP), R_DST
+ MOVQ 32(SP), R_SRC
+ MOVQ 40(SP), R_LEN
+ MOVQ 48(SP), R_OFF
+ MOVQ dst_base+0(FP), R_DBASE
+ MOVQ dst_len+8(FP), R_DLEN
+ MOVQ R_DBASE, R_DEND
+ ADDQ R_DLEN, R_DEND
+ MOVQ src_base+24(FP), R_SBASE
+ MOVQ src_len+32(FP), R_SLEN
+ MOVQ R_SBASE, R_SEND
+ ADDQ R_SLEN, R_SEND
+
+ // d += length
+ // s += length
+ ADDQ R_LEN, R_DST
+ ADDQ R_LEN, R_SRC
+ JMP loop
+
+tagLit60Plus:
+ // !!! This fragment does the
+ //
+ // s += x - 58; if uint(s) > uint(len(src)) { etc }
+ //
+ // checks. In the asm version, we code it once instead of once per switch case.
+ ADDQ R_LEN, R_SRC
+ SUBQ $58, R_SRC
+ CMPQ R_SRC, R_SEND
+ JA errCorrupt
+
+ // case x == 60:
+ CMPL R_LEN, $61
+ JEQ tagLit61
+ JA tagLit62Plus
+
+ // x = uint32(src[s-1])
+ MOVBLZX -1(R_SRC), R_LEN
+ JMP doLit
+
+tagLit61:
+ // case x == 61:
+ // x = uint32(src[s-2]) | uint32(src[s-1])<<8
+ MOVWLZX -2(R_SRC), R_LEN
+ JMP doLit
+
+tagLit62Plus:
+ CMPL R_LEN, $62
+ JA tagLit63
+
+ // case x == 62:
+ // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+ // We read one byte, safe to read one back, since we are just reading tag.
+ // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
+ MOVL -4(R_SRC), R_LEN
+ SHRL $8, R_LEN
+ JMP doLit
+
+tagLit63:
+ // case x == 63:
+ // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+ MOVL -4(R_SRC), R_LEN
+ JMP doLit
+
+// The code above handles literal tags.
+// ----------------------------------------
+// The code below handles copy tags.
+
+tagCopy4:
+ // case tagCopy4:
+ // s += 5
+ ADDQ $5, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ CMPQ R_SRC, R_SEND
+ JA errCorrupt
+
+ // length = 1 + int(src[s-5])>>2
+ SHRQ $2, R_LEN
+ INCQ R_LEN
+
+ // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+ MOVLQZX -4(R_SRC), R_OFF
+ JMP doCopy
+
+tagCopy2:
+ // case tagCopy2:
+ // s += 3
+ ADDQ $3, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ CMPQ R_SRC, R_SEND
+ JA errCorrupt
+
+ // length = 1 + int(src[s-3])>>2
+ SHRQ $2, R_LEN
+ INCQ R_LEN
+
+ // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+ MOVWQZX -2(R_SRC), R_OFF
+ JMP doCopy
+
+tagCopy:
+ // We have a copy tag. We assume that:
+ // - R_TMP1 == src[s] & 0x03
+ // - R_LEN == src[s]
+ CMPQ R_TMP1, $2
+ JEQ tagCopy2
+ JA tagCopy4
+
+ // case tagCopy1:
+ // s += 2
+ ADDQ $2, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ CMPQ R_SRC, R_SEND
+ JA errCorrupt
+
+ // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+ // length = 4 + int(src[s-2])>>2&0x7
+ MOVBQZX -1(R_SRC), R_TMP1
+ MOVQ R_LEN, R_TMP0
+ SHRQ $2, R_LEN
+ ANDQ $0xe0, R_TMP0
+ ANDQ $7, R_LEN
+ SHLQ $3, R_TMP0
+ ADDQ $4, R_LEN
+ ORQ R_TMP1, R_TMP0
+
+ // check if repeat code, ZF set by ORQ.
+ JZ repeatCode
+
+ // This is a regular copy, transfer our temporary value to R_OFF (length)
+ MOVQ R_TMP0, R_OFF
+ JMP doCopy
+
+// This is a repeat code.
+repeatCode:
+ // If length < 9, reuse last offset, with the length already calculated.
+ CMPQ R_LEN, $9
+ JL doCopyRepeat
+
+ // Read additional bytes for length.
+ JE repeatLen1
+
+ // Rare, so the extra branch shouldn't hurt too much.
+ CMPQ R_LEN, $10
+ JE repeatLen2
+ JMP repeatLen3
+
+// Read repeat lengths.
+repeatLen1:
+ // s ++
+ ADDQ $1, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ CMPQ R_SRC, R_SEND
+ JA errCorrupt
+
+ // length = src[s-1] + 8
+ MOVBQZX -1(R_SRC), R_LEN
+ ADDL $8, R_LEN
+ JMP doCopyRepeat
+
+repeatLen2:
+ // s +=2
+ ADDQ $2, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ CMPQ R_SRC, R_SEND
+ JA errCorrupt
+
+ // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
+ MOVWQZX -2(R_SRC), R_LEN
+ ADDL $260, R_LEN
+ JMP doCopyRepeat
+
+repeatLen3:
+ // s +=3
+ ADDQ $3, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ CMPQ R_SRC, R_SEND
+ JA errCorrupt
+
+ // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
+ // Read one byte further back (just part of the tag, shifted out)
+ MOVL -4(R_SRC), R_LEN
+ SHRL $8, R_LEN
+ ADDL $65540, R_LEN
+ JMP doCopyRepeat
+
+doCopy:
+ // This is the end of the outer "switch", when we have a copy tag.
+ //
+ // We assume that:
+ // - R_LEN == length && R_LEN > 0
+ // - R_OFF == offset
+
+ // if d < offset { etc }
+ MOVQ R_DST, R_TMP1
+ SUBQ R_DBASE, R_TMP1
+ CMPQ R_TMP1, R_OFF
+ JLT errCorrupt
+
+ // Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+ // if offset <= 0 { etc }
+ CMPQ R_OFF, $0
+ JLE errCorrupt
+
+ // if length > len(dst)-d { etc }
+ MOVQ R_DEND, R_TMP1
+ SUBQ R_DST, R_TMP1
+ CMPQ R_LEN, R_TMP1
+ JGT errCorrupt
+
+ // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+ //
+ // Set:
+ // - R_TMP2 = len(dst)-d
+ // - R_TMP3 = &dst[d-offset]
+ MOVQ R_DEND, R_TMP2
+ SUBQ R_DST, R_TMP2
+ MOVQ R_DST, R_TMP3
+ SUBQ R_OFF, R_TMP3
+
+ // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+ //
+ // First, try using two 8-byte load/stores, similar to the doLit technique
+ // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+ // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+ // and not one 16-byte load/store, and the first store has to be before the
+ // second load, due to the overlap if offset is in the range [8, 16).
+ //
+ // if length > 16 || offset < 8 || len(dst)-d < 16 {
+ // goto slowForwardCopy
+ // }
+ // copy 16 bytes
+ // d += length
+ CMPQ R_LEN, $16
+ JGT slowForwardCopy
+ CMPQ R_OFF, $8
+ JLT slowForwardCopy
+ CMPQ R_TMP2, $16
+ JLT slowForwardCopy
+ MOVQ 0(R_TMP3), R_TMP0
+ MOVQ R_TMP0, 0(R_DST)
+ MOVQ 8(R_TMP3), R_TMP1
+ MOVQ R_TMP1, 8(R_DST)
+ ADDQ R_LEN, R_DST
+ JMP loop
+
+slowForwardCopy:
+ // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+ // can still try 8-byte load stores, provided we can overrun up to 10 extra
+ // bytes. As above, the overrun will be fixed up by subsequent iterations
+ // of the outermost loop.
+ //
+ // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+ // commentary says:
+ //
+ // ----
+ //
+ // The main part of this loop is a simple copy of eight bytes at a time
+ // until we've copied (at least) the requested amount of bytes. However,
+ // if d and d-offset are less than eight bytes apart (indicating a
+ // repeating pattern of length < 8), we first need to expand the pattern in
+ // order to get the correct results. For instance, if the buffer looks like
+ // this, with the eight-byte <d-offset> and <d> patterns marked as
+ // intervals:
+ //
+ // abxxxxxxxxxxxx
+ // [------] d-offset
+ // [------] d
+ //
+ // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+ // once, after which we can move <d> two bytes without moving <d-offset>:
+ //
+ // ababxxxxxxxxxx
+ // [------] d-offset
+ // [------] d
+ //
+ // and repeat the exercise until the two no longer overlap.
+ //
+ // This allows us to do very well in the special case of one single byte
+ // repeated many times, without taking a big hit for more general cases.
+ //
+ // The worst case of extra writing past the end of the match occurs when
+ // offset == 1 and length == 1; the last copy will read from byte positions
+ // [0..7] and write to [4..11], whereas it was only supposed to write to
+ // position 1. Thus, ten excess bytes.
+ //
+ // ----
+ //
+ // That "10 byte overrun" worst case is confirmed by Go's
+ // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+ // and finishSlowForwardCopy algorithm.
+ //
+ // if length > len(dst)-d-10 {
+ // goto verySlowForwardCopy
+ // }
+ SUBQ $10, R_TMP2
+ CMPQ R_LEN, R_TMP2
+ JGT verySlowForwardCopy
+
+ // We want to keep the offset, so we use R_TMP2 from here.
+ MOVQ R_OFF, R_TMP2
+
+makeOffsetAtLeast8:
+ // !!! As above, expand the pattern so that offset >= 8 and we can use
+ // 8-byte load/stores.
+ //
+ // for offset < 8 {
+ // copy 8 bytes from dst[d-offset:] to dst[d:]
+ // length -= offset
+ // d += offset
+ // offset += offset
+ // // The two previous lines together means that d-offset, and therefore
+ // // R_TMP3, is unchanged.
+ // }
+ CMPQ R_TMP2, $8
+ JGE fixUpSlowForwardCopy
+ MOVQ (R_TMP3), R_TMP1
+ MOVQ R_TMP1, (R_DST)
+ SUBQ R_TMP2, R_LEN
+ ADDQ R_TMP2, R_DST
+ ADDQ R_TMP2, R_TMP2
+ JMP makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+ // !!! Add length (which might be negative now) to d (implied by R_DST being
+ // &dst[d]) so that d ends up at the right place when we jump back to the
+ // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
+ // length is positive, copying the remaining length bytes will write to the
+ // right place.
+ MOVQ R_DST, R_TMP0
+ ADDQ R_LEN, R_DST
+
+finishSlowForwardCopy:
+ // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+ // length means that we overrun, but as above, that will be fixed up by
+ // subsequent iterations of the outermost loop.
+ CMPQ R_LEN, $0
+ JLE loop
+ MOVQ (R_TMP3), R_TMP1
+ MOVQ R_TMP1, (R_TMP0)
+ ADDQ $8, R_TMP3
+ ADDQ $8, R_TMP0
+ SUBQ $8, R_LEN
+ JMP finishSlowForwardCopy
+
+verySlowForwardCopy:
+ // verySlowForwardCopy is a simple implementation of forward copy. In C
+ // parlance, this is a do/while loop instead of a while loop, since we know
+ // that length > 0. In Go syntax:
+ //
+ // for {
+ // dst[d] = dst[d - offset]
+ // d++
+ // length--
+ // if length == 0 {
+ // break
+ // }
+ // }
+ MOVB (R_TMP3), R_TMP1
+ MOVB R_TMP1, (R_DST)
+ INCQ R_TMP3
+ INCQ R_DST
+ DECQ R_LEN
+ JNZ verySlowForwardCopy
+ JMP loop
+
+// The code above handles copy tags.
+// ----------------------------------------
+
+end:
+ // This is the end of the "for s < len(src)".
+ //
+ // if d != len(dst) { etc }
+ CMPQ R_DST, R_DEND
+ JNE errCorrupt
+
+ // return 0
+ MOVQ $0, ret+48(FP)
+ RET
+
+errCorrupt:
+ // return decodeErrCodeCorrupt
+ MOVQ $1, ret+48(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
new file mode 100644
index 00000000..4b63d508
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@@ -0,0 +1,574 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+#define R_TMP0 R2
+#define R_TMP1 R3
+#define R_LEN R4
+#define R_OFF R5
+#define R_SRC R6
+#define R_DST R7
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+
+// TEST_SRC will check if R_SRC is <= SRC_END
+#define TEST_SRC() \
+ CMP R_SEND, R_SRC \
+ BGT errCorrupt
+
+// MOVD R_SRC, R_TMP1
+// SUB R_SBASE, R_TMP1, R_TMP1
+// CMP R_SLEN, R_TMP1
+// BGT errCorrupt
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+// - R_TMP0 scratch
+// - R_TMP1 scratch
+// - R_LEN length or x
+// - R_OFF offset
+// - R_SRC &src[s]
+// - R_DST &dst[d]
+// + R_DBASE dst_base
+// + R_DLEN dst_len
+// + R_DEND dst_base + dst_len
+// + R_SBASE src_base
+// + R_SLEN src_len
+// + R_SEND src_base + src_len
+// - R_TMP2 used by doCopy
+// - R_TMP3 used by doCopy
+//
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $56-64
+ // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+ MOVD dst_base+0(FP), R_DBASE
+ MOVD dst_len+8(FP), R_DLEN
+ MOVD R_DBASE, R_DST
+ MOVD R_DBASE, R_DEND
+ ADD R_DLEN, R_DEND, R_DEND
+ MOVD src_base+24(FP), R_SBASE
+ MOVD src_len+32(FP), R_SLEN
+ MOVD R_SBASE, R_SRC
+ MOVD R_SBASE, R_SEND
+ ADD R_SLEN, R_SEND, R_SEND
+ MOVD $0, R_OFF
+
+loop:
+ // for s < len(src)
+ CMP R_SEND, R_SRC
+ BEQ end
+
+ // R_LEN = uint32(src[s])
+ //
+ // switch src[s] & 0x03
+ MOVBU (R_SRC), R_LEN
+ MOVW R_LEN, R_TMP1
+ ANDW $3, R_TMP1
+ MOVW $1, R1
+ CMPW R1, R_TMP1
+ BGE tagCopy
+
+ // ----------------------------------------
+ // The code below handles literal tags.
+
+ // case tagLiteral:
+ // x := uint32(src[s] >> 2)
+ // switch
+ MOVW $60, R1
+ LSRW $2, R_LEN, R_LEN
+ CMPW R_LEN, R1
+ BLS tagLit60Plus
+
+ // case x < 60:
+ // s++
+ ADD $1, R_SRC, R_SRC
+
+doLit:
+ // This is the end of the inner "switch", when we have a literal tag.
+ //
+ // We assume that R_LEN == x and x fits in a uint32, where x is the variable
+ // used in the pure Go decode_other.go code.
+
+ // length = int(x) + 1
+ //
+ // Unlike the pure Go code, we don't need to check if length <= 0 because
+ // R_LEN can hold 64 bits, so the increment cannot overflow.
+ ADD $1, R_LEN, R_LEN
+
+ // Prepare to check if copying length bytes will run past the end of dst or
+ // src.
+ //
+ // R_TMP0 = len(dst) - d
+ // R_TMP1 = len(src) - s
+ MOVD R_DEND, R_TMP0
+ SUB R_DST, R_TMP0, R_TMP0
+ MOVD R_SEND, R_TMP1
+ SUB R_SRC, R_TMP1, R_TMP1
+
+ // !!! Try a faster technique for short (16 or fewer bytes) copies.
+ //
+ // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+ // goto callMemmove // Fall back on calling runtime·memmove.
+ // }
+ //
+ // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+ // against 21 instead of 16, because it cannot assume that all of its input
+ // is contiguous in memory and so it needs to leave enough source bytes to
+ // read the next tag without refilling buffers, but Go's Decode assumes
+ // contiguousness (the src argument is a []byte).
+ CMP $16, R_LEN
+ BGT callMemmove
+ CMP $16, R_TMP0
+ BLT callMemmove
+ CMP $16, R_TMP1
+ BLT callMemmove
+
+ // !!! Implement the copy from src to dst as a 16-byte load and store.
+ // (Decode's documentation says that dst and src must not overlap.)
+ //
+ // This always copies 16 bytes, instead of only length bytes, but that's
+ // OK. If the input is a valid Snappy encoding then subsequent iterations
+ // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+ // non-nil error), so the overrun will be ignored.
+ //
+ // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+ // 16-byte loads and stores. This technique probably wouldn't be as
+ // effective on architectures that are fussier about alignment.
+ LDP 0(R_SRC), (R_TMP2, R_TMP3)
+ STP (R_TMP2, R_TMP3), 0(R_DST)
+
+ // d += length
+ // s += length
+ ADD R_LEN, R_DST, R_DST
+ ADD R_LEN, R_SRC, R_SRC
+ B loop
+
+callMemmove:
+ // if length > len(dst)-d || length > len(src)-s { etc }
+ CMP R_TMP0, R_LEN
+ BGT errCorrupt
+ CMP R_TMP1, R_LEN
+ BGT errCorrupt
+
+ // copy(dst[d:], src[s:s+length])
+ //
+ // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+ // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
+ // three registers to the stack, to save local variables across the CALL.
+ MOVD R_DST, 8(RSP)
+ MOVD R_SRC, 16(RSP)
+ MOVD R_LEN, 24(RSP)
+ MOVD R_DST, 32(RSP)
+ MOVD R_SRC, 40(RSP)
+ MOVD R_LEN, 48(RSP)
+ MOVD R_OFF, 56(RSP)
+ CALL runtime·memmove(SB)
+
+ // Restore local variables: unspill registers from the stack and
+ // re-calculate R_DBASE-R_SEND.
+ MOVD 32(RSP), R_DST
+ MOVD 40(RSP), R_SRC
+ MOVD 48(RSP), R_LEN
+ MOVD 56(RSP), R_OFF
+ MOVD dst_base+0(FP), R_DBASE
+ MOVD dst_len+8(FP), R_DLEN
+ MOVD R_DBASE, R_DEND
+ ADD R_DLEN, R_DEND, R_DEND
+ MOVD src_base+24(FP), R_SBASE
+ MOVD src_len+32(FP), R_SLEN
+ MOVD R_SBASE, R_SEND
+ ADD R_SLEN, R_SEND, R_SEND
+
+ // d += length
+ // s += length
+ ADD R_LEN, R_DST, R_DST
+ ADD R_LEN, R_SRC, R_SRC
+ B loop
+
+tagLit60Plus:
+ // !!! This fragment does the
+ //
+ // s += x - 58; if uint(s) > uint(len(src)) { etc }
+ //
+ // checks. In the asm version, we code it once instead of once per switch case.
+ ADD R_LEN, R_SRC, R_SRC
+ SUB $58, R_SRC, R_SRC
+ TEST_SRC()
+
+ // case x == 60:
+ MOVW $61, R1
+ CMPW R1, R_LEN
+ BEQ tagLit61
+ BGT tagLit62Plus
+
+ // x = uint32(src[s-1])
+ MOVBU -1(R_SRC), R_LEN
+ B doLit
+
+tagLit61:
+ // case x == 61:
+ // x = uint32(src[s-2]) | uint32(src[s-1])<<8
+ MOVHU -2(R_SRC), R_LEN
+ B doLit
+
+tagLit62Plus:
+ CMPW $62, R_LEN
+ BHI tagLit63
+
+ // case x == 62:
+ // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+ MOVHU -3(R_SRC), R_LEN
+ MOVBU -1(R_SRC), R_TMP1
+ ORR R_TMP1<<16, R_LEN
+ B doLit
+
+tagLit63:
+ // case x == 63:
+ // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+ MOVWU -4(R_SRC), R_LEN
+ B doLit
+
+ // The code above handles literal tags.
+ // ----------------------------------------
+ // The code below handles copy tags.
+
+tagCopy4:
+ // case tagCopy4:
+ // s += 5
+ ADD $5, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ MOVD R_SRC, R_TMP1
+ SUB R_SBASE, R_TMP1, R_TMP1
+ CMP R_SLEN, R_TMP1
+ BGT errCorrupt
+
+ // length = 1 + int(src[s-5])>>2
+ MOVD $1, R1
+ ADD R_LEN>>2, R1, R_LEN
+
+ // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+ MOVWU -4(R_SRC), R_OFF
+ B doCopy
+
+tagCopy2:
+ // case tagCopy2:
+ // s += 3
+ ADD $3, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // length = 1 + int(src[s-3])>>2
+ MOVD $1, R1
+ ADD R_LEN>>2, R1, R_LEN
+
+ // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+ MOVHU -2(R_SRC), R_OFF
+ B doCopy
+
+tagCopy:
+ // We have a copy tag. We assume that:
+ // - R_TMP1 == src[s] & 0x03
+ // - R_LEN == src[s]
+ CMP $2, R_TMP1
+ BEQ tagCopy2
+ BGT tagCopy4
+
+ // case tagCopy1:
+ // s += 2
+ ADD $2, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+ // Calculate offset in R_TMP0 in case it is a repeat.
+ MOVD R_LEN, R_TMP0
+ AND $0xe0, R_TMP0
+ MOVBU -1(R_SRC), R_TMP1
+ ORR R_TMP0<<3, R_TMP1, R_TMP0
+
+ // length = 4 + int(src[s-2])>>2&0x7
+ MOVD $7, R1
+ AND R_LEN>>2, R1, R_LEN
+ ADD $4, R_LEN, R_LEN
+
+ // check if repeat code with offset 0.
+ CMP $0, R_TMP0
+ BEQ repeatCode
+
+ // This is a regular copy, transfer our temporary value to R_OFF (offset)
+ MOVD R_TMP0, R_OFF
+ B doCopy
+
+ // This is a repeat code.
+repeatCode:
+ // If length < 9, reuse last offset, with the length already calculated.
+ CMP $9, R_LEN
+ BLT doCopyRepeat
+ BEQ repeatLen1
+ CMP $10, R_LEN
+ BEQ repeatLen2
+
+repeatLen3:
+ // s +=3
+ ADD $3, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
+ MOVBU -1(R_SRC), R_TMP0
+ MOVHU -3(R_SRC), R_LEN
+ ORR R_TMP0<<16, R_LEN, R_LEN
+ ADD $65540, R_LEN, R_LEN
+ B doCopyRepeat
+
+repeatLen2:
+ // s +=2
+ ADD $2, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
+ MOVHU -2(R_SRC), R_LEN
+ ADD $260, R_LEN, R_LEN
+ B doCopyRepeat
+
+repeatLen1:
+ // s +=1
+ ADD $1, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // length = src[s-1] + 8
+ MOVBU -1(R_SRC), R_LEN
+ ADD $8, R_LEN, R_LEN
+ B doCopyRepeat
+
+doCopy:
+ // This is the end of the outer "switch", when we have a copy tag.
+ //
+ // We assume that:
+ // - R_LEN == length && R_LEN > 0
+ // - R_OFF == offset
+
+ // if d < offset { etc }
+ MOVD R_DST, R_TMP1
+ SUB R_DBASE, R_TMP1, R_TMP1
+ CMP R_OFF, R_TMP1
+ BLT errCorrupt
+
+ // Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+
+ // if offset <= 0 { etc }
+ CMP $0, R_OFF
+ BLE errCorrupt
+
+ // if length > len(dst)-d { etc }
+ MOVD R_DEND, R_TMP1
+ SUB R_DST, R_TMP1, R_TMP1
+ CMP R_TMP1, R_LEN
+ BGT errCorrupt
+
+ // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+ //
+ // Set:
+ // - R_TMP2 = len(dst)-d
+ // - R_TMP3 = &dst[d-offset]
+ MOVD R_DEND, R_TMP2
+ SUB R_DST, R_TMP2, R_TMP2
+ MOVD R_DST, R_TMP3
+ SUB R_OFF, R_TMP3, R_TMP3
+
+ // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+ //
+ // First, try using two 8-byte load/stores, similar to the doLit technique
+ // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+ // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+ // and not one 16-byte load/store, and the first store has to be before the
+ // second load, due to the overlap if offset is in the range [8, 16).
+ //
+ // if length > 16 || offset < 8 || len(dst)-d < 16 {
+ // goto slowForwardCopy
+ // }
+ // copy 16 bytes
+ // d += length
+ CMP $16, R_LEN
+ BGT slowForwardCopy
+ CMP $8, R_OFF
+ BLT slowForwardCopy
+ CMP $16, R_TMP2
+ BLT slowForwardCopy
+ MOVD 0(R_TMP3), R_TMP0
+ MOVD R_TMP0, 0(R_DST)
+ MOVD 8(R_TMP3), R_TMP1
+ MOVD R_TMP1, 8(R_DST)
+ ADD R_LEN, R_DST, R_DST
+ B loop
+
+slowForwardCopy:
+ // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+ // can still try 8-byte load stores, provided we can overrun up to 10 extra
+ // bytes. As above, the overrun will be fixed up by subsequent iterations
+ // of the outermost loop.
+ //
+ // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+ // commentary says:
+ //
+ // ----
+ //
+ // The main part of this loop is a simple copy of eight bytes at a time
+ // until we've copied (at least) the requested amount of bytes. However,
+ // if d and d-offset are less than eight bytes apart (indicating a
+ // repeating pattern of length < 8), we first need to expand the pattern in
+ // order to get the correct results. For instance, if the buffer looks like
+ // this, with the eight-byte <d-offset> and <d> patterns marked as
+ // intervals:
+ //
+ // abxxxxxxxxxxxx
+ // [------] d-offset
+ // [------] d
+ //
+ // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+ // once, after which we can move <d> two bytes without moving <d-offset>:
+ //
+ // ababxxxxxxxxxx
+ // [------] d-offset
+ // [------] d
+ //
+ // and repeat the exercise until the two no longer overlap.
+ //
+ // This allows us to do very well in the special case of one single byte
+ // repeated many times, without taking a big hit for more general cases.
+ //
+ // The worst case of extra writing past the end of the match occurs when
+ // offset == 1 and length == 1; the last copy will read from byte positions
+ // [0..7] and write to [4..11], whereas it was only supposed to write to
+ // position 1. Thus, ten excess bytes.
+ //
+ // ----
+ //
+ // That "10 byte overrun" worst case is confirmed by Go's
+ // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+ // and finishSlowForwardCopy algorithm.
+ //
+ // if length > len(dst)-d-10 {
+ // goto verySlowForwardCopy
+ // }
+ SUB $10, R_TMP2, R_TMP2
+ CMP R_TMP2, R_LEN
+ BGT verySlowForwardCopy
+
+ // We want to keep the offset, so we use R_TMP2 from here.
+ MOVD R_OFF, R_TMP2
+
+makeOffsetAtLeast8:
+ // !!! As above, expand the pattern so that offset >= 8 and we can use
+ // 8-byte load/stores.
+ //
+ // for offset < 8 {
+ // copy 8 bytes from dst[d-offset:] to dst[d:]
+ // length -= offset
+ // d += offset
+ // offset += offset
+ // // The two previous lines together means that d-offset, and therefore
+ // // R_TMP3, is unchanged.
+ // }
+ CMP $8, R_TMP2
+ BGE fixUpSlowForwardCopy
+ MOVD (R_TMP3), R_TMP1
+ MOVD R_TMP1, (R_DST)
+ SUB R_TMP2, R_LEN, R_LEN
+ ADD R_TMP2, R_DST, R_DST
+ ADD R_TMP2, R_TMP2, R_TMP2
+ B makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+ // !!! Add length (which might be negative now) to d (implied by R_DST being
+ // &dst[d]) so that d ends up at the right place when we jump back to the
+ // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
+ // length is positive, copying the remaining length bytes will write to the
+ // right place.
+ MOVD R_DST, R_TMP0
+ ADD R_LEN, R_DST, R_DST
+
+finishSlowForwardCopy:
+ // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+ // length means that we overrun, but as above, that will be fixed up by
+ // subsequent iterations of the outermost loop.
+ MOVD $0, R1
+ CMP R1, R_LEN
+ BLE loop
+ MOVD (R_TMP3), R_TMP1
+ MOVD R_TMP1, (R_TMP0)
+ ADD $8, R_TMP3, R_TMP3
+ ADD $8, R_TMP0, R_TMP0
+ SUB $8, R_LEN, R_LEN
+ B finishSlowForwardCopy
+
+verySlowForwardCopy:
+ // verySlowForwardCopy is a simple implementation of forward copy. In C
+ // parlance, this is a do/while loop instead of a while loop, since we know
+ // that length > 0. In Go syntax:
+ //
+ // for {
+ // dst[d] = dst[d - offset]
+ // d++
+ // length--
+ // if length == 0 {
+ // break
+ // }
+ // }
+ MOVB (R_TMP3), R_TMP1
+ MOVB R_TMP1, (R_DST)
+ ADD $1, R_TMP3, R_TMP3
+ ADD $1, R_DST, R_DST
+ SUB $1, R_LEN, R_LEN
+ CBNZ R_LEN, verySlowForwardCopy
+ B loop
+
+ // The code above handles copy tags.
+ // ----------------------------------------
+
+end:
+ // This is the end of the "for s < len(src)".
+ //
+ // if d != len(dst) { etc }
+ CMP R_DEND, R_DST
+ BNE errCorrupt
+
+ // return 0
+ MOVD $0, ret+48(FP)
+ RET
+
+errCorrupt:
+ // return decodeErrCodeCorrupt
+ MOVD $1, R_TMP0
+ MOVD R_TMP0, ret+48(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/s2/decode_asm.go b/vendor/github.com/klauspost/compress/s2/decode_asm.go
new file mode 100644
index 00000000..cb3576ed
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_asm.go
@@ -0,0 +1,17 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (amd64 || arm64) && !appengine && gc && !noasm
+// +build amd64 arm64
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package s2
+
+// decode has the same semantics as in decode_other.go.
+//
+//go:noescape
+func s2Decode(dst, src []byte) int
diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go
new file mode 100644
index 00000000..1074ebd2
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@@ -0,0 +1,267 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (!amd64 && !arm64) || appengine || !gc || noasm
+// +build !amd64,!arm64 appengine !gc noasm
+
+package s2
+
+import (
+ "fmt"
+ "strconv"
+)
+
+// decode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func s2Decode(dst, src []byte) int {
+ const debug = false
+ if debug {
+ fmt.Println("Starting decode, dst len:", len(dst))
+ }
+ var d, s, length int
+ offset := 0
+
+ // As long as we can read at least 5 bytes...
+ for s < len(src)-5 {
+ switch src[s] & 0x03 {
+ case tagLiteral:
+ x := uint32(src[s] >> 2)
+ switch {
+ case x < 60:
+ s++
+ case x == 60:
+ s += 2
+ x = uint32(src[s-1])
+ case x == 61:
+ s += 3
+ x = uint32(src[s-2]) | uint32(src[s-1])<<8
+ case x == 62:
+ s += 4
+ x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+ case x == 63:
+ s += 5
+ x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+ }
+ length = int(x) + 1
+ if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+ return decodeErrCodeCorrupt
+ }
+ if debug {
+ fmt.Println("literals, length:", length, "d-after:", d+length)
+ }
+
+ copy(dst[d:], src[s:s+length])
+ d += length
+ s += length
+ continue
+
+ case tagCopy1:
+ s += 2
+ length = int(src[s-2]) >> 2 & 0x7
+ toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+ if toffset == 0 {
+ if debug {
+ fmt.Print("(repeat) ")
+ }
+ // keep last offset
+ switch length {
+ case 5:
+ s += 1
+ length = int(uint32(src[s-1])) + 4
+ case 6:
+ s += 2
+ length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+ case 7:
+ s += 3
+ length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+ default: // 0-> 4
+ }
+ } else {
+ offset = toffset
+ }
+ length += 4
+ case tagCopy2:
+ s += 3
+ length = 1 + int(src[s-3])>>2
+ offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+ case tagCopy4:
+ s += 5
+ length = 1 + int(src[s-5])>>2
+ offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+ }
+
+ if offset <= 0 || d < offset || length > len(dst)-d {
+ return decodeErrCodeCorrupt
+ }
+
+ if debug {
+ fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+ }
+
+ // Copy from an earlier sub-slice of dst to a later sub-slice.
+ // If no overlap, use the built-in copy:
+ if offset > length {
+ copy(dst[d:d+length], dst[d-offset:])
+ d += length
+ continue
+ }
+
+ // Unlike the built-in copy function, this byte-by-byte copy always runs
+ // forwards, even if the slices overlap. Conceptually, this is:
+ //
+ // d += forwardCopy(dst[d:d+length], dst[d-offset:])
+ //
+ // We align the slices into a and b and show the compiler they are the same size.
+ // This allows the loop to run without bounds checks.
+ a := dst[d : d+length]
+ b := dst[d-offset:]
+ b = b[:len(a)]
+ for i := range a {
+ a[i] = b[i]
+ }
+ d += length
+ }
+
+ // Remaining with extra checks...
+ for s < len(src) {
+ switch src[s] & 0x03 {
+ case tagLiteral:
+ x := uint32(src[s] >> 2)
+ switch {
+ case x < 60:
+ s++
+ case x == 60:
+ s += 2
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ x = uint32(src[s-1])
+ case x == 61:
+ s += 3
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ x = uint32(src[s-2]) | uint32(src[s-1])<<8
+ case x == 62:
+ s += 4
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+ case x == 63:
+ s += 5
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+ }
+ length = int(x) + 1
+ if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+ return decodeErrCodeCorrupt
+ }
+ if debug {
+ fmt.Println("literals, length:", length, "d-after:", d+length)
+ }
+
+ copy(dst[d:], src[s:s+length])
+ d += length
+ s += length
+ continue
+
+ case tagCopy1:
+ s += 2
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ length = int(src[s-2]) >> 2 & 0x7
+ toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+ if toffset == 0 {
+ if debug {
+ fmt.Print("(repeat) ")
+ }
+ // keep last offset
+ switch length {
+ case 5:
+ s += 1
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ length = int(uint32(src[s-1])) + 4
+ case 6:
+ s += 2
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+ case 7:
+ s += 3
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+ default: // 0-> 4
+ }
+ } else {
+ offset = toffset
+ }
+ length += 4
+ case tagCopy2:
+ s += 3
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ length = 1 + int(src[s-3])>>2
+ offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+ case tagCopy4:
+ s += 5
+ if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+ return decodeErrCodeCorrupt
+ }
+ length = 1 + int(src[s-5])>>2
+ offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+ }
+
+ if offset <= 0 || d < offset || length > len(dst)-d {
+ return decodeErrCodeCorrupt
+ }
+
+ if debug {
+ fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+ }
+
+ // Copy from an earlier sub-slice of dst to a later sub-slice.
+ // If no overlap, use the built-in copy:
+ if offset > length {
+ copy(dst[d:d+length], dst[d-offset:])
+ d += length
+ continue
+ }
+
+ // Unlike the built-in copy function, this byte-by-byte copy always runs
+ // forwards, even if the slices overlap. Conceptually, this is:
+ //
+ // d += forwardCopy(dst[d:d+length], dst[d-offset:])
+ //
+ // We align the slices into a and b and show the compiler they are the same size.
+ // This allows the loop to run without bounds checks.
+ a := dst[d : d+length]
+ b := dst[d-offset:]
+ b = b[:len(a)]
+ for i := range a {
+ a[i] = b[i]
+ }
+ d += length
+ }
+
+ if d != len(dst) {
+ return decodeErrCodeCorrupt
+ }
+ return 0
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go
new file mode 100644
index 00000000..aa8b108d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode.go
@@ -0,0 +1,1172 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+ "crypto/rand"
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "io"
+ "math"
+ "math/bits"
+ "runtime"
+ "sync"
+)
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func Encode(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if cap(dst) < n {
+ dst = make([]byte, n)
+ } else {
+ dst = dst[:n]
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ d := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:d]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+ }
+ n := encodeBlock(dst[d:], src)
+ if n > 0 {
+ d += n
+ return dst[:d]
+ }
+ // Not compressible
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+}
+
+// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeBetter(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if len(dst) < n {
+ dst = make([]byte, n)
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ d := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:d]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+ }
+ n := encodeBlockBetter(dst[d:], src)
+ if n > 0 {
+ d += n
+ return dst[:d]
+ }
+ // Not compressible
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+}
+
+// EncodeBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBest compresses as good as reasonably possible but with a
+// big speed decrease.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeBest(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if len(dst) < n {
+ dst = make([]byte, n)
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ d := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:d]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+ }
+ n := encodeBlockBest(dst[d:], src)
+ if n > 0 {
+ d += n
+ return dst[:d]
+ }
+ // Not compressible
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+}
+
+// EncodeSnappy returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappy(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if cap(dst) < n {
+ dst = make([]byte, n)
+ } else {
+ dst = dst[:n]
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ d := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:d]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+ }
+
+ n := encodeBlockSnappy(dst[d:], src)
+ if n > 0 {
+ d += n
+ return dst[:d]
+ }
+ // Not compressible
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+}
+
+// EncodeSnappyBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappyBetter(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if cap(dst) < n {
+ dst = make([]byte, n)
+ } else {
+ dst = dst[:n]
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ d := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:d]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+ }
+
+ n := encodeBlockBetterSnappy(dst[d:], src)
+ if n > 0 {
+ d += n
+ return dst[:d]
+ }
+ // Not compressible
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+}
+
+// EncodeSnappyBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappyBest(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if cap(dst) < n {
+ dst = make([]byte, n)
+ } else {
+ dst = dst[:n]
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ d := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:d]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+ }
+
+ n := encodeBlockBestSnappy(dst[d:], src)
+ if n > 0 {
+ d += n
+ return dst[:d]
+ }
+ // Not compressible
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+}
+
+// ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination.
+// If the destination is nil or too small, a new will be allocated.
+// The blocks are not validated, so garbage in = garbage out.
+// dst may not overlap block data.
+// Any data in dst is preserved as is, so it will not be considered a block.
+func ConcatBlocks(dst []byte, blocks ...[]byte) ([]byte, error) {
+ totalSize := uint64(0)
+ compSize := 0
+ for _, b := range blocks {
+ l, hdr, err := decodedLen(b)
+ if err != nil {
+ return nil, err
+ }
+ totalSize += uint64(l)
+ compSize += len(b) - hdr
+ }
+ if totalSize == 0 {
+ dst = append(dst, 0)
+ return dst, nil
+ }
+ if totalSize > math.MaxUint32 {
+ return nil, ErrTooLarge
+ }
+ var tmp [binary.MaxVarintLen32]byte
+ hdrSize := binary.PutUvarint(tmp[:], totalSize)
+ wantSize := hdrSize + compSize
+
+ if cap(dst)-len(dst) < wantSize {
+ dst = append(make([]byte, 0, wantSize+len(dst)), dst...)
+ }
+ dst = append(dst, tmp[:hdrSize]...)
+ for _, b := range blocks {
+ _, hdr, err := decodedLen(b)
+ if err != nil {
+ return nil, err
+ }
+ dst = append(dst, b[hdr:]...)
+ }
+ return dst, nil
+}
+
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 8
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// will be accepted by the encoder.
+const minNonLiteralBlockSize = 32
+
+// MaxBlockSize is the maximum value where MaxEncodedLen will return a valid block size.
+// Blocks this big are highly discouraged, though.
+const MaxBlockSize = math.MaxUint32 - binary.MaxVarintLen32 - 5
+
+// MaxEncodedLen returns the maximum length of a snappy block, given its
+// uncompressed length.
+//
+// It will return a negative value if srcLen is too large to encode.
+// 32 bit platforms will have lower thresholds for rejecting big content.
+func MaxEncodedLen(srcLen int) int {
+ n := uint64(srcLen)
+ if n > 0xffffffff {
+ // Also includes negative.
+ return -1
+ }
+ // Size of the varint encoded block size.
+ n = n + uint64((bits.Len64(n)+7)/7)
+
+ // Add maximum size of encoding block as literals.
+ n += uint64(literalExtraSize(int64(srcLen)))
+ if n > 0xffffffff {
+ return -1
+ }
+ return int(n)
+}
+
+var errClosed = errors.New("s2: Writer is closed")
+
+// NewWriter returns a new Writer that compresses to w, using the
+// framing format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+//
+// Users must call Close to guarantee all data has been forwarded to
+// the underlying io.Writer and that resources are released.
+// They may also call Flush zero or more times before calling Close.
+func NewWriter(w io.Writer, opts ...WriterOption) *Writer {
+ w2 := Writer{
+ blockSize: defaultBlockSize,
+ concurrency: runtime.GOMAXPROCS(0),
+ randSrc: rand.Reader,
+ level: levelFast,
+ }
+ for _, opt := range opts {
+ if err := opt(&w2); err != nil {
+ w2.errState = err
+ return &w2
+ }
+ }
+ w2.obufLen = obufHeaderLen + MaxEncodedLen(w2.blockSize)
+ w2.paramsOK = true
+ w2.ibuf = make([]byte, 0, w2.blockSize)
+ w2.buffers.New = func() interface{} {
+ return make([]byte, w2.obufLen)
+ }
+ w2.Reset(w)
+ return &w2
+}
+
+// Writer is an io.Writer that can write Snappy-compressed bytes.
+type Writer struct {
+ errMu sync.Mutex
+ errState error
+
+ // ibuf is a buffer for the incoming (uncompressed) bytes.
+ ibuf []byte
+
+ blockSize int
+ obufLen int
+ concurrency int
+ written int64
+ output chan chan result
+ buffers sync.Pool
+ pad int
+
+ writer io.Writer
+ randSrc io.Reader
+ writerWg sync.WaitGroup
+
+ // wroteStreamHeader is whether we have written the stream header.
+ wroteStreamHeader bool
+ paramsOK bool
+ snappy bool
+ flushOnWrite bool
+ level uint8
+}
+
+const (
+ levelUncompressed = iota + 1
+ levelFast
+ levelBetter
+ levelBest
+)
+
+type result []byte
+
+// err returns the previously set error.
+// If no error has been set it is set to err if not nil.
+func (w *Writer) err(err error) error {
+ w.errMu.Lock()
+ errSet := w.errState
+ if errSet == nil && err != nil {
+ w.errState = err
+ errSet = err
+ }
+ w.errMu.Unlock()
+ return errSet
+}
+
+// Reset discards the writer's state and switches the Snappy writer to write to w.
+// This permits reusing a Writer rather than allocating a new one.
+func (w *Writer) Reset(writer io.Writer) {
+ if !w.paramsOK {
+ return
+ }
+ // Close previous writer, if any.
+ if w.output != nil {
+ close(w.output)
+ w.writerWg.Wait()
+ w.output = nil
+ }
+ w.errState = nil
+ w.ibuf = w.ibuf[:0]
+ w.wroteStreamHeader = false
+ w.written = 0
+ w.writer = writer
+ // If we didn't get a writer, stop here.
+ if writer == nil {
+ return
+ }
+ // If no concurrency requested, don't spin up writer goroutine.
+ if w.concurrency == 1 {
+ return
+ }
+
+ toWrite := make(chan chan result, w.concurrency)
+ w.output = toWrite
+ w.writerWg.Add(1)
+
+ // Start a writer goroutine that will write all output in order.
+ go func() {
+ defer w.writerWg.Done()
+
+ // Get a queued write.
+ for write := range toWrite {
+ // Wait for the data to be available.
+ in := <-write
+ if len(in) > 0 {
+ if w.err(nil) == nil {
+ // Don't expose data from previous buffers.
+ toWrite := in[:len(in):len(in)]
+ // Write to output.
+ n, err := writer.Write(toWrite)
+ if err == nil && n != len(toWrite) {
+ err = io.ErrShortBuffer
+ }
+ _ = w.err(err)
+ w.written += int64(n)
+ }
+ }
+ if cap(in) >= w.obufLen {
+ w.buffers.Put([]byte(in))
+ }
+ // close the incoming write request.
+ // This can be used for synchronizing flushes.
+ close(write)
+ }
+ }()
+}
+
+// Write satisfies the io.Writer interface.
+func (w *Writer) Write(p []byte) (nRet int, errRet error) {
+ if w.flushOnWrite {
+ return w.write(p)
+ }
+ // If we exceed the input buffer size, start writing
+ for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err(nil) == nil {
+ var n int
+ if len(w.ibuf) == 0 {
+ // Large write, empty buffer.
+ // Write directly from p to avoid copy.
+ n, _ = w.write(p)
+ } else {
+ n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+ w.ibuf = w.ibuf[:len(w.ibuf)+n]
+ w.write(w.ibuf)
+ w.ibuf = w.ibuf[:0]
+ }
+ nRet += n
+ p = p[n:]
+ }
+ if err := w.err(nil); err != nil {
+ return nRet, err
+ }
+ // p should always be able to fit into w.ibuf now.
+ n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+ w.ibuf = w.ibuf[:len(w.ibuf)+n]
+ nRet += n
+ return nRet, nil
+}
+
+// ReadFrom implements the io.ReaderFrom interface.
+// Using this is typically more efficient since it avoids a memory copy.
+// ReadFrom reads data from r until EOF or error.
+// The return value n is the number of bytes read.
+// Any error except io.EOF encountered during the read is also returned.
+func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
+ if len(w.ibuf) > 0 {
+ err := w.Flush()
+ if err != nil {
+ return 0, err
+ }
+ }
+ if br, ok := r.(byter); ok {
+ buf := br.Bytes()
+ if err := w.EncodeBuffer(buf); err != nil {
+ return 0, err
+ }
+ return int64(len(buf)), w.Flush()
+ }
+ for {
+ inbuf := w.buffers.Get().([]byte)[:w.blockSize+obufHeaderLen]
+ n2, err := io.ReadFull(r, inbuf[obufHeaderLen:])
+ if err != nil {
+ if err == io.ErrUnexpectedEOF {
+ err = io.EOF
+ }
+ if err != io.EOF {
+ return n, w.err(err)
+ }
+ }
+ if n2 == 0 {
+ break
+ }
+ n += int64(n2)
+ err2 := w.writeFull(inbuf[:n2+obufHeaderLen])
+ if w.err(err2) != nil {
+ break
+ }
+
+ if err != nil {
+ // We got EOF and wrote everything
+ break
+ }
+ }
+
+ return n, w.err(nil)
+}
+
+// EncodeBuffer will add a buffer to the stream.
+// This is the fastest way to encode a stream,
+// but the input buffer cannot be written to by the caller
+// until Flush or Close has been called when concurrency != 1.
+//
+// If you cannot control that, use the regular Write function.
+//
+// Note that input is not buffered.
+// This means that each write will result in discrete blocks being created.
+// For buffered writes, use the regular Write function.
+func (w *Writer) EncodeBuffer(buf []byte) (err error) {
+ if err := w.err(nil); err != nil {
+ return err
+ }
+
+ if w.flushOnWrite {
+ _, err := w.write(buf)
+ return err
+ }
+ // Flush queued data first.
+ if len(w.ibuf) > 0 {
+ err := w.Flush()
+ if err != nil {
+ return err
+ }
+ }
+ if w.concurrency == 1 {
+ _, err := w.writeSync(buf)
+ return err
+ }
+
+ // Spawn goroutine and write block to output channel.
+ if !w.wroteStreamHeader {
+ w.wroteStreamHeader = true
+ hWriter := make(chan result)
+ w.output <- hWriter
+ if w.snappy {
+ hWriter <- []byte(magicChunkSnappy)
+ } else {
+ hWriter <- []byte(magicChunk)
+ }
+ }
+
+ for len(buf) > 0 {
+ // Cut input.
+ uncompressed := buf
+ if len(uncompressed) > w.blockSize {
+ uncompressed = uncompressed[:w.blockSize]
+ }
+ buf = buf[len(uncompressed):]
+ // Get an output buffer.
+ obuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+ output := make(chan result)
+ // Queue output now, so we keep order.
+ w.output <- output
+ go func() {
+ checksum := crc(uncompressed)
+
+ // Set to uncompressed.
+ chunkType := uint8(chunkTypeUncompressedData)
+ chunkLen := 4 + len(uncompressed)
+
+ // Attempt compressing.
+ n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+ n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+ // Check if we should use this, or store as uncompressed instead.
+ if n2 > 0 {
+ chunkType = uint8(chunkTypeCompressedData)
+ chunkLen = 4 + n + n2
+ obuf = obuf[:obufHeaderLen+n+n2]
+ } else {
+ // copy uncompressed
+ copy(obuf[obufHeaderLen:], uncompressed)
+ }
+
+ // Fill in the per-chunk header that comes before the body.
+ obuf[0] = chunkType
+ obuf[1] = uint8(chunkLen >> 0)
+ obuf[2] = uint8(chunkLen >> 8)
+ obuf[3] = uint8(chunkLen >> 16)
+ obuf[4] = uint8(checksum >> 0)
+ obuf[5] = uint8(checksum >> 8)
+ obuf[6] = uint8(checksum >> 16)
+ obuf[7] = uint8(checksum >> 24)
+
+ // Queue final output.
+ output <- obuf
+ }()
+ }
+ return nil
+}
+
+func (w *Writer) encodeBlock(obuf, uncompressed []byte) int {
+ if w.snappy {
+ switch w.level {
+ case levelFast:
+ return encodeBlockSnappy(obuf, uncompressed)
+ case levelBetter:
+ return encodeBlockBetterSnappy(obuf, uncompressed)
+ case levelBest:
+ return encodeBlockBestSnappy(obuf, uncompressed)
+ }
+ return 0
+ }
+ switch w.level {
+ case levelFast:
+ return encodeBlock(obuf, uncompressed)
+ case levelBetter:
+ return encodeBlockBetter(obuf, uncompressed)
+ case levelBest:
+ return encodeBlockBest(obuf, uncompressed)
+ }
+ return 0
+}
+
+func (w *Writer) write(p []byte) (nRet int, errRet error) {
+ if err := w.err(nil); err != nil {
+ return 0, err
+ }
+ if w.concurrency == 1 {
+ return w.writeSync(p)
+ }
+
+ // Spawn goroutine and write block to output channel.
+ for len(p) > 0 {
+ if !w.wroteStreamHeader {
+ w.wroteStreamHeader = true
+ hWriter := make(chan result)
+ w.output <- hWriter
+ if w.snappy {
+ hWriter <- []byte(magicChunkSnappy)
+ } else {
+ hWriter <- []byte(magicChunk)
+ }
+ }
+
+ var uncompressed []byte
+ if len(p) > w.blockSize {
+ uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+ } else {
+ uncompressed, p = p, nil
+ }
+
+ // Copy input.
+ // If the block is incompressible, this is used for the result.
+ inbuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+ obuf := w.buffers.Get().([]byte)[:w.obufLen]
+ copy(inbuf[obufHeaderLen:], uncompressed)
+ uncompressed = inbuf[obufHeaderLen:]
+
+ output := make(chan result)
+ // Queue output now, so we keep order.
+ w.output <- output
+ go func() {
+ checksum := crc(uncompressed)
+
+ // Set to uncompressed.
+ chunkType := uint8(chunkTypeUncompressedData)
+ chunkLen := 4 + len(uncompressed)
+
+ // Attempt compressing.
+ n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+ n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+ // Check if we should use this, or store as uncompressed instead.
+ if n2 > 0 {
+ chunkType = uint8(chunkTypeCompressedData)
+ chunkLen = 4 + n + n2
+ obuf = obuf[:obufHeaderLen+n+n2]
+ } else {
+ // Use input as output.
+ obuf, inbuf = inbuf, obuf
+ }
+
+ // Fill in the per-chunk header that comes before the body.
+ obuf[0] = chunkType
+ obuf[1] = uint8(chunkLen >> 0)
+ obuf[2] = uint8(chunkLen >> 8)
+ obuf[3] = uint8(chunkLen >> 16)
+ obuf[4] = uint8(checksum >> 0)
+ obuf[5] = uint8(checksum >> 8)
+ obuf[6] = uint8(checksum >> 16)
+ obuf[7] = uint8(checksum >> 24)
+
+ // Queue final output.
+ output <- obuf
+
+ // Put unused buffer back in pool.
+ w.buffers.Put(inbuf)
+ }()
+ nRet += len(uncompressed)
+ }
+ return nRet, nil
+}
+
+// writeFull is a special version of write that will always write the full buffer.
+// Data to be compressed should start at offset obufHeaderLen and fill the remainder of the buffer.
+// The data will be written as a single block.
+// The caller is not allowed to use inbuf after this function has been called.
+func (w *Writer) writeFull(inbuf []byte) (errRet error) {
+ if err := w.err(nil); err != nil {
+ return err
+ }
+
+ if w.concurrency == 1 {
+ _, err := w.writeSync(inbuf[obufHeaderLen:])
+ return err
+ }
+
+ // Spawn goroutine and write block to output channel.
+ if !w.wroteStreamHeader {
+ w.wroteStreamHeader = true
+ hWriter := make(chan result)
+ w.output <- hWriter
+ if w.snappy {
+ hWriter <- []byte(magicChunkSnappy)
+ } else {
+ hWriter <- []byte(magicChunk)
+ }
+ }
+
+ // Get an output buffer.
+ obuf := w.buffers.Get().([]byte)[:w.obufLen]
+ uncompressed := inbuf[obufHeaderLen:]
+
+ output := make(chan result)
+ // Queue output now, so we keep order.
+ w.output <- output
+ go func() {
+ checksum := crc(uncompressed)
+
+ // Set to uncompressed.
+ chunkType := uint8(chunkTypeUncompressedData)
+ chunkLen := 4 + len(uncompressed)
+
+ // Attempt compressing.
+ n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+ n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+ // Check if we should use this, or store as uncompressed instead.
+ if n2 > 0 {
+ chunkType = uint8(chunkTypeCompressedData)
+ chunkLen = 4 + n + n2
+ obuf = obuf[:obufHeaderLen+n+n2]
+ } else {
+ // Use input as output.
+ obuf, inbuf = inbuf, obuf
+ }
+
+ // Fill in the per-chunk header that comes before the body.
+ obuf[0] = chunkType
+ obuf[1] = uint8(chunkLen >> 0)
+ obuf[2] = uint8(chunkLen >> 8)
+ obuf[3] = uint8(chunkLen >> 16)
+ obuf[4] = uint8(checksum >> 0)
+ obuf[5] = uint8(checksum >> 8)
+ obuf[6] = uint8(checksum >> 16)
+ obuf[7] = uint8(checksum >> 24)
+
+ // Queue final output.
+ output <- obuf
+
+ // Put unused buffer back in pool.
+ w.buffers.Put(inbuf)
+ }()
+ return nil
+}
+
+func (w *Writer) writeSync(p []byte) (nRet int, errRet error) {
+ if err := w.err(nil); err != nil {
+ return 0, err
+ }
+ if !w.wroteStreamHeader {
+ w.wroteStreamHeader = true
+ var n int
+ var err error
+ if w.snappy {
+ n, err = w.writer.Write([]byte(magicChunkSnappy))
+ } else {
+ n, err = w.writer.Write([]byte(magicChunk))
+ }
+ if err != nil {
+ return 0, w.err(err)
+ }
+ if n != len(magicChunk) {
+ return 0, w.err(io.ErrShortWrite)
+ }
+ w.written += int64(n)
+ }
+
+ for len(p) > 0 {
+ var uncompressed []byte
+ if len(p) > w.blockSize {
+ uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+ } else {
+ uncompressed, p = p, nil
+ }
+
+ obuf := w.buffers.Get().([]byte)[:w.obufLen]
+ checksum := crc(uncompressed)
+
+ // Set to uncompressed.
+ chunkType := uint8(chunkTypeUncompressedData)
+ chunkLen := 4 + len(uncompressed)
+
+ // Attempt compressing.
+ n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+ n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+ if n2 > 0 {
+ chunkType = uint8(chunkTypeCompressedData)
+ chunkLen = 4 + n + n2
+ obuf = obuf[:obufHeaderLen+n+n2]
+ } else {
+ obuf = obuf[:8]
+ }
+
+ // Fill in the per-chunk header that comes before the body.
+ obuf[0] = chunkType
+ obuf[1] = uint8(chunkLen >> 0)
+ obuf[2] = uint8(chunkLen >> 8)
+ obuf[3] = uint8(chunkLen >> 16)
+ obuf[4] = uint8(checksum >> 0)
+ obuf[5] = uint8(checksum >> 8)
+ obuf[6] = uint8(checksum >> 16)
+ obuf[7] = uint8(checksum >> 24)
+
+ n, err := w.writer.Write(obuf)
+ if err != nil {
+ return 0, w.err(err)
+ }
+ if n != len(obuf) {
+ return 0, w.err(io.ErrShortWrite)
+ }
+ w.written += int64(n)
+ if chunkType == chunkTypeUncompressedData {
+ // Write uncompressed data.
+ n, err := w.writer.Write(uncompressed)
+ if err != nil {
+ return 0, w.err(err)
+ }
+ if n != len(uncompressed) {
+ return 0, w.err(io.ErrShortWrite)
+ }
+ w.written += int64(n)
+ }
+ w.buffers.Put(obuf)
+ // Queue final output.
+ nRet += len(uncompressed)
+ }
+ return nRet, nil
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+// This does not apply padding.
+func (w *Writer) Flush() error {
+ if err := w.err(nil); err != nil {
+ return err
+ }
+
+ // Queue any data still in input buffer.
+ if len(w.ibuf) != 0 {
+ if !w.wroteStreamHeader {
+ _, err := w.writeSync(w.ibuf)
+ w.ibuf = w.ibuf[:0]
+ return w.err(err)
+ } else {
+ _, err := w.write(w.ibuf)
+ w.ibuf = w.ibuf[:0]
+ err = w.err(err)
+ if err != nil {
+ return err
+ }
+ }
+ }
+ if w.output == nil {
+ return w.err(nil)
+ }
+
+ // Send empty buffer
+ res := make(chan result)
+ w.output <- res
+ // Block until this has been picked up.
+ res <- nil
+ // When it is closed, we have flushed.
+ <-res
+ return w.err(nil)
+}
+
+// Close calls Flush and then closes the Writer.
+// Calling Close multiple times is ok.
+func (w *Writer) Close() error {
+ err := w.Flush()
+ if w.output != nil {
+ close(w.output)
+ w.writerWg.Wait()
+ w.output = nil
+ }
+ if w.err(nil) == nil && w.writer != nil && w.pad > 0 {
+ add := calcSkippableFrame(w.written, int64(w.pad))
+ frame, err := skippableFrame(w.ibuf[:0], add, w.randSrc)
+ if err = w.err(err); err != nil {
+ return err
+ }
+ _, err2 := w.writer.Write(frame)
+ _ = w.err(err2)
+ }
+ _ = w.err(errClosed)
+ if err == errClosed {
+ return nil
+ }
+ return err
+}
+
+const skippableFrameHeader = 4
+
+// calcSkippableFrame will return a total size to be added for written
+// to be divisible by multiple.
+// The value will always be > skippableFrameHeader.
+// The function will panic if written < 0 or wantMultiple <= 0.
+func calcSkippableFrame(written, wantMultiple int64) int {
+ if wantMultiple <= 0 {
+ panic("wantMultiple <= 0")
+ }
+ if written < 0 {
+ panic("written < 0")
+ }
+ leftOver := written % wantMultiple
+ if leftOver == 0 {
+ return 0
+ }
+ toAdd := wantMultiple - leftOver
+ for toAdd < skippableFrameHeader {
+ toAdd += wantMultiple
+ }
+ return int(toAdd)
+}
+
+// skippableFrame will add a skippable frame with a total size of bytes.
+// total should be >= skippableFrameHeader and < maxBlockSize + skippableFrameHeader
+func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) {
+ if total == 0 {
+ return dst, nil
+ }
+ if total < skippableFrameHeader {
+ return dst, fmt.Errorf("s2: requested skippable frame (%d) < 4", total)
+ }
+ if int64(total) >= maxBlockSize+skippableFrameHeader {
+ return dst, fmt.Errorf("s2: requested skippable frame (%d) >= max 1<<24", total)
+ }
+ // Chunk type 0xfe "Section 4.4 Padding (chunk type 0xfe)"
+ dst = append(dst, chunkTypePadding)
+ f := uint32(total - skippableFrameHeader)
+ // Add chunk length.
+ dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16))
+ // Add data
+ start := len(dst)
+ dst = append(dst, make([]byte, f)...)
+ _, err := io.ReadFull(r, dst[start:])
+ return dst, err
+}
+
+// WriterOption is an option for creating a encoder.
+type WriterOption func(*Writer) error
+
+// WriterConcurrency will set the concurrency,
+// meaning the maximum number of decoders to run concurrently.
+// The value supplied must be at least 1.
+// By default this will be set to GOMAXPROCS.
+func WriterConcurrency(n int) WriterOption {
+ return func(w *Writer) error {
+ if n <= 0 {
+ return errors.New("concurrency must be at least 1")
+ }
+ w.concurrency = n
+ return nil
+ }
+}
+
+// WriterBetterCompression will enable better compression.
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+func WriterBetterCompression() WriterOption {
+ return func(w *Writer) error {
+ w.level = levelBetter
+ return nil
+ }
+}
+
+// WriterBestCompression will enable better compression.
+// EncodeBetter compresses better than Encode but typically with a
+// big speed decrease on compression.
+func WriterBestCompression() WriterOption {
+ return func(w *Writer) error {
+ w.level = levelBest
+ return nil
+ }
+}
+
+// WriterUncompressed will bypass compression.
+// The stream will be written as uncompressed blocks only.
+// If concurrency is > 1 CRC and output will still be done async.
+func WriterUncompressed() WriterOption {
+ return func(w *Writer) error {
+ w.level = levelUncompressed
+ return nil
+ }
+}
+
+// WriterBlockSize allows to override the default block size.
+// Blocks will be this size or smaller.
+// Minimum size is 4KB and and maximum size is 4MB.
+//
+// Bigger blocks may give bigger throughput on systems with many cores,
+// and will increase compression slightly, but it will limit the possible
+// concurrency for smaller payloads for both encoding and decoding.
+// Default block size is 1MB.
+//
+// When writing Snappy compatible output using WriterSnappyCompat,
+// the maximum block size is 64KB.
+func WriterBlockSize(n int) WriterOption {
+ return func(w *Writer) error {
+ if w.snappy && n > maxSnappyBlockSize || n < minBlockSize {
+ return errors.New("s2: block size too large. Must be <= 64K and >=4KB on for snappy compatible output")
+ }
+ if n > maxBlockSize || n < minBlockSize {
+ return errors.New("s2: block size too large. Must be <= 4MB and >=4KB")
+ }
+ w.blockSize = n
+ return nil
+ }
+}
+
+// WriterPadding will add padding to all output so the size will be a multiple of n.
+// This can be used to obfuscate the exact output size or make blocks of a certain size.
+// The contents will be a skippable frame, so it will be invisible by the decoder.
+// n must be > 0 and <= 4MB.
+// The padded area will be filled with data from crypto/rand.Reader.
+// The padding will be applied whenever Close is called on the writer.
+func WriterPadding(n int) WriterOption {
+ return func(w *Writer) error {
+ if n <= 0 {
+ return fmt.Errorf("s2: padding must be at least 1")
+ }
+ // No need to waste our time.
+ if n == 1 {
+ w.pad = 0
+ }
+ if n > maxBlockSize {
+ return fmt.Errorf("s2: padding must less than 4MB")
+ }
+ w.pad = n
+ return nil
+ }
+}
+
+// WriterPaddingSrc will get random data for padding from the supplied source.
+// By default crypto/rand is used.
+func WriterPaddingSrc(reader io.Reader) WriterOption {
+ return func(w *Writer) error {
+ w.randSrc = reader
+ return nil
+ }
+}
+
+// WriterSnappyCompat will write snappy compatible output.
+// The output can be decompressed using either snappy or s2.
+// If block size is more than 64KB it is set to that.
+func WriterSnappyCompat() WriterOption {
+ return func(w *Writer) error {
+ w.snappy = true
+ if w.blockSize > 64<<10 {
+ // We choose 8 bytes less than 64K, since that will make literal emits slightly more effective.
+ // And allows us to skip some size checks.
+ w.blockSize = (64 << 10) - 8
+ }
+ return nil
+ }
+}
+
+// WriterFlushOnWrite will compress blocks on each call to the Write function.
+//
+// This is quite inefficient as blocks size will depend on the write size.
+//
+// Use WriterConcurrency(1) to also make sure that output is flushed.
+// When Write calls return, otherwise they will be written when compression is done.
+func WriterFlushOnWrite() WriterOption {
+ return func(w *Writer) error {
+ w.flushOnWrite = true
+ return nil
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go
new file mode 100644
index 00000000..8b16c38a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@@ -0,0 +1,456 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+ "bytes"
+ "encoding/binary"
+ "math/bits"
+)
+
+func load32(b []byte, i int) uint32 {
+ return binary.LittleEndian.Uint32(b[i:])
+}
+
+func load64(b []byte, i int) uint64 {
+ return binary.LittleEndian.Uint64(b[i:])
+}
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+ const prime6bytes = 227718039650203
+ return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+}
+
+func encodeGo(dst, src []byte) []byte {
+ if n := MaxEncodedLen(len(src)); n < 0 {
+ panic(ErrTooLarge)
+ } else if len(dst) < n {
+ dst = make([]byte, n)
+ }
+
+ // The block starts with the varint-encoded length of the decompressed bytes.
+ d := binary.PutUvarint(dst, uint64(len(src)))
+
+ if len(src) == 0 {
+ return dst[:d]
+ }
+ if len(src) < minNonLiteralBlockSize {
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+ }
+ n := encodeBlockGo(dst[d:], src)
+ if n > 0 {
+ d += n
+ return dst[:d]
+ }
+ // Not compressible
+ d += emitLiteral(dst[d:], src)
+ return dst[:d]
+}
+
+// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockGo(dst, src []byte) (d int) {
+ // Initialize the hash table.
+ const (
+ tableBits = 14
+ maxTableSize = 1 << tableBits
+
+ debug = false
+ )
+
+ var table [maxTableSize]uint32
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - len(src)>>5 - 5
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ cv := load64(src, s)
+
+ // We search for a repeat at -1, but don't output repeats when nextEmit == 0
+ repeat := 1
+
+ for {
+ candidate := 0
+ for {
+ // Next src position to check
+ nextS := s + (s-nextEmit)>>6 + 4
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hash0 := hash6(cv, tableBits)
+ hash1 := hash6(cv>>8, tableBits)
+ candidate = int(table[hash0])
+ candidate2 := int(table[hash1])
+ table[hash0] = uint32(s)
+ table[hash1] = uint32(s + 1)
+ hash2 := hash6(cv>>16, tableBits)
+
+ // Check repeat at offset checkRep.
+ const checkRep = 1
+ if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+ base := s + checkRep
+ // Extend back
+ for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+
+ // Extend forward
+ candidate := s - repeat + 4 + checkRep
+ s += 4 + checkRep
+ for s <= sLimit {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+ if debug {
+ // Validate match.
+ if s <= candidate {
+ panic("s <= candidate")
+ }
+ a := src[base:s]
+ b := src[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+ if nextEmit > 0 {
+ // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+ d += emitRepeat(dst[d:], repeat, s-base)
+ } else {
+ // First match, cannot be repeat.
+ d += emitCopy(dst[d:], repeat, s-base)
+ }
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ cv = load64(src, s)
+ continue
+ }
+
+ if uint32(cv) == load32(src, candidate) {
+ break
+ }
+ candidate = int(table[hash2])
+ if uint32(cv>>8) == load32(src, candidate2) {
+ table[hash2] = uint32(s + 2)
+ candidate = candidate2
+ s++
+ break
+ }
+ table[hash2] = uint32(s + 2)
+ if uint32(cv>>16) == load32(src, candidate) {
+ s += 2
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards.
+ // The top bytes will be rechecked to get the full match.
+ for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+ candidate--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+
+ d += emitLiteral(dst[d:], src[nextEmit:s])
+
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+ repeat = base - candidate
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidate += 4
+ for s <= len(src)-8 {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+
+ d += emitCopy(dst[d:], repeat, s-base)
+ if debug {
+ // Validate match.
+ if s <= candidate {
+ panic("s <= candidate")
+ }
+ a := src[base:s]
+ b := src[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Check for an immediate match, otherwise start search at s+1
+ x := load64(src, s-2)
+ m2Hash := hash6(x, tableBits)
+ currHash := hash6(x>>16, tableBits)
+ candidate = int(table[currHash])
+ table[m2Hash] = uint32(s - 2)
+ table[currHash] = uint32(s)
+ if debug && s == candidate {
+ panic("s == candidate")
+ }
+ if uint32(x>>16) != load32(src, candidate) {
+ cv = load64(src, s+1)
+ s++
+ break
+ }
+ }
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:])
+ }
+ return d
+}
+
+func encodeBlockSnappyGo(dst, src []byte) (d int) {
+ // Initialize the hash table.
+ const (
+ tableBits = 14
+ maxTableSize = 1 << tableBits
+ )
+
+ var table [maxTableSize]uint32
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - len(src)>>5 - 5
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ cv := load64(src, s)
+
+ // We search for a repeat at -1, but don't output repeats when nextEmit == 0
+ repeat := 1
+
+ for {
+ candidate := 0
+ for {
+ // Next src position to check
+ nextS := s + (s-nextEmit)>>6 + 4
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hash0 := hash6(cv, tableBits)
+ hash1 := hash6(cv>>8, tableBits)
+ candidate = int(table[hash0])
+ candidate2 := int(table[hash1])
+ table[hash0] = uint32(s)
+ table[hash1] = uint32(s + 1)
+ hash2 := hash6(cv>>16, tableBits)
+
+ // Check repeat at offset checkRep.
+ const checkRep = 1
+ if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+ base := s + checkRep
+ // Extend back
+ for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+
+ // Extend forward
+ candidate := s - repeat + 4 + checkRep
+ s += 4 + checkRep
+ for s <= sLimit {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+
+ d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ cv = load64(src, s)
+ continue
+ }
+
+ if uint32(cv) == load32(src, candidate) {
+ break
+ }
+ candidate = int(table[hash2])
+ if uint32(cv>>8) == load32(src, candidate2) {
+ table[hash2] = uint32(s + 2)
+ candidate = candidate2
+ s++
+ break
+ }
+ table[hash2] = uint32(s + 2)
+ if uint32(cv>>16) == load32(src, candidate) {
+ s += 2
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards
+ for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+ candidate--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+
+ d += emitLiteral(dst[d:], src[nextEmit:s])
+
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+ repeat = base - candidate
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidate += 4
+ for s <= len(src)-8 {
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+
+ d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+ if false {
+ // Validate match.
+ a := src[base:s]
+ b := src[base-repeat : base-repeat+(s-base)]
+ if !bytes.Equal(a, b) {
+ panic("mismatch")
+ }
+ }
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Check for an immediate match, otherwise start search at s+1
+ x := load64(src, s-2)
+ m2Hash := hash6(x, tableBits)
+ currHash := hash6(x>>16, tableBits)
+ candidate = int(table[currHash])
+ table[m2Hash] = uint32(s - 2)
+ table[currHash] = uint32(s)
+ if uint32(x>>16) != load32(src, candidate) {
+ cv = load64(src, s+1)
+ s++
+ break
+ }
+ }
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:])
+ }
+ return d
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
new file mode 100644
index 00000000..e612225f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
@@ -0,0 +1,142 @@
+//go:build !appengine && !noasm && gc
+// +build !appengine,!noasm,gc
+
+package s2
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+ const (
+ // Use 12 bit table when less than...
+ limit12B = 16 << 10
+ // Use 10 bit table when less than...
+ limit10B = 4 << 10
+ // Use 8 bit table when less than...
+ limit8B = 512
+ )
+
+ if len(src) >= 4<<20 {
+ return encodeBlockAsm(dst, src)
+ }
+ if len(src) >= limit12B {
+ return encodeBlockAsm4MB(dst, src)
+ }
+ if len(src) >= limit10B {
+ return encodeBlockAsm12B(dst, src)
+ }
+ if len(src) >= limit8B {
+ return encodeBlockAsm10B(dst, src)
+ }
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+ return encodeBlockAsm8B(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetter(dst, src []byte) (d int) {
+ const (
+ // Use 12 bit table when less than...
+ limit12B = 16 << 10
+ // Use 10 bit table when less than...
+ limit10B = 4 << 10
+ // Use 8 bit table when less than...
+ limit8B = 512
+ )
+
+ if len(src) > 4<<20 {
+ return encodeBetterBlockAsm(dst, src)
+ }
+ if len(src) >= limit12B {
+ return encodeBetterBlockAsm4MB(dst, src)
+ }
+ if len(src) >= limit10B {
+ return encodeBetterBlockAsm12B(dst, src)
+ }
+ if len(src) >= limit8B {
+ return encodeBetterBlockAsm10B(dst, src)
+ }
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+ return encodeBetterBlockAsm8B(dst, src)
+}
+
+// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockSnappy(dst, src []byte) (d int) {
+ const (
+ // Use 12 bit table when less than...
+ limit12B = 16 << 10
+ // Use 10 bit table when less than...
+ limit10B = 4 << 10
+ // Use 8 bit table when less than...
+ limit8B = 512
+ )
+ if len(src) >= 64<<10 {
+ return encodeSnappyBlockAsm(dst, src)
+ }
+ if len(src) >= limit12B {
+ return encodeSnappyBlockAsm64K(dst, src)
+ }
+ if len(src) >= limit10B {
+ return encodeSnappyBlockAsm12B(dst, src)
+ }
+ if len(src) >= limit8B {
+ return encodeSnappyBlockAsm10B(dst, src)
+ }
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+ return encodeSnappyBlockAsm8B(dst, src)
+}
+
+// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+ const (
+ // Use 12 bit table when less than...
+ limit12B = 16 << 10
+ // Use 10 bit table when less than...
+ limit10B = 4 << 10
+ // Use 8 bit table when less than...
+ limit8B = 512
+ )
+ if len(src) >= 64<<10 {
+ return encodeSnappyBetterBlockAsm(dst, src)
+ }
+ if len(src) >= limit12B {
+ return encodeSnappyBetterBlockAsm64K(dst, src)
+ }
+ if len(src) >= limit10B {
+ return encodeSnappyBetterBlockAsm12B(dst, src)
+ }
+ if len(src) >= limit8B {
+ return encodeSnappyBetterBlockAsm10B(dst, src)
+ }
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+ return encodeSnappyBetterBlockAsm8B(dst, src)
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go
new file mode 100644
index 00000000..44803477
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@@ -0,0 +1,604 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+ "fmt"
+ "math/bits"
+)
+
+// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBest(dst, src []byte) (d int) {
+ // Initialize the hash tables.
+ const (
+ // Long hash matches.
+ lTableBits = 19
+ maxLTableSize = 1 << lTableBits
+
+ // Short hash matches.
+ sTableBits = 16
+ maxSTableSize = 1 << sTableBits
+
+ inputMargin = 8 + 2
+ )
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+
+ var lTable [maxLTableSize]uint64
+ var sTable [maxSTableSize]uint64
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - 5
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ cv := load64(src, s)
+
+ // We search for a repeat at -1, but don't output repeats when nextEmit == 0
+ repeat := 1
+ const lowbitMask = 0xffffffff
+ getCur := func(x uint64) int {
+ return int(x & lowbitMask)
+ }
+ getPrev := func(x uint64) int {
+ return int(x >> 32)
+ }
+ const maxSkip = 64
+
+ for {
+ type match struct {
+ offset int
+ s int
+ length int
+ score int
+ rep bool
+ }
+ var best match
+ for {
+ // Next src position to check
+ nextS := (s-nextEmit)>>8 + 1
+ if nextS > maxSkip {
+ nextS = s + maxSkip
+ } else {
+ nextS += s
+ }
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hashL := hash8(cv, lTableBits)
+ hashS := hash4(cv, sTableBits)
+ candidateL := lTable[hashL]
+ candidateS := sTable[hashS]
+
+ score := func(m match) int {
+ // Matches that are longer forward are penalized since we must emit it as a literal.
+ score := m.length - m.s
+ if nextEmit == m.s {
+ // If we do not have to emit literals, we save 1 byte
+ score++
+ }
+ offset := m.s - m.offset
+ if m.rep {
+ return score - emitRepeatSize(offset, m.length)
+ }
+ return score - emitCopySize(offset, m.length)
+ }
+
+ matchAt := func(offset, s int, first uint32, rep bool) match {
+ if best.length != 0 && best.s-best.offset == s-offset {
+ // Don't retest if we have the same offset.
+ return match{offset: offset, s: s}
+ }
+ if load32(src, offset) != first {
+ return match{offset: offset, s: s}
+ }
+ m := match{offset: offset, s: s, length: 4 + offset, rep: rep}
+ s += 4
+ for s <= sLimit {
+ if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+ m.length += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ m.length += 8
+ }
+ m.length -= offset
+ m.score = score(m)
+ if m.score <= -m.s {
+ // Eliminate if no savings, we might find a better one.
+ m.length = 0
+ }
+ return m
+ }
+
+ bestOf := func(a, b match) match {
+ if b.length == 0 {
+ return a
+ }
+ if a.length == 0 {
+ return b
+ }
+ as := a.score + b.s
+ bs := b.score + a.s
+ if as >= bs {
+ return a
+ }
+ return b
+ }
+
+ best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false))
+
+ {
+ best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
+ if best.length > 0 {
+ // s+1
+ nextShort := sTable[hash4(cv>>8, sTableBits)]
+ s := s + 1
+ cv := load64(src, s)
+ nextLong := lTable[hash8(cv, lTableBits)]
+ best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
+ // Repeat at + 2
+ best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
+
+ // s+2
+ if true {
+ nextShort = sTable[hash4(cv>>8, sTableBits)]
+ s++
+ cv = load64(src, s)
+ nextLong = lTable[hash8(cv, lTableBits)]
+ best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
+ best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
+ }
+ // Search for a match at best match end, see if that is better.
+ if sAt := best.s + best.length; sAt < sLimit {
+ sBack := best.s
+ backL := best.length
+ // Load initial values
+ cv = load64(src, sBack)
+ // Search for mismatch
+ next := lTable[hash8(load64(src, sAt), lTableBits)]
+ //next := sTable[hash4(load64(src, sAt), sTableBits)]
+
+ if checkAt := getCur(next) - backL; checkAt > 0 {
+ best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+ }
+ if checkAt := getPrev(next) - backL; checkAt > 0 {
+ best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+ }
+ }
+ }
+ }
+
+ // Update table
+ lTable[hashL] = uint64(s) | candidateL<<32
+ sTable[hashS] = uint64(s) | candidateS<<32
+
+ if best.length > 0 {
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards, not needed for repeats...
+ s = best.s
+ if !best.rep {
+ for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
+ best.offset--
+ best.length++
+ s--
+ }
+ }
+ if false && best.offset >= s {
+ panic(fmt.Errorf("t %d >= s %d", best.offset, s))
+ }
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ base := s
+ offset := s - best.offset
+
+ s += best.length
+
+ if offset > 65535 && s-base <= 5 && !best.rep {
+ // Bail if the match is equal or worse to the encoding.
+ s = best.s + 1
+ if s >= sLimit {
+ goto emitRemainder
+ }
+ cv = load64(src, s)
+ continue
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ if best.rep {
+ if nextEmit > 0 {
+ // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+ d += emitRepeat(dst[d:], offset, best.length)
+ } else {
+ // First match, cannot be repeat.
+ d += emitCopy(dst[d:], offset, best.length)
+ }
+ } else {
+ d += emitCopy(dst[d:], offset, best.length)
+ }
+ repeat = offset
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Fill tables...
+ for i := best.s + 1; i < s; i++ {
+ cv0 := load64(src, i)
+ long0 := hash8(cv0, lTableBits)
+ short0 := hash4(cv0, sTableBits)
+ lTable[long0] = uint64(i) | lTable[long0]<<32
+ sTable[short0] = uint64(i) | sTable[short0]<<32
+ }
+ cv = load64(src, s)
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:])
+ }
+ return d
+}
+
+// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBestSnappy(dst, src []byte) (d int) {
+ // Initialize the hash tables.
+ const (
+ // Long hash matches.
+ lTableBits = 19
+ maxLTableSize = 1 << lTableBits
+
+ // Short hash matches.
+ sTableBits = 16
+ maxSTableSize = 1 << sTableBits
+
+ inputMargin = 8 + 2
+ )
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+
+ var lTable [maxLTableSize]uint64
+ var sTable [maxSTableSize]uint64
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - 5
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ cv := load64(src, s)
+
+ // We search for a repeat at -1, but don't output repeats when nextEmit == 0
+ repeat := 1
+ const lowbitMask = 0xffffffff
+ getCur := func(x uint64) int {
+ return int(x & lowbitMask)
+ }
+ getPrev := func(x uint64) int {
+ return int(x >> 32)
+ }
+ const maxSkip = 64
+
+ for {
+ type match struct {
+ offset int
+ s int
+ length int
+ score int
+ }
+ var best match
+ for {
+ // Next src position to check
+ nextS := (s-nextEmit)>>8 + 1
+ if nextS > maxSkip {
+ nextS = s + maxSkip
+ } else {
+ nextS += s
+ }
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hashL := hash8(cv, lTableBits)
+ hashS := hash4(cv, sTableBits)
+ candidateL := lTable[hashL]
+ candidateS := sTable[hashS]
+
+ score := func(m match) int {
+ // Matches that are longer forward are penalized since we must emit it as a literal.
+ score := m.length - m.s
+ if nextEmit == m.s {
+ // If we do not have to emit literals, we save 1 byte
+ score++
+ }
+ offset := m.s - m.offset
+
+ return score - emitCopySize(offset, m.length)
+ }
+
+ matchAt := func(offset, s int, first uint32) match {
+ if best.length != 0 && best.s-best.offset == s-offset {
+ // Don't retest if we have the same offset.
+ return match{offset: offset, s: s}
+ }
+ if load32(src, offset) != first {
+ return match{offset: offset, s: s}
+ }
+ m := match{offset: offset, s: s, length: 4 + offset}
+ s += 4
+ for s <= sLimit {
+ if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+ m.length += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ m.length += 8
+ }
+ m.length -= offset
+ m.score = score(m)
+ if m.score <= -m.s {
+ // Eliminate if no savings, we might find a better one.
+ m.length = 0
+ }
+ return m
+ }
+
+ bestOf := func(a, b match) match {
+ if b.length == 0 {
+ return a
+ }
+ if a.length == 0 {
+ return b
+ }
+ as := a.score + b.s
+ bs := b.score + a.s
+ if as >= bs {
+ return a
+ }
+ return b
+ }
+
+ best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv)))
+ best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv)))
+ best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv)))
+
+ {
+ best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
+ if best.length > 0 {
+ // s+1
+ nextShort := sTable[hash4(cv>>8, sTableBits)]
+ s := s + 1
+ cv := load64(src, s)
+ nextLong := lTable[hash8(cv, lTableBits)]
+ best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
+ best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
+ best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
+ best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
+ // Repeat at + 2
+ best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
+
+ // s+2
+ if true {
+ nextShort = sTable[hash4(cv>>8, sTableBits)]
+ s++
+ cv = load64(src, s)
+ nextLong = lTable[hash8(cv, lTableBits)]
+ best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
+ best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
+ best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
+ best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
+ }
+ // Search for a match at best match end, see if that is better.
+ if sAt := best.s + best.length; sAt < sLimit {
+ sBack := best.s
+ backL := best.length
+ // Load initial values
+ cv = load64(src, sBack)
+ // Search for mismatch
+ next := lTable[hash8(load64(src, sAt), lTableBits)]
+ //next := sTable[hash4(load64(src, sAt), sTableBits)]
+
+ if checkAt := getCur(next) - backL; checkAt > 0 {
+ best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+ }
+ if checkAt := getPrev(next) - backL; checkAt > 0 {
+ best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+ }
+ }
+ }
+ }
+
+ // Update table
+ lTable[hashL] = uint64(s) | candidateL<<32
+ sTable[hashS] = uint64(s) | candidateS<<32
+
+ if best.length > 0 {
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards, not needed for repeats...
+ s = best.s
+ if true {
+ for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
+ best.offset--
+ best.length++
+ s--
+ }
+ }
+ if false && best.offset >= s {
+ panic(fmt.Errorf("t %d >= s %d", best.offset, s))
+ }
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ base := s
+ offset := s - best.offset
+
+ s += best.length
+
+ if offset > 65535 && s-base <= 5 {
+ // Bail if the match is equal or worse to the encoding.
+ s = best.s + 1
+ if s >= sLimit {
+ goto emitRemainder
+ }
+ cv = load64(src, s)
+ continue
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ d += emitCopyNoRepeat(dst[d:], offset, best.length)
+ repeat = offset
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Fill tables...
+ for i := best.s + 1; i < s; i++ {
+ cv0 := load64(src, i)
+ long0 := hash8(cv0, lTableBits)
+ short0 := hash4(cv0, sTableBits)
+ lTable[long0] = uint64(i) | lTable[long0]<<32
+ sTable[short0] = uint64(i) | sTable[short0]<<32
+ }
+ cv = load64(src, s)
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:])
+ }
+ return d
+}
+
+// emitCopySize returns the size to encode the offset+length
+//
+// It assumes that:
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
+func emitCopySize(offset, length int) int {
+ if offset >= 65536 {
+ i := 0
+ if length > 64 {
+ length -= 64
+ if length >= 4 {
+ // Emit remaining as repeats
+ return 5 + emitRepeatSize(offset, length)
+ }
+ i = 5
+ }
+ if length == 0 {
+ return i
+ }
+ return i + 5
+ }
+
+ // Offset no more than 2 bytes.
+ if length > 64 {
+ // Emit remaining as repeats, at least 4 bytes remain.
+ return 3 + emitRepeatSize(offset, length-60)
+ }
+ if length >= 12 || offset >= 2048 {
+ return 3
+ }
+ // Emit the remaining copy, encoded as 2 bytes.
+ return 2
+}
+
+// emitRepeatSize returns the number of bytes required to encode a repeat.
+// Length must be at least 4 and < 1<<24
+func emitRepeatSize(offset, length int) int {
+ // Repeat offset, make length cheaper
+ if length <= 4+4 || (length < 8+4 && offset < 2048) {
+ return 2
+ }
+ if length < (1<<8)+4+4 {
+ return 3
+ }
+ if length < (1<<16)+(1<<8)+4 {
+ return 4
+ }
+ const maxRepeat = (1 << 24) - 1
+ length -= (1 << 16) - 4
+ left := 0
+ if length > maxRepeat {
+ left = length - maxRepeat + 4
+ length = maxRepeat - 4
+ }
+ if left > 0 {
+ return 5 + emitRepeatSize(offset, left)
+ }
+ return 5
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go
new file mode 100644
index 00000000..943215b8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@@ -0,0 +1,431 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+ "math/bits"
+)
+
+// hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4(u uint64, h uint8) uint32 {
+ const prime4bytes = 2654435761
+ return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash5(u uint64, h uint8) uint32 {
+ const prime5bytes = 889523592379
+ return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+ const prime7bytes = 58295818150454627
+ return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+ const prime8bytes = 0xcf1bbcdcb7a56463
+ return uint32((u * prime8bytes) >> ((64 - h) & 63))
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterGo(dst, src []byte) (d int) {
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+
+ // Initialize the hash tables.
+ const (
+ // Long hash matches.
+ lTableBits = 16
+ maxLTableSize = 1 << lTableBits
+
+ // Short hash matches.
+ sTableBits = 14
+ maxSTableSize = 1 << sTableBits
+ )
+
+ var lTable [maxLTableSize]uint32
+ var sTable [maxSTableSize]uint32
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - len(src)>>5 - 6
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ cv := load64(src, s)
+
+ // We initialize repeat to 0, so we never match on first attempt
+ repeat := 0
+
+ for {
+ candidateL := 0
+ nextS := 0
+ for {
+ // Next src position to check
+ nextS = s + (s-nextEmit)>>7 + 1
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hashL := hash7(cv, lTableBits)
+ hashS := hash4(cv, sTableBits)
+ candidateL = int(lTable[hashL])
+ candidateS := int(sTable[hashS])
+ lTable[hashL] = uint32(s)
+ sTable[hashS] = uint32(s)
+
+ // Check repeat at offset checkRep.
+ const checkRep = 1
+ if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+ base := s + checkRep
+ // Extend back
+ for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+ i--
+ base--
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+
+ // Extend forward
+ candidate := s - repeat + 4 + checkRep
+ s += 4 + checkRep
+ for s < len(src) {
+ if len(src)-s < 8 {
+ if src[s] == src[candidate] {
+ s++
+ candidate++
+ continue
+ }
+ break
+ }
+ if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidate += 8
+ }
+ if nextEmit > 0 {
+ // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+ d += emitRepeat(dst[d:], repeat, s-base)
+ } else {
+ // First match, cannot be repeat.
+ d += emitCopy(dst[d:], repeat, s-base)
+ }
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ cv = load64(src, s)
+ continue
+ }
+
+ if uint32(cv) == load32(src, candidateL) {
+ break
+ }
+
+ // Check our short candidate
+ if uint32(cv) == load32(src, candidateS) {
+ // Try a long candidate at s+1
+ hashL = hash7(cv>>8, lTableBits)
+ candidateL = int(lTable[hashL])
+ lTable[hashL] = uint32(s + 1)
+ if uint32(cv>>8) == load32(src, candidateL) {
+ s++
+ break
+ }
+ // Use our short candidate.
+ candidateL = candidateS
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards
+ for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+ candidateL--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ base := s
+ offset := base - candidateL
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidateL += 4
+ for s < len(src) {
+ if len(src)-s < 8 {
+ if src[s] == src[candidateL] {
+ s++
+ candidateL++
+ continue
+ }
+ break
+ }
+ if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidateL += 8
+ }
+
+ if offset > 65535 && s-base <= 5 && repeat != offset {
+ // Bail if the match is equal or worse to the encoding.
+ s = nextS + 1
+ if s >= sLimit {
+ goto emitRemainder
+ }
+ cv = load64(src, s)
+ continue
+ }
+
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ if repeat == offset {
+ d += emitRepeat(dst[d:], offset, s-base)
+ } else {
+ d += emitCopy(dst[d:], offset, s-base)
+ repeat = offset
+ }
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Index match start+1 (long) and start+2 (short)
+ index0 := base + 1
+ // Index match end-2 (long) and end-1 (short)
+ index1 := s - 2
+
+ cv0 := load64(src, index0)
+ cv1 := load64(src, index1)
+ cv = load64(src, s)
+ lTable[hash7(cv0, lTableBits)] = uint32(index0)
+ lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
+ lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
+ sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+ sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+ sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:])
+ }
+ return d
+}
+
+// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src)) &&
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+
+ // Initialize the hash tables.
+ const (
+ // Long hash matches.
+ lTableBits = 16
+ maxLTableSize = 1 << lTableBits
+
+ // Short hash matches.
+ sTableBits = 14
+ maxSTableSize = 1 << sTableBits
+ )
+
+ var lTable [maxLTableSize]uint32
+ var sTable [maxSTableSize]uint32
+
+ // Bail if we can't compress to at least this.
+ dstLimit := len(src) - len(src)>>5 - 6
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ cv := load64(src, s)
+
+ // We initialize repeat to 0, so we never match on first attempt
+ repeat := 0
+ const maxSkip = 100
+
+ for {
+ candidateL := 0
+ nextS := 0
+ for {
+ // Next src position to check
+ nextS = (s-nextEmit)>>7 + 1
+ if nextS > maxSkip {
+ nextS = s + maxSkip
+ } else {
+ nextS += s
+ }
+
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ hashL := hash7(cv, lTableBits)
+ hashS := hash4(cv, sTableBits)
+ candidateL = int(lTable[hashL])
+ candidateS := int(sTable[hashS])
+ lTable[hashL] = uint32(s)
+ sTable[hashS] = uint32(s)
+
+ if uint32(cv) == load32(src, candidateL) {
+ break
+ }
+
+ // Check our short candidate
+ if uint32(cv) == load32(src, candidateS) {
+ // Try a long candidate at s+1
+ hashL = hash7(cv>>8, lTableBits)
+ candidateL = int(lTable[hashL])
+ lTable[hashL] = uint32(s + 1)
+ if uint32(cv>>8) == load32(src, candidateL) {
+ s++
+ break
+ }
+ // Use our short candidate.
+ candidateL = candidateS
+ break
+ }
+
+ cv = load64(src, nextS)
+ s = nextS
+ }
+
+ // Extend backwards
+ for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+ candidateL--
+ s--
+ }
+
+ // Bail if we exceed the maximum size.
+ if d+(s-nextEmit) > dstLimit {
+ return 0
+ }
+
+ base := s
+ offset := base - candidateL
+
+ // Extend the 4-byte match as long as possible.
+ s += 4
+ candidateL += 4
+ for s < len(src) {
+ if len(src)-s < 8 {
+ if src[s] == src[candidateL] {
+ s++
+ candidateL++
+ continue
+ }
+ break
+ }
+ if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+ s += bits.TrailingZeros64(diff) >> 3
+ break
+ }
+ s += 8
+ candidateL += 8
+ }
+
+ if offset > 65535 && s-base <= 5 && repeat != offset {
+ // Bail if the match is equal or worse to the encoding.
+ s = nextS + 1
+ if s >= sLimit {
+ goto emitRemainder
+ }
+ cv = load64(src, s)
+ continue
+ }
+
+ d += emitLiteral(dst[d:], src[nextEmit:base])
+ d += emitCopyNoRepeat(dst[d:], offset, s-base)
+ repeat = offset
+
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ if d > dstLimit {
+ // Do we have space for more, if not bail.
+ return 0
+ }
+ // Index match start+1 (long) and start+2 (short)
+ index0 := base + 1
+ // Index match end-2 (long) and end-1 (short)
+ index1 := s - 2
+
+ cv0 := load64(src, index0)
+ cv1 := load64(src, index1)
+ cv = load64(src, s)
+ lTable[hash7(cv0, lTableBits)] = uint32(index0)
+ lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
+ lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
+ sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+ sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+ sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ // Bail if we exceed the maximum size.
+ if d+len(src)-nextEmit > dstLimit {
+ return 0
+ }
+ d += emitLiteral(dst[d:], src[nextEmit:])
+ }
+ return d
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
new file mode 100644
index 00000000..43d43534
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -0,0 +1,298 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package s2
+
+import (
+ "math/bits"
+)
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src))
+func encodeBlock(dst, src []byte) (d int) {
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+ return encodeBlockGo(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockBetter(dst, src []byte) (d int) {
+ return encodeBlockBetterGo(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+ return encodeBlockBetterSnappyGo(dst, src)
+}
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+// len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockSnappy(dst, src []byte) (d int) {
+ if len(src) < minNonLiteralBlockSize {
+ return 0
+ }
+ return encodeBlockSnappyGo(dst, src)
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+// dst is long enough to hold the encoded bytes
+// 0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteral(dst, lit []byte) int {
+ if len(lit) == 0 {
+ return 0
+ }
+ const num = 63<<2 | tagLiteral
+ i, n := 0, uint(len(lit)-1)
+ switch {
+ case n < 60:
+ dst[0] = uint8(n)<<2 | tagLiteral
+ i = 1
+ case n < 1<<8:
+ dst[1] = uint8(n)
+ dst[0] = 60<<2 | tagLiteral
+ i = 2
+ case n < 1<<16:
+ dst[2] = uint8(n >> 8)
+ dst[1] = uint8(n)
+ dst[0] = 61<<2 | tagLiteral
+ i = 3
+ case n < 1<<24:
+ dst[3] = uint8(n >> 16)
+ dst[2] = uint8(n >> 8)
+ dst[1] = uint8(n)
+ dst[0] = 62<<2 | tagLiteral
+ i = 4
+ default:
+ dst[4] = uint8(n >> 24)
+ dst[3] = uint8(n >> 16)
+ dst[2] = uint8(n >> 8)
+ dst[1] = uint8(n)
+ dst[0] = 63<<2 | tagLiteral
+ i = 5
+ }
+ return i + copy(dst[i:], lit)
+}
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<24
+func emitRepeat(dst []byte, offset, length int) int {
+ // Repeat offset, make length cheaper
+ length -= 4
+ if length <= 4 {
+ dst[0] = uint8(length)<<2 | tagCopy1
+ dst[1] = 0
+ return 2
+ }
+ if length < 8 && offset < 2048 {
+ // Encode WITH offset
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+ return 2
+ }
+ if length < (1<<8)+4 {
+ length -= 4
+ dst[2] = uint8(length)
+ dst[1] = 0
+ dst[0] = 5<<2 | tagCopy1
+ return 3
+ }
+ if length < (1<<16)+(1<<8) {
+ length -= 1 << 8
+ dst[3] = uint8(length >> 8)
+ dst[2] = uint8(length >> 0)
+ dst[1] = 0
+ dst[0] = 6<<2 | tagCopy1
+ return 4
+ }
+ const maxRepeat = (1 << 24) - 1
+ length -= 1 << 16
+ left := 0
+ if length > maxRepeat {
+ left = length - maxRepeat + 4
+ length = maxRepeat - 4
+ }
+ dst[4] = uint8(length >> 16)
+ dst[3] = uint8(length >> 8)
+ dst[2] = uint8(length >> 0)
+ dst[1] = 0
+ dst[0] = 7<<2 | tagCopy1
+ if left > 0 {
+ return 5 + emitRepeat(dst[5:], offset, left)
+ }
+ return 5
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+// dst is long enough to hold the encoded bytes
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
+func emitCopy(dst []byte, offset, length int) int {
+ if offset >= 65536 {
+ i := 0
+ if length > 64 {
+ // Emit a length 64 copy, encoded as 5 bytes.
+ dst[4] = uint8(offset >> 24)
+ dst[3] = uint8(offset >> 16)
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = 63<<2 | tagCopy4
+ length -= 64
+ if length >= 4 {
+ // Emit remaining as repeats
+ return 5 + emitRepeat(dst[5:], offset, length)
+ }
+ i = 5
+ }
+ if length == 0 {
+ return i
+ }
+ // Emit a copy, offset encoded as 4 bytes.
+ dst[i+0] = uint8(length-1)<<2 | tagCopy4
+ dst[i+1] = uint8(offset)
+ dst[i+2] = uint8(offset >> 8)
+ dst[i+3] = uint8(offset >> 16)
+ dst[i+4] = uint8(offset >> 24)
+ return i + 5
+ }
+
+ // Offset no more than 2 bytes.
+ if length > 64 {
+ // Emit a length 60 copy, encoded as 3 bytes.
+ // Emit remaining as repeat value (minimum 4 bytes).
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = 59<<2 | tagCopy2
+ length -= 60
+ // Emit remaining as repeats, at least 4 bytes remain.
+ return 3 + emitRepeat(dst[3:], offset, length)
+ }
+ if length >= 12 || offset >= 2048 {
+ // Emit the remaining copy, encoded as 3 bytes.
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = uint8(length-1)<<2 | tagCopy2
+ return 3
+ }
+ // Emit the remaining copy, encoded as 2 bytes.
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+ return 2
+}
+
+// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+// dst is long enough to hold the encoded bytes
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
+func emitCopyNoRepeat(dst []byte, offset, length int) int {
+ if offset >= 65536 {
+ i := 0
+ if length > 64 {
+ // Emit a length 64 copy, encoded as 5 bytes.
+ dst[4] = uint8(offset >> 24)
+ dst[3] = uint8(offset >> 16)
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = 63<<2 | tagCopy4
+ length -= 64
+ if length >= 4 {
+ // Emit remaining as repeats
+ return 5 + emitCopyNoRepeat(dst[5:], offset, length)
+ }
+ i = 5
+ }
+ if length == 0 {
+ return i
+ }
+ // Emit a copy, offset encoded as 4 bytes.
+ dst[i+0] = uint8(length-1)<<2 | tagCopy4
+ dst[i+1] = uint8(offset)
+ dst[i+2] = uint8(offset >> 8)
+ dst[i+3] = uint8(offset >> 16)
+ dst[i+4] = uint8(offset >> 24)
+ return i + 5
+ }
+
+ // Offset no more than 2 bytes.
+ if length > 64 {
+ // Emit a length 60 copy, encoded as 3 bytes.
+ // Emit remaining as repeat value (minimum 4 bytes).
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = 59<<2 | tagCopy2
+ length -= 60
+ // Emit remaining as repeats, at least 4 bytes remain.
+ return 3 + emitCopyNoRepeat(dst[3:], offset, length)
+ }
+ if length >= 12 || offset >= 2048 {
+ // Emit the remaining copy, encoded as 3 bytes.
+ dst[2] = uint8(offset >> 8)
+ dst[1] = uint8(offset)
+ dst[0] = uint8(length-1)<<2 | tagCopy2
+ return 3
+ }
+ // Emit the remaining copy, encoded as 2 bytes.
+ dst[1] = uint8(offset)
+ dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+ return 2
+}
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+// len(a) <= len(b)
+//
+func matchLen(a []byte, b []byte) int {
+ b = b[:len(a)]
+ var checked int
+ if len(a) > 4 {
+ // Try 4 bytes first
+ if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
+ return bits.TrailingZeros32(diff) >> 3
+ }
+ // Switch to 8 byte matching.
+ checked = 4
+ a = a[4:]
+ b = b[4:]
+ for len(a) >= 8 {
+ b = b[:len(a)]
+ if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
+ return checked + (bits.TrailingZeros64(diff) >> 3)
+ }
+ checked += 8
+ a = a[8:]
+ b = b[8:]
+ }
+ }
+ b = b[:len(a)]
+ for i := range a {
+ if a[i] != b[i] {
+ return int(i) + checked
+ }
+ }
+ return len(a) + checked
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
new file mode 100644
index 00000000..c8cf7b69
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@@ -0,0 +1,189 @@
+// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc
+// +build !appengine,!noasm,gc
+
+package s2
+
+// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm(dst []byte, src []byte) int
+
+// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4194304 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm4MB(dst []byte, src []byte) int
+
+// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm12B(dst []byte, src []byte) int
+
+// encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm10B(dst []byte, src []byte) int
+
+// encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm8B(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4194304 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm12B(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm10B(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm8B(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65535 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65535 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+// dst is long enough to hold the encoded bytes with margin of 0 bytes
+// 0 <= len(lit) && len(lit) <= math.MaxUint32
+//
+//go:noescape
+func emitLiteral(dst []byte, lit []byte) int
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<32
+//
+//go:noescape
+func emitRepeat(dst []byte, offset int, length int) int
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+// dst is long enough to hold the encoded bytes
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
+//
+//go:noescape
+func emitCopy(dst []byte, offset int, length int) int
+
+// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+// dst is long enough to hold the encoded bytes
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
+//
+//go:noescape
+func emitCopyNoRepeat(dst []byte, offset int, length int) int
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+// len(a) <= len(b)
+//
+//go:noescape
+func matchLen(a []byte, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
new file mode 100644
index 00000000..1ac65a0e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -0,0 +1,15678 @@
+// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
+
+// +build !appengine
+// +build !noasm
+// +build gc
+
+#include "textflag.h"
+
+// func encodeBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm(SB), $65560-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000200, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R11
+ IMULQ R9, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm
+
+repeat_extend_back_loop_encodeBlockAsm:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm
+
+repeat_extend_back_end_encodeBlockAsm:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_repeat_emit_encodeBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_repeat_emit_encodeBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm
+
+four_bytes_repeat_emit_encodeBlockAsm:
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm
+
+three_bytes_repeat_emit_encodeBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm
+
+two_bytes_repeat_emit_encodeBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm
+ JMP memmove_long_repeat_emit_encodeBlockAsm
+
+one_byte_repeat_emit_encodeBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm
+
+memmove_long_repeat_emit_encodeBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm
+
+matchlen_loopback_repeat_extend_encodeBlockAsm:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_loop_repeat_extend_encodeBlockAsm:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm
+
+matchlen_single_repeat_extend_encodeBlockAsm:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm
+
+repeat_extend_forward_end_encodeBlockAsm:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm
+
+ // emitRepeat
+emit_repeat_again_match_repeat_encodeBlockAsm:
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_match_repeat_encodeBlockAsm
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm
+ CMPL SI, $0x00010100
+ JLT repeat_four_match_repeat_encodeBlockAsm
+ CMPL SI, $0x0100ffff
+ JLT repeat_five_match_repeat_encodeBlockAsm
+ LEAL -16842747(SI), SI
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_repeat_encodeBlockAsm
+
+repeat_five_match_repeat_encodeBlockAsm:
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_four_match_repeat_encodeBlockAsm:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_three_match_repeat_encodeBlockAsm:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_match_repeat_encodeBlockAsm:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_match_repeat_encodeBlockAsm:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_as_copy_encodeBlockAsm:
+ // emitCopy
+ CMPL DI, $0x00010000
+ JL two_byte_offset_repeat_as_copy_encodeBlockAsm
+
+four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
+ CMPL SI, $0x40
+ JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm
+ MOVB $0xff, (AX)
+ MOVL DI, 1(AX)
+ LEAL -64(SI), SI
+ ADDQ $0x05, AX
+ CMPL SI, $0x04
+ JL four_bytes_remain_repeat_as_copy_encodeBlockAsm
+
+ // emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
+ CMPL SI, $0x0100ffff
+ JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
+ LEAL -16842747(SI), SI
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+ JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm:
+ TESTL SI, SI
+ JZ repeat_end_emit_encodeBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVL DI, 1(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ CMPL SI, $0x0100ffff
+ JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ LEAL -16842747(SI), SI
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm
+
+no_repeat_found_encodeBlockAsm:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm
+
+candidate3_match_encodeBlockAsm:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm
+
+candidate2_match_encodeBlockAsm:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm
+
+match_extend_back_loop_encodeBlockAsm:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm
+ JMP match_extend_back_loop_encodeBlockAsm
+
+match_extend_back_end_encodeBlockAsm:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 5(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm
+ CMPL R8, $0x00010000
+ JLT three_bytes_match_emit_encodeBlockAsm
+ CMPL R8, $0x01000000
+ JLT four_bytes_match_emit_encodeBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_encodeBlockAsm
+
+four_bytes_match_emit_encodeBlockAsm:
+ MOVL R8, R10
+ SHRL $0x10, R10
+ MOVB $0xf8, (AX)
+ MOVW R8, 1(AX)
+ MOVB R10, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeBlockAsm
+
+three_bytes_match_emit_encodeBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm
+
+two_bytes_match_emit_encodeBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm
+ JMP memmove_long_match_emit_encodeBlockAsm
+
+one_byte_match_emit_encodeBlockAsm:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm
+
+memmove_long_match_emit_encodeBlockAsm:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm:
+match_nolit_loop_encodeBlockAsm:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm
+
+matchlen_loopback_match_nolit_encodeBlockAsm:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm
+
+matchlen_loop_match_nolit_encodeBlockAsm:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm
+
+matchlen_single_match_nolit_encodeBlockAsm:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm
+
+match_nolit_end_encodeBlockAsm:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL SI, $0x00010000
+ JL two_byte_offset_match_nolit_encodeBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeBlockAsm:
+ CMPL R10, $0x40
+ JLE four_bytes_remain_match_nolit_encodeBlockAsm
+ MOVB $0xff, (AX)
+ MOVL SI, 1(AX)
+ LEAL -64(R10), R10
+ ADDQ $0x05, AX
+ CMPL R10, $0x04
+ JL four_bytes_remain_match_nolit_encodeBlockAsm
+
+ // emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy
+ LEAL -16842747(R10), R10
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy:
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+ JMP four_bytes_loop_back_match_nolit_encodeBlockAsm
+
+four_bytes_remain_match_nolit_encodeBlockAsm:
+ TESTL R10, R10
+ JZ match_nolit_emitcopy_end_encodeBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+two_byte_offset_match_nolit_encodeBlockAsm:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
+ LEAL -16842747(R10), R10
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+ JMP two_byte_offset_match_nolit_encodeBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBlockAsm:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy_three_match_nolit_encodeBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm:
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x10, R8
+ IMULQ R9, R8
+ SHRQ $0x32, R8
+ SHLQ $0x10, SI
+ IMULQ R9, SI
+ SHRQ $0x32, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm
+ INCL CX
+ JMP search_loop_encodeBlockAsm
+
+emit_remainder_encodeBlockAsm:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeBlockAsm
+ CMPL DX, $0x01000000
+ JLT four_bytes_emit_remainder_encodeBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL DX, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm
+
+four_bytes_emit_remainder_encodeBlockAsm:
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm
+
+three_bytes_emit_remainder_encodeBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm
+
+two_bytes_emit_remainder_encodeBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm
+ JMP memmove_long_emit_remainder_encodeBlockAsm
+
+one_byte_emit_remainder_encodeBlockAsm:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm
+
+memmove_long_emit_remainder_encodeBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBlockAsm4MB(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm4MB(SB), $65560-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000200, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm4MB:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm4MB
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm4MB:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm4MB
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R11
+ IMULQ R9, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm4MB
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm4MB
+
+repeat_extend_back_loop_encodeBlockAsm4MB:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm4MB
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm4MB
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm4MB
+
+repeat_extend_back_end_encodeBlockAsm4MB:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm4MB
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm4MB
+ CMPL SI, $0x00010000
+ JLT three_bytes_repeat_emit_encodeBlockAsm4MB
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm4MB
+
+three_bytes_repeat_emit_encodeBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm4MB
+
+two_bytes_repeat_emit_encodeBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm4MB
+ JMP memmove_long_repeat_emit_encodeBlockAsm4MB
+
+one_byte_repeat_emit_encodeBlockAsm4MB:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
+
+memmove_long_repeat_emit_encodeBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm4MB:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm4MB
+
+matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_loop_repeat_extend_encodeBlockAsm4MB:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB
+
+matchlen_single_repeat_extend_encodeBlockAsm4MB:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm4MB
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB
+
+repeat_extend_forward_end_encodeBlockAsm4MB:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm4MB
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm4MB
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm4MB
+ CMPL SI, $0x00010100
+ JLT repeat_four_match_repeat_encodeBlockAsm4MB
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_match_repeat_encodeBlockAsm4MB:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_match_repeat_encodeBlockAsm4MB:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_match_repeat_encodeBlockAsm4MB:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_as_copy_encodeBlockAsm4MB:
+ // emitCopy
+ CMPL DI, $0x00010000
+ JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
+
+four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
+ CMPL SI, $0x40
+ JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+ MOVB $0xff, (AX)
+ MOVL DI, 1(AX)
+ LEAL -64(SI), SI
+ ADDQ $0x05, AX
+ CMPL SI, $0x04
+ JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+ JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
+ TESTL SI, SI
+ JZ repeat_end_emit_encodeBlockAsm4MB
+ MOVB $0x03, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVL DI, 1(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm4MB:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm4MB
+
+no_repeat_found_encodeBlockAsm4MB:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm4MB
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm4MB
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm4MB
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm4MB
+
+candidate3_match_encodeBlockAsm4MB:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm4MB
+
+candidate2_match_encodeBlockAsm4MB:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm4MB:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm4MB
+
+match_extend_back_loop_encodeBlockAsm4MB:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm4MB
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm4MB
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm4MB
+ JMP match_extend_back_loop_encodeBlockAsm4MB
+
+match_extend_back_end_encodeBlockAsm4MB:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 4(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm4MB:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm4MB
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm4MB
+ CMPL R8, $0x00010000
+ JLT three_bytes_match_emit_encodeBlockAsm4MB
+ MOVL R8, R10
+ SHRL $0x10, R10
+ MOVB $0xf8, (AX)
+ MOVW R8, 1(AX)
+ MOVB R10, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeBlockAsm4MB
+
+three_bytes_match_emit_encodeBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm4MB
+
+two_bytes_match_emit_encodeBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm4MB
+ JMP memmove_long_match_emit_encodeBlockAsm4MB
+
+one_byte_match_emit_encodeBlockAsm4MB:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm4MB:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm4MB:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm4MB
+
+memmove_long_match_emit_encodeBlockAsm4MB:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm4MB:
+match_nolit_loop_encodeBlockAsm4MB:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm4MB
+
+matchlen_loopback_match_nolit_encodeBlockAsm4MB:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm4MB
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm4MB
+
+matchlen_loop_match_nolit_encodeBlockAsm4MB:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB
+
+matchlen_single_match_nolit_encodeBlockAsm4MB:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm4MB
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm4MB:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm4MB
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB
+
+match_nolit_end_encodeBlockAsm4MB:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL SI, $0x00010000
+ JL two_byte_offset_match_nolit_encodeBlockAsm4MB
+
+four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
+ CMPL R10, $0x40
+ JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB
+ MOVB $0xff, (AX)
+ MOVL SI, 1(AX)
+ LEAL -64(R10), R10
+ ADDQ $0x05, AX
+ CMPL R10, $0x04
+ JL four_bytes_remain_match_nolit_encodeBlockAsm4MB
+
+ // emitRepeat
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+ JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBlockAsm4MB:
+ TESTL R10, R10
+ JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
+ MOVB $0x03, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBlockAsm4MB:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+ JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBlockAsm4MB:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm4MB:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm4MB
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm4MB:
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x10, R8
+ IMULQ R9, R8
+ SHRQ $0x32, R8
+ SHLQ $0x10, SI
+ IMULQ R9, SI
+ SHRQ $0x32, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm4MB
+ INCL CX
+ JMP search_loop_encodeBlockAsm4MB
+
+emit_remainder_encodeBlockAsm4MB:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 4(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm4MB:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm4MB
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm4MB
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeBlockAsm4MB
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm4MB
+ JMP memmove_long_emit_remainder_encodeBlockAsm4MB
+
+one_byte_emit_remainder_encodeBlockAsm4MB:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm4MB:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBlockAsm4MB:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm4MB:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm12B(SB), $16408-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000080, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm12B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm12B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm12B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm12B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x000000cf1bbcdcbb, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x18, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ SHLQ $0x18, R11
+ IMULQ R9, R11
+ SHRQ $0x34, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x18, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm12B
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm12B
+
+repeat_extend_back_loop_encodeBlockAsm12B:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm12B
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm12B
+
+repeat_extend_back_end_encodeBlockAsm12B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm12B
+
+two_bytes_repeat_emit_encodeBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm12B
+ JMP memmove_long_repeat_emit_encodeBlockAsm12B
+
+one_byte_repeat_emit_encodeBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
+
+memmove_long_repeat_emit_encodeBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm12B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm12B
+
+matchlen_loopback_repeat_extend_encodeBlockAsm12B:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm12B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_loop_repeat_extend_encodeBlockAsm12B:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B
+
+matchlen_single_repeat_extend_encodeBlockAsm12B:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm12B:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm12B
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B
+
+repeat_extend_forward_end_encodeBlockAsm12B:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm12B
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm12B
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_match_repeat_encodeBlockAsm12B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm12B
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_match_repeat_encodeBlockAsm12B:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_match_repeat_encodeBlockAsm12B:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_match_repeat_encodeBlockAsm12B:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_as_copy_encodeBlockAsm12B:
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm12B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm12B
+
+no_repeat_found_encodeBlockAsm12B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm12B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm12B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm12B
+
+candidate3_match_encodeBlockAsm12B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm12B
+
+candidate2_match_encodeBlockAsm12B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm12B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm12B
+
+match_extend_back_loop_encodeBlockAsm12B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm12B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm12B
+ JMP match_extend_back_loop_encodeBlockAsm12B
+
+match_extend_back_end_encodeBlockAsm12B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm12B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm12B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm12B
+
+two_bytes_match_emit_encodeBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm12B
+ JMP memmove_long_match_emit_encodeBlockAsm12B
+
+one_byte_match_emit_encodeBlockAsm12B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm12B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm12B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm12B
+
+memmove_long_match_emit_encodeBlockAsm12B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm12B:
+match_nolit_loop_encodeBlockAsm12B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeBlockAsm12B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm12B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm12B
+
+matchlen_loop_match_nolit_encodeBlockAsm12B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm12B
+
+matchlen_single_match_nolit_encodeBlockAsm12B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm12B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm12B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B
+
+match_nolit_end_encodeBlockAsm12B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBlockAsm12B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+ JMP two_byte_offset_match_nolit_encodeBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm12B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm12B
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm12B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm12B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm12B:
+ MOVQ $0x000000cf1bbcdcbb, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x18, R8
+ IMULQ R9, R8
+ SHRQ $0x34, R8
+ SHLQ $0x18, SI
+ IMULQ R9, SI
+ SHRQ $0x34, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm12B
+ INCL CX
+ JMP search_loop_encodeBlockAsm12B
+
+emit_remainder_encodeBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm12B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm12B
+
+two_bytes_emit_remainder_encodeBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm12B
+ JMP memmove_long_emit_remainder_encodeBlockAsm12B
+
+one_byte_emit_remainder_encodeBlockAsm12B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm12B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm12B
+
+memmove_long_emit_remainder_encodeBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm12B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm10B(SB), $4120-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000020, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm10B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm10B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm10B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm10B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ R9, R11
+ SHRQ $0x36, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm10B
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm10B
+
+repeat_extend_back_loop_encodeBlockAsm10B:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm10B
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm10B
+
+repeat_extend_back_end_encodeBlockAsm10B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm10B
+
+two_bytes_repeat_emit_encodeBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm10B
+ JMP memmove_long_repeat_emit_encodeBlockAsm10B
+
+one_byte_repeat_emit_encodeBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
+
+memmove_long_repeat_emit_encodeBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm10B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm10B
+
+matchlen_loopback_repeat_extend_encodeBlockAsm10B:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm10B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_loop_repeat_extend_encodeBlockAsm10B:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B
+
+matchlen_single_repeat_extend_encodeBlockAsm10B:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm10B:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm10B
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B
+
+repeat_extend_forward_end_encodeBlockAsm10B:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm10B
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm10B
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_match_repeat_encodeBlockAsm10B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm10B
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_match_repeat_encodeBlockAsm10B:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_match_repeat_encodeBlockAsm10B:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_match_repeat_encodeBlockAsm10B:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_as_copy_encodeBlockAsm10B:
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm10B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm10B
+
+no_repeat_found_encodeBlockAsm10B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm10B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm10B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm10B
+
+candidate3_match_encodeBlockAsm10B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm10B
+
+candidate2_match_encodeBlockAsm10B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm10B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm10B
+
+match_extend_back_loop_encodeBlockAsm10B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm10B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm10B
+ JMP match_extend_back_loop_encodeBlockAsm10B
+
+match_extend_back_end_encodeBlockAsm10B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm10B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm10B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm10B
+
+two_bytes_match_emit_encodeBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm10B
+ JMP memmove_long_match_emit_encodeBlockAsm10B
+
+one_byte_match_emit_encodeBlockAsm10B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm10B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm10B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm10B
+
+memmove_long_match_emit_encodeBlockAsm10B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm10B:
+match_nolit_loop_encodeBlockAsm10B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeBlockAsm10B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm10B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm10B
+
+matchlen_loop_match_nolit_encodeBlockAsm10B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm10B
+
+matchlen_single_match_nolit_encodeBlockAsm10B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm10B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm10B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B
+
+match_nolit_end_encodeBlockAsm10B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBlockAsm10B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+ JMP two_byte_offset_match_nolit_encodeBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm10B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm10B
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm10B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm10B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm10B:
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x20, R8
+ IMULQ R9, R8
+ SHRQ $0x36, R8
+ SHLQ $0x20, SI
+ IMULQ R9, SI
+ SHRQ $0x36, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm10B
+ INCL CX
+ JMP search_loop_encodeBlockAsm10B
+
+emit_remainder_encodeBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm10B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm10B
+
+two_bytes_emit_remainder_encodeBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm10B
+ JMP memmove_long_emit_remainder_encodeBlockAsm10B
+
+one_byte_emit_remainder_encodeBlockAsm10B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm10B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm10B
+
+memmove_long_emit_remainder_encodeBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm10B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm8B(SB), $1048-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000008, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm8B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm8B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm8B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x04, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm8B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x38, R10
+ SHLQ $0x20, R11
+ IMULQ R9, R11
+ SHRQ $0x38, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x38, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm8B
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm8B
+
+repeat_extend_back_loop_encodeBlockAsm8B:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm8B
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm8B
+
+repeat_extend_back_end_encodeBlockAsm8B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm8B
+
+two_bytes_repeat_emit_encodeBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm8B
+ JMP memmove_long_repeat_emit_encodeBlockAsm8B
+
+one_byte_repeat_emit_encodeBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
+
+memmove_long_repeat_emit_encodeBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm8B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm8B
+
+matchlen_loopback_repeat_extend_encodeBlockAsm8B:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm8B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_loop_repeat_extend_encodeBlockAsm8B:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B
+
+matchlen_single_repeat_extend_encodeBlockAsm8B:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm8B:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm8B
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B
+
+repeat_extend_forward_end_encodeBlockAsm8B:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm8B
+
+ // emitRepeat
+ MOVL SI, DI
+ LEAL -4(SI), SI
+ CMPL DI, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm8B
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm8B
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_match_repeat_encodeBlockAsm8B:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_match_repeat_encodeBlockAsm8B:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_as_copy_encodeBlockAsm8B:
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL SI, DI
+ LEAL -4(SI), SI
+ CMPL DI, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm8B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm8B
+
+no_repeat_found_encodeBlockAsm8B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm8B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm8B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm8B
+
+candidate3_match_encodeBlockAsm8B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm8B
+
+candidate2_match_encodeBlockAsm8B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm8B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm8B
+
+match_extend_back_loop_encodeBlockAsm8B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm8B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm8B
+ JMP match_extend_back_loop_encodeBlockAsm8B
+
+match_extend_back_end_encodeBlockAsm8B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm8B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm8B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm8B
+
+two_bytes_match_emit_encodeBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm8B
+ JMP memmove_long_match_emit_encodeBlockAsm8B
+
+one_byte_match_emit_encodeBlockAsm8B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm8B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm8B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm8B
+
+memmove_long_match_emit_encodeBlockAsm8B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm8B:
+match_nolit_loop_encodeBlockAsm8B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeBlockAsm8B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm8B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm8B
+
+matchlen_loop_match_nolit_encodeBlockAsm8B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm8B
+
+matchlen_single_match_nolit_encodeBlockAsm8B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm8B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm8B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B
+
+match_nolit_end_encodeBlockAsm8B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBlockAsm8B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R10, SI
+ LEAL -4(R10), R10
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+ JMP two_byte_offset_match_nolit_encodeBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm8B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm8B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm8B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm8B:
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x20, R8
+ IMULQ R9, R8
+ SHRQ $0x38, R8
+ SHLQ $0x20, SI
+ IMULQ R9, SI
+ SHRQ $0x38, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm8B
+ INCL CX
+ JMP search_loop_encodeBlockAsm8B
+
+emit_remainder_encodeBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm8B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm8B
+
+two_bytes_emit_remainder_encodeBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm8B
+ JMP memmove_long_emit_remainder_encodeBlockAsm8B
+
+one_byte_emit_remainder_encodeBlockAsm8B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm8B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm8B
+
+memmove_long_emit_remainder_encodeBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm8B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm(SB), $327704-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000a00, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x07, SI
+ CMPL SI, $0x63
+ JLE check_maxskip_ok_encodeBetterBlockAsm
+ LEAL 100(CX), SI
+ JMP check_maxskip_cont_encodeBetterBlockAsm
+
+check_maxskip_ok_encodeBetterBlockAsm:
+ LEAL 1(CX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm:
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 262168(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 262168(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm
+
+candidateS_match_encodeBetterBlockAsm:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm
+
+match_extend_back_loop_encodeBetterBlockAsm:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm
+ JMP match_extend_back_loop_encodeBetterBlockAsm
+
+match_extend_back_end_encodeBetterBlockAsm:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 5(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm
+
+matchlen_single_match_nolit_encodeBetterBlockAsm:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm
+
+match_nolit_end_encodeBetterBlockAsm:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm
+ CMPL R12, $0x01
+ JG match_length_ok_encodeBetterBlockAsm
+ CMPL R8, $0x0000ffff
+ JLE match_length_ok_encodeBetterBlockAsm
+ MOVL 20(SP), CX
+ INCL CX
+ JMP search_loop_encodeBetterBlockAsm
+
+match_length_ok_encodeBetterBlockAsm:
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_encodeBetterBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_match_emit_encodeBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm
+
+four_bytes_match_emit_encodeBetterBlockAsm:
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm
+
+three_bytes_match_emit_encodeBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm
+
+two_bytes_match_emit_encodeBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm
+ JMP memmove_long_match_emit_encodeBetterBlockAsm
+
+one_byte_match_emit_encodeBetterBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm
+
+memmove_long_match_emit_encodeBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL R8, $0x00010000
+ JL two_byte_offset_match_nolit_encodeBetterBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
+ CMPL R12, $0x40
+ JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm
+ MOVB $0xff, (AX)
+ MOVL R8, 1(AX)
+ LEAL -64(R12), R12
+ ADDQ $0x05, AX
+ CMPL R12, $0x04
+ JL four_bytes_remain_match_nolit_encodeBetterBlockAsm
+
+ // emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
+ CMPL R12, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
+ LEAL -16842747(R12), R12
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+ JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm:
+ TESTL R12, R12
+ JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ CMPL R12, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ LEAL -16842747(R12), R12
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+match_is_repeat_encodeBetterBlockAsm:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+four_bytes_match_emit_repeat_encodeBetterBlockAsm:
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm
+ CMPL R12, $0x0100ffff
+ JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm
+ LEAL -16842747(R12), R12
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
+
+repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm:
+ MOVQ $0x00cf1bbcdcbfa563, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x32, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 262168(SP)(R11*4)
+ MOVL R15, 262168(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 262168(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm
+
+emit_remainder_encodeBetterBlockAsm:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeBetterBlockAsm
+ CMPL DX, $0x01000000
+ JLT four_bytes_emit_remainder_encodeBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL DX, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm
+
+four_bytes_emit_remainder_encodeBetterBlockAsm:
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm
+
+three_bytes_emit_remainder_encodeBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm
+
+two_bytes_emit_remainder_encodeBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm
+
+one_byte_emit_remainder_encodeBetterBlockAsm:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
+
+memmove_long_emit_remainder_encodeBetterBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000a00, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm4MB:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm4MB
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm4MB:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x07, SI
+ CMPL SI, $0x63
+ JLE check_maxskip_ok_encodeBetterBlockAsm4MB
+ LEAL 100(CX), SI
+ JMP check_maxskip_cont_encodeBetterBlockAsm4MB
+
+check_maxskip_ok_encodeBetterBlockAsm4MB:
+ LEAL 1(CX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm4MB:
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm4MB
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 262168(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 262168(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm4MB
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm4MB
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm4MB
+
+candidateS_match_encodeBetterBlockAsm4MB:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm4MB
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm4MB:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm4MB
+
+match_extend_back_loop_encodeBetterBlockAsm4MB:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm4MB
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm4MB
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm4MB
+ JMP match_extend_back_loop_encodeBetterBlockAsm4MB
+
+match_extend_back_end_encodeBetterBlockAsm4MB:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 4(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm4MB:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_single_match_nolit_encodeBetterBlockAsm4MB:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm4MB
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB
+
+match_nolit_end_encodeBetterBlockAsm4MB:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm4MB
+ CMPL R12, $0x01
+ JG match_length_ok_encodeBetterBlockAsm4MB
+ CMPL R8, $0x0000ffff
+ JLE match_length_ok_encodeBetterBlockAsm4MB
+ MOVL 20(SP), CX
+ INCL CX
+ JMP search_loop_encodeBetterBlockAsm4MB
+
+match_length_ok_encodeBetterBlockAsm4MB:
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm4MB
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm4MB
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_encodeBetterBlockAsm4MB
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_encodeBetterBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_encodeBetterBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm4MB
+ JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_encodeBetterBlockAsm4MB:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL R8, $0x00010000
+ JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
+
+four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
+ CMPL R12, $0x40
+ JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+ MOVB $0xff, (AX)
+ MOVL R8, 1(AX)
+ LEAL -64(R12), R12
+ ADDQ $0x05, AX
+ CMPL R12, $0x04
+ JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+ JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
+ TESTL R12, R12
+ JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+ MOVB $0x03, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+match_is_repeat_encodeBetterBlockAsm4MB:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm4MB
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm4MB:
+ MOVQ $0x00cf1bbcdcbfa563, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x32, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 262168(SP)(R11*4)
+ MOVL R15, 262168(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 262168(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm4MB
+
+emit_remainder_encodeBetterBlockAsm4MB:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 4(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm4MB:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm4MB
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+one_byte_emit_remainder_encodeBetterBlockAsm4MB:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000280, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm12B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm12B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm12B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm12B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x34, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 65560(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 65560(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm12B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm12B
+
+candidateS_match_encodeBetterBlockAsm12B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm12B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm12B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm12B
+
+match_extend_back_loop_encodeBetterBlockAsm12B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm12B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm12B
+ JMP match_extend_back_loop_encodeBetterBlockAsm12B
+
+match_extend_back_end_encodeBetterBlockAsm12B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm12B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_single_match_nolit_encodeBetterBlockAsm12B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm12B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B
+
+match_nolit_end_encodeBetterBlockAsm12B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm12B
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_encodeBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm12B
+ JMP memmove_long_match_emit_encodeBetterBlockAsm12B
+
+one_byte_match_emit_encodeBetterBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_encodeBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm12B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+match_is_repeat_encodeBetterBlockAsm12B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm12B
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm12B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm12B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x32, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x34, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x34, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 65560(SP)(R11*4)
+ MOVL R15, 65560(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x34, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x32, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 65560(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm12B
+
+emit_remainder_encodeBetterBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm12B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm12B
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeBetterBlockAsm12B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x000000a0, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm10B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm10B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm10B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm10B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x36, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 16408(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 16408(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm10B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm10B
+
+candidateS_match_encodeBetterBlockAsm10B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm10B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm10B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm10B
+
+match_extend_back_loop_encodeBetterBlockAsm10B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm10B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm10B
+ JMP match_extend_back_loop_encodeBetterBlockAsm10B
+
+match_extend_back_end_encodeBetterBlockAsm10B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm10B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_single_match_nolit_encodeBetterBlockAsm10B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm10B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B
+
+match_nolit_end_encodeBetterBlockAsm10B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm10B
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_encodeBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm10B
+ JMP memmove_long_match_emit_encodeBetterBlockAsm10B
+
+one_byte_match_emit_encodeBetterBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_encodeBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm10B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+match_is_repeat_encodeBetterBlockAsm10B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm10B
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm10B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm10B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x34, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x36, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x36, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 16408(SP)(R11*4)
+ MOVL R15, 16408(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x36, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x34, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 16408(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm10B
+
+emit_remainder_encodeBetterBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm10B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm10B
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeBetterBlockAsm10B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000028, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm8B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm8B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm8B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x04, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm8B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x38, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 4120(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 4120(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm8B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm8B
+
+candidateS_match_encodeBetterBlockAsm8B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm8B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm8B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm8B
+
+match_extend_back_loop_encodeBetterBlockAsm8B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm8B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm8B
+ JMP match_extend_back_loop_encodeBetterBlockAsm8B
+
+match_extend_back_end_encodeBetterBlockAsm8B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm8B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_single_match_nolit_encodeBetterBlockAsm8B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm8B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B
+
+match_nolit_end_encodeBetterBlockAsm8B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm8B
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_encodeBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm8B
+ JMP memmove_long_match_emit_encodeBetterBlockAsm8B
+
+one_byte_match_emit_encodeBetterBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_encodeBetterBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm8B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+match_is_repeat_encodeBetterBlockAsm8B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm8B
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm8B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
+ CMPQ R8, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
+ MOVL (R9), R10
+ MOVL R10, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm8B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm8B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x36, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x38, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x38, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 4120(SP)(R11*4)
+ MOVL R15, 4120(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x38, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x36, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 4120(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm8B
+
+emit_remainder_encodeBetterBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm8B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm8B
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeBetterBlockAsm8B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm(SB), $65560-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000200, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R11
+ IMULQ R9, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm
+
+repeat_extend_back_loop_encodeSnappyBlockAsm:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
+
+repeat_extend_back_end_encodeSnappyBlockAsm:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_repeat_emit_encodeSnappyBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_repeat_emit_encodeSnappyBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+four_bytes_repeat_emit_encodeSnappyBlockAsm:
+ MOVL SI, R10
+ SHRL $0x10, R10
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R10, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+one_byte_repeat_emit_encodeSnappyBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm
+
+repeat_extend_forward_end_encodeSnappyBlockAsm:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+ CMPL DI, $0x00010000
+ JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
+ CMPL SI, $0x40
+ JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+ MOVB $0xff, (AX)
+ MOVL DI, 1(AX)
+ LEAL -64(SI), SI
+ ADDQ $0x05, AX
+ CMPL SI, $0x04
+ JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+ JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
+ TESTL SI, SI
+ JZ repeat_end_emit_encodeSnappyBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVL DI, 1(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm
+
+no_repeat_found_encodeSnappyBlockAsm:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm
+
+candidate3_match_encodeSnappyBlockAsm:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm
+
+candidate2_match_encodeSnappyBlockAsm:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm
+
+match_extend_back_loop_encodeSnappyBlockAsm:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm
+ JMP match_extend_back_loop_encodeSnappyBlockAsm
+
+match_extend_back_end_encodeSnappyBlockAsm:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 5(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm
+ CMPL R8, $0x00010000
+ JLT three_bytes_match_emit_encodeSnappyBlockAsm
+ CMPL R8, $0x01000000
+ JLT four_bytes_match_emit_encodeSnappyBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm
+
+four_bytes_match_emit_encodeSnappyBlockAsm:
+ MOVL R8, R10
+ SHRL $0x10, R10
+ MOVB $0xf8, (AX)
+ MOVW R8, 1(AX)
+ MOVB R10, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm
+
+three_bytes_match_emit_encodeSnappyBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm
+
+two_bytes_match_emit_encodeSnappyBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm
+
+one_byte_match_emit_encodeSnappyBlockAsm:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
+
+memmove_long_match_emit_encodeSnappyBlockAsm:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm:
+match_nolit_loop_encodeSnappyBlockAsm:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm
+
+match_nolit_end_encodeSnappyBlockAsm:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL SI, $0x00010000
+ JL two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
+ CMPL R10, $0x40
+ JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+ MOVB $0xff, (AX)
+ MOVL SI, 1(AX)
+ LEAL -64(R10), R10
+ ADDQ $0x05, AX
+ CMPL R10, $0x04
+ JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+ JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
+ TESTL R10, R10
+ JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBlockAsm:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm:
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x10, R8
+ IMULQ R9, R8
+ SHRQ $0x32, R8
+ SHLQ $0x10, SI
+ IMULQ R9, SI
+ SHRQ $0x32, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm
+
+emit_remainder_encodeSnappyBlockAsm:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeSnappyBlockAsm
+ CMPL DX, $0x01000000
+ JLT four_bytes_emit_remainder_encodeSnappyBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL DX, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBlockAsm:
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBlockAsm:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000200, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm64K:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm64K
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm64K:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm64K
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R11
+ IMULQ R9, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm64K
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
+
+repeat_extend_back_loop_encodeSnappyBlockAsm64K:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm64K
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
+
+repeat_extend_back_end_encodeSnappyBlockAsm64K:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm64K
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+one_byte_repeat_emit_encodeSnappyBlockAsm64K:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm64K:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K
+
+repeat_extend_forward_end_encodeSnappyBlockAsm64K:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm64K
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm64K:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm64K
+
+no_repeat_found_encodeSnappyBlockAsm64K:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm64K
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm64K
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm64K
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm64K
+
+candidate3_match_encodeSnappyBlockAsm64K:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm64K
+
+candidate2_match_encodeSnappyBlockAsm64K:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm64K:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBlockAsm64K:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm64K
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm64K
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm64K
+ JMP match_extend_back_loop_encodeSnappyBlockAsm64K
+
+match_extend_back_end_encodeSnappyBlockAsm64K:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm64K:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm64K
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm64K
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBlockAsm64K:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
+match_nolit_loop_encodeSnappyBlockAsm64K:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm64K:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm64K
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K
+
+match_nolit_end_encodeSnappyBlockAsm64K:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm64K
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm64K:
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x10, R8
+ IMULQ R9, R8
+ SHRQ $0x32, R8
+ SHLQ $0x10, SI
+ IMULQ R9, SI
+ SHRQ $0x32, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm64K
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm64K
+
+emit_remainder_encodeSnappyBlockAsm64K:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm64K:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm64K
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBlockAsm64K:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000080, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm12B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm12B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm12B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm12B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x000000cf1bbcdcbb, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x18, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ SHLQ $0x18, R11
+ IMULQ R9, R11
+ SHRQ $0x34, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x18, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm12B
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm12B:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm12B
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
+
+repeat_extend_back_end_encodeSnappyBlockAsm12B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm12B
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm12B:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B
+
+repeat_extend_forward_end_encodeSnappyBlockAsm12B:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm12B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm12B
+
+no_repeat_found_encodeSnappyBlockAsm12B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm12B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm12B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm12B
+
+candidate3_match_encodeSnappyBlockAsm12B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm12B
+
+candidate2_match_encodeSnappyBlockAsm12B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm12B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBlockAsm12B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm12B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm12B
+ JMP match_extend_back_loop_encodeSnappyBlockAsm12B
+
+match_extend_back_end_encodeSnappyBlockAsm12B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm12B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm12B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm12B
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBlockAsm12B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
+match_nolit_loop_encodeSnappyBlockAsm12B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm12B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm12B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B
+
+match_nolit_end_encodeSnappyBlockAsm12B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm12B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm12B:
+ MOVQ $0x000000cf1bbcdcbb, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x18, R8
+ IMULQ R9, R8
+ SHRQ $0x34, R8
+ SHLQ $0x18, SI
+ IMULQ R9, SI
+ SHRQ $0x34, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm12B
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm12B
+
+emit_remainder_encodeSnappyBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm12B
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm12B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000020, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm10B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm10B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm10B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm10B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ R9, R11
+ SHRQ $0x36, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm10B
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm10B:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm10B
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
+
+repeat_extend_back_end_encodeSnappyBlockAsm10B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm10B
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm10B:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B
+
+repeat_extend_forward_end_encodeSnappyBlockAsm10B:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm10B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm10B
+
+no_repeat_found_encodeSnappyBlockAsm10B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm10B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm10B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm10B
+
+candidate3_match_encodeSnappyBlockAsm10B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm10B
+
+candidate2_match_encodeSnappyBlockAsm10B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm10B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBlockAsm10B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm10B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm10B
+ JMP match_extend_back_loop_encodeSnappyBlockAsm10B
+
+match_extend_back_end_encodeSnappyBlockAsm10B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm10B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm10B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm10B
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBlockAsm10B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
+match_nolit_loop_encodeSnappyBlockAsm10B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm10B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm10B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B
+
+match_nolit_end_encodeSnappyBlockAsm10B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm10B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm10B:
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x20, R8
+ IMULQ R9, R8
+ SHRQ $0x36, R8
+ SHLQ $0x20, SI
+ IMULQ R9, SI
+ SHRQ $0x36, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm10B
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm10B
+
+emit_remainder_encodeSnappyBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm10B
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm10B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000008, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm8B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm8B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm8B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x04, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm8B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x38, R10
+ SHLQ $0x20, R11
+ IMULQ R9, R11
+ SHRQ $0x38, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x38, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm8B
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm8B:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm8B
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
+
+repeat_extend_back_end_encodeSnappyBlockAsm8B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm8B
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm8B:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B
+
+repeat_extend_forward_end_encodeSnappyBlockAsm8B:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm8B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm8B
+
+no_repeat_found_encodeSnappyBlockAsm8B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm8B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm8B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm8B
+
+candidate3_match_encodeSnappyBlockAsm8B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm8B
+
+candidate2_match_encodeSnappyBlockAsm8B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm8B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBlockAsm8B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm8B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm8B
+ JMP match_extend_back_loop_encodeSnappyBlockAsm8B
+
+match_extend_back_end_encodeSnappyBlockAsm8B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm8B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm8B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm8B
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBlockAsm8B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
+match_nolit_loop_encodeSnappyBlockAsm8B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm8B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm8B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B
+
+match_nolit_end_encodeSnappyBlockAsm8B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm8B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm8B:
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x20, R8
+ IMULQ R9, R8
+ SHRQ $0x38, R8
+ SHLQ $0x20, SI
+ IMULQ R9, SI
+ SHRQ $0x38, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm8B
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm8B
+
+emit_remainder_encodeSnappyBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm8B
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm8B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000a00, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x07, SI
+ CMPL SI, $0x63
+ JLE check_maxskip_ok_encodeSnappyBetterBlockAsm
+ LEAL 100(CX), SI
+ JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
+
+check_maxskip_ok_encodeSnappyBetterBlockAsm:
+ LEAL 1(CX)(SI*1), SI
+
+check_maxskip_cont_encodeSnappyBetterBlockAsm:
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 262168(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 262168(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm
+
+candidateS_match_encodeSnappyBetterBlockAsm:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
+
+match_extend_back_end_encodeSnappyBetterBlockAsm:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 5(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm
+
+match_nolit_end_encodeSnappyBetterBlockAsm:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL R12, $0x01
+ JG match_length_ok_encodeSnappyBetterBlockAsm
+ CMPL R8, $0x0000ffff
+ JLE match_length_ok_encodeSnappyBetterBlockAsm
+ MOVL 20(SP), CX
+ INCL CX
+ JMP search_loop_encodeSnappyBetterBlockAsm
+
+match_length_ok_encodeSnappyBetterBlockAsm:
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+four_bytes_match_emit_encodeSnappyBetterBlockAsm:
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL R8, $0x00010000
+ JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
+ CMPL R12, $0x40
+ JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+ MOVB $0xff, (AX)
+ MOVL R8, 1(AX)
+ LEAL -64(R12), R12
+ ADDQ $0x05, AX
+ CMPL R12, $0x04
+ JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+ JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
+ TESTL R12, R12
+ JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
+ MOVQ $0x00cf1bbcdcbfa563, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x32, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 262168(SP)(R11*4)
+ MOVL R15, 262168(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 262168(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm
+
+emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+ CMPL DX, $0x01000000
+ JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL DX, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000a00, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm64K:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm64K
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm64K:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x07, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm64K
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 262168(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 262168(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm64K
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm64K
+
+candidateS_match_encodeSnappyBetterBlockAsm64K:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm64K
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm64K:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_end_encodeSnappyBetterBlockAsm64K:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm64K:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
+
+match_nolit_end_encodeSnappyBetterBlockAsm64K:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm64K
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm64K:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm64K
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
+ MOVQ $0x00cf1bbcdcbfa563, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x32, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 262168(SP)(R11*4)
+ MOVL R15, 262168(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 262168(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm64K
+
+emit_remainder_encodeSnappyBetterBlockAsm64K:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000280, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm12B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm12B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm12B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm12B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x34, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 65560(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 65560(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm12B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm12B
+
+candidateS_match_encodeSnappyBetterBlockAsm12B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm12B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm12B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm12B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm12B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
+
+match_nolit_end_encodeSnappyBetterBlockAsm12B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm12B
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm12B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x32, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x34, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x34, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 65560(SP)(R11*4)
+ MOVL R15, 65560(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x34, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x32, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 65560(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm12B
+
+emit_remainder_encodeSnappyBetterBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x000000a0, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm10B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm10B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm10B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm10B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x36, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 16408(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 16408(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm10B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm10B
+
+candidateS_match_encodeSnappyBetterBlockAsm10B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm10B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm10B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm10B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm10B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
+
+match_nolit_end_encodeSnappyBetterBlockAsm10B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm10B
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm10B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x34, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x36, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x36, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 16408(SP)(R11*4)
+ MOVL R15, 16408(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x36, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x34, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 16408(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm10B
+
+emit_remainder_encodeSnappyBetterBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000028, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm8B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm8B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm8B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x04, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm8B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x38, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 4120(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 4120(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm8B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm8B
+
+candidateS_match_encodeSnappyBetterBlockAsm8B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm8B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm8B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm8B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm8B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
+
+match_nolit_end_encodeSnappyBetterBlockAsm8B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm8B
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm8B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x36, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x38, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x38, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 4120(SP)(R11*4)
+ MOVL R15, 4120(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x38, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x36, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 4120(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm8B
+
+emit_remainder_encodeSnappyBetterBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func emitLiteral(dst []byte, lit []byte) int
+// Requires: SSE2
+TEXT ·emitLiteral(SB), NOSPLIT, $0-56
+ MOVQ lit_len+32(FP), DX
+ MOVQ dst_base+0(FP), AX
+ MOVQ lit_base+24(FP), CX
+ TESTQ DX, DX
+ JZ emit_literal_end_standalone_skip
+ MOVL DX, BX
+ LEAL -1(DX), SI
+ CMPL SI, $0x3c
+ JLT one_byte_standalone
+ CMPL SI, $0x00000100
+ JLT two_bytes_standalone
+ CMPL SI, $0x00010000
+ JLT three_bytes_standalone
+ CMPL SI, $0x01000000
+ JLT four_bytes_standalone
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP memmove_long_standalone
+
+four_bytes_standalone:
+ MOVL SI, DI
+ SHRL $0x10, DI
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB DI, 3(AX)
+ ADDQ $0x04, BX
+ ADDQ $0x04, AX
+ JMP memmove_long_standalone
+
+three_bytes_standalone:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+ JMP memmove_long_standalone
+
+two_bytes_standalone:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_standalone
+ JMP memmove_long_standalone
+
+one_byte_standalone:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, BX
+ ADDQ $0x01, AX
+
+memmove_standalone:
+ // genMemMoveShort
+ CMPQ DX, $0x03
+ JB emit_lit_memmove_standalone_memmove_move_1or2
+ JE emit_lit_memmove_standalone_memmove_move_3
+ CMPQ DX, $0x08
+ JB emit_lit_memmove_standalone_memmove_move_4through7
+ CMPQ DX, $0x10
+ JBE emit_lit_memmove_standalone_memmove_move_8through16
+ CMPQ DX, $0x20
+ JBE emit_lit_memmove_standalone_memmove_move_17through32
+ JMP emit_lit_memmove_standalone_memmove_move_33through64
+
+emit_lit_memmove_standalone_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(DX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(DX*1)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(DX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(DX*1)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(DX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(DX*1)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(DX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(DX*1)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(DX*1), X2
+ MOVOU -16(CX)(DX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(DX*1)
+ MOVOU X3, -16(AX)(DX*1)
+ JMP emit_literal_end_standalone
+ JMP emit_literal_end_standalone
+
+memmove_long_standalone:
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(DX*1), X2
+ MOVOU -16(CX)(DX*1), X3
+ MOVQ DX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_standalonelarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_standalonelarge_big_loop_back
+
+emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ DX, R8
+ JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(DX*1)
+ MOVOU X3, -16(AX)(DX*1)
+ JMP emit_literal_end_standalone
+ JMP emit_literal_end_standalone
+
+emit_literal_end_standalone_skip:
+ XORQ BX, BX
+
+emit_literal_end_standalone:
+ MOVQ BX, ret+48(FP)
+ RET
+
+// func emitRepeat(dst []byte, offset int, length int) int
+TEXT ·emitRepeat(SB), NOSPLIT, $0-48
+ XORQ BX, BX
+ MOVQ dst_base+0(FP), AX
+ MOVQ offset+24(FP), CX
+ MOVQ length+32(FP), DX
+
+ // emitRepeat
+emit_repeat_again_standalone:
+ MOVL DX, SI
+ LEAL -4(DX), DX
+ CMPL SI, $0x08
+ JLE repeat_two_standalone
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_standalone
+ CMPL CX, $0x00000800
+ JLT repeat_two_offset_standalone
+
+cant_repeat_two_offset_standalone:
+ CMPL DX, $0x00000104
+ JLT repeat_three_standalone
+ CMPL DX, $0x00010100
+ JLT repeat_four_standalone
+ CMPL DX, $0x0100ffff
+ JLT repeat_five_standalone
+ LEAL -16842747(DX), DX
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ ADDQ $0x05, BX
+ JMP emit_repeat_again_standalone
+
+repeat_five_standalone:
+ LEAL -65536(DX), DX
+ MOVL DX, CX
+ MOVW $0x001d, (AX)
+ MOVW DX, 2(AX)
+ SARL $0x10, CX
+ MOVB CL, 4(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_repeat_end
+
+repeat_four_standalone:
+ LEAL -256(DX), DX
+ MOVW $0x0019, (AX)
+ MOVW DX, 2(AX)
+ ADDQ $0x04, BX
+ ADDQ $0x04, AX
+ JMP gen_emit_repeat_end
+
+repeat_three_standalone:
+ LEAL -4(DX), DX
+ MOVW $0x0015, (AX)
+ MOVB DL, 2(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+ JMP gen_emit_repeat_end
+
+repeat_two_standalone:
+ SHLL $0x02, DX
+ ORL $0x01, DX
+ MOVW DX, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_repeat_end
+
+repeat_two_offset_standalone:
+ XORQ SI, SI
+ LEAL 1(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SARL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+
+gen_emit_repeat_end:
+ MOVQ BX, ret+40(FP)
+ RET
+
+// func emitCopy(dst []byte, offset int, length int) int
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+ XORQ BX, BX
+ MOVQ dst_base+0(FP), AX
+ MOVQ offset+24(FP), CX
+ MOVQ length+32(FP), DX
+
+ // emitCopy
+ CMPL CX, $0x00010000
+ JL two_byte_offset_standalone
+
+four_bytes_loop_back_standalone:
+ CMPL DX, $0x40
+ JLE four_bytes_remain_standalone
+ MOVB $0xff, (AX)
+ MOVL CX, 1(AX)
+ LEAL -64(DX), DX
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ CMPL DX, $0x04
+ JL four_bytes_remain_standalone
+
+ // emitRepeat
+emit_repeat_again_standalone_emit_copy:
+ MOVL DX, SI
+ LEAL -4(DX), DX
+ CMPL SI, $0x08
+ JLE repeat_two_standalone_emit_copy
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_standalone_emit_copy
+ CMPL CX, $0x00000800
+ JLT repeat_two_offset_standalone_emit_copy
+
+cant_repeat_two_offset_standalone_emit_copy:
+ CMPL DX, $0x00000104
+ JLT repeat_three_standalone_emit_copy
+ CMPL DX, $0x00010100
+ JLT repeat_four_standalone_emit_copy
+ CMPL DX, $0x0100ffff
+ JLT repeat_five_standalone_emit_copy
+ LEAL -16842747(DX), DX
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ ADDQ $0x05, BX
+ JMP emit_repeat_again_standalone_emit_copy
+
+repeat_five_standalone_emit_copy:
+ LEAL -65536(DX), DX
+ MOVL DX, CX
+ MOVW $0x001d, (AX)
+ MOVW DX, 2(AX)
+ SARL $0x10, CX
+ MOVB CL, 4(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_copy_end
+
+repeat_four_standalone_emit_copy:
+ LEAL -256(DX), DX
+ MOVW $0x0019, (AX)
+ MOVW DX, 2(AX)
+ ADDQ $0x04, BX
+ ADDQ $0x04, AX
+ JMP gen_emit_copy_end
+
+repeat_three_standalone_emit_copy:
+ LEAL -4(DX), DX
+ MOVW $0x0015, (AX)
+ MOVB DL, 2(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+ JMP gen_emit_copy_end
+
+repeat_two_standalone_emit_copy:
+ SHLL $0x02, DX
+ ORL $0x01, DX
+ MOVW DX, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy:
+ XORQ SI, SI
+ LEAL 1(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SARL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+ JMP four_bytes_loop_back_standalone
+
+four_bytes_remain_standalone:
+ TESTL DX, DX
+ JZ gen_emit_copy_end
+ MOVB $0x03, SI
+ LEAL -4(SI)(DX*4), DX
+ MOVB DL, (AX)
+ MOVL CX, 1(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_copy_end
+
+two_byte_offset_standalone:
+ CMPL DX, $0x40
+ JLE two_byte_offset_short_standalone
+ MOVB $0xee, (AX)
+ MOVW CX, 1(AX)
+ LEAL -60(DX), DX
+ ADDQ $0x03, AX
+ ADDQ $0x03, BX
+
+ // emitRepeat
+emit_repeat_again_standalone_emit_copy_short:
+ MOVL DX, SI
+ LEAL -4(DX), DX
+ CMPL SI, $0x08
+ JLE repeat_two_standalone_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_standalone_emit_copy_short
+ CMPL CX, $0x00000800
+ JLT repeat_two_offset_standalone_emit_copy_short
+
+cant_repeat_two_offset_standalone_emit_copy_short:
+ CMPL DX, $0x00000104
+ JLT repeat_three_standalone_emit_copy_short
+ CMPL DX, $0x00010100
+ JLT repeat_four_standalone_emit_copy_short
+ CMPL DX, $0x0100ffff
+ JLT repeat_five_standalone_emit_copy_short
+ LEAL -16842747(DX), DX
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ ADDQ $0x05, BX
+ JMP emit_repeat_again_standalone_emit_copy_short
+
+repeat_five_standalone_emit_copy_short:
+ LEAL -65536(DX), DX
+ MOVL DX, CX
+ MOVW $0x001d, (AX)
+ MOVW DX, 2(AX)
+ SARL $0x10, CX
+ MOVB CL, 4(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_copy_end
+
+repeat_four_standalone_emit_copy_short:
+ LEAL -256(DX), DX
+ MOVW $0x0019, (AX)
+ MOVW DX, 2(AX)
+ ADDQ $0x04, BX
+ ADDQ $0x04, AX
+ JMP gen_emit_copy_end
+
+repeat_three_standalone_emit_copy_short:
+ LEAL -4(DX), DX
+ MOVW $0x0015, (AX)
+ MOVB DL, 2(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+ JMP gen_emit_copy_end
+
+repeat_two_standalone_emit_copy_short:
+ SHLL $0x02, DX
+ ORL $0x01, DX
+ MOVW DX, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SARL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+ JMP two_byte_offset_standalone
+
+two_byte_offset_short_standalone:
+ CMPL DX, $0x0c
+ JGE emit_copy_three_standalone
+ CMPL CX, $0x00000800
+ JGE emit_copy_three_standalone
+ MOVB $0x01, SI
+ LEAL -16(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SHRL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+
+emit_copy_three_standalone:
+ MOVB $0x02, SI
+ LEAL -4(SI)(DX*4), DX
+ MOVB DL, (AX)
+ MOVW CX, 1(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+
+gen_emit_copy_end:
+ MOVQ BX, ret+40(FP)
+ RET
+
+// func emitCopyNoRepeat(dst []byte, offset int, length int) int
+TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
+ XORQ BX, BX
+ MOVQ dst_base+0(FP), AX
+ MOVQ offset+24(FP), CX
+ MOVQ length+32(FP), DX
+
+ // emitCopy
+ CMPL CX, $0x00010000
+ JL two_byte_offset_standalone_snappy
+
+four_bytes_loop_back_standalone_snappy:
+ CMPL DX, $0x40
+ JLE four_bytes_remain_standalone_snappy
+ MOVB $0xff, (AX)
+ MOVL CX, 1(AX)
+ LEAL -64(DX), DX
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ CMPL DX, $0x04
+ JL four_bytes_remain_standalone_snappy
+ JMP four_bytes_loop_back_standalone_snappy
+
+four_bytes_remain_standalone_snappy:
+ TESTL DX, DX
+ JZ gen_emit_copy_end_snappy
+ MOVB $0x03, SI
+ LEAL -4(SI)(DX*4), DX
+ MOVB DL, (AX)
+ MOVL CX, 1(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_copy_end_snappy
+
+two_byte_offset_standalone_snappy:
+ CMPL DX, $0x40
+ JLE two_byte_offset_short_standalone_snappy
+ MOVB $0xee, (AX)
+ MOVW CX, 1(AX)
+ LEAL -60(DX), DX
+ ADDQ $0x03, AX
+ ADDQ $0x03, BX
+ JMP two_byte_offset_standalone_snappy
+
+two_byte_offset_short_standalone_snappy:
+ CMPL DX, $0x0c
+ JGE emit_copy_three_standalone_snappy
+ CMPL CX, $0x00000800
+ JGE emit_copy_three_standalone_snappy
+ MOVB $0x01, SI
+ LEAL -16(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SHRL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end_snappy
+
+emit_copy_three_standalone_snappy:
+ MOVB $0x02, SI
+ LEAL -4(SI)(DX*4), DX
+ MOVB DL, (AX)
+ MOVW CX, 1(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+
+gen_emit_copy_end_snappy:
+ MOVQ BX, ret+40(FP)
+ RET
+
+// func matchLen(a []byte, b []byte) int
+TEXT ·matchLen(SB), NOSPLIT, $0-56
+ MOVQ a_base+0(FP), AX
+ MOVQ b_base+24(FP), CX
+ MOVQ a_len+8(FP), DX
+
+ // matchLen
+ XORL SI, SI
+ CMPL DX, $0x08
+ JL matchlen_single_standalone
+
+matchlen_loopback_standalone:
+ MOVQ (AX)(SI*1), BX
+ XORQ (CX)(SI*1), BX
+ TESTQ BX, BX
+ JZ matchlen_loop_standalone
+ BSFQ BX, BX
+ SARQ $0x03, BX
+ LEAL (SI)(BX*1), SI
+ JMP gen_match_len_end
+
+matchlen_loop_standalone:
+ LEAL -8(DX), DX
+ LEAL 8(SI), SI
+ CMPL DX, $0x08
+ JGE matchlen_loopback_standalone
+
+matchlen_single_standalone:
+ TESTL DX, DX
+ JZ gen_match_len_end
+
+matchlen_single_loopback_standalone:
+ MOVB (AX)(SI*1), BL
+ CMPB (CX)(SI*1), BL
+ JNE gen_match_len_end
+ LEAL 1(SI), SI
+ DECL DX
+ JNZ matchlen_single_loopback_standalone
+
+gen_match_len_end:
+ MOVQ SI, ret+48(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/s2/s2.go b/vendor/github.com/klauspost/compress/s2/s2.go
new file mode 100644
index 00000000..89d69e96
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/s2.go
@@ -0,0 +1,139 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package s2 implements the S2 compression format.
+//
+// S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
+// which is why it features concurrent compression for bigger payloads.
+//
+// Decoding is compatible with Snappy compressed content,
+// but content compressed with S2 cannot be decompressed by Snappy.
+//
+// For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
+//
+// There are actually two S2 formats: block and stream. They are related,
+// but different: trying to decompress block-compressed data as a S2 stream
+// will fail, and vice versa. The block format is the Decode and Encode
+// functions and the stream format is the Reader and Writer types.
+//
+// A "better" compression option is available. This will trade some compression
+// speed
+//
+// The block format, the more common case, is used when the complete size (the
+// number of bytes) of the original data is known upfront, at the time
+// compression starts. The stream format, also known as the framing format, is
+// for when that isn't always true.
+//
+// Blocks to not offer much data protection, so it is up to you to
+// add data validation of decompressed blocks.
+//
+// Streams perform CRC validation of the decompressed data.
+// Stream compression will also be performed on multiple CPU cores concurrently
+// significantly improving throughput.
+package s2
+
+import (
+ "bytes"
+ "hash/crc32"
+)
+
+/*
+Each encoded block begins with the varint-encoded length of the decoded data,
+followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
+first byte of each chunk is broken into its 2 least and 6 most significant bits
+called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
+Zero means a literal tag. All other values mean a copy tag.
+
+For literal tags:
+ - If m < 60, the next 1 + m bytes are literal bytes.
+ - Otherwise, let n be the little-endian unsigned integer denoted by the next
+ m - 59 bytes. The next 1 + n bytes after that are literal bytes.
+
+For copy tags, length bytes are copied from offset bytes ago, in the style of
+Lempel-Ziv compression algorithms. In particular:
+ - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
+ The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
+ of the offset. The next byte is bits 0-7 of the offset.
+ - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
+ The length is 1 + m. The offset is the little-endian unsigned integer
+ denoted by the next 2 bytes.
+ - For l == 3, the offset ranges in [0, 1<<32) and the length in
+ [1, 65). The length is 1 + m. The offset is the little-endian unsigned
+ integer denoted by the next 4 bytes.
+*/
+const (
+ tagLiteral = 0x00
+ tagCopy1 = 0x01
+ tagCopy2 = 0x02
+ tagCopy4 = 0x03
+)
+
+const (
+ checksumSize = 4
+ chunkHeaderSize = 4
+ magicChunk = "\xff\x06\x00\x00" + magicBody
+ magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
+ magicBodySnappy = "sNaPpY"
+ magicBody = "S2sTwO"
+
+ // maxBlockSize is the maximum size of the input to encodeBlock.
+ //
+ // For the framing format (Writer type instead of Encode function),
+ // this is the maximum uncompressed size of a block.
+ maxBlockSize = 4 << 20
+
+ // minBlockSize is the minimum size of block setting when creating a writer.
+ minBlockSize = 4 << 10
+
+ // Default block size
+ defaultBlockSize = 1 << 20
+
+ // maxSnappyBlockSize is the maximum snappy block size.
+ maxSnappyBlockSize = 1 << 16
+
+ obufHeaderLen = checksumSize + chunkHeaderSize
+)
+
+const (
+ chunkTypeCompressedData = 0x00
+ chunkTypeUncompressedData = 0x01
+ chunkTypePadding = 0xfe
+ chunkTypeStreamIdentifier = 0xff
+)
+
+var crcTable = crc32.MakeTable(crc32.Castagnoli)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func crc(b []byte) uint32 {
+ c := crc32.Update(0, crcTable, b)
+ return c>>15 | c<<17 + 0xa282ead8
+}
+
+// literalExtraSize returns the extra size of encoding n literals.
+// n should be >= 0 and <= math.MaxUint32.
+func literalExtraSize(n int64) int64 {
+ if n == 0 {
+ return 0
+ }
+ switch {
+ case n < 60:
+ return 1
+ case n < 1<<8:
+ return 2
+ case n < 1<<16:
+ return 3
+ case n < 1<<24:
+ return 4
+ default:
+ return 5
+ }
+}
+
+type byter interface {
+ Bytes() []byte
+}
+
+var _ byter = &bytes.Buffer{}