17 files changed, 21860 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/.gitignore b/vendor/github.com/klauspost/compress/s2/.gitignore
new file mode 100644
index 00000000..3a89c6e3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/.gitignore
@@ -0,0 +1,15 @@
+testdata/bench
+
+# These explicitly listed benchmark data files are for an obsolete version of
+# snappy_test.go.
+testdata/alice29.txt
+testdata/asyoulik.txt
+testdata/fireworks.jpeg
+testdata/geo.protodata
+testdata/html
+testdata/html_x_4
+testdata/kppkn.gtb
+testdata/lcet10.txt
+testdata/paper-100k.pdf
+testdata/plrabn12.txt
+testdata/urls.10K
diff --git a/vendor/github.com/klauspost/compress/s2/LICENSE b/vendor/github.com/klauspost/compress/s2/LICENSE
new file mode 100644
index 00000000..1d2d645b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+Copyright (c) 2019 Klaus Post. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
new file mode 100644
index 00000000..81fad652
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -0,0 +1,717 @@
+# S2 Compression
+
+S2 is an extension of [Snappy](https://github.com/google/snappy).
+
+S2 is aimed for high throughput, which is why it features concurrent compression for bigger payloads.
+
+Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy.
+This means that S2 can seamlessly replace Snappy without converting compressed content.
+
+S2 can produce Snappy compatible output, faster and better than Snappy.
+If you want full benefit of the changes you should use s2 without Snappy compatibility. 
+
+S2 is designed to have high throughput on content that cannot be compressed.
+This is important, so you don't have to worry about spending CPU cycles on already compressed data. 
+
+## Benefits over Snappy
+
+* Better compression
+* Adjustable compression (3 levels) 
+* Concurrent stream compression
+* Faster decompression, even for Snappy compatible content
+* Ability to quickly skip forward in compressed stream
+* Compatible with reading Snappy compressed content
+* Smaller block size overhead on incompressible blocks
+* Block concatenation
+* Uncompressed stream mode
+* Automatic stream size padding
+* Snappy compatible block compression
+
+## Drawbacks over Snappy
+
+* Not optimized for 32 bit systems.
+* Streams use slightly more memory due to larger blocks and concurrency (configurable).
+
+# Usage
+
+Installation: `go get -u github.com/klauspost/compress/s2`
+
+Full package documentation:
+ 
+[![godoc][1]][2]
+
+[1]: https://godoc.org/github.com/klauspost/compress?status.svg
+[2]: https://godoc.org/github.com/klauspost/compress/s2
+
+## Compression
+
+```Go
+func EncodeStream(src io.Reader, dst io.Writer) error {
+    enc := s2.NewWriter(dst)
+    _, err := io.Copy(enc, src)
+    if err != nil {
+        enc.Close()
+        return err
+    }
+    // Blocks until compression is done.
+    return enc.Close() 
+}
+```
+
+You should always call `enc.Close()`, otherwise you will leak resources and your encode will be incomplete.
+
+For the best throughput, you should attempt to reuse the `Writer` using the `Reset()` method.
+
+The Writer in S2 is always buffered, therefore `NewBufferedWriter` in Snappy can be replaced with `NewWriter` in S2.
+It is possible to flush any buffered data using the `Flush()` method. 
+This will block until all data sent to the encoder has been written to the output.
+
+S2 also supports the `io.ReaderFrom` interface, which will consume all input from a reader.
+
+As a final method to compress data, if you have a single block of data you would like to have encoded as a stream,
+a slightly more efficient method is to use the `EncodeBuffer` method.
+This will take ownership of the buffer until the stream is closed.
+
+```Go
+func EncodeStream(src []byte, dst io.Writer) error {
+    enc := s2.NewWriter(dst)
+    // The encoder owns the buffer until Flush or Close is called.
+    err := enc.EncodeBuffer(buf)
+    if err != nil {
+        enc.Close()
+        return err
+    }
+    // Blocks until compression is done.
+    return enc.Close()
+}
+```
+
+Each call to `EncodeBuffer` will result in discrete blocks being created without buffering, 
+so it should only be used a single time per stream.
+If you need to write several blocks, you should use the regular io.Writer interface.
+
+
+## Decompression
+
+```Go
+func DecodeStream(src io.Reader, dst io.Writer) error {
+    dec := s2.NewReader(src)
+    _, err := io.Copy(dst, dec)
+    return err
+}
+```
+
+Similar to the Writer, a Reader can be reused using the `Reset` method.
+
+For the best possible throughput, there is a `EncodeBuffer(buf []byte)` function available.
+However, it requires that the provided buffer isn't used after it is handed over to S2 and until the stream is flushed or closed.  
+
+For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`.
+Do however note that these functions (similar to Snappy) does not provide validation of data, 
+so data corruption may be undetected. Stream encoding provides CRC checks of data.
+
+It is possible to efficiently skip forward in a compressed stream using the `Skip()` method. 
+For big skips the decompressor is able to skip blocks without decompressing them.
+
+## Single Blocks
+
+Similar to Snappy S2 offers single block compression. 
+Blocks do not offer the same flexibility and safety as streams,
+but may be preferable for very small payloads, less than 100K.
+
+Using a simple `dst := s2.Encode(nil, src)` will compress `src` and return the compressed result. 
+It is possible to provide a destination buffer. 
+If the buffer has a capacity of `s2.MaxEncodedLen(len(src))` it will be used. 
+If not a new will be allocated. 
+
+Alternatively `EncodeBetter`/`EncodeBest` can also be used for better, but slightly slower compression.
+
+Similarly to decompress a block you can use `dst, err := s2.Decode(nil, src)`. 
+Again an optional destination buffer can be supplied. 
+The `s2.DecodedLen(src)` can be used to get the minimum capacity needed. 
+If that is not satisfied a new buffer will be allocated.
+
+Block function always operate on a single goroutine since it should only be used for small payloads.
+
+# Commandline tools
+
+Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression.
+
+Binaries can be downloaded on the [Releases Page](https://github.com/klauspost/compress/releases).
+
+Installing then requires Go to be installed. To install them, use:
+
+`go install github.com/klauspost/compress/s2/cmd/s2c && go install github.com/klauspost/compress/s2/cmd/s2d`
+
+To build binaries to the current folder use:
+
+`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d`
+
+
+## s2c
+
+```
+Usage: s2c [options] file1 file2
+
+Compresses all files supplied as input separately.
+Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'.
+By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and compressed.
+Only http response code 200 is accepted.
+
+Options:
+  -bench int
+    	Run benchmark n times. No output will be written
+  -blocksize string
+    	Max  block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M")
+  -c	Write all output to stdout. Multiple input files will be concatenated
+  -cpu int
+    	Compress using this amount of threads (default 32)
+  -faster
+    	Compress faster, but with a minor compression loss
+  -help
+    	Display help
+  -o string
+        Write output to another file. Single input file only
+  -pad string
+    	Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1")
+  -q	Don't write any output to terminal, except errors
+  -rm
+    	Delete source file(s) after successful compression
+  -safe
+    	Do not overwrite output files
+  -slower
+    	Compress more, but a lot slower
+  -snappy
+        Generate Snappy compatible output stream
+  -verify
+    	Verify written files  
+
+```
+
+## s2d
+
+```
+Usage: s2d [options] file1 file2
+
+Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'.
+Output file names have the extension removed. By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and decompressed.
+Extensions on downloaded files are ignored. Only http response code 200 is accepted.
+
+Options:
+  -bench int
+    	Run benchmark n times. No output will be written
+  -c	Write all output to stdout. Multiple input files will be concatenated
+  -help
+    	Display help
+  -o string
+        Write output to another file. Single input file only
+  -q	Don't write any output to terminal, except errors
+  -rm
+    	Delete source file(s) after successful decompression
+  -safe
+    	Do not overwrite output files
+  -verify
+    	Verify files, but do not write output                                      
+```
+
+## s2sx: self-extracting archives
+
+s2sx allows creating self-extracting archives with no dependencies.
+
+By default, executables are created for the same platforms as the host os, 
+but this can be overridden with `-os` and `-arch` parameters.
+
+Extracted files have 0666 permissions, except when untar option used.
+
+```
+Usage: s2sx [options] file1 file2
+
+Compresses all files supplied as input separately.
+If files have '.s2' extension they are assumed to be compressed already.
+Output files are written as 'filename.s2sx' and with '.exe' for windows targets.
+If output is big, an additional file with ".more" is written. This must be included as well.
+By default output files will be overwritten.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+Options:
+  -arch string
+        Destination architecture (default "amd64")
+  -c    Write all output to stdout. Multiple input files will be concatenated
+  -cpu int
+        Compress using this amount of threads (default 32)
+  -help
+        Display help
+  -max string
+        Maximum executable size. Rest will be written to another file. (default "1G")
+  -os string
+        Destination operating system (default "windows")
+  -q    Don't write any output to terminal, except errors
+  -rm
+        Delete source file(s) after successful compression
+  -safe
+        Do not overwrite output files
+  -untar
+        Untar on destination
+```
+
+Available platforms are:
+
+ * darwin-amd64
+ * darwin-arm64
+ * linux-amd64
+ * linux-arm
+ * linux-arm64
+ * linux-mips64
+ * linux-ppc64le
+ * windows-386
+ * windows-amd64                                                                             
+
+By default, there is a size limit of 1GB for the output executable.
+
+When this is exceeded the remaining file content is written to a file called
+output+`.more`. This file must be included for a successful extraction and 
+placed alongside the executable for a successful extraction.
+
+This file *must* have the same name as the executable, so if the executable is renamed, 
+so must the `.more` file. 
+
+This functionality is disabled with stdin/stdout. 
+
+### Self-extracting TAR files
+
+If you wrap a TAR file you can specify `-untar` to make it untar on the destination host.
+
+Files are extracted to the current folder with the path specified in the tar file.
+
+Note that tar files are not validated before they are wrapped.
+
+For security reasons files that move below the root folder are not allowed.
+
+# Performance
+
+This section will focus on comparisons to Snappy. 
+This package is solely aimed at replacing Snappy as a high speed compression package.
+If you are mainly looking for better compression [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd)
+gives better compression, but typically at speeds slightly below "better" mode in this package.
+
+Compression is increased compared to Snappy, mostly around 5-20% and the throughput is typically 25-40% increased (single threaded) compared to the Snappy Go implementation.
+
+Streams are concurrently compressed. The stream will be distributed among all available CPU cores for the best possible throughput.
+
+A "better" compression mode is also available. This allows to trade a bit of speed for a minor compression gain.
+The content compressed in this mode is fully compatible with the standard decoder.
+
+Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
+
+| File                                                                                                | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
+|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                    | 12.70x   | 10556 MB/s    | 7.35%        | 4.15x       | 3455 MB/s           | 12.79%             |
+| (1 CPU)                                                                                             | 1.14x    | 948 MB/s      | -            | 0.42x       | 349 MB/s            | -                  |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x   | 14484 MB/s    | 31.60%       | 10.09x      | 8533 MB/s           | 37.71%             |
+| (1 CPU)                                                                                             | 1.33x    | 1127 MB/s     | -            | 0.70x       | 589 MB/s            | -                  |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)         | 15.14x   | 12000 MB/s    | -5.79%       | 6.59x       | 5223 MB/s           | 5.80%              |
+| (1 CPU)                                                                                             | 1.11x    | 877 MB/s      | -            | 0.47x       | 370 MB/s            | -                  |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                     | 14.62x   | 12116 MB/s    | 15.90%       | 5.35x       | 4430 MB/s           | 16.08%             |
+| (1 CPU)                                                                                             | 1.38x    | 1146 MB/s     | -            | 0.38x       | 312 MB/s            | -                  |
+| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst)                             | 8.83x    | 17579 MB/s    | 43.86%       | 6.54x       | 13011 MB/s          | 47.23%             |
+| (1 CPU)                                                                                             | 1.14x    | 2259 MB/s     | -            | 0.74x       | 1475 MB/s           | -                  |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                    | 16.72x   | 14019 MB/s    | 24.02%       | 10.11x      | 8477 MB/s           | 30.48%             |
+| (1 CPU)                                                                                             | 1.24x    | 1043 MB/s     | -            | 0.70x       | 586 MB/s            | -                  |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                     | 13.33x   | 9254 MB/s     | 1.84%        | 6.75x       | 4686 MB/s           | 6.72%              |
+| (1 CPU)                                                                                             | 0.97x    | 672 MB/s      | -            | 0.53x       | 366 MB/s            | -                  |
+| sharnd.out.2gb                                                                                      | 2.11x    | 12639 MB/s    | 0.01%        | 1.98x       | 11833 MB/s          | 0.01%              |
+| (1 CPU)                                                                                             | 0.93x    | 5594 MB/s     | -            | 1.34x       | 8030 MB/s           | -                  |
+| [enwik9](http://mattmahoney.net/dc/textdata.html)                                                   | 19.34x   | 8220 MB/s     | 3.98%        | 7.87x       | 3345 MB/s           | 15.82%             |
+| (1 CPU)                                                                                             | 1.06x    | 452 MB/s      | -            | 0.50x       | 213 MB/s            | -                  |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                    | 10.48x   | 6124 MB/s     | 5.67%        | 3.76x       | 2197 MB/s           | 12.60%             |
+| (1 CPU)                                                                                             | 0.97x    | 568 MB/s      | -            | 0.46x       | 271 MB/s            | -                  |
+| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results)                                 | 21.07x   | 9020 MB/s     | 6.36%        | 6.91x       | 2959 MB/s           | 16.95%             |
+| (1 CPU)                                                                                             | 1.07x    | 460 MB/s      | -            | 0.51x       | 220 MB/s            | -                  |
+
+### Legend
+
+* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
+* `S2 throughput`: Throughput of S2 in MB/s. 
+* `S2 % smaller`: How many percent of the Snappy output size is S2 better.
+* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
+* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
+* `"better" % smaller`: How many percent of the Snappy output size is S2 better when using "better" compression.
+
+There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
+
+Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size.
+
+The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
+
+Incompressible content (`sharnd.out.2gb`, 2GB random data) sees the smallest speedup. 
+This is likely dominated by synchronization overhead, which is confirmed by the fact that single threaded performance is higher (see above). 
+
+## Decompression
+
+S2 attempts to create content that is also fast to decompress, except in "better" mode where the smallest representation is used.
+
+S2 vs Snappy **decompression** speed. Both operating on single core:
+
+| File                                                                                                | S2 Throughput | vs. Snappy | Better Throughput | vs. Snappy |
+|-----------------------------------------------------------------------------------------------------|---------------|------------|-------------------|------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                    | 2117 MB/s     | 1.14x      | 1738 MB/s         | 0.94x      |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 2401 MB/s     | 1.25x      | 2307 MB/s         | 1.20x      |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)         | 2075 MB/s     | 0.98x      | 1764 MB/s         | 0.83x      |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                     | 2967 MB/s     | 1.05x      | 2885 MB/s         | 1.02x      |
+| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst)                             | 4141 MB/s     | 1.07x      | 4184 MB/s         | 1.08x      |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                    | 2264 MB/s     | 1.12x      | 2185 MB/s         | 1.08x      |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                     | 1525 MB/s     | 1.03x      | 1347 MB/s         | 0.91x      |
+| sharnd.out.2gb                                                                                      | 3813 MB/s     | 0.79x      | 3900 MB/s         | 0.81x      |
+| [enwik9](http://mattmahoney.net/dc/textdata.html)                                                   | 1246 MB/s     | 1.29x      | 967 MB/s          | 1.00x      |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                    | 1433 MB/s     | 1.12x      | 1203 MB/s         | 0.94x      |
+| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results)                                 | 1284 MB/s     | 1.32x      | 1010 MB/s         | 1.04x      |
+
+### Legend
+
+* `S2 Throughput`: Decompression speed of S2 encoded content.
+* `Better Throughput`: Decompression speed of S2 "better" encoded content.
+* `vs Snappy`: Decompression speed of S2 "better" mode compared to Snappy and absolute speed.
+
+
+While the decompression code hasn't changed, there is a significant speedup in decompression speed. 
+S2 prefers longer matches and will typically only find matches that are 6 bytes or longer. 
+While this reduces compression a bit, it improves decompression speed.
+
+The "better" compression mode will actively look for shorter matches, which is why it has a decompression speed quite similar to Snappy.   
+
+Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
+
+| File                           | S2 Throughput | S2 throughput |
+|--------------------------------|--------------|---------------|
+| consensus.db.10gb.s2           | 1.84x        | 2289.8 MB/s   |
+| 10gb.tar.s2                    | 1.30x        | 867.07 MB/s   |
+| rawstudio-mint14.tar.s2        | 1.66x        | 1329.65 MB/s  |
+| github-june-2days-2019.json.s2 | 2.36x        | 1831.59 MB/s  |
+| github-ranks-backup.bin.s2     | 1.73x        | 1390.7 MB/s   |
+| enwik9.s2                      | 1.67x        | 681.53 MB/s   |
+| adresser.json.s2               | 3.41x        | 4230.53 MB/s  |
+| silesia.tar.s2                 | 1.52x        | 811.58        |
+
+Even though S2 typically compresses better than Snappy, decompression speed is always better. 
+
+## Block compression
+
+
+When compressing blocks no concurrent compression is performed just as Snappy. 
+This is because blocks are for smaller payloads and generally will not benefit from concurrent compression.
+
+An important change is that incompressible blocks will not be more than at most 10 bytes bigger than the input.
+In rare, worst case scenario Snappy blocks could be significantly bigger than the input.  
+
+### Mixed content blocks
+
+The most reliable is a wide dataset. 
+For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
+53927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
+
+| *                 | Input      | Output     | Reduction | MB/s   |
+|-------------------|------------|------------|-----------|--------|
+| S2                | 4014735833 | 1059723369 | 73.60%    | **934.34** |
+| S2 Better         | 4014735833 | 969670507  | 75.85%    | 532.70 |
+| S2 Best           | 4014735833 | 906625668  | **77.85%** | 46.84 |
+| Snappy            | 4014735833 | 1128706759 | 71.89%    | 762.59 |
+| S2, Snappy Output | 4014735833 | 1093821420 | 72.75%    | 908.60 |
+| LZ4               | 4014735833 | 1079259294 | 73.12%    | 526.94 |
+
+S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
+"Better" mode provides the same compression speed as LZ4 with better compression ratio. 
+
+When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression.
+
+As can be seen from the other benchmarks decompression should also be easier on the S2 generated output.
+
+Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for
+other Go compressors:
+
+| *                 | Input      | Output     | Reduction | MB/s   |
+|-------------------|------------|------------|-----------|--------|
+| Zstd Fastest (Go) | 4014735833 | 794608518  | 80.21%    | 236.04 |
+| Zstd Best (Go)    | 4014735833 | 704603356  | 82.45%    | 35.63  |
+| Deflate (Go) l1   | 4014735833 | 871294239  | 78.30%    | 214.04 |
+| Deflate (Go) l9   | 4014735833 | 730389060  | 81.81%    | 41.17  |
+
+### Standard block compression
+
+Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
+So individual benchmarks should only be seen as a guideline and the overall picture is more important.
+
+These micro-benchmarks are with data in cache and trained branch predictors. For a more realistic benchmark see the mixed content above. 
+
+Block compression. Parallel benchmark running on 16 cores, 16 goroutines.
+
+AMD64 assembly is use for both S2 and Snappy.
+
+| Absolute Perf         | Snappy size | S2 Size | Snappy Speed | S2 Speed    | Snappy dec  | S2 dec      |
+|-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
+| html                  | 22843       | 21111   | 16246 MB/s   | 17438 MB/s  | 40972 MB/s  | 49263 MB/s  |
+| urls.10K              | 335492      | 287326  | 7943 MB/s    | 9693 MB/s   | 22523 MB/s  | 26484 MB/s  |
+| fireworks.jpeg        | 123034      | 123100  | 349544 MB/s  | 273889 MB/s | 718321 MB/s | 827552 MB/s |
+| fireworks.jpeg (200B) | 146         | 155     | 8869 MB/s    | 17773 MB/s  | 33691 MB/s  | 52421 MB/s  |
+| paper-100k.pdf        | 85304       | 84459   | 167546 MB/s  | 101263 MB/s | 326905 MB/s | 291944 MB/s |
+| html_x_4              | 92234       | 21113   | 15194 MB/s   | 50670 MB/s  | 30843 MB/s  | 32217 MB/s  |
+| alice29.txt           | 88034       | 85975   | 5936 MB/s    | 6139 MB/s   | 12882 MB/s  | 20044 MB/s  |
+| asyoulik.txt          | 77503       | 79650   | 5517 MB/s    | 6366 MB/s   | 12735 MB/s  | 22806 MB/s  |
+| lcet10.txt            | 234661      | 220670  | 6235 MB/s    | 6067 MB/s   | 14519 MB/s  | 18697 MB/s  |
+| plrabn12.txt          | 319267      | 317985  | 5159 MB/s    | 5726 MB/s   | 11923 MB/s  | 19901 MB/s  |
+| geo.protodata         | 23335       | 18690   | 21220 MB/s   | 26529 MB/s  | 56271 MB/s  | 62540 MB/s  |
+| kppkn.gtb             | 69526       | 65312   | 9732 MB/s    | 8559 MB/s   | 18491 MB/s  | 18969 MB/s  |
+| alice29.txt (128B)    | 80          | 82      | 6691 MB/s    | 15489 MB/s  | 31883 MB/s  | 38874 MB/s  |
+| alice29.txt (1000B)   | 774         | 774     | 12204 MB/s   | 13000 MB/s  | 48056 MB/s  | 52341 MB/s  |
+| alice29.txt (10000B)  | 6648        | 6933    | 10044 MB/s   | 12806 MB/s  | 32378 MB/s  | 46322 MB/s  |
+| alice29.txt (20000B)  | 12686       | 13574   | 7733 MB/s    | 11210 MB/s  | 30566 MB/s  | 58969 MB/s  |
+
+
+| Relative Perf         | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed |
+|-----------------------|-------------|------------------|----------|--------------|
+| html                  | 22.31%      | 7.58%            | 1.07x    | 1.20x        |
+| urls.10K              | 47.78%      | 14.36%           | 1.22x    | 1.18x        |
+| fireworks.jpeg        | 99.95%      | -0.05%           | 0.78x    | 1.15x        |
+| fireworks.jpeg (200B) | 73.00%      | -6.16%           | 2.00x    | 1.56x        |
+| paper-100k.pdf        | 83.30%      | 0.99%            | 0.60x    | 0.89x        |
+| html_x_4              | 22.52%      | 77.11%           | 3.33x    | 1.04x        |
+| alice29.txt           | 57.88%      | 2.34%            | 1.03x    | 1.56x        |
+| asyoulik.txt          | 61.91%      | -2.77%           | 1.15x    | 1.79x        |
+| lcet10.txt            | 54.99%      | 5.96%            | 0.97x    | 1.29x        |
+| plrabn12.txt          | 66.26%      | 0.40%            | 1.11x    | 1.67x        |
+| geo.protodata         | 19.68%      | 19.91%           | 1.25x    | 1.11x        |
+| kppkn.gtb             | 37.72%      | 6.06%            | 0.88x    | 1.03x        |
+| alice29.txt (128B)    | 62.50%      | -2.50%           | 2.31x    | 1.22x        |
+| alice29.txt (1000B)   | 77.40%      | 0.00%            | 1.07x    | 1.09x        |
+| alice29.txt (10000B)  | 66.48%      | -4.29%           | 1.27x    | 1.43x        |
+| alice29.txt (20000B)  | 63.43%      | -7.00%           | 1.45x    | 1.93x        |
+
+Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size. 
+
+Decompression speed is better than Snappy, except in one case. 
+
+Since payloads are very small the variance in terms of size is rather big, so they should only be seen as a general guideline.
+
+Size is on average around Snappy, but varies on content type. 
+In cases where compression is worse, it usually is compensated by a speed boost. 
+
+
+### Better compression
+
+Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
+So individual benchmarks should only be seen as a guideline and the overall picture is more important.
+
+| Absolute Perf         | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec  | Better dec  |
+|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
+| html                  | 22843       | 19833       | 16246 MB/s   | 7731 MB/s    | 40972 MB/s  | 40292 MB/s  |
+| urls.10K              | 335492      | 253529      | 7943 MB/s    | 3980 MB/s    | 22523 MB/s  | 20981 MB/s  |
+| fireworks.jpeg        | 123034      | 123100      | 349544 MB/s  | 9760 MB/s    | 718321 MB/s | 823698 MB/s |
+| fireworks.jpeg (200B) | 146         | 142         | 8869 MB/s    | 594 MB/s     | 33691 MB/s  | 30101 MB/s  |
+| paper-100k.pdf        | 85304       | 82915       | 167546 MB/s  | 7470 MB/s    | 326905 MB/s | 198869 MB/s |
+| html_x_4              | 92234       | 19841       | 15194 MB/s   | 23403 MB/s   | 30843 MB/s  | 30937 MB/s  |
+| alice29.txt           | 88034       | 73218       | 5936 MB/s    | 2945 MB/s    | 12882 MB/s  | 16611 MB/s  |
+| asyoulik.txt          | 77503       | 66844       | 5517 MB/s    | 2739 MB/s    | 12735 MB/s  | 14975 MB/s  |
+| lcet10.txt            | 234661      | 190589      | 6235 MB/s    | 3099 MB/s    | 14519 MB/s  | 16634 MB/s  |
+| plrabn12.txt          | 319267      | 270828      | 5159 MB/s    | 2600 MB/s    | 11923 MB/s  | 13382 MB/s  |
+| geo.protodata         | 23335       | 18278       | 21220 MB/s   | 11208 MB/s   | 56271 MB/s  | 57961 MB/s  |
+| kppkn.gtb             | 69526       | 61851       | 9732 MB/s    | 4556 MB/s    | 18491 MB/s  | 16524 MB/s  |
+| alice29.txt (128B)    | 80          | 81          | 6691 MB/s    | 529 MB/s     | 31883 MB/s  | 34225 MB/s  |
+| alice29.txt (1000B)   | 774         | 748         | 12204 MB/s   | 1943 MB/s    | 48056 MB/s  | 42068 MB/s  |
+| alice29.txt (10000B)  | 6648        | 6234        | 10044 MB/s   | 2949 MB/s    | 32378 MB/s  | 28813 MB/s  |
+| alice29.txt (20000B)  | 12686       | 11584       | 7733 MB/s    | 2822 MB/s    | 30566 MB/s  | 27315 MB/s  |
+
+
+| Relative Perf         | Snappy size | Better size | Better Speed | Better dec |
+|-----------------------|-------------|-------------|--------------|------------|
+| html                  | 22.31%      | 13.18%      | 0.48x        | 0.98x      |
+| urls.10K              | 47.78%      | 24.43%      | 0.50x        | 0.93x      |
+| fireworks.jpeg        | 99.95%      | -0.05%      | 0.03x        | 1.15x      |
+| fireworks.jpeg (200B) | 73.00%      | 2.74%       | 0.07x        | 0.89x      |
+| paper-100k.pdf        | 83.30%      | 2.80%       | 0.07x        | 0.61x      |
+| html_x_4              | 22.52%      | 78.49%      | 0.04x        | 1.00x      |
+| alice29.txt           | 57.88%      | 16.83%      | 1.54x        | 1.29x      |
+| asyoulik.txt          | 61.91%      | 13.75%      | 0.50x        | 1.18x      |
+| lcet10.txt            | 54.99%      | 18.78%      | 0.50x        | 1.15x      |
+| plrabn12.txt          | 66.26%      | 15.17%      | 0.50x        | 1.12x      |
+| geo.protodata         | 19.68%      | 21.67%      | 0.50x        | 1.03x      |
+| kppkn.gtb             | 37.72%      | 11.04%      | 0.53x        | 0.89x      |
+| alice29.txt (128B)    | 62.50%      | -1.25%      | 0.47x        | 1.07x      |
+| alice29.txt (1000B)   | 77.40%      | 3.36%       | 0.08x        | 0.88x      |
+| alice29.txt (10000B)  | 66.48%      | 6.23%       | 0.16x        | 0.89x      |
+| alice29.txt (20000B)  | 63.43%      | 8.69%       | 0.29x        | 0.89x      |
+
+Except for the mostly incompressible JPEG image compression is better and usually in the 
+double digits in terms of percentage reduction over Snappy.
+
+The PDF sample shows a significant slowdown compared to Snappy, as this mode tries harder 
+to compress the data. Very small blocks are also not favorable for better compression, so throughput is way down.
+
+This mode aims to provide better compression at the expense of performance and achieves that 
+without a huge performance penalty, except on very small blocks. 
+
+Decompression speed suffers a little compared to the regular S2 mode, 
+but still manages to be close to Snappy in spite of increased compression.  
+ 
+# Best compression mode
+
+S2 offers a "best" compression mode. 
+
+This will compress as much as possible with little regard to CPU usage.
+
+Mainly for offline compression, but where decompression speed should still
+be high and compatible with other S2 compressed data.
+
+Some examples compared on 16 core CPU, amd64 assembly used:
+
+```
+* enwik10
+Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
+Better...  10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
+Best...    10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s
+
+* github-june-2days-2019.json
+Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
+Better...  6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
+Best...    6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s
+
+* nyc-taxi-data-10M.csv
+Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
+Better...  3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
+Best...    3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s
+
+* 10gb.tar
+Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
+Better...  10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
+Best...    10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/
+
+* consensus.db.10gb
+Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
+Better...  10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
+Best...    10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
+```
+
+Decompression speed should be around the same as using the 'better' compression mode. 
+
+# Snappy Compatibility
+
+S2 now offers full compatibility with Snappy.
+
+This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output.
+
+There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by
+simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`.
+This uses "better" mode for all operations.
+If you would like more control, you can use the s2 package as described below: 
+
+## Blocks
+
+Snappy compatible blocks can be generated with the S2 encoder. 
+Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace 
+
+| Snappy                     | S2 replacement          |
+|----------------------------|-------------------------|
+| snappy.Encode(...)         | s2.EncodeSnappy(...)   |
+| snappy.MaxEncodedLen(...)  | s2.MaxEncodedLen(...)   |
+
+`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. 
+
+`s2.ConcatBlocks` is compatible with snappy blocks.
+
+Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
+53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
+
+| Encoder               | Size       | MB/s   | Reduction |
+|-----------------------|------------|--------|------------
+| snappy.Encode         | 1128706759 | 725.59 | 71.89%    |
+| s2.EncodeSnappy       | 1093823291 | 899.16 | 72.75%    |
+| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06%    |
+| s2.EncodeSnappyBest   | 944507998  | 66.00  | 76.47%    |
+
+## Streams
+
+For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`.
+All other options are available, but note that block size limit is different for snappy.
+
+Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput: 
+
+| File                        | snappy.NewWriter         | S2 Snappy                 | S2 Snappy, Better        | S2 Snappy, Best         |
+|-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------|
+| nyc-taxi-data-10M.csv       | 1316042016 - 517.54MB/s  | 1307003093 - 8406.29MB/s  | 1174534014 - 4984.35MB/s | 1115904679 - 177.81MB/s |
+| enwik10                     | 5088294643 - 433.45MB/s  | 5175840939 - 8454.52MB/s  | 4560784526 - 4403.10MB/s | 4340299103 - 159.71MB/s |
+| 10gb.tar                    | 6056946612 - 703.25MB/s  | 6208571995 - 9035.75MB/s  | 5741646126 - 2402.08MB/s | 5548973895 - 171.17MB/s |
+| github-june-2days-2019.json | 1525176492 - 908.11MB/s  | 1476519054 - 12625.93MB/s | 1400547532 - 6163.61MB/s | 1321887137 - 200.71MB/s |
+| consensus.db.10gb           | 5412897703 - 1054.38MB/s | 5354073487 - 12634.82MB/s | 5335069899 - 2472.23MB/s | 5201000954 - 166.32MB/s |
+
+# Decompression
+
+All decompression functions map directly to equivalent s2 functions.
+
+| Snappy                 | S2 replacement     |
+|------------------------|--------------------|
+| snappy.Decode(...)     | s2.Decode(...)     |
+| snappy.DecodedLen(...) | s2.DecodedLen(...) |
+| snappy.NewReader(...)  | s2.NewReader(...)  |
+
+Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip)
+are also available for Snappy streams.
+
+If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize)
+on your Reader will reduce memory consumption.
+
+# Concatenating blocks and streams.
+
+Concatenating streams will concatenate the output of both without recompressing them. 
+While this is inefficient in terms of compression it might be usable in certain scenarios. 
+The 10 byte 'stream identifier' of the second stream can optionally be stripped, but it is not a requirement.
+
+Blocks can be concatenated using the `ConcatBlocks` function.
+
+Snappy blocks/streams can safely be concatenated with S2 blocks and streams. 
+
+# Format Extensions
+
+* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
+* [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB).
+* Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset.
+
+Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0.
+
+The length is specified by reading the 3-bit length specified in the tag and decode using this table:
+
+| Length | Actual Length        |
+|--------|----------------------|
+| 0      | 4                    |
+| 1      | 5                    |
+| 2      | 6                    |
+| 3      | 7                    |
+| 4      | 8                    |
+| 5      | 8 + read 1 byte      |
+| 6      | 260 + read 2 bytes   |
+| 7      | 65540 + read 3 bytes |
+
+This allows any repeat offset + length to be represented by 2 to 5 bytes.
+
+Lengths are stored as little endian values.
+
+The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams.
+
+Default streaming block size is 1MB.
+
+# LICENSE
+
+This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation.
+
+Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
new file mode 100644
index 00000000..d0ae5304
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -0,0 +1,565 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+var (
+	// ErrCorrupt reports that the input is invalid.
+	ErrCorrupt = errors.New("s2: corrupt input")
+	// ErrCRC reports that the input failed CRC validation (streams only)
+	ErrCRC = errors.New("s2: corrupt input, crc mismatch")
+	// ErrTooLarge reports that the uncompressed length is too large.
+	ErrTooLarge = errors.New("s2: decoded block is too large")
+	// ErrUnsupported reports that the input isn't supported.
+	ErrUnsupported = errors.New("s2: unsupported input")
+)
+
+// DecodedLen returns the length of the decoded block.
+func DecodedLen(src []byte) (int, error) {
+	v, _, err := decodedLen(src)
+	return v, err
+}
+
+// decodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func decodedLen(src []byte) (blockLen, headerLen int, err error) {
+	v, n := binary.Uvarint(src)
+	if n <= 0 || v > 0xffffffff {
+		return 0, 0, ErrCorrupt
+	}
+
+	const wordSize = 32 << (^uint(0) >> 32 & 1)
+	if wordSize == 32 && v > 0x7fffffff {
+		return 0, 0, ErrTooLarge
+	}
+	return int(v), n, nil
+}
+
+const (
+	decodeErrCodeCorrupt = 1
+)
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= cap(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+	if s2Decode(dst, src[s:]) != 0 {
+		return nil, ErrCorrupt
+	}
+	return dst, nil
+}
+
+// NewReader returns a new Reader that decompresses from r, using the framing
+// format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt with S2 changes.
+func NewReader(r io.Reader, opts ...ReaderOption) *Reader {
+	nr := Reader{
+		r:        r,
+		maxBlock: maxBlockSize,
+	}
+	for _, opt := range opts {
+		if err := opt(&nr); err != nil {
+			nr.err = err
+			return &nr
+		}
+	}
+	nr.maxBufSize = MaxEncodedLen(nr.maxBlock) + checksumSize
+	if nr.lazyBuf > 0 {
+		nr.buf = make([]byte, MaxEncodedLen(nr.lazyBuf)+checksumSize)
+	} else {
+		nr.buf = make([]byte, MaxEncodedLen(defaultBlockSize)+checksumSize)
+	}
+	nr.paramsOK = true
+	return &nr
+}
+
+// ReaderOption is an option for creating a decoder.
+type ReaderOption func(*Reader) error
+
+// ReaderMaxBlockSize allows to control allocations if the stream
+// has been compressed with a smaller WriterBlockSize, or with the default 1MB.
+// Blocks must be this size or smaller to decompress,
+// otherwise the decoder will return ErrUnsupported.
+//
+// For streams compressed with Snappy this can safely be set to 64KB (64 << 10).
+//
+// Default is the maximum limit of 4MB.
+func ReaderMaxBlockSize(blockSize int) ReaderOption {
+	return func(r *Reader) error {
+		if blockSize > maxBlockSize || blockSize <= 0 {
+			return errors.New("s2: block size too large. Must be <= 4MB and > 0")
+		}
+		if r.lazyBuf == 0 && blockSize < defaultBlockSize {
+			r.lazyBuf = blockSize
+		}
+		r.maxBlock = blockSize
+		return nil
+	}
+}
+
+// ReaderAllocBlock allows to control upfront stream allocations
+// and not allocate for frames bigger than this initially.
+// If frames bigger than this is seen a bigger buffer will be allocated.
+//
+// Default is 1MB, which is default output size.
+func ReaderAllocBlock(blockSize int) ReaderOption {
+	return func(r *Reader) error {
+		if blockSize > maxBlockSize || blockSize < 1024 {
+			return errors.New("s2: invalid ReaderAllocBlock. Must be <= 4MB and >= 1024")
+		}
+		r.lazyBuf = blockSize
+		return nil
+	}
+}
+
+// Reader is an io.Reader that can read Snappy-compressed bytes.
+type Reader struct {
+	r       io.Reader
+	err     error
+	decoded []byte
+	buf     []byte
+	// decoded[i:j] contains decoded bytes that have not yet been passed on.
+	i, j int
+	// maximum block size allowed.
+	maxBlock int
+	// maximum expected buffer size.
+	maxBufSize int
+	// alloc a buffer this size if > 0.
+	lazyBuf     int
+	readHeader  bool
+	paramsOK    bool
+	snappyFrame bool
+}
+
+// ensureBufferSize will ensure that the buffer can take at least n bytes.
+// If false is returned the buffer exceeds maximum allowed size.
+func (r *Reader) ensureBufferSize(n int) bool {
+	if len(r.buf) >= n {
+		return true
+	}
+	if n > r.maxBufSize {
+		r.err = ErrCorrupt
+		return false
+	}
+	// Realloc buffer.
+	r.buf = make([]byte, n)
+	return true
+}
+
+// Reset discards any buffered data, resets all state, and switches the Snappy
+// reader to read from r. This permits reusing a Reader rather than allocating
+// a new one.
+func (r *Reader) Reset(reader io.Reader) {
+	if !r.paramsOK {
+		return
+	}
+	r.r = reader
+	r.err = nil
+	r.i = 0
+	r.j = 0
+	r.readHeader = false
+}
+
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
+	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+		}
+		return false
+	}
+	return true
+}
+
+// skipN will skip n bytes.
+// If the supplied reader supports seeking that is used.
+// tmp is used as a temporary buffer for reading.
+// The supplied slice does not need to be the size of the read.
+func (r *Reader) skipN(tmp []byte, n int, allowEOF bool) (ok bool) {
+	if rs, ok := r.r.(io.ReadSeeker); ok {
+		_, err := rs.Seek(int64(n), io.SeekCurrent)
+		if err == nil {
+			return true
+		}
+		if err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+			return false
+		}
+	}
+	for n > 0 {
+		if n < len(tmp) {
+			tmp = tmp[:n]
+		}
+		if _, r.err = io.ReadFull(r.r, tmp); r.err != nil {
+			if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+				r.err = ErrCorrupt
+			}
+			return false
+		}
+		n -= len(tmp)
+	}
+	return true
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	for {
+		if r.i < r.j {
+			n := copy(p, r.decoded[r.i:r.j])
+			r.i += n
+			return n, nil
+		}
+		if !r.readFull(r.buf[:4], true) {
+			return 0, r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrUnsupported
+				}
+				return 0, r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+
+			if n > len(r.decoded) {
+				if n > r.maxBlock {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+				r.decoded = make([]byte, n)
+			}
+			if _, err := Decode(r.decoded, buf); err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeUncompressedData:
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrUnsupported
+				}
+				return 0, r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n := chunkLen - checksumSize
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if n > len(r.decoded) {
+				if n > r.maxBlock {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+				r.decoded = make([]byte, n)
+			}
+			if !r.readFull(r.decoded[:n], false) {
+				return 0, r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return 0, r.err
+			}
+			if string(r.buf[:len(magicBody)]) != magicBody {
+				if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+					r.err = ErrCorrupt
+					return 0, r.err
+				} else {
+					r.snappyFrame = true
+				}
+			} else {
+				r.snappyFrame = false
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if chunkLen > maxBlockSize {
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		if !r.skipN(r.buf, chunkLen, false) {
+			return 0, r.err
+		}
+	}
+}
+
+// Skip will skip n bytes forward in the decompressed output.
+// For larger skips this consumes less CPU and is faster than reading output and discarding it.
+// CRC is not checked on skipped blocks.
+// io.ErrUnexpectedEOF is returned if the stream ends before all bytes have been skipped.
+// If a decoding error is encountered subsequent calls to Read will also fail.
+func (r *Reader) Skip(n int64) error {
+	if n < 0 {
+		return errors.New("attempted negative skip")
+	}
+	if r.err != nil {
+		return r.err
+	}
+
+	for n > 0 {
+		if r.i < r.j {
+			// Skip in buffer.
+			// decoded[i:j] contains decoded bytes that have not yet been passed on.
+			left := int64(r.j - r.i)
+			if left >= n {
+				r.i += int(n)
+				return nil
+			}
+			n -= int64(r.j - r.i)
+			r.i, r.j = 0, 0
+		}
+
+		// Buffer empty; read blocks until we have content.
+		if !r.readFull(r.buf[:4], true) {
+			if r.err == io.EOF {
+				r.err = io.ErrUnexpectedEOF
+			}
+			return r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrUnsupported
+				}
+				return r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			dLen, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return r.err
+			}
+			if dLen > r.maxBlock {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			// Check if destination is within this block
+			if int64(dLen) > n {
+				if len(r.decoded) < dLen {
+					r.decoded = make([]byte, dLen)
+				}
+				if _, err := Decode(r.decoded, buf); err != nil {
+					r.err = err
+					return r.err
+				}
+				if crc(r.decoded[:dLen]) != checksum {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			} else {
+				// Skip block completely
+				n -= int64(dLen)
+				dLen = 0
+			}
+			r.i, r.j = 0, dLen
+			continue
+		case chunkTypeUncompressedData:
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err != nil {
+					r.err = ErrUnsupported
+				}
+				return r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n2 := chunkLen - checksumSize
+			if n2 > len(r.decoded) {
+				if n2 > r.maxBlock {
+					r.err = ErrCorrupt
+					return r.err
+				}
+				r.decoded = make([]byte, n2)
+			}
+			if !r.readFull(r.decoded[:n2], false) {
+				return r.err
+			}
+			if int64(n2) < n {
+				if crc(r.decoded[:n2]) != checksum {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			}
+			r.i, r.j = 0, n2
+			continue
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return r.err
+			}
+			if string(r.buf[:len(magicBody)]) != magicBody {
+				if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			}
+
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			r.err = ErrUnsupported
+			return r.err
+		}
+		if chunkLen > maxBlockSize {
+			r.err = ErrUnsupported
+			return r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if !r.skipN(r.buf, chunkLen, false) {
+			return r.err
+		}
+	}
+	return nil
+}
+
+// ReadByte satisfies the io.ByteReader interface.
+func (r *Reader) ReadByte() (byte, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	if r.i < r.j {
+		c := r.decoded[r.i]
+		r.i++
+		return c, nil
+	}
+	var tmp [1]byte
+	for i := 0; i < 10; i++ {
+		n, err := r.Read(tmp[:])
+		if err != nil {
+			return 0, err
+		}
+		if n == 1 {
+			return tmp[0], nil
+		}
+	}
+	return 0, io.ErrNoProgress
+}
diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
new file mode 100644
index 00000000..9b105e03
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
@@ -0,0 +1,568 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+#define R_TMP0 AX
+#define R_TMP1 BX
+#define R_LEN CX
+#define R_OFF DX
+#define R_SRC SI
+#define R_DST DI
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- R_TMP0	scratch
+//	- R_TMP1	scratch
+//	- R_LEN	    length or x (shared)
+//	- R_OFF	    offset
+//	- R_SRC	    &src[s]
+//	- R_DST	    &dst[d]
+//	+ R_DBASE	dst_base
+//	+ R_DLEN	dst_len
+//	+ R_DEND	dst_base + dst_len
+//	+ R_SBASE	src_base
+//	+ R_SLEN	src_len
+//	+ R_SEND	src_base + src_len
+//	- R_TMP2	used by doCopy
+//	- R_TMP3	used by doCopy
+//
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $48-56
+	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+	MOVQ dst_base+0(FP), R_DBASE
+	MOVQ dst_len+8(FP), R_DLEN
+	MOVQ R_DBASE, R_DST
+	MOVQ R_DBASE, R_DEND
+	ADDQ R_DLEN, R_DEND
+	MOVQ src_base+24(FP), R_SBASE
+	MOVQ src_len+32(FP), R_SLEN
+	MOVQ R_SBASE, R_SRC
+	MOVQ R_SBASE, R_SEND
+	ADDQ R_SLEN, R_SEND
+	XORQ R_OFF, R_OFF
+
+loop:
+	// for s < len(src)
+	CMPQ R_SRC, R_SEND
+	JEQ  end
+
+	// R_LEN = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBLZX (R_SRC), R_LEN
+	MOVL    R_LEN, R_TMP1
+	ANDL    $3, R_TMP1
+	CMPL    R_TMP1, $1
+	JAE     tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	SHRL $2, R_LEN
+	CMPL R_LEN, $60
+	JAE  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	INCQ R_SRC
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// R_LEN can hold 64 bits, so the increment cannot overflow.
+	INCQ R_LEN
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// R_TMP0 = len(dst) - d
+	// R_TMP1 = len(src) - s
+	MOVQ R_DEND, R_TMP0
+	SUBQ R_DST, R_TMP0
+	MOVQ R_SEND, R_TMP1
+	SUBQ R_SRC, R_TMP1
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMPQ R_LEN, $16
+	JGT  callMemmove
+	CMPQ R_TMP0, $16
+	JLT  callMemmove
+	CMPQ R_TMP1, $16
+	JLT  callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(R_SRC), X0
+	MOVOU X0, 0(R_DST)
+
+	// d += length
+	// s += length
+	ADDQ R_LEN, R_DST
+	ADDQ R_LEN, R_SRC
+	JMP  loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMPQ R_LEN, R_TMP0
+	JGT  errCorrupt
+	CMPQ R_LEN, R_TMP1
+	JGT  errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVQ R_DST, 0(SP)
+	MOVQ R_SRC, 8(SP)
+	MOVQ R_LEN, 16(SP)
+	MOVQ R_DST, 24(SP)
+	MOVQ R_SRC, 32(SP)
+	MOVQ R_LEN, 40(SP)
+	MOVQ R_OFF, 48(SP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R_DBASE-R_SEND.
+	MOVQ 24(SP), R_DST
+	MOVQ 32(SP), R_SRC
+	MOVQ 40(SP), R_LEN
+	MOVQ 48(SP), R_OFF
+	MOVQ dst_base+0(FP), R_DBASE
+	MOVQ dst_len+8(FP), R_DLEN
+	MOVQ R_DBASE, R_DEND
+	ADDQ R_DLEN, R_DEND
+	MOVQ src_base+24(FP), R_SBASE
+	MOVQ src_len+32(FP), R_SLEN
+	MOVQ R_SBASE, R_SEND
+	ADDQ R_SLEN, R_SEND
+
+	// d += length
+	// s += length
+	ADDQ R_LEN, R_DST
+	ADDQ R_LEN, R_SRC
+	JMP  loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADDQ R_LEN, R_SRC
+	SUBQ $58, R_SRC
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// case x == 60:
+	CMPL R_LEN, $61
+	JEQ  tagLit61
+	JA   tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBLZX -1(R_SRC), R_LEN
+	JMP     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVWLZX -2(R_SRC), R_LEN
+	JMP     doLit
+
+tagLit62Plus:
+	CMPL R_LEN, $62
+	JA   tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	// We read one byte, safe to read one back, since we are just reading tag.
+	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
+	MOVL -4(R_SRC), R_LEN
+	SHRL $8, R_LEN
+	JMP  doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVL -4(R_SRC), R_LEN
+	JMP  doLit
+
+// The code above handles literal tags.
+// ----------------------------------------
+// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADDQ $5, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	SHRQ $2, R_LEN
+	INCQ R_LEN
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVLQZX -4(R_SRC), R_OFF
+	JMP     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADDQ $3, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-3])>>2
+	SHRQ $2, R_LEN
+	INCQ R_LEN
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVWQZX -2(R_SRC), R_OFF
+	JMP     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- R_TMP1 == src[s] & 0x03
+	//	- R_LEN == src[s]
+	CMPQ R_TMP1, $2
+	JEQ  tagCopy2
+	JA   tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADDQ $2, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	// length = 4 + int(src[s-2])>>2&0x7
+	MOVBQZX -1(R_SRC), R_TMP1
+	MOVQ    R_LEN, R_TMP0
+	SHRQ    $2, R_LEN
+	ANDQ    $0xe0, R_TMP0
+	ANDQ    $7, R_LEN
+	SHLQ    $3, R_TMP0
+	ADDQ    $4, R_LEN
+	ORQ     R_TMP1, R_TMP0
+
+	// check if repeat code, ZF set by ORQ.
+	JZ repeatCode
+
+	// This is a regular copy, transfer our temporary value to R_OFF (length)
+	MOVQ R_TMP0, R_OFF
+	JMP  doCopy
+
+// This is a repeat code.
+repeatCode:
+	// If length < 9, reuse last offset, with the length already calculated.
+	CMPQ R_LEN, $9
+	JL   doCopyRepeat
+
+	// Read additional bytes for length.
+	JE repeatLen1
+
+	// Rare, so the extra branch shouldn't hurt too much.
+	CMPQ R_LEN, $10
+	JE   repeatLen2
+	JMP  repeatLen3
+
+// Read repeat lengths.
+repeatLen1:
+	// s ++
+	ADDQ $1, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = src[s-1] + 8
+	MOVBQZX -1(R_SRC), R_LEN
+	ADDL    $8, R_LEN
+	JMP     doCopyRepeat
+
+repeatLen2:
+	// s +=2
+	ADDQ $2, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
+	MOVWQZX -2(R_SRC), R_LEN
+	ADDL    $260, R_LEN
+	JMP     doCopyRepeat
+
+repeatLen3:
+	// s +=3
+	ADDQ $3, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
+	// Read one byte further back (just part of the tag, shifted out)
+	MOVL -4(R_SRC), R_LEN
+	SHRL $8, R_LEN
+	ADDL $65540, R_LEN
+	JMP  doCopyRepeat
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- R_LEN == length && R_LEN > 0
+	//	- R_OFF == offset
+
+	// if d < offset { etc }
+	MOVQ R_DST, R_TMP1
+	SUBQ R_DBASE, R_TMP1
+	CMPQ R_TMP1, R_OFF
+	JLT  errCorrupt
+
+	// Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+	// if offset <= 0 { etc }
+	CMPQ R_OFF, $0
+	JLE  errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVQ R_DEND, R_TMP1
+	SUBQ R_DST, R_TMP1
+	CMPQ R_LEN, R_TMP1
+	JGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R_TMP2 = len(dst)-d
+	//	- R_TMP3 = &dst[d-offset]
+	MOVQ R_DEND, R_TMP2
+	SUBQ R_DST, R_TMP2
+	MOVQ R_DST, R_TMP3
+	SUBQ R_OFF, R_TMP3
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMPQ R_LEN, $16
+	JGT  slowForwardCopy
+	CMPQ R_OFF, $8
+	JLT  slowForwardCopy
+	CMPQ R_TMP2, $16
+	JLT  slowForwardCopy
+	MOVQ 0(R_TMP3), R_TMP0
+	MOVQ R_TMP0, 0(R_DST)
+	MOVQ 8(R_TMP3), R_TMP1
+	MOVQ R_TMP1, 8(R_DST)
+	ADDQ R_LEN, R_DST
+	JMP  loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUBQ $10, R_TMP2
+	CMPQ R_LEN, R_TMP2
+	JGT  verySlowForwardCopy
+
+	// We want to keep the offset, so we use R_TMP2 from here.
+	MOVQ R_OFF, R_TMP2
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R_TMP3, is unchanged.
+	// }
+	CMPQ R_TMP2, $8
+	JGE  fixUpSlowForwardCopy
+	MOVQ (R_TMP3), R_TMP1
+	MOVQ R_TMP1, (R_DST)
+	SUBQ R_TMP2, R_LEN
+	ADDQ R_TMP2, R_DST
+	ADDQ R_TMP2, R_TMP2
+	JMP  makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by R_DST being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVQ R_DST, R_TMP0
+	ADDQ R_LEN, R_DST
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	CMPQ R_LEN, $0
+	JLE  loop
+	MOVQ (R_TMP3), R_TMP1
+	MOVQ R_TMP1, (R_TMP0)
+	ADDQ $8, R_TMP3
+	ADDQ $8, R_TMP0
+	SUBQ $8, R_LEN
+	JMP  finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R_TMP3), R_TMP1
+	MOVB R_TMP1, (R_DST)
+	INCQ R_TMP3
+	INCQ R_DST
+	DECQ R_LEN
+	JNZ  verySlowForwardCopy
+	JMP  loop
+
+// The code above handles copy tags.
+// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMPQ R_DST, R_DEND
+	JNE  errCorrupt
+
+	// return 0
+	MOVQ $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVQ $1, ret+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
new file mode 100644
index 00000000..4b63d508
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@@ -0,0 +1,574 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+#define R_TMP0 R2
+#define R_TMP1 R3
+#define R_LEN R4
+#define R_OFF R5
+#define R_SRC R6
+#define R_DST R7
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+
+// TEST_SRC will check if R_SRC is <= SRC_END
+#define TEST_SRC() \
+	CMP R_SEND, R_SRC \
+	BGT errCorrupt
+
+// MOVD R_SRC, R_TMP1
+// SUB  R_SBASE, R_TMP1, R_TMP1
+// CMP  R_SLEN, R_TMP1
+// BGT  errCorrupt
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- R_TMP0	scratch
+//	- R_TMP1	scratch
+//	- R_LEN	length or x
+//	- R_OFF	offset
+//	- R_SRC	&src[s]
+//	- R_DST	&dst[d]
+//	+ R_DBASE	dst_base
+//	+ R_DLEN	dst_len
+//	+ R_DEND	dst_base + dst_len
+//	+ R_SBASE	src_base
+//	+ R_SLEN	src_len
+//	+ R_SEND	src_base + src_len
+//	- R_TMP2	used by doCopy
+//	- R_TMP3	used by doCopy
+//
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $56-64
+	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+	MOVD dst_base+0(FP), R_DBASE
+	MOVD dst_len+8(FP), R_DLEN
+	MOVD R_DBASE, R_DST
+	MOVD R_DBASE, R_DEND
+	ADD  R_DLEN, R_DEND, R_DEND
+	MOVD src_base+24(FP), R_SBASE
+	MOVD src_len+32(FP), R_SLEN
+	MOVD R_SBASE, R_SRC
+	MOVD R_SBASE, R_SEND
+	ADD  R_SLEN, R_SEND, R_SEND
+	MOVD $0, R_OFF
+
+loop:
+	// for s < len(src)
+	CMP R_SEND, R_SRC
+	BEQ end
+
+	// R_LEN = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBU (R_SRC), R_LEN
+	MOVW  R_LEN, R_TMP1
+	ANDW  $3, R_TMP1
+	MOVW  $1, R1
+	CMPW  R1, R_TMP1
+	BGE   tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	MOVW $60, R1
+	LSRW $2, R_LEN, R_LEN
+	CMPW R_LEN, R1
+	BLS  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	ADD $1, R_SRC, R_SRC
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// R_LEN can hold 64 bits, so the increment cannot overflow.
+	ADD $1, R_LEN, R_LEN
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// R_TMP0 = len(dst) - d
+	// R_TMP1 = len(src) - s
+	MOVD R_DEND, R_TMP0
+	SUB  R_DST, R_TMP0, R_TMP0
+	MOVD R_SEND, R_TMP1
+	SUB  R_SRC, R_TMP1, R_TMP1
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMP $16, R_LEN
+	BGT callMemmove
+	CMP $16, R_TMP0
+	BLT callMemmove
+	CMP $16, R_TMP1
+	BLT callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	LDP 0(R_SRC), (R_TMP2, R_TMP3)
+	STP (R_TMP2, R_TMP3), 0(R_DST)
+
+	// d += length
+	// s += length
+	ADD R_LEN, R_DST, R_DST
+	ADD R_LEN, R_SRC, R_SRC
+	B   loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMP R_TMP0, R_LEN
+	BGT errCorrupt
+	CMP R_TMP1, R_LEN
+	BGT errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVD R_DST, 8(RSP)
+	MOVD R_SRC, 16(RSP)
+	MOVD R_LEN, 24(RSP)
+	MOVD R_DST, 32(RSP)
+	MOVD R_SRC, 40(RSP)
+	MOVD R_LEN, 48(RSP)
+	MOVD R_OFF, 56(RSP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R_DBASE-R_SEND.
+	MOVD 32(RSP), R_DST
+	MOVD 40(RSP), R_SRC
+	MOVD 48(RSP), R_LEN
+	MOVD 56(RSP), R_OFF
+	MOVD dst_base+0(FP), R_DBASE
+	MOVD dst_len+8(FP), R_DLEN
+	MOVD R_DBASE, R_DEND
+	ADD  R_DLEN, R_DEND, R_DEND
+	MOVD src_base+24(FP), R_SBASE
+	MOVD src_len+32(FP), R_SLEN
+	MOVD R_SBASE, R_SEND
+	ADD  R_SLEN, R_SEND, R_SEND
+
+	// d += length
+	// s += length
+	ADD R_LEN, R_DST, R_DST
+	ADD R_LEN, R_SRC, R_SRC
+	B   loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADD R_LEN, R_SRC, R_SRC
+	SUB $58, R_SRC, R_SRC
+	TEST_SRC()
+
+	// case x == 60:
+	MOVW $61, R1
+	CMPW R1, R_LEN
+	BEQ  tagLit61
+	BGT  tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBU -1(R_SRC), R_LEN
+	B     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVHU -2(R_SRC), R_LEN
+	B     doLit
+
+tagLit62Plus:
+	CMPW $62, R_LEN
+	BHI  tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	MOVHU -3(R_SRC), R_LEN
+	MOVBU -1(R_SRC), R_TMP1
+	ORR   R_TMP1<<16, R_LEN
+	B     doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVWU -4(R_SRC), R_LEN
+	B     doLit
+
+	// The code above handles literal tags.
+	// ----------------------------------------
+	// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADD $5, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVD R_SRC, R_TMP1
+	SUB  R_SBASE, R_TMP1, R_TMP1
+	CMP  R_SLEN, R_TMP1
+	BGT  errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	MOVD $1, R1
+	ADD  R_LEN>>2, R1, R_LEN
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVWU -4(R_SRC), R_OFF
+	B     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADD $3, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = 1 + int(src[s-3])>>2
+	MOVD $1, R1
+	ADD  R_LEN>>2, R1, R_LEN
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVHU -2(R_SRC), R_OFF
+	B     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- R_TMP1 == src[s] & 0x03
+	//	- R_LEN == src[s]
+	CMP $2, R_TMP1
+	BEQ tagCopy2
+	BGT tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADD $2, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	// Calculate offset in R_TMP0 in case it is a repeat.
+	MOVD  R_LEN, R_TMP0
+	AND   $0xe0, R_TMP0
+	MOVBU -1(R_SRC), R_TMP1
+	ORR   R_TMP0<<3, R_TMP1, R_TMP0
+
+	// length = 4 + int(src[s-2])>>2&0x7
+	MOVD $7, R1
+	AND  R_LEN>>2, R1, R_LEN
+	ADD  $4, R_LEN, R_LEN
+
+	// check if repeat code with offset 0.
+	CMP $0, R_TMP0
+	BEQ repeatCode
+
+	// This is a regular copy, transfer our temporary value to R_OFF (offset)
+	MOVD R_TMP0, R_OFF
+	B    doCopy
+
+	// This is a repeat code.
+repeatCode:
+	// If length < 9, reuse last offset, with the length already calculated.
+	CMP $9, R_LEN
+	BLT doCopyRepeat
+	BEQ repeatLen1
+	CMP $10, R_LEN
+	BEQ repeatLen2
+
+repeatLen3:
+	// s +=3
+	ADD $3, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
+	MOVBU -1(R_SRC), R_TMP0
+	MOVHU -3(R_SRC), R_LEN
+	ORR   R_TMP0<<16, R_LEN, R_LEN
+	ADD   $65540, R_LEN, R_LEN
+	B     doCopyRepeat
+
+repeatLen2:
+	// s +=2
+	ADD $2, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
+	MOVHU -2(R_SRC), R_LEN
+	ADD   $260, R_LEN, R_LEN
+	B     doCopyRepeat
+
+repeatLen1:
+	// s +=1
+	ADD $1, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = src[s-1] + 8
+	MOVBU -1(R_SRC), R_LEN
+	ADD   $8, R_LEN, R_LEN
+	B     doCopyRepeat
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- R_LEN == length && R_LEN > 0
+	//	- R_OFF == offset
+
+	// if d < offset { etc }
+	MOVD R_DST, R_TMP1
+	SUB  R_DBASE, R_TMP1, R_TMP1
+	CMP  R_OFF, R_TMP1
+	BLT  errCorrupt
+
+	// Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+
+	// if offset <= 0 { etc }
+	CMP $0, R_OFF
+	BLE errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVD R_DEND, R_TMP1
+	SUB  R_DST, R_TMP1, R_TMP1
+	CMP  R_TMP1, R_LEN
+	BGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R_TMP2 = len(dst)-d
+	//	- R_TMP3 = &dst[d-offset]
+	MOVD R_DEND, R_TMP2
+	SUB  R_DST, R_TMP2, R_TMP2
+	MOVD R_DST, R_TMP3
+	SUB  R_OFF, R_TMP3, R_TMP3
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMP  $16, R_LEN
+	BGT  slowForwardCopy
+	CMP  $8, R_OFF
+	BLT  slowForwardCopy
+	CMP  $16, R_TMP2
+	BLT  slowForwardCopy
+	MOVD 0(R_TMP3), R_TMP0
+	MOVD R_TMP0, 0(R_DST)
+	MOVD 8(R_TMP3), R_TMP1
+	MOVD R_TMP1, 8(R_DST)
+	ADD  R_LEN, R_DST, R_DST
+	B    loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUB $10, R_TMP2, R_TMP2
+	CMP R_TMP2, R_LEN
+	BGT verySlowForwardCopy
+
+	// We want to keep the offset, so we use R_TMP2 from here.
+	MOVD R_OFF, R_TMP2
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R_TMP3, is unchanged.
+	// }
+	CMP  $8, R_TMP2
+	BGE  fixUpSlowForwardCopy
+	MOVD (R_TMP3), R_TMP1
+	MOVD R_TMP1, (R_DST)
+	SUB  R_TMP2, R_LEN, R_LEN
+	ADD  R_TMP2, R_DST, R_DST
+	ADD  R_TMP2, R_TMP2, R_TMP2
+	B    makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by R_DST being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVD R_DST, R_TMP0
+	ADD  R_LEN, R_DST, R_DST
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	MOVD $0, R1
+	CMP  R1, R_LEN
+	BLE  loop
+	MOVD (R_TMP3), R_TMP1
+	MOVD R_TMP1, (R_TMP0)
+	ADD  $8, R_TMP3, R_TMP3
+	ADD  $8, R_TMP0, R_TMP0
+	SUB  $8, R_LEN, R_LEN
+	B    finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R_TMP3), R_TMP1
+	MOVB R_TMP1, (R_DST)
+	ADD  $1, R_TMP3, R_TMP3
+	ADD  $1, R_DST, R_DST
+	SUB  $1, R_LEN, R_LEN
+	CBNZ R_LEN, verySlowForwardCopy
+	B    loop
+
+	// The code above handles copy tags.
+	// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMP R_DEND, R_DST
+	BNE errCorrupt
+
+	// return 0
+	MOVD $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVD $1, R_TMP0
+	MOVD R_TMP0, ret+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/s2/decode_asm.go b/vendor/github.com/klauspost/compress/s2/decode_asm.go
new file mode 100644
index 00000000..cb3576ed
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_asm.go
@@ -0,0 +1,17 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (amd64 || arm64) && !appengine && gc && !noasm
+// +build amd64 arm64
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package s2
+
+// decode has the same semantics as in decode_other.go.
+//
+//go:noescape
+func s2Decode(dst, src []byte) int
diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go
new file mode 100644
index 00000000..1074ebd2
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@@ -0,0 +1,267 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (!amd64 && !arm64) || appengine || !gc || noasm
+// +build !amd64,!arm64 appengine !gc noasm
+
+package s2
+
+import (
+	"fmt"
+	"strconv"
+)
+
+// decode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func s2Decode(dst, src []byte) int {
+	const debug = false
+	if debug {
+		fmt.Println("Starting decode, dst len:", len(dst))
+	}
+	var d, s, length int
+	offset := 0
+
+	// As long as we can read at least 5 bytes...
+	for s < len(src)-5 {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			length = int(src[s-2]) >> 2 & 0x7
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					s += 1
+					length = int(uint32(src[s-1])) + 4
+				case 6:
+					s += 2
+					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+				case 7:
+					s += 3
+					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			s += 3
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			return decodeErrCodeCorrupt
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	// Remaining with extra checks...
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = int(src[s-2]) >> 2 & 0x7
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					s += 1
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-1])) + 4
+				case 6:
+					s += 2
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+				case 7:
+					s += 3
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			return decodeErrCodeCorrupt
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	if d != len(dst) {
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go
new file mode 100644
index 00000000..aa8b108d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode.go
@@ -0,0 +1,1172 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"crypto/rand"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+	"math/bits"
+	"runtime"
+	"sync"
+)
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func Encode(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlock(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeBetter(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlockBetter(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBest compresses as good as reasonably possible but with a
+// big speed decrease.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeBest(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlockBest(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeSnappy returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappy(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeSnappyBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappyBetter(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockBetterSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeSnappyBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappyBest(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockBestSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination.
+// If the destination is nil or too small, a new will be allocated.
+// The blocks are not validated, so garbage in = garbage out.
+// dst may not overlap block data.
+// Any data in dst is preserved as is, so it will not be considered a block.
+func ConcatBlocks(dst []byte, blocks ...[]byte) ([]byte, error) {
+	totalSize := uint64(0)
+	compSize := 0
+	for _, b := range blocks {
+		l, hdr, err := decodedLen(b)
+		if err != nil {
+			return nil, err
+		}
+		totalSize += uint64(l)
+		compSize += len(b) - hdr
+	}
+	if totalSize == 0 {
+		dst = append(dst, 0)
+		return dst, nil
+	}
+	if totalSize > math.MaxUint32 {
+		return nil, ErrTooLarge
+	}
+	var tmp [binary.MaxVarintLen32]byte
+	hdrSize := binary.PutUvarint(tmp[:], totalSize)
+	wantSize := hdrSize + compSize
+
+	if cap(dst)-len(dst) < wantSize {
+		dst = append(make([]byte, 0, wantSize+len(dst)), dst...)
+	}
+	dst = append(dst, tmp[:hdrSize]...)
+	for _, b := range blocks {
+		_, hdr, err := decodedLen(b)
+		if err != nil {
+			return nil, err
+		}
+		dst = append(dst, b[hdr:]...)
+	}
+	return dst, nil
+}
+
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 8
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// will be accepted by the encoder.
+const minNonLiteralBlockSize = 32
+
+// MaxBlockSize is the maximum value where MaxEncodedLen will return a valid block size.
+// Blocks this big are highly discouraged, though.
+const MaxBlockSize = math.MaxUint32 - binary.MaxVarintLen32 - 5
+
+// MaxEncodedLen returns the maximum length of a snappy block, given its
+// uncompressed length.
+//
+// It will return a negative value if srcLen is too large to encode.
+// 32 bit platforms will have lower thresholds for rejecting big content.
+func MaxEncodedLen(srcLen int) int {
+	n := uint64(srcLen)
+	if n > 0xffffffff {
+		// Also includes negative.
+		return -1
+	}
+	// Size of the varint encoded block size.
+	n = n + uint64((bits.Len64(n)+7)/7)
+
+	// Add maximum size of encoding block as literals.
+	n += uint64(literalExtraSize(int64(srcLen)))
+	if n > 0xffffffff {
+		return -1
+	}
+	return int(n)
+}
+
+var errClosed = errors.New("s2: Writer is closed")
+
+// NewWriter returns a new Writer that compresses to w, using the
+// framing format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+//
+// Users must call Close to guarantee all data has been forwarded to
+// the underlying io.Writer and that resources are released.
+// They may also call Flush zero or more times before calling Close.
+func NewWriter(w io.Writer, opts ...WriterOption) *Writer {
+	w2 := Writer{
+		blockSize:   defaultBlockSize,
+		concurrency: runtime.GOMAXPROCS(0),
+		randSrc:     rand.Reader,
+		level:       levelFast,
+	}
+	for _, opt := range opts {
+		if err := opt(&w2); err != nil {
+			w2.errState = err
+			return &w2
+		}
+	}
+	w2.obufLen = obufHeaderLen + MaxEncodedLen(w2.blockSize)
+	w2.paramsOK = true
+	w2.ibuf = make([]byte, 0, w2.blockSize)
+	w2.buffers.New = func() interface{} {
+		return make([]byte, w2.obufLen)
+	}
+	w2.Reset(w)
+	return &w2
+}
+
+// Writer is an io.Writer that can write Snappy-compressed bytes.
+type Writer struct {
+	errMu    sync.Mutex
+	errState error
+
+	// ibuf is a buffer for the incoming (uncompressed) bytes.
+	ibuf []byte
+
+	blockSize   int
+	obufLen     int
+	concurrency int
+	written     int64
+	output      chan chan result
+	buffers     sync.Pool
+	pad         int
+
+	writer   io.Writer
+	randSrc  io.Reader
+	writerWg sync.WaitGroup
+
+	// wroteStreamHeader is whether we have written the stream header.
+	wroteStreamHeader bool
+	paramsOK          bool
+	snappy            bool
+	flushOnWrite      bool
+	level             uint8
+}
+
+const (
+	levelUncompressed = iota + 1
+	levelFast
+	levelBetter
+	levelBest
+)
+
+type result []byte
+
+// err returns the previously set error.
+// If no error has been set it is set to err if not nil.
+func (w *Writer) err(err error) error {
+	w.errMu.Lock()
+	errSet := w.errState
+	if errSet == nil && err != nil {
+		w.errState = err
+		errSet = err
+	}
+	w.errMu.Unlock()
+	return errSet
+}
+
+// Reset discards the writer's state and switches the Snappy writer to write to w.
+// This permits reusing a Writer rather than allocating a new one.
+func (w *Writer) Reset(writer io.Writer) {
+	if !w.paramsOK {
+		return
+	}
+	// Close previous writer, if any.
+	if w.output != nil {
+		close(w.output)
+		w.writerWg.Wait()
+		w.output = nil
+	}
+	w.errState = nil
+	w.ibuf = w.ibuf[:0]
+	w.wroteStreamHeader = false
+	w.written = 0
+	w.writer = writer
+	// If we didn't get a writer, stop here.
+	if writer == nil {
+		return
+	}
+	// If no concurrency requested, don't spin up writer goroutine.
+	if w.concurrency == 1 {
+		return
+	}
+
+	toWrite := make(chan chan result, w.concurrency)
+	w.output = toWrite
+	w.writerWg.Add(1)
+
+	// Start a writer goroutine that will write all output in order.
+	go func() {
+		defer w.writerWg.Done()
+
+		// Get a queued write.
+		for write := range toWrite {
+			// Wait for the data to be available.
+			in := <-write
+			if len(in) > 0 {
+				if w.err(nil) == nil {
+					// Don't expose data from previous buffers.
+					toWrite := in[:len(in):len(in)]
+					// Write to output.
+					n, err := writer.Write(toWrite)
+					if err == nil && n != len(toWrite) {
+						err = io.ErrShortBuffer
+					}
+					_ = w.err(err)
+					w.written += int64(n)
+				}
+			}
+			if cap(in) >= w.obufLen {
+				w.buffers.Put([]byte(in))
+			}
+			// close the incoming write request.
+			// This can be used for synchronizing flushes.
+			close(write)
+		}
+	}()
+}
+
+// Write satisfies the io.Writer interface.
+func (w *Writer) Write(p []byte) (nRet int, errRet error) {
+	if w.flushOnWrite {
+		return w.write(p)
+	}
+	// If we exceed the input buffer size, start writing
+	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err(nil) == nil {
+		var n int
+		if len(w.ibuf) == 0 {
+			// Large write, empty buffer.
+			// Write directly from p to avoid copy.
+			n, _ = w.write(p)
+		} else {
+			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+			w.ibuf = w.ibuf[:len(w.ibuf)+n]
+			w.write(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+		}
+		nRet += n
+		p = p[n:]
+	}
+	if err := w.err(nil); err != nil {
+		return nRet, err
+	}
+	// p should always be able to fit into w.ibuf now.
+	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+	w.ibuf = w.ibuf[:len(w.ibuf)+n]
+	nRet += n
+	return nRet, nil
+}
+
+// ReadFrom implements the io.ReaderFrom interface.
+// Using this is typically more efficient since it avoids a memory copy.
+// ReadFrom reads data from r until EOF or error.
+// The return value n is the number of bytes read.
+// Any error except io.EOF encountered during the read is also returned.
+func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
+	if len(w.ibuf) > 0 {
+		err := w.Flush()
+		if err != nil {
+			return 0, err
+		}
+	}
+	if br, ok := r.(byter); ok {
+		buf := br.Bytes()
+		if err := w.EncodeBuffer(buf); err != nil {
+			return 0, err
+		}
+		return int64(len(buf)), w.Flush()
+	}
+	for {
+		inbuf := w.buffers.Get().([]byte)[:w.blockSize+obufHeaderLen]
+		n2, err := io.ReadFull(r, inbuf[obufHeaderLen:])
+		if err != nil {
+			if err == io.ErrUnexpectedEOF {
+				err = io.EOF
+			}
+			if err != io.EOF {
+				return n, w.err(err)
+			}
+		}
+		if n2 == 0 {
+			break
+		}
+		n += int64(n2)
+		err2 := w.writeFull(inbuf[:n2+obufHeaderLen])
+		if w.err(err2) != nil {
+			break
+		}
+
+		if err != nil {
+			// We got EOF and wrote everything
+			break
+		}
+	}
+
+	return n, w.err(nil)
+}
+
+// EncodeBuffer will add a buffer to the stream.
+// This is the fastest way to encode a stream,
+// but the input buffer cannot be written to by the caller
+// until Flush or Close has been called when concurrency != 1.
+//
+// If you cannot control that, use the regular Write function.
+//
+// Note that input is not buffered.
+// This means that each write will result in discrete blocks being created.
+// For buffered writes, use the regular Write function.
+func (w *Writer) EncodeBuffer(buf []byte) (err error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	if w.flushOnWrite {
+		_, err := w.write(buf)
+		return err
+	}
+	// Flush queued data first.
+	if len(w.ibuf) > 0 {
+		err := w.Flush()
+		if err != nil {
+			return err
+		}
+	}
+	if w.concurrency == 1 {
+		_, err := w.writeSync(buf)
+		return err
+	}
+
+	// Spawn goroutine and write block to output channel.
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		if w.snappy {
+			hWriter <- []byte(magicChunkSnappy)
+		} else {
+			hWriter <- []byte(magicChunk)
+		}
+	}
+
+	for len(buf) > 0 {
+		// Cut input.
+		uncompressed := buf
+		if len(uncompressed) > w.blockSize {
+			uncompressed = uncompressed[:w.blockSize]
+		}
+		buf = buf[len(uncompressed):]
+		// Get an output buffer.
+		obuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+		output := make(chan result)
+		// Queue output now, so we keep order.
+		w.output <- output
+		go func() {
+			checksum := crc(uncompressed)
+
+			// Set to uncompressed.
+			chunkType := uint8(chunkTypeUncompressedData)
+			chunkLen := 4 + len(uncompressed)
+
+			// Attempt compressing.
+			n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+			n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+			// Check if we should use this, or store as uncompressed instead.
+			if n2 > 0 {
+				chunkType = uint8(chunkTypeCompressedData)
+				chunkLen = 4 + n + n2
+				obuf = obuf[:obufHeaderLen+n+n2]
+			} else {
+				// copy uncompressed
+				copy(obuf[obufHeaderLen:], uncompressed)
+			}
+
+			// Fill in the per-chunk header that comes before the body.
+			obuf[0] = chunkType
+			obuf[1] = uint8(chunkLen >> 0)
+			obuf[2] = uint8(chunkLen >> 8)
+			obuf[3] = uint8(chunkLen >> 16)
+			obuf[4] = uint8(checksum >> 0)
+			obuf[5] = uint8(checksum >> 8)
+			obuf[6] = uint8(checksum >> 16)
+			obuf[7] = uint8(checksum >> 24)
+
+			// Queue final output.
+			output <- obuf
+		}()
+	}
+	return nil
+}
+
+func (w *Writer) encodeBlock(obuf, uncompressed []byte) int {
+	if w.snappy {
+		switch w.level {
+		case levelFast:
+			return encodeBlockSnappy(obuf, uncompressed)
+		case levelBetter:
+			return encodeBlockBetterSnappy(obuf, uncompressed)
+		case levelBest:
+			return encodeBlockBestSnappy(obuf, uncompressed)
+		}
+		return 0
+	}
+	switch w.level {
+	case levelFast:
+		return encodeBlock(obuf, uncompressed)
+	case levelBetter:
+		return encodeBlockBetter(obuf, uncompressed)
+	case levelBest:
+		return encodeBlockBest(obuf, uncompressed)
+	}
+	return 0
+}
+
+func (w *Writer) write(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if w.concurrency == 1 {
+		return w.writeSync(p)
+	}
+
+	// Spawn goroutine and write block to output channel.
+	for len(p) > 0 {
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			hWriter := make(chan result)
+			w.output <- hWriter
+			if w.snappy {
+				hWriter <- []byte(magicChunkSnappy)
+			} else {
+				hWriter <- []byte(magicChunk)
+			}
+		}
+
+		var uncompressed []byte
+		if len(p) > w.blockSize {
+			uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+
+		// Copy input.
+		// If the block is incompressible, this is used for the result.
+		inbuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+		obuf := w.buffers.Get().([]byte)[:w.obufLen]
+		copy(inbuf[obufHeaderLen:], uncompressed)
+		uncompressed = inbuf[obufHeaderLen:]
+
+		output := make(chan result)
+		// Queue output now, so we keep order.
+		w.output <- output
+		go func() {
+			checksum := crc(uncompressed)
+
+			// Set to uncompressed.
+			chunkType := uint8(chunkTypeUncompressedData)
+			chunkLen := 4 + len(uncompressed)
+
+			// Attempt compressing.
+			n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+			n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+			// Check if we should use this, or store as uncompressed instead.
+			if n2 > 0 {
+				chunkType = uint8(chunkTypeCompressedData)
+				chunkLen = 4 + n + n2
+				obuf = obuf[:obufHeaderLen+n+n2]
+			} else {
+				// Use input as output.
+				obuf, inbuf = inbuf, obuf
+			}
+
+			// Fill in the per-chunk header that comes before the body.
+			obuf[0] = chunkType
+			obuf[1] = uint8(chunkLen >> 0)
+			obuf[2] = uint8(chunkLen >> 8)
+			obuf[3] = uint8(chunkLen >> 16)
+			obuf[4] = uint8(checksum >> 0)
+			obuf[5] = uint8(checksum >> 8)
+			obuf[6] = uint8(checksum >> 16)
+			obuf[7] = uint8(checksum >> 24)
+
+			// Queue final output.
+			output <- obuf
+
+			// Put unused buffer back in pool.
+			w.buffers.Put(inbuf)
+		}()
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// writeFull is a special version of write that will always write the full buffer.
+// Data to be compressed should start at offset obufHeaderLen and fill the remainder of the buffer.
+// The data will be written as a single block.
+// The caller is not allowed to use inbuf after this function has been called.
+func (w *Writer) writeFull(inbuf []byte) (errRet error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	if w.concurrency == 1 {
+		_, err := w.writeSync(inbuf[obufHeaderLen:])
+		return err
+	}
+
+	// Spawn goroutine and write block to output channel.
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		if w.snappy {
+			hWriter <- []byte(magicChunkSnappy)
+		} else {
+			hWriter <- []byte(magicChunk)
+		}
+	}
+
+	// Get an output buffer.
+	obuf := w.buffers.Get().([]byte)[:w.obufLen]
+	uncompressed := inbuf[obufHeaderLen:]
+
+	output := make(chan result)
+	// Queue output now, so we keep order.
+	w.output <- output
+	go func() {
+		checksum := crc(uncompressed)
+
+		// Set to uncompressed.
+		chunkType := uint8(chunkTypeUncompressedData)
+		chunkLen := 4 + len(uncompressed)
+
+		// Attempt compressing.
+		n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+		n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+		// Check if we should use this, or store as uncompressed instead.
+		if n2 > 0 {
+			chunkType = uint8(chunkTypeCompressedData)
+			chunkLen = 4 + n + n2
+			obuf = obuf[:obufHeaderLen+n+n2]
+		} else {
+			// Use input as output.
+			obuf, inbuf = inbuf, obuf
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		obuf[0] = chunkType
+		obuf[1] = uint8(chunkLen >> 0)
+		obuf[2] = uint8(chunkLen >> 8)
+		obuf[3] = uint8(chunkLen >> 16)
+		obuf[4] = uint8(checksum >> 0)
+		obuf[5] = uint8(checksum >> 8)
+		obuf[6] = uint8(checksum >> 16)
+		obuf[7] = uint8(checksum >> 24)
+
+		// Queue final output.
+		output <- obuf
+
+		// Put unused buffer back in pool.
+		w.buffers.Put(inbuf)
+	}()
+	return nil
+}
+
+func (w *Writer) writeSync(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		var n int
+		var err error
+		if w.snappy {
+			n, err = w.writer.Write([]byte(magicChunkSnappy))
+		} else {
+			n, err = w.writer.Write([]byte(magicChunk))
+		}
+		if err != nil {
+			return 0, w.err(err)
+		}
+		if n != len(magicChunk) {
+			return 0, w.err(io.ErrShortWrite)
+		}
+		w.written += int64(n)
+	}
+
+	for len(p) > 0 {
+		var uncompressed []byte
+		if len(p) > w.blockSize {
+			uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+
+		obuf := w.buffers.Get().([]byte)[:w.obufLen]
+		checksum := crc(uncompressed)
+
+		// Set to uncompressed.
+		chunkType := uint8(chunkTypeUncompressedData)
+		chunkLen := 4 + len(uncompressed)
+
+		// Attempt compressing.
+		n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+		n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+		if n2 > 0 {
+			chunkType = uint8(chunkTypeCompressedData)
+			chunkLen = 4 + n + n2
+			obuf = obuf[:obufHeaderLen+n+n2]
+		} else {
+			obuf = obuf[:8]
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		obuf[0] = chunkType
+		obuf[1] = uint8(chunkLen >> 0)
+		obuf[2] = uint8(chunkLen >> 8)
+		obuf[3] = uint8(chunkLen >> 16)
+		obuf[4] = uint8(checksum >> 0)
+		obuf[5] = uint8(checksum >> 8)
+		obuf[6] = uint8(checksum >> 16)
+		obuf[7] = uint8(checksum >> 24)
+
+		n, err := w.writer.Write(obuf)
+		if err != nil {
+			return 0, w.err(err)
+		}
+		if n != len(obuf) {
+			return 0, w.err(io.ErrShortWrite)
+		}
+		w.written += int64(n)
+		if chunkType == chunkTypeUncompressedData {
+			// Write uncompressed data.
+			n, err := w.writer.Write(uncompressed)
+			if err != nil {
+				return 0, w.err(err)
+			}
+			if n != len(uncompressed) {
+				return 0, w.err(io.ErrShortWrite)
+			}
+			w.written += int64(n)
+		}
+		w.buffers.Put(obuf)
+		// Queue final output.
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+// This does not apply padding.
+func (w *Writer) Flush() error {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	// Queue any data still in input buffer.
+	if len(w.ibuf) != 0 {
+		if !w.wroteStreamHeader {
+			_, err := w.writeSync(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+			return w.err(err)
+		} else {
+			_, err := w.write(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+			err = w.err(err)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	if w.output == nil {
+		return w.err(nil)
+	}
+
+	// Send empty buffer
+	res := make(chan result)
+	w.output <- res
+	// Block until this has been picked up.
+	res <- nil
+	// When it is closed, we have flushed.
+	<-res
+	return w.err(nil)
+}
+
+// Close calls Flush and then closes the Writer.
+// Calling Close multiple times is ok.
+func (w *Writer) Close() error {
+	err := w.Flush()
+	if w.output != nil {
+		close(w.output)
+		w.writerWg.Wait()
+		w.output = nil
+	}
+	if w.err(nil) == nil && w.writer != nil && w.pad > 0 {
+		add := calcSkippableFrame(w.written, int64(w.pad))
+		frame, err := skippableFrame(w.ibuf[:0], add, w.randSrc)
+		if err = w.err(err); err != nil {
+			return err
+		}
+		_, err2 := w.writer.Write(frame)
+		_ = w.err(err2)
+	}
+	_ = w.err(errClosed)
+	if err == errClosed {
+		return nil
+	}
+	return err
+}
+
+const skippableFrameHeader = 4
+
+// calcSkippableFrame will return a total size to be added for written
+// to be divisible by multiple.
+// The value will always be > skippableFrameHeader.
+// The function will panic if written < 0 or wantMultiple <= 0.
+func calcSkippableFrame(written, wantMultiple int64) int {
+	if wantMultiple <= 0 {
+		panic("wantMultiple <= 0")
+	}
+	if written < 0 {
+		panic("written < 0")
+	}
+	leftOver := written % wantMultiple
+	if leftOver == 0 {
+		return 0
+	}
+	toAdd := wantMultiple - leftOver
+	for toAdd < skippableFrameHeader {
+		toAdd += wantMultiple
+	}
+	return int(toAdd)
+}
+
+// skippableFrame will add a skippable frame with a total size of bytes.
+// total should be >= skippableFrameHeader and < maxBlockSize + skippableFrameHeader
+func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) {
+	if total == 0 {
+		return dst, nil
+	}
+	if total < skippableFrameHeader {
+		return dst, fmt.Errorf("s2: requested skippable frame (%d) < 4", total)
+	}
+	if int64(total) >= maxBlockSize+skippableFrameHeader {
+		return dst, fmt.Errorf("s2: requested skippable frame (%d) >= max 1<<24", total)
+	}
+	// Chunk type 0xfe "Section 4.4 Padding (chunk type 0xfe)"
+	dst = append(dst, chunkTypePadding)
+	f := uint32(total - skippableFrameHeader)
+	// Add chunk length.
+	dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16))
+	// Add data
+	start := len(dst)
+	dst = append(dst, make([]byte, f)...)
+	_, err := io.ReadFull(r, dst[start:])
+	return dst, err
+}
+
+// WriterOption is an option for creating a encoder.
+type WriterOption func(*Writer) error
+
+// WriterConcurrency will set the concurrency,
+// meaning the maximum number of decoders to run concurrently.
+// The value supplied must be at least 1.
+// By default this will be set to GOMAXPROCS.
+func WriterConcurrency(n int) WriterOption {
+	return func(w *Writer) error {
+		if n <= 0 {
+			return errors.New("concurrency must be at least 1")
+		}
+		w.concurrency = n
+		return nil
+	}
+}
+
+// WriterBetterCompression will enable better compression.
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+func WriterBetterCompression() WriterOption {
+	return func(w *Writer) error {
+		w.level = levelBetter
+		return nil
+	}
+}
+
+// WriterBestCompression will enable better compression.
+// EncodeBetter compresses better than Encode but typically with a
+// big speed decrease on compression.
+func WriterBestCompression() WriterOption {
+	return func(w *Writer) error {
+		w.level = levelBest
+		return nil
+	}
+}
+
+// WriterUncompressed will bypass compression.
+// The stream will be written as uncompressed blocks only.
+// If concurrency is > 1 CRC and output will still be done async.
+func WriterUncompressed() WriterOption {
+	return func(w *Writer) error {
+		w.level = levelUncompressed
+		return nil
+	}
+}
+
+// WriterBlockSize allows to override the default block size.
+// Blocks will be this size or smaller.
+// Minimum size is 4KB and and maximum size is 4MB.
+//
+// Bigger blocks may give bigger throughput on systems with many cores,
+// and will increase compression slightly, but it will limit the possible
+// concurrency for smaller payloads for both encoding and decoding.
+// Default block size is 1MB.
+//
+// When writing Snappy compatible output using WriterSnappyCompat,
+// the maximum block size is 64KB.
+func WriterBlockSize(n int) WriterOption {
+	return func(w *Writer) error {
+		if w.snappy && n > maxSnappyBlockSize || n < minBlockSize {
+			return errors.New("s2: block size too large. Must be <= 64K and >=4KB on for snappy compatible output")
+		}
+		if n > maxBlockSize || n < minBlockSize {
+			return errors.New("s2: block size too large. Must be <= 4MB and >=4KB")
+		}
+		w.blockSize = n
+		return nil
+	}
+}
+
+// WriterPadding will add padding to all output so the size will be a multiple of n.
+// This can be used to obfuscate the exact output size or make blocks of a certain size.
+// The contents will be a skippable frame, so it will be invisible by the decoder.
+// n must be > 0 and <= 4MB.
+// The padded area will be filled with data from crypto/rand.Reader.
+// The padding will be applied whenever Close is called on the writer.
+func WriterPadding(n int) WriterOption {
+	return func(w *Writer) error {
+		if n <= 0 {
+			return fmt.Errorf("s2: padding must be at least 1")
+		}
+		// No need to waste our time.
+		if n == 1 {
+			w.pad = 0
+		}
+		if n > maxBlockSize {
+			return fmt.Errorf("s2: padding must less than 4MB")
+		}
+		w.pad = n
+		return nil
+	}
+}
+
+// WriterPaddingSrc will get random data for padding from the supplied source.
+// By default crypto/rand is used.
+func WriterPaddingSrc(reader io.Reader) WriterOption {
+	return func(w *Writer) error {
+		w.randSrc = reader
+		return nil
+	}
+}
+
+// WriterSnappyCompat will write snappy compatible output.
+// The output can be decompressed using either snappy or s2.
+// If block size is more than 64KB it is set to that.
+func WriterSnappyCompat() WriterOption {
+	return func(w *Writer) error {
+		w.snappy = true
+		if w.blockSize > 64<<10 {
+			// We choose 8 bytes less than 64K, since that will make literal emits slightly more effective.
+			// And allows us to skip some size checks.
+			w.blockSize = (64 << 10) - 8
+		}
+		return nil
+	}
+}
+
+// WriterFlushOnWrite will compress blocks on each call to the Write function.
+//
+// This is quite inefficient as blocks size will depend on the write size.
+//
+// Use WriterConcurrency(1) to also make sure that output is flushed.
+// When Write calls return, otherwise they will be written when compression is done.
+func WriterFlushOnWrite() WriterOption {
+	return func(w *Writer) error {
+		w.flushOnWrite = true
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go
new file mode 100644
index 00000000..8b16c38a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@@ -0,0 +1,456 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"bytes"
+	"encoding/binary"
+	"math/bits"
+)
+
+func load32(b []byte, i int) uint32 {
+	return binary.LittleEndian.Uint32(b[i:])
+}
+
+func load64(b []byte, i int) uint64 {
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+}
+
+func encodeGo(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlockGo(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockGo(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+
+		debug = false
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if debug {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := src[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				if nextEmit > 0 {
+					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+					d += emitRepeat(dst[d:], repeat, s-base)
+				} else {
+					// First match, cannot be repeat.
+					d += emitCopy(dst[d:], repeat, s-base)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+func encodeBlockSnappyGo(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
new file mode 100644
index 00000000..e612225f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
@@ -0,0 +1,142 @@
+//go:build !appengine && !noasm && gc
+// +build !appengine,!noasm,gc
+
+package s2
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+
+	if len(src) >= 4<<20 {
+		return encodeBlockAsm(dst, src)
+	}
+	if len(src) >= limit12B {
+		return encodeBlockAsm4MB(dst, src)
+	}
+	if len(src) >= limit10B {
+		return encodeBlockAsm12B(dst, src)
+	}
+	if len(src) >= limit8B {
+		return encodeBlockAsm10B(dst, src)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBlockAsm8B(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetter(dst, src []byte) (d int) {
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+
+	if len(src) > 4<<20 {
+		return encodeBetterBlockAsm(dst, src)
+	}
+	if len(src) >= limit12B {
+		return encodeBetterBlockAsm4MB(dst, src)
+	}
+	if len(src) >= limit10B {
+		return encodeBetterBlockAsm12B(dst, src)
+	}
+	if len(src) >= limit8B {
+		return encodeBetterBlockAsm10B(dst, src)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBetterBlockAsm8B(dst, src)
+}
+
+// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockSnappy(dst, src []byte) (d int) {
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+	if len(src) >= 64<<10 {
+		return encodeSnappyBlockAsm(dst, src)
+	}
+	if len(src) >= limit12B {
+		return encodeSnappyBlockAsm64K(dst, src)
+	}
+	if len(src) >= limit10B {
+		return encodeSnappyBlockAsm12B(dst, src)
+	}
+	if len(src) >= limit8B {
+		return encodeSnappyBlockAsm10B(dst, src)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeSnappyBlockAsm8B(dst, src)
+}
+
+// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+	if len(src) >= 64<<10 {
+		return encodeSnappyBetterBlockAsm(dst, src)
+	}
+	if len(src) >= limit12B {
+		return encodeSnappyBetterBlockAsm64K(dst, src)
+	}
+	if len(src) >= limit10B {
+		return encodeSnappyBetterBlockAsm12B(dst, src)
+	}
+	if len(src) >= limit8B {
+		return encodeSnappyBetterBlockAsm10B(dst, src)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeSnappyBetterBlockAsm8B(dst, src)
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go
new file mode 100644
index 00000000..44803477
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@@ -0,0 +1,604 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"fmt"
+	"math/bits"
+)
+
+// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBest(dst, src []byte) (d int) {
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 19
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 16
+		maxSTableSize = 1 << sTableBits
+
+		inputMargin = 8 + 2
+	)
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	var lTable [maxLTableSize]uint64
+	var sTable [maxSTableSize]uint64
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+	const lowbitMask = 0xffffffff
+	getCur := func(x uint64) int {
+		return int(x & lowbitMask)
+	}
+	getPrev := func(x uint64) int {
+		return int(x >> 32)
+	}
+	const maxSkip = 64
+
+	for {
+		type match struct {
+			offset int
+			s      int
+			length int
+			score  int
+			rep    bool
+		}
+		var best match
+		for {
+			// Next src position to check
+			nextS := (s-nextEmit)>>8 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+
+			score := func(m match) int {
+				// Matches that are longer forward are penalized since we must emit it as a literal.
+				score := m.length - m.s
+				if nextEmit == m.s {
+					// If we do not have to emit literals, we save 1 byte
+					score++
+				}
+				offset := m.s - m.offset
+				if m.rep {
+					return score - emitRepeatSize(offset, m.length)
+				}
+				return score - emitCopySize(offset, m.length)
+			}
+
+			matchAt := func(offset, s int, first uint32, rep bool) match {
+				if best.length != 0 && best.s-best.offset == s-offset {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+				if load32(src, offset) != first {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: 4 + offset, rep: rep}
+				s += 4
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+						m.length += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					m.length += 8
+				}
+				m.length -= offset
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				return m
+			}
+
+			bestOf := func(a, b match) match {
+				if b.length == 0 {
+					return a
+				}
+				if a.length == 0 {
+					return b
+				}
+				as := a.score + b.s
+				bs := b.score + a.s
+				if as >= bs {
+					return a
+				}
+				return b
+			}
+
+			best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false))
+			best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false))
+			best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false))
+
+			{
+				best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
+				if best.length > 0 {
+					// s+1
+					nextShort := sTable[hash4(cv>>8, sTableBits)]
+					s := s + 1
+					cv := load64(src, s)
+					nextLong := lTable[hash8(cv, lTableBits)]
+					best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
+					best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
+					best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
+					best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
+					// Repeat at + 2
+					best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
+
+					// s+2
+					if true {
+						nextShort = sTable[hash4(cv>>8, sTableBits)]
+						s++
+						cv = load64(src, s)
+						nextLong = lTable[hash8(cv, lTableBits)]
+						best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
+						best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
+						best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
+						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
+					}
+					// Search for a match at best match end, see if that is better.
+					if sAt := best.s + best.length; sAt < sLimit {
+						sBack := best.s
+						backL := best.length
+						// Load initial values
+						cv = load64(src, sBack)
+						// Search for mismatch
+						next := lTable[hash8(load64(src, sAt), lTableBits)]
+						//next := sTable[hash4(load64(src, sAt), sTableBits)]
+
+						if checkAt := getCur(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+						}
+						if checkAt := getPrev(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+						}
+					}
+				}
+			}
+
+			// Update table
+			lTable[hashL] = uint64(s) | candidateL<<32
+			sTable[hashS] = uint64(s) | candidateS<<32
+
+			if best.length > 0 {
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards, not needed for repeats...
+		s = best.s
+		if !best.rep {
+			for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
+				best.offset--
+				best.length++
+				s--
+			}
+		}
+		if false && best.offset >= s {
+			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
+		}
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := s - best.offset
+
+		s += best.length
+
+		if offset > 65535 && s-base <= 5 && !best.rep {
+			// Bail if the match is equal or worse to the encoding.
+			s = best.s + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if best.rep {
+			if nextEmit > 0 {
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], offset, best.length)
+			} else {
+				// First match, cannot be repeat.
+				d += emitCopy(dst[d:], offset, best.length)
+			}
+		} else {
+			d += emitCopy(dst[d:], offset, best.length)
+		}
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+		// Fill tables...
+		for i := best.s + 1; i < s; i++ {
+			cv0 := load64(src, i)
+			long0 := hash8(cv0, lTableBits)
+			short0 := hash4(cv0, sTableBits)
+			lTable[long0] = uint64(i) | lTable[long0]<<32
+			sTable[short0] = uint64(i) | sTable[short0]<<32
+		}
+		cv = load64(src, s)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBestSnappy(dst, src []byte) (d int) {
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 19
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 16
+		maxSTableSize = 1 << sTableBits
+
+		inputMargin = 8 + 2
+	)
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	var lTable [maxLTableSize]uint64
+	var sTable [maxSTableSize]uint64
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+	const lowbitMask = 0xffffffff
+	getCur := func(x uint64) int {
+		return int(x & lowbitMask)
+	}
+	getPrev := func(x uint64) int {
+		return int(x >> 32)
+	}
+	const maxSkip = 64
+
+	for {
+		type match struct {
+			offset int
+			s      int
+			length int
+			score  int
+		}
+		var best match
+		for {
+			// Next src position to check
+			nextS := (s-nextEmit)>>8 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+
+			score := func(m match) int {
+				// Matches that are longer forward are penalized since we must emit it as a literal.
+				score := m.length - m.s
+				if nextEmit == m.s {
+					// If we do not have to emit literals, we save 1 byte
+					score++
+				}
+				offset := m.s - m.offset
+
+				return score - emitCopySize(offset, m.length)
+			}
+
+			matchAt := func(offset, s int, first uint32) match {
+				if best.length != 0 && best.s-best.offset == s-offset {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+				if load32(src, offset) != first {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: 4 + offset}
+				s += 4
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+						m.length += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					m.length += 8
+				}
+				m.length -= offset
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				return m
+			}
+
+			bestOf := func(a, b match) match {
+				if b.length == 0 {
+					return a
+				}
+				if a.length == 0 {
+					return b
+				}
+				as := a.score + b.s
+				bs := b.score + a.s
+				if as >= bs {
+					return a
+				}
+				return b
+			}
+
+			best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv)))
+			best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv)))
+			best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv)))
+
+			{
+				best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
+				if best.length > 0 {
+					// s+1
+					nextShort := sTable[hash4(cv>>8, sTableBits)]
+					s := s + 1
+					cv := load64(src, s)
+					nextLong := lTable[hash8(cv, lTableBits)]
+					best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
+					best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
+					best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
+					best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
+					// Repeat at + 2
+					best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
+
+					// s+2
+					if true {
+						nextShort = sTable[hash4(cv>>8, sTableBits)]
+						s++
+						cv = load64(src, s)
+						nextLong = lTable[hash8(cv, lTableBits)]
+						best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
+						best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
+						best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
+						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
+					}
+					// Search for a match at best match end, see if that is better.
+					if sAt := best.s + best.length; sAt < sLimit {
+						sBack := best.s
+						backL := best.length
+						// Load initial values
+						cv = load64(src, sBack)
+						// Search for mismatch
+						next := lTable[hash8(load64(src, sAt), lTableBits)]
+						//next := sTable[hash4(load64(src, sAt), sTableBits)]
+
+						if checkAt := getCur(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+						}
+						if checkAt := getPrev(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+						}
+					}
+				}
+			}
+
+			// Update table
+			lTable[hashL] = uint64(s) | candidateL<<32
+			sTable[hashS] = uint64(s) | candidateS<<32
+
+			if best.length > 0 {
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards, not needed for repeats...
+		s = best.s
+		if true {
+			for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
+				best.offset--
+				best.length++
+				s--
+			}
+		}
+		if false && best.offset >= s {
+			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
+		}
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := s - best.offset
+
+		s += best.length
+
+		if offset > 65535 && s-base <= 5 {
+			// Bail if the match is equal or worse to the encoding.
+			s = best.s + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		d += emitCopyNoRepeat(dst[d:], offset, best.length)
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+		// Fill tables...
+		for i := best.s + 1; i < s; i++ {
+			cv0 := load64(src, i)
+			long0 := hash8(cv0, lTableBits)
+			short0 := hash4(cv0, sTableBits)
+			lTable[long0] = uint64(i) | lTable[long0]<<32
+			sTable[short0] = uint64(i) | sTable[short0]<<32
+		}
+		cv = load64(src, s)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// emitCopySize returns the size to encode the offset+length
+//
+// It assumes that:
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopySize(offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitRepeatSize(offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + emitRepeatSize(offset, length-60)
+	}
+	if length >= 12 || offset >= 2048 {
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	return 2
+}
+
+// emitRepeatSize returns the number of bytes required to encode a repeat.
+// Length must be at least 4 and < 1<<24
+func emitRepeatSize(offset, length int) int {
+	// Repeat offset, make length cheaper
+	if length <= 4+4 || (length < 8+4 && offset < 2048) {
+		return 2
+	}
+	if length < (1<<8)+4+4 {
+		return 3
+	}
+	if length < (1<<16)+(1<<8)+4 {
+		return 4
+	}
+	const maxRepeat = (1 << 24) - 1
+	length -= (1 << 16) - 4
+	left := 0
+	if length > maxRepeat {
+		left = length - maxRepeat + 4
+		length = maxRepeat - 4
+	}
+	if left > 0 {
+		return 5 + emitRepeatSize(offset, left)
+	}
+	return 5
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go
new file mode 100644
index 00000000..943215b8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@@ -0,0 +1,431 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"math/bits"
+)
+
+// hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4(u uint64, h uint8) uint32 {
+	const prime4bytes = 2654435761
+	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash5(u uint64, h uint8) uint32 {
+	const prime5bytes = 889523592379
+	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	const prime7bytes = 58295818150454627
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+	const prime8bytes = 0xcf1bbcdcb7a56463
+	return uint32((u * prime8bytes) >> ((64 - h) & 63))
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterGo(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 16
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint32
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 0
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if nextEmit > 0 {
+					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+					d += emitRepeat(dst[d:], repeat, s-base)
+				} else {
+					// First match, cannot be repeat.
+					d += emitCopy(dst[d:], repeat, s-base)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidateL) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == load32(src, candidateS) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if repeat == offset {
+			d += emitRepeat(dst[d:], offset, s-base)
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+		// Index match start+1 (long) and start+2 (short)
+		index0 := base + 1
+		// Index match end-2 (long) and end-1 (short)
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		cv = load64(src, s)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 16
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint32
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 0
+	const maxSkip = 100
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = (s-nextEmit)>>7 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			if uint32(cv) == load32(src, candidateL) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == load32(src, candidateS) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		d += emitCopyNoRepeat(dst[d:], offset, s-base)
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+		// Index match start+1 (long) and start+2 (short)
+		index0 := base + 1
+		// Index match end-2 (long) and end-1 (short)
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		cv = load64(src, s)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
new file mode 100644
index 00000000..43d43534
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -0,0 +1,298 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package s2
+
+import (
+	"math/bits"
+)
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlock(dst, src []byte) (d int) {
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBlockGo(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockBetter(dst, src []byte) (d int) {
+	return encodeBlockBetterGo(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+	return encodeBlockBetterSnappyGo(dst, src)
+}
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockSnappy(dst, src []byte) (d int) {
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBlockSnappyGo(dst, src)
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteral(dst, lit []byte) int {
+	if len(lit) == 0 {
+		return 0
+	}
+	const num = 63<<2 | tagLiteral
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[1] = uint8(n)
+		dst[0] = 60<<2 | tagLiteral
+		i = 2
+	case n < 1<<16:
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 61<<2 | tagLiteral
+		i = 3
+	case n < 1<<24:
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 62<<2 | tagLiteral
+		i = 4
+	default:
+		dst[4] = uint8(n >> 24)
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 63<<2 | tagLiteral
+		i = 5
+	}
+	return i + copy(dst[i:], lit)
+}
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<24
+func emitRepeat(dst []byte, offset, length int) int {
+	// Repeat offset, make length cheaper
+	length -= 4
+	if length <= 4 {
+		dst[0] = uint8(length)<<2 | tagCopy1
+		dst[1] = 0
+		return 2
+	}
+	if length < 8 && offset < 2048 {
+		// Encode WITH offset
+		dst[1] = uint8(offset)
+		dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+		return 2
+	}
+	if length < (1<<8)+4 {
+		length -= 4
+		dst[2] = uint8(length)
+		dst[1] = 0
+		dst[0] = 5<<2 | tagCopy1
+		return 3
+	}
+	if length < (1<<16)+(1<<8) {
+		length -= 1 << 8
+		dst[3] = uint8(length >> 8)
+		dst[2] = uint8(length >> 0)
+		dst[1] = 0
+		dst[0] = 6<<2 | tagCopy1
+		return 4
+	}
+	const maxRepeat = (1 << 24) - 1
+	length -= 1 << 16
+	left := 0
+	if length > maxRepeat {
+		left = length - maxRepeat + 4
+		length = maxRepeat - 4
+	}
+	dst[4] = uint8(length >> 16)
+	dst[3] = uint8(length >> 8)
+	dst[2] = uint8(length >> 0)
+	dst[1] = 0
+	dst[0] = 7<<2 | tagCopy1
+	if left > 0 {
+		return 5 + emitRepeat(dst[5:], offset, left)
+	}
+	return 5
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopy(dst []byte, offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			// Emit a length 64 copy, encoded as 5 bytes.
+			dst[4] = uint8(offset >> 24)
+			dst[3] = uint8(offset >> 16)
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 63<<2 | tagCopy4
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitRepeat(dst[5:], offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		// Emit a copy, offset encoded as 4 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy4
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		dst[i+3] = uint8(offset >> 16)
+		dst[i+4] = uint8(offset >> 24)
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		// Emit remaining as repeat value (minimum 4 bytes).
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = 59<<2 | tagCopy2
+		length -= 60
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + emitRepeat(dst[3:], offset, length)
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = uint8(length-1)<<2 | tagCopy2
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[1] = uint8(offset)
+	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	return 2
+}
+
+// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopyNoRepeat(dst []byte, offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			// Emit a length 64 copy, encoded as 5 bytes.
+			dst[4] = uint8(offset >> 24)
+			dst[3] = uint8(offset >> 16)
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 63<<2 | tagCopy4
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitCopyNoRepeat(dst[5:], offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		// Emit a copy, offset encoded as 4 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy4
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		dst[i+3] = uint8(offset >> 16)
+		dst[i+4] = uint8(offset >> 24)
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		// Emit remaining as repeat value (minimum 4 bytes).
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = 59<<2 | tagCopy2
+		length -= 60
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + emitCopyNoRepeat(dst[3:], offset, length)
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = uint8(length-1)<<2 | tagCopy2
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[1] = uint8(offset)
+	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	return 2
+}
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//   len(a) <= len(b)
+//
+func matchLen(a []byte, b []byte) int {
+	b = b[:len(a)]
+	var checked int
+	if len(a) > 4 {
+		// Try 4 bytes first
+		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
+			return bits.TrailingZeros32(diff) >> 3
+		}
+		// Switch to 8 byte matching.
+		checked = 4
+		a = a[4:]
+		b = b[4:]
+		for len(a) >= 8 {
+			b = b[:len(a)]
+			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
+				return checked + (bits.TrailingZeros64(diff) >> 3)
+			}
+			checked += 8
+			a = a[8:]
+			b = b[8:]
+		}
+	}
+	b = b[:len(a)]
+	for i := range a {
+		if a[i] != b[i] {
+			return int(i) + checked
+		}
+	}
+	return len(a) + checked
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
new file mode 100644
index 00000000..c8cf7b69
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@@ -0,0 +1,189 @@
+// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc
+// +build !appengine,!noasm,gc
+
+package s2
+
+// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm(dst []byte, src []byte) int
+
+// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4194304 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm4MB(dst []byte, src []byte) int
+
+// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm12B(dst []byte, src []byte) int
+
+// encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm10B(dst []byte, src []byte) int
+
+// encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm8B(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4194304 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm12B(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm10B(dst []byte, src []byte) int
+
+// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm8B(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65535 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
+
+// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65535 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
+
+// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//   dst is long enough to hold the encoded bytes with margin of 0 bytes
+//   0 <= len(lit) && len(lit) <= math.MaxUint32
+//
+//go:noescape
+func emitLiteral(dst []byte, lit []byte) int
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<32
+//
+//go:noescape
+func emitRepeat(dst []byte, offset int, length int) int
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//   dst is long enough to hold the encoded bytes
+//   1 <= offset && offset <= math.MaxUint32
+//   4 <= length && length <= 1 << 24
+//
+//go:noescape
+func emitCopy(dst []byte, offset int, length int) int
+
+// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//   dst is long enough to hold the encoded bytes
+//   1 <= offset && offset <= math.MaxUint32
+//   4 <= length && length <= 1 << 24
+//
+//go:noescape
+func emitCopyNoRepeat(dst []byte, offset int, length int) int
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//   len(a) <= len(b)
+//
+//go:noescape
+func matchLen(a []byte, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
new file mode 100644
index 00000000..1ac65a0e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -0,0 +1,15678 @@
+// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
+
+// +build !appengine
+// +build !noasm
+// +build gc
+
+#include "textflag.h"
+
+// func encodeBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm(SB), $65560-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBlockAsm
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x32, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm
+	LEAL  1(CX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm
+
+repeat_extend_back_loop_encodeBlockAsm:
+	CMPL DI, R8
+	JLE  repeat_extend_back_end_encodeBlockAsm
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm
+
+repeat_extend_back_end_encodeBlockAsm:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeBlockAsm
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeBlockAsm
+	CMPL SI, $0x00010000
+	JLT  three_bytes_repeat_emit_encodeBlockAsm
+	CMPL SI, $0x01000000
+	JLT  four_bytes_repeat_emit_encodeBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL SI, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+four_bytes_repeat_emit_encodeBlockAsm:
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+three_bytes_repeat_emit_encodeBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+two_bytes_repeat_emit_encodeBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeBlockAsm
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+one_byte_repeat_emit_encodeBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm
+
+memmove_long_repeat_emit_encodeBlockAsm:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL CX, R9
+	LEAQ (DX)(CX*1), R10
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+	CMPL R9, $0x08
+	JL   matchlen_single_repeat_extend_encodeBlockAsm
+
+matchlen_loopback_repeat_extend_encodeBlockAsm:
+	MOVQ  (R10)(R12*1), R11
+	XORQ  (SI)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_repeat_extend_encodeBlockAsm
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_loop_repeat_extend_encodeBlockAsm:
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	CMPL R9, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm
+
+matchlen_single_repeat_extend_encodeBlockAsm:
+	TESTL R9, R9
+	JZ    repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm
+	LEAL 1(R12), R12
+	DECL R9
+	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm
+
+repeat_extend_forward_end_encodeBlockAsm:
+	ADDL  R12, CX
+	MOVL  CX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm
+
+	// emitRepeat
+emit_repeat_again_match_repeat_encodeBlockAsm:
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_match_repeat_encodeBlockAsm
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_match_repeat_encodeBlockAsm
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_match_repeat_encodeBlockAsm
+	CMPL SI, $0x00010100
+	JLT  repeat_four_match_repeat_encodeBlockAsm
+	CMPL SI, $0x0100ffff
+	JLT  repeat_five_match_repeat_encodeBlockAsm
+	LEAL -16842747(SI), SI
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_repeat_encodeBlockAsm
+
+repeat_five_match_repeat_encodeBlockAsm:
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (AX)
+	MOVW SI, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_match_repeat_encodeBlockAsm:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_match_repeat_encodeBlockAsm:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_match_repeat_encodeBlockAsm:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_match_repeat_encodeBlockAsm:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_as_copy_encodeBlockAsm:
+	// emitCopy
+	CMPL DI, $0x00010000
+	JL   two_byte_offset_repeat_as_copy_encodeBlockAsm
+
+four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
+	CMPL SI, $0x40
+	JLE  four_bytes_remain_repeat_as_copy_encodeBlockAsm
+	MOVB $0xff, (AX)
+	MOVL DI, 1(AX)
+	LEAL -64(SI), SI
+	ADDQ $0x05, AX
+	CMPL SI, $0x04
+	JL   four_bytes_remain_repeat_as_copy_encodeBlockAsm
+
+	// emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL SI, $0x00010100
+	JLT  repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL SI, $0x0100ffff
+	JLT  repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
+	LEAL -16842747(SI), SI
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (AX)
+	MOVW SI, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+	JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm:
+	TESTL SI, SI
+	JZ    repeat_end_emit_encodeBlockAsm
+	MOVB  $0x03, BL
+	LEAL  -4(BX)(SI*4), SI
+	MOVB  SI, (AX)
+	MOVL  DI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   repeat_end_emit_encodeBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL SI, $0x00010100
+	JLT  repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL SI, $0x0100ffff
+	JLT  repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	LEAL -16842747(SI), SI
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (AX)
+	MOVW SI, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm
+	CMPL DI, $0x00000800
+	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm
+
+no_repeat_found_encodeBlockAsm:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm
+
+candidate3_match_encodeBlockAsm:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm
+
+candidate2_match_encodeBlockAsm:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm
+
+match_extend_back_loop_encodeBlockAsm:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBlockAsm
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBlockAsm
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm
+	JMP  match_extend_back_loop_encodeBlockAsm
+
+match_extend_back_end_encodeBlockAsm:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 5(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeBlockAsm
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeBlockAsm
+	CMPL R8, $0x00010000
+	JLT  three_bytes_match_emit_encodeBlockAsm
+	CMPL R8, $0x01000000
+	JLT  four_bytes_match_emit_encodeBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL R8, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+four_bytes_match_emit_encodeBlockAsm:
+	MOVL R8, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW R8, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+three_bytes_match_emit_encodeBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+two_bytes_match_emit_encodeBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeBlockAsm
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+one_byte_match_emit_encodeBlockAsm:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm
+
+memmove_long_match_emit_encodeBlockAsm:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm:
+match_nolit_loop_encodeBlockAsm:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeBlockAsm
+
+matchlen_loopback_match_nolit_encodeBlockAsm:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeBlockAsm
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeBlockAsm
+
+matchlen_loop_match_nolit_encodeBlockAsm:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBlockAsm
+
+matchlen_single_match_nolit_encodeBlockAsm:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm
+
+match_nolit_end_encodeBlockAsm:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL SI, $0x00010000
+	JL   two_byte_offset_match_nolit_encodeBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeBlockAsm:
+	CMPL R10, $0x40
+	JLE  four_bytes_remain_match_nolit_encodeBlockAsm
+	MOVB $0xff, (AX)
+	MOVL SI, 1(AX)
+	LEAL -64(R10), R10
+	ADDQ $0x05, AX
+	CMPL R10, $0x04
+	JL   four_bytes_remain_match_nolit_encodeBlockAsm
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JLE  repeat_two_match_nolit_encodeBlockAsm_emit_copy
+	CMPL DI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+	CMPL SI, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBlockAsm_emit_copy
+	CMPL R10, $0x00010100
+	JLT  repeat_four_match_nolit_encodeBlockAsm_emit_copy
+	CMPL R10, $0x0100ffff
+	JLT  repeat_five_match_nolit_encodeBlockAsm_emit_copy
+	LEAL -16842747(R10), R10
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy:
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+	JMP four_bytes_loop_back_match_nolit_encodeBlockAsm
+
+four_bytes_remain_match_nolit_encodeBlockAsm:
+	TESTL R10, R10
+	JZ    match_nolit_emitcopy_end_encodeBlockAsm
+	MOVB  $0x03, BL
+	LEAL  -4(BX)(R10*4), R10
+	MOVB  R10, (AX)
+	MOVL  SI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeBlockAsm
+
+two_byte_offset_match_nolit_encodeBlockAsm:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JLE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL DI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL SI, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL R10, $0x00010100
+	JLT  repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL R10, $0x0100ffff
+	JLT  repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
+	LEAL -16842747(R10), R10
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+	JMP two_byte_offset_match_nolit_encodeBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBlockAsm:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBlockAsm
+	CMPL SI, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeBlockAsm
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy_three_match_nolit_encodeBlockAsm:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBlockAsm
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm:
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x10, R8
+	IMULQ R9, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, SI
+	IMULQ R9, SI
+	SHRQ  $0x32, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm
+
+emit_remainder_encodeBlockAsm:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 5(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBlockAsm
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBlockAsm
+	CMPL DX, $0x00010000
+	JLT  three_bytes_emit_remainder_encodeBlockAsm
+	CMPL DX, $0x01000000
+	JLT  four_bytes_emit_remainder_encodeBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+four_bytes_emit_remainder_encodeBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+three_bytes_emit_remainder_encodeBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+two_bytes_emit_remainder_encodeBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBlockAsm
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+one_byte_emit_remainder_encodeBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm
+
+memmove_long_emit_remainder_encodeBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBlockAsm4MB(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm4MB(SB), $65560-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm4MB:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm4MB
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm4MB:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBlockAsm4MB
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x32, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm4MB
+	LEAL  1(CX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm4MB
+
+repeat_extend_back_loop_encodeBlockAsm4MB:
+	CMPL DI, R8
+	JLE  repeat_extend_back_end_encodeBlockAsm4MB
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm4MB
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm4MB
+
+repeat_extend_back_end_encodeBlockAsm4MB:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm4MB
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeBlockAsm4MB
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeBlockAsm4MB
+	CMPL SI, $0x00010000
+	JLT  three_bytes_repeat_emit_encodeBlockAsm4MB
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
+
+three_bytes_repeat_emit_encodeBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
+
+two_bytes_repeat_emit_encodeBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeBlockAsm4MB
+	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
+
+one_byte_repeat_emit_encodeBlockAsm4MB:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm4MB:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm4MB
+
+memmove_long_repeat_emit_encodeBlockAsm4MB:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm4MB:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL CX, R9
+	LEAQ (DX)(CX*1), R10
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+	CMPL R9, $0x08
+	JL   matchlen_single_repeat_extend_encodeBlockAsm4MB
+
+matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
+	MOVQ  (R10)(R12*1), R11
+	XORQ  (SI)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_repeat_extend_encodeBlockAsm4MB
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_loop_repeat_extend_encodeBlockAsm4MB:
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	CMPL R9, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm4MB
+
+matchlen_single_repeat_extend_encodeBlockAsm4MB:
+	TESTL R9, R9
+	JZ    repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm4MB
+	LEAL 1(R12), R12
+	DECL R9
+	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB
+
+repeat_extend_forward_end_encodeBlockAsm4MB:
+	ADDL  R12, CX
+	MOVL  CX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm4MB
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_match_repeat_encodeBlockAsm4MB
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_match_repeat_encodeBlockAsm4MB
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_match_repeat_encodeBlockAsm4MB
+	CMPL SI, $0x00010100
+	JLT  repeat_four_match_repeat_encodeBlockAsm4MB
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (AX)
+	MOVW SI, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_match_repeat_encodeBlockAsm4MB:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_match_repeat_encodeBlockAsm4MB:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_match_repeat_encodeBlockAsm4MB:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_as_copy_encodeBlockAsm4MB:
+	// emitCopy
+	CMPL DI, $0x00010000
+	JL   two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
+
+four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
+	CMPL SI, $0x40
+	JLE  four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+	MOVB $0xff, (AX)
+	MOVL DI, 1(AX)
+	LEAL -64(SI), SI
+	ADDQ $0x05, AX
+	CMPL SI, $0x04
+	JL   four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	CMPL SI, $0x00010100
+	JLT  repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (AX)
+	MOVW SI, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+	JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
+	TESTL SI, SI
+	JZ    repeat_end_emit_encodeBlockAsm4MB
+	MOVB  $0x03, BL
+	LEAL  -4(BX)(SI*4), SI
+	MOVB  SI, (AX)
+	MOVL  DI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   repeat_end_emit_encodeBlockAsm4MB
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	CMPL SI, $0x00010100
+	JLT  repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (AX)
+	MOVW SI, 2(AX)
+	SARL $0x10, DI
+	MOVB DI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+	CMPL DI, $0x00000800
+	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm4MB:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm4MB
+
+no_repeat_found_encodeBlockAsm4MB:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm4MB
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm4MB
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm4MB
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm4MB
+
+candidate3_match_encodeBlockAsm4MB:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm4MB
+
+candidate2_match_encodeBlockAsm4MB:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm4MB:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm4MB
+
+match_extend_back_loop_encodeBlockAsm4MB:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBlockAsm4MB
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBlockAsm4MB
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm4MB
+	JMP  match_extend_back_loop_encodeBlockAsm4MB
+
+match_extend_back_end_encodeBlockAsm4MB:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 4(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm4MB:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm4MB
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeBlockAsm4MB
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeBlockAsm4MB
+	CMPL R8, $0x00010000
+	JLT  three_bytes_match_emit_encodeBlockAsm4MB
+	MOVL R8, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW R8, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm4MB
+
+three_bytes_match_emit_encodeBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm4MB
+
+two_bytes_match_emit_encodeBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeBlockAsm4MB
+	JMP  memmove_long_match_emit_encodeBlockAsm4MB
+
+one_byte_match_emit_encodeBlockAsm4MB:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm4MB:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm4MB:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm4MB
+
+memmove_long_match_emit_encodeBlockAsm4MB:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm4MB:
+match_nolit_loop_encodeBlockAsm4MB:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeBlockAsm4MB
+
+matchlen_loopback_match_nolit_encodeBlockAsm4MB:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeBlockAsm4MB
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeBlockAsm4MB
+
+matchlen_loop_match_nolit_encodeBlockAsm4MB:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBlockAsm4MB
+
+matchlen_single_match_nolit_encodeBlockAsm4MB:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeBlockAsm4MB
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm4MB:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm4MB
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm4MB
+
+match_nolit_end_encodeBlockAsm4MB:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL SI, $0x00010000
+	JL   two_byte_offset_match_nolit_encodeBlockAsm4MB
+
+four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
+	CMPL R10, $0x40
+	JLE  four_bytes_remain_match_nolit_encodeBlockAsm4MB
+	MOVB $0xff, (AX)
+	MOVL SI, 1(AX)
+	LEAL -64(R10), R10
+	ADDQ $0x05, AX
+	CMPL R10, $0x04
+	JL   four_bytes_remain_match_nolit_encodeBlockAsm4MB
+
+	// emitRepeat
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JLE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
+	CMPL DI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+	CMPL SI, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
+	CMPL R10, $0x00010100
+	JLT  repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+	JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBlockAsm4MB:
+	TESTL R10, R10
+	JZ    match_nolit_emitcopy_end_encodeBlockAsm4MB
+	MOVB  $0x03, BL
+	LEAL  -4(BX)(R10*4), R10
+	MOVB  R10, (AX)
+	MOVL  SI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBlockAsm4MB:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm4MB
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JLE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	CMPL DI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	CMPL SI, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	CMPL R10, $0x00010100
+	JLT  repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, SI
+	MOVB SI, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+	JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBlockAsm4MB
+	CMPL SI, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeBlockAsm4MB
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBlockAsm4MB:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm4MB:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBlockAsm4MB
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm4MB:
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x10, R8
+	IMULQ R9, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, SI
+	IMULQ R9, SI
+	SHRQ  $0x32, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm4MB
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm4MB
+
+emit_remainder_encodeBlockAsm4MB:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 4(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm4MB:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm4MB
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBlockAsm4MB
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBlockAsm4MB
+	CMPL DX, $0x00010000
+	JLT  three_bytes_emit_remainder_encodeBlockAsm4MB
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBlockAsm4MB
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
+
+one_byte_emit_remainder_encodeBlockAsm4MB:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm4MB:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBlockAsm4MB:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm4MB:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm12B(SB), $16408-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000080, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm12B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm12B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBlockAsm12B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x000000cf1bbcdcbb, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x18, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x18, R11
+	IMULQ R9, R11
+	SHRQ  $0x34, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x18, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm12B
+	LEAL  1(CX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm12B
+
+repeat_extend_back_loop_encodeBlockAsm12B:
+	CMPL DI, R8
+	JLE  repeat_extend_back_end_encodeBlockAsm12B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm12B
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm12B
+
+repeat_extend_back_end_encodeBlockAsm12B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeBlockAsm12B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
+
+two_bytes_repeat_emit_encodeBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeBlockAsm12B
+	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
+
+one_byte_repeat_emit_encodeBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm12B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm12B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm12B
+
+memmove_long_repeat_emit_encodeBlockAsm12B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm12B:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL CX, R9
+	LEAQ (DX)(CX*1), R10
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+	CMPL R9, $0x08
+	JL   matchlen_single_repeat_extend_encodeBlockAsm12B
+
+matchlen_loopback_repeat_extend_encodeBlockAsm12B:
+	MOVQ  (R10)(R12*1), R11
+	XORQ  (SI)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_repeat_extend_encodeBlockAsm12B
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_loop_repeat_extend_encodeBlockAsm12B:
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	CMPL R9, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm12B
+
+matchlen_single_repeat_extend_encodeBlockAsm12B:
+	TESTL R9, R9
+	JZ    repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm12B:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm12B
+	LEAL 1(R12), R12
+	DECL R9
+	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm12B
+
+repeat_extend_forward_end_encodeBlockAsm12B:
+	ADDL  R12, CX
+	MOVL  CX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm12B
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_match_repeat_encodeBlockAsm12B
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_match_repeat_encodeBlockAsm12B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_match_repeat_encodeBlockAsm12B
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_match_repeat_encodeBlockAsm12B:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_match_repeat_encodeBlockAsm12B:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_match_repeat_encodeBlockAsm12B:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_as_copy_encodeBlockAsm12B:
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+	CMPL DI, $0x00000800
+	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm12B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm12B
+
+no_repeat_found_encodeBlockAsm12B:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm12B
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm12B
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm12B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm12B
+
+candidate3_match_encodeBlockAsm12B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm12B
+
+candidate2_match_encodeBlockAsm12B:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm12B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm12B
+
+match_extend_back_loop_encodeBlockAsm12B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBlockAsm12B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBlockAsm12B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm12B
+	JMP  match_extend_back_loop_encodeBlockAsm12B
+
+match_extend_back_end_encodeBlockAsm12B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm12B:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeBlockAsm12B
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm12B
+
+two_bytes_match_emit_encodeBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeBlockAsm12B
+	JMP  memmove_long_match_emit_encodeBlockAsm12B
+
+one_byte_match_emit_encodeBlockAsm12B:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm12B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm12B:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm12B
+
+memmove_long_match_emit_encodeBlockAsm12B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm12B:
+match_nolit_loop_encodeBlockAsm12B:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeBlockAsm12B:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeBlockAsm12B
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeBlockAsm12B
+
+matchlen_loop_match_nolit_encodeBlockAsm12B:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBlockAsm12B
+
+matchlen_single_match_nolit_encodeBlockAsm12B:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm12B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm12B
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm12B
+
+match_nolit_end_encodeBlockAsm12B:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeBlockAsm12B:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm12B
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JLE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
+	CMPL DI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+	CMPL SI, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+	JMP two_byte_offset_match_nolit_encodeBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm12B:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBlockAsm12B
+	CMPL SI, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeBlockAsm12B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBlockAsm12B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm12B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBlockAsm12B
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm12B:
+	MOVQ  $0x000000cf1bbcdcbb, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x18, R8
+	IMULQ R9, R8
+	SHRQ  $0x34, R8
+	SHLQ  $0x18, SI
+	IMULQ R9, SI
+	SHRQ  $0x34, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm12B
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm12B
+
+emit_remainder_encodeBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm12B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBlockAsm12B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
+
+two_bytes_emit_remainder_encodeBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
+
+one_byte_emit_remainder_encodeBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm12B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm12B
+
+memmove_long_emit_remainder_encodeBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm12B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm10B(SB), $4120-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000020, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm10B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm10B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBlockAsm10B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ R9, R11
+	SHRQ  $0x36, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm10B
+	LEAL  1(CX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm10B
+
+repeat_extend_back_loop_encodeBlockAsm10B:
+	CMPL DI, R8
+	JLE  repeat_extend_back_end_encodeBlockAsm10B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm10B
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm10B
+
+repeat_extend_back_end_encodeBlockAsm10B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeBlockAsm10B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
+
+two_bytes_repeat_emit_encodeBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeBlockAsm10B
+	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
+
+one_byte_repeat_emit_encodeBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm10B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm10B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm10B
+
+memmove_long_repeat_emit_encodeBlockAsm10B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm10B:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL CX, R9
+	LEAQ (DX)(CX*1), R10
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+	CMPL R9, $0x08
+	JL   matchlen_single_repeat_extend_encodeBlockAsm10B
+
+matchlen_loopback_repeat_extend_encodeBlockAsm10B:
+	MOVQ  (R10)(R12*1), R11
+	XORQ  (SI)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_repeat_extend_encodeBlockAsm10B
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_loop_repeat_extend_encodeBlockAsm10B:
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	CMPL R9, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm10B
+
+matchlen_single_repeat_extend_encodeBlockAsm10B:
+	TESTL R9, R9
+	JZ    repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm10B:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm10B
+	LEAL 1(R12), R12
+	DECL R9
+	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm10B
+
+repeat_extend_forward_end_encodeBlockAsm10B:
+	ADDL  R12, CX
+	MOVL  CX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm10B
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_match_repeat_encodeBlockAsm10B
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_match_repeat_encodeBlockAsm10B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_match_repeat_encodeBlockAsm10B
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_match_repeat_encodeBlockAsm10B:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_match_repeat_encodeBlockAsm10B:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_match_repeat_encodeBlockAsm10B:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_as_copy_encodeBlockAsm10B:
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JLE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+	CMPL DI, $0x00000800
+	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+	CMPL DI, $0x00000800
+	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm10B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm10B
+
+no_repeat_found_encodeBlockAsm10B:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm10B
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm10B
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm10B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm10B
+
+candidate3_match_encodeBlockAsm10B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm10B
+
+candidate2_match_encodeBlockAsm10B:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm10B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm10B
+
+match_extend_back_loop_encodeBlockAsm10B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBlockAsm10B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBlockAsm10B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm10B
+	JMP  match_extend_back_loop_encodeBlockAsm10B
+
+match_extend_back_end_encodeBlockAsm10B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm10B:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeBlockAsm10B
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm10B
+
+two_bytes_match_emit_encodeBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeBlockAsm10B
+	JMP  memmove_long_match_emit_encodeBlockAsm10B
+
+one_byte_match_emit_encodeBlockAsm10B:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm10B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm10B:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm10B
+
+memmove_long_match_emit_encodeBlockAsm10B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm10B:
+match_nolit_loop_encodeBlockAsm10B:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeBlockAsm10B:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeBlockAsm10B
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeBlockAsm10B
+
+matchlen_loop_match_nolit_encodeBlockAsm10B:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBlockAsm10B
+
+matchlen_single_match_nolit_encodeBlockAsm10B:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm10B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm10B
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm10B
+
+match_nolit_end_encodeBlockAsm10B:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeBlockAsm10B:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm10B
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JLE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
+	CMPL DI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+	CMPL SI, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+	JMP two_byte_offset_match_nolit_encodeBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm10B:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBlockAsm10B
+	CMPL SI, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeBlockAsm10B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBlockAsm10B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm10B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBlockAsm10B
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm10B:
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x20, R8
+	IMULQ R9, R8
+	SHRQ  $0x36, R8
+	SHLQ  $0x20, SI
+	IMULQ R9, SI
+	SHRQ  $0x36, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm10B
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm10B
+
+emit_remainder_encodeBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm10B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBlockAsm10B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
+
+two_bytes_emit_remainder_encodeBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
+
+one_byte_emit_remainder_encodeBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm10B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm10B
+
+memmove_long_emit_remainder_encodeBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm10B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm8B(SB), $1048-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000008, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm8B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBlockAsm8B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x04, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBlockAsm8B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x38, R10
+	SHLQ  $0x20, R11
+	IMULQ R9, R11
+	SHRQ  $0x38, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x38, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm8B
+	LEAL  1(CX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm8B
+
+repeat_extend_back_loop_encodeBlockAsm8B:
+	CMPL DI, R8
+	JLE  repeat_extend_back_end_encodeBlockAsm8B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeBlockAsm8B
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm8B
+
+repeat_extend_back_end_encodeBlockAsm8B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeBlockAsm8B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
+
+two_bytes_repeat_emit_encodeBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeBlockAsm8B
+	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
+
+one_byte_repeat_emit_encodeBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm8B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm8B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm8B
+
+memmove_long_repeat_emit_encodeBlockAsm8B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm8B:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL CX, R9
+	LEAQ (DX)(CX*1), R10
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+	CMPL R9, $0x08
+	JL   matchlen_single_repeat_extend_encodeBlockAsm8B
+
+matchlen_loopback_repeat_extend_encodeBlockAsm8B:
+	MOVQ  (R10)(R12*1), R11
+	XORQ  (SI)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_repeat_extend_encodeBlockAsm8B
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_loop_repeat_extend_encodeBlockAsm8B:
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	CMPL R9, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm8B
+
+matchlen_single_repeat_extend_encodeBlockAsm8B:
+	TESTL R9, R9
+	JZ    repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm8B:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm8B
+	LEAL 1(R12), R12
+	DECL R9
+	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm8B
+
+repeat_extend_forward_end_encodeBlockAsm8B:
+	ADDL  R12, CX
+	MOVL  CX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm8B
+
+	// emitRepeat
+	MOVL SI, DI
+	LEAL -4(SI), SI
+	CMPL DI, $0x08
+	JLE  repeat_two_match_repeat_encodeBlockAsm8B
+	CMPL DI, $0x0c
+	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_match_repeat_encodeBlockAsm8B
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_match_repeat_encodeBlockAsm8B:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_match_repeat_encodeBlockAsm8B:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_as_copy_encodeBlockAsm8B:
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL SI, DI
+	LEAL -4(SI), SI
+	CMPL DI, $0x08
+	JLE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+	CMPL DI, $0x0c
+	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+	CMPL SI, $0x00000104
+	JLT  repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+	LEAL -256(SI), SI
+	MOVW $0x0019, (AX)
+	MOVW SI, 2(AX)
+	ADDQ $0x04, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (AX)
+	MOVB SI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(AX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm8B
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm8B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeBlockAsm8B
+
+no_repeat_found_encodeBlockAsm8B:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm8B
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm8B
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm8B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBlockAsm8B
+
+candidate3_match_encodeBlockAsm8B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeBlockAsm8B
+
+candidate2_match_encodeBlockAsm8B:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm8B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm8B
+
+match_extend_back_loop_encodeBlockAsm8B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBlockAsm8B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBlockAsm8B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm8B
+	JMP  match_extend_back_loop_encodeBlockAsm8B
+
+match_extend_back_end_encodeBlockAsm8B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm8B:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeBlockAsm8B
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBlockAsm8B
+
+two_bytes_match_emit_encodeBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeBlockAsm8B
+	JMP  memmove_long_match_emit_encodeBlockAsm8B
+
+one_byte_match_emit_encodeBlockAsm8B:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm8B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm8B:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm8B
+
+memmove_long_match_emit_encodeBlockAsm8B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm8B:
+match_nolit_loop_encodeBlockAsm8B:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeBlockAsm8B:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeBlockAsm8B
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeBlockAsm8B
+
+matchlen_loop_match_nolit_encodeBlockAsm8B:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBlockAsm8B
+
+matchlen_single_match_nolit_encodeBlockAsm8B:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm8B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm8B
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm8B
+
+match_nolit_end_encodeBlockAsm8B:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeBlockAsm8B:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm8B
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R10, SI
+	LEAL -4(R10), R10
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(AX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+	JMP two_byte_offset_match_nolit_encodeBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm8B:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBlockAsm8B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBlockAsm8B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm8B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBlockAsm8B
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm8B:
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x20, R8
+	IMULQ R9, R8
+	SHRQ  $0x38, R8
+	SHLQ  $0x20, SI
+	IMULQ R9, SI
+	SHRQ  $0x38, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm8B
+	INCL  CX
+	JMP   search_loop_encodeBlockAsm8B
+
+emit_remainder_encodeBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm8B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBlockAsm8B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
+
+two_bytes_emit_remainder_encodeBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
+
+one_byte_emit_remainder_encodeBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm8B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm8B
+
+memmove_long_emit_remainder_encodeBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm8B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm(SB), $327704-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000a00, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	SHRL $0x07, SI
+	CMPL SI, $0x63
+	JLE  check_maxskip_ok_encodeBetterBlockAsm
+	LEAL 100(CX), SI
+	JMP  check_maxskip_cont_encodeBetterBlockAsm
+
+check_maxskip_ok_encodeBetterBlockAsm:
+	LEAL 1(CX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm:
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBetterBlockAsm
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  262168(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 262168(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeBetterBlockAsm
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeBetterBlockAsm
+
+candidateS_match_encodeBetterBlockAsm:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm
+
+match_extend_back_loop_encodeBetterBlockAsm:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBetterBlockAsm
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm
+	JMP  match_extend_back_loop_encodeBetterBlockAsm
+
+match_extend_back_end_encodeBetterBlockAsm:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 5(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeBetterBlockAsm
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeBetterBlockAsm
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm
+
+matchlen_single_match_nolit_encodeBetterBlockAsm:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeBetterBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm
+
+match_nolit_end_encodeBetterBlockAsm:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm
+	CMPL R12, $0x01
+	JG   match_length_ok_encodeBetterBlockAsm
+	CMPL R8, $0x0000ffff
+	JLE  match_length_ok_encodeBetterBlockAsm
+	MOVL 20(SP), CX
+	INCL CX
+	JMP  search_loop_encodeBetterBlockAsm
+
+match_length_ok_encodeBetterBlockAsm:
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeBetterBlockAsm
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeBetterBlockAsm
+	CMPL SI, $0x00010000
+	JLT  three_bytes_match_emit_encodeBetterBlockAsm
+	CMPL SI, $0x01000000
+	JLT  four_bytes_match_emit_encodeBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL SI, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+four_bytes_match_emit_encodeBetterBlockAsm:
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+three_bytes_match_emit_encodeBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+two_bytes_match_emit_encodeBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeBetterBlockAsm
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+one_byte_match_emit_encodeBetterBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (AX)
+	MOVL R10, -4(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm
+
+memmove_long_match_emit_encodeBetterBlockAsm:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL R8, $0x00010000
+	JL   two_byte_offset_match_nolit_encodeBetterBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
+	CMPL R12, $0x40
+	JLE  four_bytes_remain_match_nolit_encodeBetterBlockAsm
+	MOVB $0xff, (AX)
+	MOVL R8, 1(AX)
+	LEAL -64(R12), R12
+	ADDQ $0x05, AX
+	CMPL R12, $0x04
+	JL   four_bytes_remain_match_nolit_encodeBetterBlockAsm
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL R12, $0x00010100
+	JLT  repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL R12, $0x0100ffff
+	JLT  repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
+	LEAL -16842747(R12), R12
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (AX)
+	MOVW R12, 2(AX)
+	SARL $0x10, R8
+	MOVB R8, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+	JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm:
+	TESTL R12, R12
+	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm
+	MOVB  $0x03, BL
+	LEAL  -4(BX)(R12*4), R12
+	MOVB  R12, (AX)
+	MOVL  R8, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL R12, $0x00010100
+	JLT  repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL R12, $0x0100ffff
+	JLT  repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	LEAL -16842747(R12), R12
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (AX)
+	MOVW R12, 2(AX)
+	SARL $0x10, R8
+	MOVB R8, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm
+	CMPL R8, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+match_is_repeat_encodeBetterBlockAsm:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm
+	CMPL SI, $0x00010000
+	JLT  three_bytes_match_emit_repeat_encodeBetterBlockAsm
+	CMPL SI, $0x01000000
+	JLT  four_bytes_match_emit_repeat_encodeBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL SI, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+four_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_repeat_encodeBetterBlockAsm
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (AX)
+	MOVL R10, -4(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL R12, $0x00010100
+	JLT  repeat_four_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL R12, $0x0100ffff
+	JLT  repeat_five_match_nolit_repeat_encodeBetterBlockAsm
+	LEAL -16842747(R12), R12
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
+
+repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (AX)
+	MOVW R12, 2(AX)
+	SARL $0x10, R8
+	MOVB R8, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBetterBlockAsm
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm:
+	MOVQ  $0x00cf1bbcdcbfa563, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x08, R13
+	IMULQ SI, R13
+	SHRQ  $0x30, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x32, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 262168(SP)(R11*4)
+	MOVL  R15, 262168(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x08, R13
+	IMULQ SI, R13
+	SHRQ  $0x30, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 262168(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeBetterBlockAsm
+
+emit_remainder_encodeBetterBlockAsm:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 5(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x00010000
+	JLT  three_bytes_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x01000000
+	JLT  four_bytes_emit_remainder_encodeBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+four_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+three_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+two_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBetterBlockAsm
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+one_byte_emit_remainder_encodeBetterBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x04
+	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4:
+	MOVL (CX), SI
+	MOVL SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm
+
+memmove_long_emit_remainder_encodeBetterBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000a00, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm4MB:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm4MB
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm4MB:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	SHRL $0x07, SI
+	CMPL SI, $0x63
+	JLE  check_maxskip_ok_encodeBetterBlockAsm4MB
+	LEAL 100(CX), SI
+	JMP  check_maxskip_cont_encodeBetterBlockAsm4MB
+
+check_maxskip_ok_encodeBetterBlockAsm4MB:
+	LEAL 1(CX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm4MB:
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBetterBlockAsm4MB
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  262168(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 262168(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm4MB
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeBetterBlockAsm4MB
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeBetterBlockAsm4MB
+
+candidateS_match_encodeBetterBlockAsm4MB:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm4MB
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm4MB:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm4MB
+
+match_extend_back_loop_encodeBetterBlockAsm4MB:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBetterBlockAsm4MB
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm4MB
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm4MB
+	JMP  match_extend_back_loop_encodeBetterBlockAsm4MB
+
+match_extend_back_end_encodeBetterBlockAsm4MB:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 4(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBetterBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm4MB:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_single_match_nolit_encodeBetterBlockAsm4MB:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm4MB
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB
+
+match_nolit_end_encodeBetterBlockAsm4MB:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm4MB
+	CMPL R12, $0x01
+	JG   match_length_ok_encodeBetterBlockAsm4MB
+	CMPL R8, $0x0000ffff
+	JLE  match_length_ok_encodeBetterBlockAsm4MB
+	MOVL 20(SP), CX
+	INCL CX
+	JMP  search_loop_encodeBetterBlockAsm4MB
+
+match_length_ok_encodeBetterBlockAsm4MB:
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeBetterBlockAsm4MB
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeBetterBlockAsm4MB
+	CMPL SI, $0x00010000
+	JLT  three_bytes_match_emit_encodeBetterBlockAsm4MB
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_encodeBetterBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_encodeBetterBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeBetterBlockAsm4MB
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_encodeBetterBlockAsm4MB:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (AX)
+	MOVL R10, -4(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL R8, $0x00010000
+	JL   two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
+
+four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL R12, $0x40
+	JLE  four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+	MOVB $0xff, (AX)
+	MOVL R8, 1(AX)
+	LEAL -64(R12), R12
+	ADDQ $0x05, AX
+	CMPL R12, $0x04
+	JL   four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	CMPL R12, $0x00010100
+	JLT  repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (AX)
+	MOVW R12, 2(AX)
+	SARL $0x10, R8
+	MOVB R8, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+	JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
+	TESTL R12, R12
+	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+	MOVB  $0x03, BL
+	LEAL  -4(BX)(R12*4), R12
+	MOVB  R12, (AX)
+	MOVL  R8, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	CMPL R12, $0x00010100
+	JLT  repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (AX)
+	MOVW R12, 2(AX)
+	SARL $0x10, R8
+	MOVB R8, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+	CMPL R8, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+match_is_repeat_encodeBetterBlockAsm4MB:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+	CMPL SI, $0x00010000
+	JLT  three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_repeat_encodeBetterBlockAsm4MB
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (AX)
+	MOVL R10, -4(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
+	CMPL R12, $0x00010100
+	JLT  repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (AX)
+	MOVW R12, 2(AX)
+	SARL $0x10, R8
+	MOVB R8, 4(AX)
+	ADDQ $0x05, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBetterBlockAsm4MB
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBetterBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm4MB:
+	MOVQ  $0x00cf1bbcdcbfa563, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x08, R13
+	IMULQ SI, R13
+	SHRQ  $0x30, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x32, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 262168(SP)(R11*4)
+	MOVL  R15, 262168(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x08, R13
+	IMULQ SI, R13
+	SHRQ  $0x30, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 262168(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeBetterBlockAsm4MB
+
+emit_remainder_encodeBetterBlockAsm4MB:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 4(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBetterBlockAsm4MB
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm4MB:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBetterBlockAsm4MB
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm4MB
+	CMPL DX, $0x00010000
+	JLT  three_bytes_emit_remainder_encodeBetterBlockAsm4MB
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBetterBlockAsm4MB
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+one_byte_emit_remainder_encodeBetterBlockAsm4MB:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x04
+	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4:
+	MOVL (CX), SI
+	MOVL SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000280, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm12B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm12B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  1(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBetterBlockAsm12B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x34, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  65560(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 65560(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm12B
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeBetterBlockAsm12B
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeBetterBlockAsm12B
+
+candidateS_match_encodeBetterBlockAsm12B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm12B
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm12B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm12B
+
+match_extend_back_loop_encodeBetterBlockAsm12B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBetterBlockAsm12B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm12B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm12B
+	JMP  match_extend_back_loop_encodeBetterBlockAsm12B
+
+match_extend_back_end_encodeBetterBlockAsm12B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm12B:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm12B
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_single_match_nolit_encodeBetterBlockAsm12B:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm12B
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B
+
+match_nolit_end_encodeBetterBlockAsm12B:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm12B
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeBetterBlockAsm12B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeBetterBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_encodeBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeBetterBlockAsm12B
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
+
+one_byte_match_emit_encodeBetterBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm12B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (AX)
+	MOVL R10, -4(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_encodeBetterBlockAsm12B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm12B:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+	CMPL R8, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+match_is_repeat_encodeBetterBlockAsm12B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm12B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_repeat_encodeBetterBlockAsm12B
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm12B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (AX)
+	MOVL R10, -4(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBetterBlockAsm12B
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm12B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x32, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x34, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x34, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 65560(SP)(R11*4)
+	MOVL  R15, 65560(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x34, R11
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x32, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 65560(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeBetterBlockAsm12B
+
+emit_remainder_encodeBetterBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBetterBlockAsm12B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBetterBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeBetterBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x04
+	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4:
+	MOVL (CX), SI
+	MOVL SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x000000a0, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm10B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm10B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  1(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBetterBlockAsm10B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x36, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  16408(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 16408(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm10B
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeBetterBlockAsm10B
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeBetterBlockAsm10B
+
+candidateS_match_encodeBetterBlockAsm10B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm10B
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm10B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm10B
+
+match_extend_back_loop_encodeBetterBlockAsm10B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBetterBlockAsm10B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm10B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm10B
+	JMP  match_extend_back_loop_encodeBetterBlockAsm10B
+
+match_extend_back_end_encodeBetterBlockAsm10B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm10B:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm10B
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_single_match_nolit_encodeBetterBlockAsm10B:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm10B
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B
+
+match_nolit_end_encodeBetterBlockAsm10B:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm10B
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeBetterBlockAsm10B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeBetterBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_encodeBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeBetterBlockAsm10B
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
+
+one_byte_match_emit_encodeBetterBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm10B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (AX)
+	MOVL R10, -4(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_encodeBetterBlockAsm10B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm10B:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+	CMPL R8, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+match_is_repeat_encodeBetterBlockAsm10B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm10B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_repeat_encodeBetterBlockAsm10B
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm10B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (AX)
+	MOVL R10, -4(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+	CMPL R8, $0x00000800
+	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBetterBlockAsm10B
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm10B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x34, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x36, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x36, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 16408(SP)(R11*4)
+	MOVL  R15, 16408(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x36, R11
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x34, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 16408(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeBetterBlockAsm10B
+
+emit_remainder_encodeBetterBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBetterBlockAsm10B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBetterBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeBetterBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x04
+	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4:
+	MOVL (CX), SI
+	MOVL SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000028, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm8B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeBetterBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -6(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm8B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x04, SI
+	LEAL  1(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeBetterBlockAsm8B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x38, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  4120(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 4120(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm8B
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeBetterBlockAsm8B
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeBetterBlockAsm8B
+
+candidateS_match_encodeBetterBlockAsm8B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm8B
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm8B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm8B
+
+match_extend_back_loop_encodeBetterBlockAsm8B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeBetterBlockAsm8B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm8B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm8B
+	JMP  match_extend_back_loop_encodeBetterBlockAsm8B
+
+match_extend_back_end_encodeBetterBlockAsm8B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm8B:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm8B
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_single_match_nolit_encodeBetterBlockAsm8B:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm8B
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B
+
+match_nolit_end_encodeBetterBlockAsm8B:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm8B
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeBetterBlockAsm8B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeBetterBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_encodeBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeBetterBlockAsm8B
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
+
+one_byte_match_emit_encodeBetterBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm8B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (AX)
+	MOVL R10, -4(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_encodeBetterBlockAsm8B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm8B:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm8B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+match_is_repeat_encodeBetterBlockAsm8B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm8B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_repeat_encodeBetterBlockAsm8B
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm8B:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (AX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (AX)
+	MOVL R9, -4(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
+	CMPL R12, $0x00000104
+	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
+	LEAL -256(R12), R12
+	MOVW $0x0019, (AX)
+	MOVW R12, 2(AX)
+	ADDQ $0x04, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (AX)
+	MOVB R12, 2(AX)
+	ADDQ $0x03, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(AX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeBetterBlockAsm8B
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm8B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x36, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x38, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x38, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 4120(SP)(R11*4)
+	MOVL  R15, 4120(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x38, R11
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x36, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 4120(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeBetterBlockAsm8B
+
+emit_remainder_encodeBetterBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeBetterBlockAsm8B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeBetterBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeBetterBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x04
+	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4:
+	MOVL (CX), SI
+	MOVL SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(BX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm(SB), $65560-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBlockAsm
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x32, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm
+	LEAL  1(CX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm
+
+repeat_extend_back_loop_encodeSnappyBlockAsm:
+	CMPL DI, SI
+	JLE  repeat_extend_back_end_encodeSnappyBlockAsm
+	MOVB -1(DX)(R8*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm
+
+repeat_extend_back_end_encodeSnappyBlockAsm:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm
+	CMPL SI, $0x00010000
+	JLT  three_bytes_repeat_emit_encodeSnappyBlockAsm
+	CMPL SI, $0x01000000
+	JLT  four_bytes_repeat_emit_encodeSnappyBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL SI, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+four_bytes_repeat_emit_encodeSnappyBlockAsm:
+	MOVL SI, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeSnappyBlockAsm
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+one_byte_repeat_emit_encodeSnappyBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	CMPL R8, $0x08
+	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
+	MOVQ  (R9)(R11*1), R10
+	XORQ  (SI)(R11*1), R10
+	TESTQ R10, R10
+	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm
+	BSFQ  R10, R10
+	SARQ  $0x03, R10
+	LEAL  (R11)(R10*1), R11
+	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm:
+	TESTL R8, R8
+	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm
+	LEAL 1(R11), R11
+	DECL R8
+	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm
+
+repeat_extend_forward_end_encodeSnappyBlockAsm:
+	ADDL R11, CX
+	MOVL CX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+	CMPL DI, $0x00010000
+	JL   two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
+	CMPL SI, $0x40
+	JLE  four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+	MOVB $0xff, (AX)
+	MOVL DI, 1(AX)
+	LEAL -64(SI), SI
+	ADDQ $0x05, AX
+	CMPL SI, $0x04
+	JL   four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+	JMP  four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
+	TESTL SI, SI
+	JZ    repeat_end_emit_encodeSnappyBlockAsm
+	MOVB  $0x03, BL
+	LEAL  -4(BX)(SI*4), SI
+	MOVB  SI, (AX)
+	MOVL  DI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   repeat_end_emit_encodeSnappyBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+	CMPL DI, $0x00000800
+	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm
+
+no_repeat_found_encodeSnappyBlockAsm:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm
+
+candidate3_match_encodeSnappyBlockAsm:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm
+
+candidate2_match_encodeSnappyBlockAsm:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm
+
+match_extend_back_loop_encodeSnappyBlockAsm:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBlockAsm
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm
+
+match_extend_back_end_encodeSnappyBlockAsm:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 5(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBlockAsm
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBlockAsm
+	CMPL R8, $0x00010000
+	JLT  three_bytes_match_emit_encodeSnappyBlockAsm
+	CMPL R8, $0x01000000
+	JLT  four_bytes_match_emit_encodeSnappyBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL R8, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+four_bytes_match_emit_encodeSnappyBlockAsm:
+	MOVL R8, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (AX)
+	MOVW R8, 1(AX)
+	MOVB R10, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+three_bytes_match_emit_encodeSnappyBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+two_bytes_match_emit_encodeSnappyBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeSnappyBlockAsm
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+one_byte_match_emit_encodeSnappyBlockAsm:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm
+
+memmove_long_match_emit_encodeSnappyBlockAsm:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm:
+match_nolit_loop_encodeSnappyBlockAsm:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm
+
+match_nolit_end_encodeSnappyBlockAsm:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL SI, $0x00010000
+	JL   two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
+	CMPL R10, $0x40
+	JLE  four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+	MOVB $0xff, (AX)
+	MOVL SI, 1(AX)
+	LEAL -64(R10), R10
+	ADDQ $0x05, AX
+	CMPL R10, $0x04
+	JL   four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
+	TESTL R10, R10
+	JZ    match_nolit_emitcopy_end_encodeSnappyBlockAsm
+	MOVB  $0x03, BL
+	LEAL  -4(BX)(R10*4), R10
+	MOVB  R10, (AX)
+	MOVL  SI, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBlockAsm:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
+	CMPL SI, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBlockAsm
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm:
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x10, R8
+	IMULQ R9, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, SI
+	IMULQ R9, SI
+	SHRQ  $0x32, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm
+
+emit_remainder_encodeSnappyBlockAsm:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 5(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm
+	CMPL DX, $0x00010000
+	JLT  three_bytes_emit_remainder_encodeSnappyBlockAsm
+	CMPL DX, $0x01000000
+	JLT  four_bytes_emit_remainder_encodeSnappyBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBlockAsm
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000200, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm64K:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm64K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm64K:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBlockAsm64K
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x32, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm64K
+	LEAL  1(CX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm64K
+
+repeat_extend_back_loop_encodeSnappyBlockAsm64K:
+	CMPL DI, SI
+	JLE  repeat_extend_back_end_encodeSnappyBlockAsm64K
+	MOVB -1(DX)(R8*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm64K
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm64K
+
+repeat_extend_back_end_encodeSnappyBlockAsm64K:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm64K
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm64K
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeSnappyBlockAsm64K
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+one_byte_repeat_emit_encodeSnappyBlockAsm64K:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	CMPL R8, $0x08
+	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
+	MOVQ  (R9)(R11*1), R10
+	XORQ  (SI)(R11*1), R10
+	TESTQ R10, R10
+	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
+	BSFQ  R10, R10
+	SARQ  $0x03, R10
+	LEAL  (R11)(R10*1), R11
+	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm64K:
+	TESTL R8, R8
+	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm64K
+	LEAL 1(R11), R11
+	DECL R8
+	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K
+
+repeat_extend_forward_end_encodeSnappyBlockAsm64K:
+	ADDL R11, CX
+	MOVL CX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+	CMPL DI, $0x00000800
+	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm64K
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm64K:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm64K
+
+no_repeat_found_encodeSnappyBlockAsm64K:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm64K
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm64K
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm64K
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm64K
+
+candidate3_match_encodeSnappyBlockAsm64K:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm64K
+
+candidate2_match_encodeSnappyBlockAsm64K:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm64K:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBlockAsm64K:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBlockAsm64K
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm64K
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm64K
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm64K
+
+match_extend_back_end_encodeSnappyBlockAsm64K:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm64K:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBlockAsm64K
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBlockAsm64K
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeSnappyBlockAsm64K
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBlockAsm64K:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
+match_nolit_loop_encodeSnappyBlockAsm64K:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm64K:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm64K
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K
+
+match_nolit_end_encodeSnappyBlockAsm64K:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+	CMPL SI, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBlockAsm64K
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm64K:
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x10, R8
+	IMULQ R9, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, SI
+	IMULQ R9, SI
+	SHRQ  $0x32, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm64K
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm64K
+
+emit_remainder_encodeSnappyBlockAsm64K:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm64K:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm64K
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm64K
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBlockAsm64K
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBlockAsm64K:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000080, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm12B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm12B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBlockAsm12B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x000000cf1bbcdcbb, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x18, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x18, R11
+	IMULQ R9, R11
+	SHRQ  $0x34, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x18, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm12B
+	LEAL  1(CX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm12B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm12B:
+	CMPL DI, SI
+	JLE  repeat_extend_back_end_encodeSnappyBlockAsm12B
+	MOVB -1(DX)(R8*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm12B
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm12B
+
+repeat_extend_back_end_encodeSnappyBlockAsm12B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm12B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeSnappyBlockAsm12B
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	CMPL R8, $0x08
+	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
+	MOVQ  (R9)(R11*1), R10
+	XORQ  (SI)(R11*1), R10
+	TESTQ R10, R10
+	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
+	BSFQ  R10, R10
+	SARQ  $0x03, R10
+	LEAL  (R11)(R10*1), R11
+	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm12B:
+	TESTL R8, R8
+	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm12B
+	LEAL 1(R11), R11
+	DECL R8
+	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B
+
+repeat_extend_forward_end_encodeSnappyBlockAsm12B:
+	ADDL R11, CX
+	MOVL CX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+	CMPL DI, $0x00000800
+	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm12B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm12B
+
+no_repeat_found_encodeSnappyBlockAsm12B:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm12B
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm12B
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm12B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm12B
+
+candidate3_match_encodeSnappyBlockAsm12B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm12B
+
+candidate2_match_encodeSnappyBlockAsm12B:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm12B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBlockAsm12B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBlockAsm12B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm12B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm12B
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm12B
+
+match_extend_back_end_encodeSnappyBlockAsm12B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm12B:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBlockAsm12B
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeSnappyBlockAsm12B
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBlockAsm12B:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
+match_nolit_loop_encodeSnappyBlockAsm12B:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm12B:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm12B
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B
+
+match_nolit_end_encodeSnappyBlockAsm12B:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+	CMPL SI, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBlockAsm12B
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm12B:
+	MOVQ  $0x000000cf1bbcdcbb, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x18, R8
+	IMULQ R9, R8
+	SHRQ  $0x34, R8
+	SHLQ  $0x18, SI
+	IMULQ R9, SI
+	SHRQ  $0x34, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm12B
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm12B
+
+emit_remainder_encodeSnappyBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm12B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000020, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm10B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm10B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBlockAsm10B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ R9, R11
+	SHRQ  $0x36, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm10B
+	LEAL  1(CX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm10B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm10B:
+	CMPL DI, SI
+	JLE  repeat_extend_back_end_encodeSnappyBlockAsm10B
+	MOVB -1(DX)(R8*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm10B
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm10B
+
+repeat_extend_back_end_encodeSnappyBlockAsm10B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm10B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeSnappyBlockAsm10B
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	CMPL R8, $0x08
+	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
+	MOVQ  (R9)(R11*1), R10
+	XORQ  (SI)(R11*1), R10
+	TESTQ R10, R10
+	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
+	BSFQ  R10, R10
+	SARQ  $0x03, R10
+	LEAL  (R11)(R10*1), R11
+	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm10B:
+	TESTL R8, R8
+	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm10B
+	LEAL 1(R11), R11
+	DECL R8
+	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B
+
+repeat_extend_forward_end_encodeSnappyBlockAsm10B:
+	ADDL R11, CX
+	MOVL CX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+	CMPL DI, $0x00000800
+	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm10B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm10B
+
+no_repeat_found_encodeSnappyBlockAsm10B:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm10B
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm10B
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm10B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm10B
+
+candidate3_match_encodeSnappyBlockAsm10B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm10B
+
+candidate2_match_encodeSnappyBlockAsm10B:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm10B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBlockAsm10B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBlockAsm10B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm10B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm10B
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm10B
+
+match_extend_back_end_encodeSnappyBlockAsm10B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm10B:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBlockAsm10B
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeSnappyBlockAsm10B
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBlockAsm10B:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
+match_nolit_loop_encodeSnappyBlockAsm10B:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm10B:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm10B
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B
+
+match_nolit_end_encodeSnappyBlockAsm10B:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+	CMPL SI, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBlockAsm10B
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm10B:
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x20, R8
+	IMULQ R9, R8
+	SHRQ  $0x36, R8
+	SHLQ  $0x20, SI
+	IMULQ R9, SI
+	SHRQ  $0x36, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm10B
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm10B
+
+emit_remainder_encodeSnappyBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm10B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000008, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm8B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  CX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm8B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x04, SI
+	LEAL  4(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBlockAsm8B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x38, R10
+	SHLQ  $0x20, R11
+	IMULQ R9, R11
+	SHRQ  $0x38, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  24(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	LEAL  1(CX), R10
+	MOVL  R10, 24(SP)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x38, R10
+	MOVL  CX, R9
+	SUBL  16(SP), R9
+	MOVL  1(DX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm8B
+	LEAL  1(CX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm8B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm8B:
+	CMPL DI, SI
+	JLE  repeat_extend_back_end_encodeSnappyBlockAsm8B
+	MOVB -1(DX)(R8*1), BL
+	MOVB -1(DX)(DI*1), R9
+	CMPB BL, R9
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm8B
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm8B
+
+repeat_extend_back_end_encodeSnappyBlockAsm8B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm8B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_repeat_emit_encodeSnappyBlockAsm8B
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (AX)
+	MOVQ R9, -8(AX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
+	ADDL $0x05, CX
+	MOVL CX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	CMPL R8, $0x08
+	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
+	MOVQ  (R9)(R11*1), R10
+	XORQ  (SI)(R11*1), R10
+	TESTQ R10, R10
+	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
+	BSFQ  R10, R10
+	SARQ  $0x03, R10
+	LEAL  (R11)(R10*1), R11
+	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm8B:
+	TESTL R8, R8
+	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm8B
+	LEAL 1(R11), R11
+	DECL R8
+	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B
+
+repeat_extend_forward_end_encodeSnappyBlockAsm8B:
+	ADDL R11, CX
+	MOVL CX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
+	CMPL SI, $0x40
+	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
+	MOVB $0xee, (AX)
+	MOVW DI, 1(AX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
+	CMPL SI, $0x0c
+	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
+	MOVB $0x01, BL
+	LEAL -16(BX)(SI*4), SI
+	MOVB DI, 1(AX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, AX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(SI*4), SI
+	MOVB SI, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm8B:
+	MOVL CX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm8B
+
+no_repeat_found_encodeSnappyBlockAsm8B:
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm8B
+	SHRQ $0x08, DI
+	MOVL 24(SP)(R10*4), SI
+	LEAL 2(CX), R9
+	CMPL (DX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm8B
+	MOVL R9, 24(SP)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (DX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm8B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBlockAsm8B
+
+candidate3_match_encodeSnappyBlockAsm8B:
+	ADDL $0x02, CX
+	JMP  candidate_match_encodeSnappyBlockAsm8B
+
+candidate2_match_encodeSnappyBlockAsm8B:
+	MOVL R9, 24(SP)(R10*4)
+	INCL CX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm8B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBlockAsm8B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBlockAsm8B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBlockAsm8B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm8B
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm8B
+
+match_extend_back_end_encodeSnappyBlockAsm8B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm8B:
+	MOVL CX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBlockAsm8B
+	CMPL R8, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB R8, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R8, $0x40
+	JL   memmove_match_emit_encodeSnappyBlockAsm8B
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBlockAsm8B:
+	SHLB $0x02, R8
+	MOVB R8, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (AX)
+	MOVQ DI, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
+	MOVQ R8, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  AX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(AX)(R12*1)
+	MOVOA X5, -16(AX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
+match_nolit_loop_encodeSnappyBlockAsm8B:
+	MOVL CX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL CX, DI
+	LEAQ (DX)(CX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	CMPL DI, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
+	MOVQ  (R8)(R10*1), R9
+	XORQ  (SI)(R10*1), R9
+	TESTQ R9, R9
+	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
+	BSFQ  R9, R9
+	SARQ  $0x03, R9
+	LEAL  (R10)(R9*1), R10
+	JMP   match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	CMPL DI, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm8B:
+	TESTL DI, DI
+	JZ    match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm8B
+	LEAL 1(R10), R10
+	DECL DI
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B
+
+match_nolit_end_encodeSnappyBlockAsm8B:
+	ADDL R10, CX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
+	MOVB $0xee, (AX)
+	MOVW SI, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R10*4), R10
+	MOVB SI, 1(AX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R10*4), R10
+	MOVB R10, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBlockAsm8B
+	MOVQ -2(DX)(CX*1), DI
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm8B:
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x20, R8
+	IMULQ R9, R8
+	SHRQ  $0x38, R8
+	SHLQ  $0x20, SI
+	IMULQ R9, SI
+	SHRQ  $0x38, SI
+	LEAL  -2(CX), R9
+	LEAQ  24(SP)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, 24(SP)(R8*4)
+	MOVL  CX, (R10)
+	CMPL  (DX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm8B
+	INCL  CX
+	JMP   search_loop_encodeSnappyBlockAsm8B
+
+emit_remainder_encodeSnappyBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm8B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000a00, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm:
+	MOVL CX, SI
+	SUBL 12(SP), SI
+	SHRL $0x07, SI
+	CMPL SI, $0x63
+	JLE  check_maxskip_ok_encodeSnappyBetterBlockAsm
+	LEAL 100(CX), SI
+	JMP  check_maxskip_cont_encodeSnappyBetterBlockAsm
+
+check_maxskip_ok_encodeSnappyBetterBlockAsm:
+	LEAL 1(CX)(SI*1), SI
+
+check_maxskip_cont_encodeSnappyBetterBlockAsm:
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBetterBlockAsm
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  262168(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 262168(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeSnappyBetterBlockAsm
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeSnappyBetterBlockAsm
+
+candidateS_match_encodeSnappyBetterBlockAsm:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm
+
+match_extend_back_end_encodeSnappyBetterBlockAsm:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 5(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm
+
+match_nolit_end_encodeSnappyBetterBlockAsm:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL R12, $0x01
+	JG   match_length_ok_encodeSnappyBetterBlockAsm
+	CMPL R8, $0x0000ffff
+	JLE  match_length_ok_encodeSnappyBetterBlockAsm
+	MOVL 20(SP), CX
+	INCL CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm
+
+match_length_ok_encodeSnappyBetterBlockAsm:
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm
+	CMPL SI, $0x00010000
+	JLT  three_bytes_match_emit_encodeSnappyBetterBlockAsm
+	CMPL SI, $0x01000000
+	JLT  four_bytes_match_emit_encodeSnappyBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL SI, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+four_bytes_match_emit_encodeSnappyBetterBlockAsm:
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeSnappyBetterBlockAsm
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+	CMPL R8, $0x00010000
+	JL   two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R12, $0x40
+	JLE  four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+	MOVB $0xff, (AX)
+	MOVL R8, 1(AX)
+	LEAL -64(R12), R12
+	ADDQ $0x05, AX
+	CMPL R12, $0x04
+	JL   four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
+	TESTL R12, R12
+	JZ    match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+	MOVB  $0x03, BL
+	LEAL  -4(BX)(R12*4), R12
+	MOVB  R12, (AX)
+	MOVL  R8, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+	CMPL R8, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBetterBlockAsm
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
+	MOVQ  $0x00cf1bbcdcbfa563, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x08, R13
+	IMULQ SI, R13
+	SHRQ  $0x30, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x32, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 262168(SP)(R11*4)
+	MOVL  R15, 262168(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x08, R13
+	IMULQ SI, R13
+	SHRQ  $0x30, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 262168(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeSnappyBetterBlockAsm
+
+emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 5(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+	CMPL DX, $0x00010000
+	JLT  three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+	CMPL DX, $0x01000000
+	JLT  four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+	MOVB $0xfc, (AX)
+	MOVL DX, 1(AX)
+	ADDQ $0x05, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (AX)
+	MOVW DX, 1(AX)
+	MOVB BL, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000a00, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm64K:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm64K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm64K:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x07, SI
+	LEAL  1(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBetterBlockAsm64K
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  262168(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 262168(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeSnappyBetterBlockAsm64K
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeSnappyBetterBlockAsm64K
+
+candidateS_match_encodeSnappyBetterBlockAsm64K:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm64K:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm64K
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_end_encodeSnappyBetterBlockAsm64K:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBetterBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm64K:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm64K
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
+
+match_nolit_end_encodeSnappyBetterBlockAsm64K:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm64K
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeSnappyBetterBlockAsm64K
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm64K:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+	CMPL R8, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBetterBlockAsm64K
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
+	MOVQ  $0x00cf1bbcdcbfa563, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x08, R13
+	IMULQ SI, R13
+	SHRQ  $0x30, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x32, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 262168(SP)(R11*4)
+	MOVL  R15, 262168(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x08, R13
+	IMULQ SI, R13
+	SHRQ  $0x30, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 262168(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeSnappyBetterBlockAsm64K
+
+emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm64K
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000280, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm12B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm12B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  1(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBetterBlockAsm12B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x34, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  65560(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 65560(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeSnappyBetterBlockAsm12B
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeSnappyBetterBlockAsm12B
+
+candidateS_match_encodeSnappyBetterBlockAsm12B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm12B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm12B
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm12B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm12B:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm12B
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
+
+match_nolit_end_encodeSnappyBetterBlockAsm12B:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm12B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeSnappyBetterBlockAsm12B
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm12B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+	CMPL R8, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBetterBlockAsm12B
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x32, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x34, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x34, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 65560(SP)(R11*4)
+	MOVL  R15, 65560(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x34, R11
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x32, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 65560(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeSnappyBetterBlockAsm12B
+
+emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm12B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x000000a0, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm10B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm10B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  1(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBetterBlockAsm10B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x36, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  16408(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 16408(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeSnappyBetterBlockAsm10B
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeSnappyBetterBlockAsm10B
+
+candidateS_match_encodeSnappyBetterBlockAsm10B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm10B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm10B
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm10B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm10B:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm10B
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
+
+match_nolit_end_encodeSnappyBetterBlockAsm10B:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm10B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeSnappyBetterBlockAsm10B
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm10B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+	CMPL R8, $0x00000800
+	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBetterBlockAsm10B
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x34, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x36, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x36, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 16408(SP)(R11*4)
+	MOVL  R15, 16408(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x36, R11
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x34, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 16408(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeSnappyBetterBlockAsm10B
+
+emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm10B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
+	MOVQ dst_base+0(FP), AX
+	MOVQ $0x00000028, CX
+	LEAQ 24(SP), DX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm8B:
+	MOVOU X0, (DX)
+	MOVOU X0, 16(DX)
+	MOVOU X0, 32(DX)
+	MOVOU X0, 48(DX)
+	MOVOU X0, 64(DX)
+	MOVOU X0, 80(DX)
+	MOVOU X0, 96(DX)
+	MOVOU X0, 112(DX)
+	ADDQ  $0x80, DX
+	DECQ  CX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), CX
+	LEAQ  -9(CX), DX
+	LEAQ  -8(CX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, CX
+	SUBL  CX, DX
+	LEAQ  (AX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, CX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm8B:
+	MOVL  CX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x04, SI
+	LEAL  1(CX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JGE   emit_remainder_encodeSnappyBetterBlockAsm8B
+	MOVQ  (DX)(CX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x38, R11
+	MOVL  24(SP)(R10*4), SI
+	MOVL  4120(SP)(R11*4), R8
+	MOVL  CX, 24(SP)(R10*4)
+	MOVL  CX, 4120(SP)(R11*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
+	CMPL  (DX)(R8*1), DI
+	JEQ   candidateS_match_encodeSnappyBetterBlockAsm8B
+	MOVL  20(SP), CX
+	JMP   search_loop_encodeSnappyBetterBlockAsm8B
+
+candidateS_match_encodeSnappyBetterBlockAsm8B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	MOVL  24(SP)(R10*4), SI
+	INCL  CX
+	MOVL  CX, 24(SP)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
+	DECL  CX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm8B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
+	CMPL CX, DI
+	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
+	MOVB -1(DX)(SI*1), BL
+	MOVB -1(DX)(CX*1), R8
+	CMPB BL, R8
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
+	LEAL -1(CX), CX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm8B
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm8B:
+	MOVL CX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(AX)(DI*1), DI
+	CMPQ DI, (SP)
+	JL   match_dst_size_check_encodeSnappyBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm8B:
+	MOVL CX, DI
+	ADDL $0x04, CX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL CX, R8
+	LEAQ (DX)(CX*1), R9
+	LEAQ (DX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+	CMPL R8, $0x08
+	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
+	MOVQ  (R9)(R12*1), R11
+	XORQ  (R10)(R12*1), R11
+	TESTQ R11, R11
+	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
+	BSFQ  R11, R11
+	SARQ  $0x03, R11
+	LEAL  (R12)(R11*1), R12
+	JMP   match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	CMPL R8, $0x08
+	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B:
+	TESTL R8, R8
+	JZ    match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm8B
+	LEAL 1(R12), R12
+	DECL R8
+	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
+
+match_nolit_end_encodeSnappyBetterBlockAsm8B:
+	MOVL CX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm8B
+	CMPL SI, $0x00000100
+	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_match_emit_encodeSnappyBetterBlockAsm8B
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm8B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (AX)
+	MOVQ R10, -8(AX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
+	MOVQ SI, AX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
+	LEAQ (AX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
+	ADDL R12, CX
+	ADDL $0x04, R12
+	MOVL CX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL R12, $0x40
+	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, AX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL R12, $0x0c
+	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVB $0x01, BL
+	LEAL -16(BX)(R12*4), R12
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (AX)
+	ADDQ $0x02, AX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
+	MOVB $0x02, BL
+	LEAL -4(BX)(R12*4), R12
+	MOVB R12, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
+	CMPL CX, 8(SP)
+	JGE  emit_remainder_encodeSnappyBetterBlockAsm8B
+	CMPQ AX, (SP)
+	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	INCL  DI
+	MOVQ  (DX)(DI*1), R9
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	MOVQ  R9, R12
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	SHRQ  $0x10, R12
+	LEAL  1(DI), R14
+	LEAL  2(DI), R15
+	MOVQ  -2(DX)(CX*1), R9
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x36, R13
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x38, R11
+	SHLQ  $0x20, R12
+	IMULQ R8, R12
+	SHRQ  $0x38, R12
+	MOVL  DI, 24(SP)(R10*4)
+	MOVL  R14, 24(SP)(R13*4)
+	MOVL  R14, 4120(SP)(R11*4)
+	MOVL  R15, 4120(SP)(R12*4)
+	MOVQ  R9, R10
+	MOVQ  R9, R11
+	SHRQ  $0x08, R11
+	MOVQ  R11, R13
+	LEAL  -2(CX), R9
+	LEAL  -1(CX), DI
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x38, R11
+	SHLQ  $0x10, R13
+	IMULQ SI, R13
+	SHRQ  $0x36, R13
+	MOVL  R9, 24(SP)(R10*4)
+	MOVL  DI, 4120(SP)(R11*4)
+	MOVL  DI, 24(SP)(R13*4)
+	JMP   search_loop_encodeSnappyBetterBlockAsm8B
+
+emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	SUBL 12(SP), CX
+	LEAQ 3(AX)(CX*1), CX
+	CMPQ CX, (SP)
+	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm8B
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
+	MOVQ src_len+32(FP), CX
+	MOVL 12(SP), BX
+	CMPL BX, CX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+	MOVL CX, SI
+	MOVL CX, 12(SP)
+	LEAQ (DX)(BX*1), CX
+	SUBL BX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
+	CMPL DX, $0x00000100
+	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
+	MOVB $0xf4, (AX)
+	MOVW DX, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf0, (AX)
+	MOVB DL, 1(AX)
+	ADDQ $0x02, AX
+	CMPL DX, $0x40
+	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (AX)
+	ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x08
+	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8:
+	MOVQ (CX), SI
+	MOVQ SI, (AX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(BX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(BX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVQ DX, AX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	LEAQ (AX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(BX*1), X2
+	MOVOU -16(CX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(BX*1)
+	MOVOU X3, -16(AX)(BX*1)
+	MOVQ  DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ AX, ret+48(FP)
+	RET
+
+// func emitLiteral(dst []byte, lit []byte) int
+// Requires: SSE2
+TEXT ·emitLiteral(SB), NOSPLIT, $0-56
+	MOVQ  lit_len+32(FP), DX
+	MOVQ  dst_base+0(FP), AX
+	MOVQ  lit_base+24(FP), CX
+	TESTQ DX, DX
+	JZ    emit_literal_end_standalone_skip
+	MOVL  DX, BX
+	LEAL  -1(DX), SI
+	CMPL  SI, $0x3c
+	JLT   one_byte_standalone
+	CMPL  SI, $0x00000100
+	JLT   two_bytes_standalone
+	CMPL  SI, $0x00010000
+	JLT   three_bytes_standalone
+	CMPL  SI, $0x01000000
+	JLT   four_bytes_standalone
+	MOVB  $0xfc, (AX)
+	MOVL  SI, 1(AX)
+	ADDQ  $0x05, BX
+	ADDQ  $0x05, AX
+	JMP   memmove_long_standalone
+
+four_bytes_standalone:
+	MOVL SI, DI
+	SHRL $0x10, DI
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB DI, 3(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  memmove_long_standalone
+
+three_bytes_standalone:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  memmove_long_standalone
+
+two_bytes_standalone:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JL   memmove_standalone
+	JMP  memmove_long_standalone
+
+one_byte_standalone:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, BX
+	ADDQ $0x01, AX
+
+memmove_standalone:
+	// genMemMoveShort
+	CMPQ DX, $0x03
+	JB   emit_lit_memmove_standalone_memmove_move_1or2
+	JE   emit_lit_memmove_standalone_memmove_move_3
+	CMPQ DX, $0x08
+	JB   emit_lit_memmove_standalone_memmove_move_4through7
+	CMPQ DX, $0x10
+	JBE  emit_lit_memmove_standalone_memmove_move_8through16
+	CMPQ DX, $0x20
+	JBE  emit_lit_memmove_standalone_memmove_move_17through32
+	JMP  emit_lit_memmove_standalone_memmove_move_33through64
+
+emit_lit_memmove_standalone_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(DX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(DX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(DX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(DX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(DX*1), X2
+	MOVOU -16(CX)(DX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+	JMP emit_literal_end_standalone
+
+memmove_long_standalone:
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(DX*1), X2
+	MOVOU -16(CX)(DX*1), X3
+	MOVQ  DX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_standalonelarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_standalonelarge_big_loop_back
+
+emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  DX, R8
+	JAE   emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+	JMP emit_literal_end_standalone
+
+emit_literal_end_standalone_skip:
+	XORQ BX, BX
+
+emit_literal_end_standalone:
+	MOVQ BX, ret+48(FP)
+	RET
+
+// func emitRepeat(dst []byte, offset int, length int) int
+TEXT ·emitRepeat(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitRepeat
+emit_repeat_again_standalone:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JLE  repeat_two_standalone
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_standalone
+	CMPL CX, $0x00000800
+	JLT  repeat_two_offset_standalone
+
+cant_repeat_two_offset_standalone:
+	CMPL DX, $0x00000104
+	JLT  repeat_three_standalone
+	CMPL DX, $0x00010100
+	JLT  repeat_four_standalone
+	CMPL DX, $0x0100ffff
+	JLT  repeat_five_standalone
+	LEAL -16842747(DX), DX
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone
+
+repeat_five_standalone:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_repeat_end
+
+repeat_four_standalone:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_repeat_end
+
+repeat_three_standalone:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_repeat_end
+
+repeat_two_standalone:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_repeat_end
+
+repeat_two_offset_standalone:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+
+gen_emit_repeat_end:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func emitCopy(dst []byte, offset int, length int) int
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitCopy
+	CMPL CX, $0x00010000
+	JL   two_byte_offset_standalone
+
+four_bytes_loop_back_standalone:
+	CMPL DX, $0x40
+	JLE  four_bytes_remain_standalone
+	MOVB $0xff, (AX)
+	MOVL CX, 1(AX)
+	LEAL -64(DX), DX
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	CMPL DX, $0x04
+	JL   four_bytes_remain_standalone
+
+	// emitRepeat
+emit_repeat_again_standalone_emit_copy:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JLE  repeat_two_standalone_emit_copy
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_standalone_emit_copy
+	CMPL CX, $0x00000800
+	JLT  repeat_two_offset_standalone_emit_copy
+
+cant_repeat_two_offset_standalone_emit_copy:
+	CMPL DX, $0x00000104
+	JLT  repeat_three_standalone_emit_copy
+	CMPL DX, $0x00010100
+	JLT  repeat_four_standalone_emit_copy
+	CMPL DX, $0x0100ffff
+	JLT  repeat_five_standalone_emit_copy
+	LEAL -16842747(DX), DX
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone_emit_copy
+
+repeat_five_standalone_emit_copy:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+repeat_four_standalone_emit_copy:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+repeat_three_standalone_emit_copy:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_standalone_emit_copy:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+	JMP four_bytes_loop_back_standalone
+
+four_bytes_remain_standalone:
+	TESTL DX, DX
+	JZ    gen_emit_copy_end
+	MOVB  $0x03, SI
+	LEAL  -4(SI)(DX*4), DX
+	MOVB  DL, (AX)
+	MOVL  CX, 1(AX)
+	ADDQ  $0x05, BX
+	ADDQ  $0x05, AX
+	JMP   gen_emit_copy_end
+
+two_byte_offset_standalone:
+	CMPL DX, $0x40
+	JLE  two_byte_offset_short_standalone
+	MOVB $0xee, (AX)
+	MOVW CX, 1(AX)
+	LEAL -60(DX), DX
+	ADDQ $0x03, AX
+	ADDQ $0x03, BX
+
+	// emitRepeat
+emit_repeat_again_standalone_emit_copy_short:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JLE  repeat_two_standalone_emit_copy_short
+	CMPL SI, $0x0c
+	JGE  cant_repeat_two_offset_standalone_emit_copy_short
+	CMPL CX, $0x00000800
+	JLT  repeat_two_offset_standalone_emit_copy_short
+
+cant_repeat_two_offset_standalone_emit_copy_short:
+	CMPL DX, $0x00000104
+	JLT  repeat_three_standalone_emit_copy_short
+	CMPL DX, $0x00010100
+	JLT  repeat_four_standalone_emit_copy_short
+	CMPL DX, $0x0100ffff
+	JLT  repeat_five_standalone_emit_copy_short
+	LEAL -16842747(DX), DX
+	MOVW $0x001d, (AX)
+	MOVW $0xfffb, 2(AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone_emit_copy_short
+
+repeat_five_standalone_emit_copy_short:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+repeat_four_standalone_emit_copy_short:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+repeat_three_standalone_emit_copy_short:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_standalone_emit_copy_short:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+	JMP two_byte_offset_standalone
+
+two_byte_offset_short_standalone:
+	CMPL DX, $0x0c
+	JGE  emit_copy_three_standalone
+	CMPL CX, $0x00000800
+	JGE  emit_copy_three_standalone
+	MOVB $0x01, SI
+	LEAL -16(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SHRL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+emit_copy_three_standalone:
+	MOVB $0x02, SI
+	LEAL -4(SI)(DX*4), DX
+	MOVB DL, (AX)
+	MOVW CX, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+
+gen_emit_copy_end:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func emitCopyNoRepeat(dst []byte, offset int, length int) int
+TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitCopy
+	CMPL CX, $0x00010000
+	JL   two_byte_offset_standalone_snappy
+
+four_bytes_loop_back_standalone_snappy:
+	CMPL DX, $0x40
+	JLE  four_bytes_remain_standalone_snappy
+	MOVB $0xff, (AX)
+	MOVL CX, 1(AX)
+	LEAL -64(DX), DX
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	CMPL DX, $0x04
+	JL   four_bytes_remain_standalone_snappy
+	JMP  four_bytes_loop_back_standalone_snappy
+
+four_bytes_remain_standalone_snappy:
+	TESTL DX, DX
+	JZ    gen_emit_copy_end_snappy
+	MOVB  $0x03, SI
+	LEAL  -4(SI)(DX*4), DX
+	MOVB  DL, (AX)
+	MOVL  CX, 1(AX)
+	ADDQ  $0x05, BX
+	ADDQ  $0x05, AX
+	JMP   gen_emit_copy_end_snappy
+
+two_byte_offset_standalone_snappy:
+	CMPL DX, $0x40
+	JLE  two_byte_offset_short_standalone_snappy
+	MOVB $0xee, (AX)
+	MOVW CX, 1(AX)
+	LEAL -60(DX), DX
+	ADDQ $0x03, AX
+	ADDQ $0x03, BX
+	JMP  two_byte_offset_standalone_snappy
+
+two_byte_offset_short_standalone_snappy:
+	CMPL DX, $0x0c
+	JGE  emit_copy_three_standalone_snappy
+	CMPL CX, $0x00000800
+	JGE  emit_copy_three_standalone_snappy
+	MOVB $0x01, SI
+	LEAL -16(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SHRL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end_snappy
+
+emit_copy_three_standalone_snappy:
+	MOVB $0x02, SI
+	LEAL -4(SI)(DX*4), DX
+	MOVB DL, (AX)
+	MOVW CX, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+
+gen_emit_copy_end_snappy:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func matchLen(a []byte, b []byte) int
+TEXT ·matchLen(SB), NOSPLIT, $0-56
+	MOVQ a_base+0(FP), AX
+	MOVQ b_base+24(FP), CX
+	MOVQ a_len+8(FP), DX
+
+	// matchLen
+	XORL SI, SI
+	CMPL DX, $0x08
+	JL   matchlen_single_standalone
+
+matchlen_loopback_standalone:
+	MOVQ  (AX)(SI*1), BX
+	XORQ  (CX)(SI*1), BX
+	TESTQ BX, BX
+	JZ    matchlen_loop_standalone
+	BSFQ  BX, BX
+	SARQ  $0x03, BX
+	LEAL  (SI)(BX*1), SI
+	JMP   gen_match_len_end
+
+matchlen_loop_standalone:
+	LEAL -8(DX), DX
+	LEAL 8(SI), SI
+	CMPL DX, $0x08
+	JGE  matchlen_loopback_standalone
+
+matchlen_single_standalone:
+	TESTL DX, DX
+	JZ    gen_match_len_end
+
+matchlen_single_loopback_standalone:
+	MOVB (AX)(SI*1), BL
+	CMPB (CX)(SI*1), BL
+	JNE  gen_match_len_end
+	LEAL 1(SI), SI
+	DECL DX
+	JNZ  matchlen_single_loopback_standalone
+
+gen_match_len_end:
+	MOVQ SI, ret+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/s2/s2.go b/vendor/github.com/klauspost/compress/s2/s2.go
new file mode 100644
index 00000000..89d69e96
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/s2.go
@@ -0,0 +1,139 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package s2 implements the S2 compression format.
+//
+// S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
+// which is why it features concurrent compression for bigger payloads.
+//
+// Decoding is compatible with Snappy compressed content,
+// but content compressed with S2 cannot be decompressed by Snappy.
+//
+// For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
+//
+// There are actually two S2 formats: block and stream. They are related,
+// but different: trying to decompress block-compressed data as a S2 stream
+// will fail, and vice versa. The block format is the Decode and Encode
+// functions and the stream format is the Reader and Writer types.
+//
+// A "better" compression option is available. This will trade some compression
+// speed
+//
+// The block format, the more common case, is used when the complete size (the
+// number of bytes) of the original data is known upfront, at the time
+// compression starts. The stream format, also known as the framing format, is
+// for when that isn't always true.
+//
+// Blocks to not offer much data protection, so it is up to you to
+// add data validation of decompressed blocks.
+//
+// Streams perform CRC validation of the decompressed data.
+// Stream compression will also be performed on multiple CPU cores concurrently
+// significantly improving throughput.
+package s2
+
+import (
+	"bytes"
+	"hash/crc32"
+)
+
+/*
+Each encoded block begins with the varint-encoded length of the decoded data,
+followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
+first byte of each chunk is broken into its 2 least and 6 most significant bits
+called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
+Zero means a literal tag. All other values mean a copy tag.
+
+For literal tags:
+  - If m < 60, the next 1 + m bytes are literal bytes.
+  - Otherwise, let n be the little-endian unsigned integer denoted by the next
+    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
+
+For copy tags, length bytes are copied from offset bytes ago, in the style of
+Lempel-Ziv compression algorithms. In particular:
+  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
+    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
+    of the offset. The next byte is bits 0-7 of the offset.
+  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
+    The length is 1 + m. The offset is the little-endian unsigned integer
+    denoted by the next 2 bytes.
+  - For l == 3, the offset ranges in [0, 1<<32) and the length in
+    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
+    integer denoted by the next 4 bytes.
+*/
+const (
+	tagLiteral = 0x00
+	tagCopy1   = 0x01
+	tagCopy2   = 0x02
+	tagCopy4   = 0x03
+)
+
+const (
+	checksumSize     = 4
+	chunkHeaderSize  = 4
+	magicChunk       = "\xff\x06\x00\x00" + magicBody
+	magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
+	magicBodySnappy  = "sNaPpY"
+	magicBody        = "S2sTwO"
+
+	// maxBlockSize is the maximum size of the input to encodeBlock.
+	//
+	// For the framing format (Writer type instead of Encode function),
+	// this is the maximum uncompressed size of a block.
+	maxBlockSize = 4 << 20
+
+	// minBlockSize is the minimum size of block setting when creating a writer.
+	minBlockSize = 4 << 10
+
+	// Default block size
+	defaultBlockSize = 1 << 20
+
+	// maxSnappyBlockSize is the maximum snappy block size.
+	maxSnappyBlockSize = 1 << 16
+
+	obufHeaderLen = checksumSize + chunkHeaderSize
+)
+
+const (
+	chunkTypeCompressedData   = 0x00
+	chunkTypeUncompressedData = 0x01
+	chunkTypePadding          = 0xfe
+	chunkTypeStreamIdentifier = 0xff
+)
+
+var crcTable = crc32.MakeTable(crc32.Castagnoli)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func crc(b []byte) uint32 {
+	c := crc32.Update(0, crcTable, b)
+	return c>>15 | c<<17 + 0xa282ead8
+}
+
+// literalExtraSize returns the extra size of encoding n literals.
+// n should be >= 0 and <= math.MaxUint32.
+func literalExtraSize(n int64) int64 {
+	if n == 0 {
+		return 0
+	}
+	switch {
+	case n < 60:
+		return 1
+	case n < 1<<8:
+		return 2
+	case n < 1<<16:
+		return 3
+	case n < 1<<24:
+		return 4
+	default:
+		return 5
+	}
+}
+
+type byter interface {
+	Bytes() []byte
+}
+
+var _ byter = &bytes.Buffer{}