diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2')
17 files changed, 21860 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/.gitignore b/vendor/github.com/klauspost/compress/s2/.gitignore new file mode 100644 index 00000000..3a89c6e3 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/.gitignore @@ -0,0 +1,15 @@ +testdata/bench + +# These explicitly listed benchmark data files are for an obsolete version of +# snappy_test.go. +testdata/alice29.txt +testdata/asyoulik.txt +testdata/fireworks.jpeg +testdata/geo.protodata +testdata/html +testdata/html_x_4 +testdata/kppkn.gtb +testdata/lcet10.txt +testdata/paper-100k.pdf +testdata/plrabn12.txt +testdata/urls.10K diff --git a/vendor/github.com/klauspost/compress/s2/LICENSE b/vendor/github.com/klauspost/compress/s2/LICENSE new file mode 100644 index 00000000..1d2d645b --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. +Copyright (c) 2019 Klaus Post. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md new file mode 100644 index 00000000..81fad652 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/README.md @@ -0,0 +1,717 @@ +# S2 Compression + +S2 is an extension of [Snappy](https://github.com/google/snappy). + +S2 is aimed for high throughput, which is why it features concurrent compression for bigger payloads. + +Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy. +This means that S2 can seamlessly replace Snappy without converting compressed content. + +S2 can produce Snappy compatible output, faster and better than Snappy. +If you want full benefit of the changes you should use s2 without Snappy compatibility. + +S2 is designed to have high throughput on content that cannot be compressed. +This is important, so you don't have to worry about spending CPU cycles on already compressed data. + +## Benefits over Snappy + +* Better compression +* Adjustable compression (3 levels) +* Concurrent stream compression +* Faster decompression, even for Snappy compatible content +* Ability to quickly skip forward in compressed stream +* Compatible with reading Snappy compressed content +* Smaller block size overhead on incompressible blocks +* Block concatenation +* Uncompressed stream mode +* Automatic stream size padding +* Snappy compatible block compression + +## Drawbacks over Snappy + +* Not optimized for 32 bit systems. +* Streams use slightly more memory due to larger blocks and concurrency (configurable). + +# Usage + +Installation: `go get -u github.com/klauspost/compress/s2` + +Full package documentation: + +[![godoc][1]][2] + +[1]: https://godoc.org/github.com/klauspost/compress?status.svg +[2]: https://godoc.org/github.com/klauspost/compress/s2 + +## Compression + +```Go +func EncodeStream(src io.Reader, dst io.Writer) error { + enc := s2.NewWriter(dst) + _, err := io.Copy(enc, src) + if err != nil { + enc.Close() + return err + } + // Blocks until compression is done. + return enc.Close() +} +``` + +You should always call `enc.Close()`, otherwise you will leak resources and your encode will be incomplete. + +For the best throughput, you should attempt to reuse the `Writer` using the `Reset()` method. + +The Writer in S2 is always buffered, therefore `NewBufferedWriter` in Snappy can be replaced with `NewWriter` in S2. +It is possible to flush any buffered data using the `Flush()` method. +This will block until all data sent to the encoder has been written to the output. + +S2 also supports the `io.ReaderFrom` interface, which will consume all input from a reader. + +As a final method to compress data, if you have a single block of data you would like to have encoded as a stream, +a slightly more efficient method is to use the `EncodeBuffer` method. +This will take ownership of the buffer until the stream is closed. + +```Go +func EncodeStream(src []byte, dst io.Writer) error { + enc := s2.NewWriter(dst) + // The encoder owns the buffer until Flush or Close is called. + err := enc.EncodeBuffer(buf) + if err != nil { + enc.Close() + return err + } + // Blocks until compression is done. + return enc.Close() +} +``` + +Each call to `EncodeBuffer` will result in discrete blocks being created without buffering, +so it should only be used a single time per stream. +If you need to write several blocks, you should use the regular io.Writer interface. + + +## Decompression + +```Go +func DecodeStream(src io.Reader, dst io.Writer) error { + dec := s2.NewReader(src) + _, err := io.Copy(dst, dec) + return err +} +``` + +Similar to the Writer, a Reader can be reused using the `Reset` method. + +For the best possible throughput, there is a `EncodeBuffer(buf []byte)` function available. +However, it requires that the provided buffer isn't used after it is handed over to S2 and until the stream is flushed or closed. + +For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`. +Do however note that these functions (similar to Snappy) does not provide validation of data, +so data corruption may be undetected. Stream encoding provides CRC checks of data. + +It is possible to efficiently skip forward in a compressed stream using the `Skip()` method. +For big skips the decompressor is able to skip blocks without decompressing them. + +## Single Blocks + +Similar to Snappy S2 offers single block compression. +Blocks do not offer the same flexibility and safety as streams, +but may be preferable for very small payloads, less than 100K. + +Using a simple `dst := s2.Encode(nil, src)` will compress `src` and return the compressed result. +It is possible to provide a destination buffer. +If the buffer has a capacity of `s2.MaxEncodedLen(len(src))` it will be used. +If not a new will be allocated. + +Alternatively `EncodeBetter`/`EncodeBest` can also be used for better, but slightly slower compression. + +Similarly to decompress a block you can use `dst, err := s2.Decode(nil, src)`. +Again an optional destination buffer can be supplied. +The `s2.DecodedLen(src)` can be used to get the minimum capacity needed. +If that is not satisfied a new buffer will be allocated. + +Block function always operate on a single goroutine since it should only be used for small payloads. + +# Commandline tools + +Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression. + +Binaries can be downloaded on the [Releases Page](https://github.com/klauspost/compress/releases). + +Installing then requires Go to be installed. To install them, use: + +`go install github.com/klauspost/compress/s2/cmd/s2c && go install github.com/klauspost/compress/s2/cmd/s2d` + +To build binaries to the current folder use: + +`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d` + + +## s2c + +``` +Usage: s2c [options] file1 file2 + +Compresses all files supplied as input separately. +Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'. +By default output files will be overwritten. +Use - as the only file name to read from stdin and write to stdout. + +Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt +Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt + +File names beginning with 'http://' and 'https://' will be downloaded and compressed. +Only http response code 200 is accepted. + +Options: + -bench int + Run benchmark n times. No output will be written + -blocksize string + Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M") + -c Write all output to stdout. Multiple input files will be concatenated + -cpu int + Compress using this amount of threads (default 32) + -faster + Compress faster, but with a minor compression loss + -help + Display help + -o string + Write output to another file. Single input file only + -pad string + Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1") + -q Don't write any output to terminal, except errors + -rm + Delete source file(s) after successful compression + -safe + Do not overwrite output files + -slower + Compress more, but a lot slower + -snappy + Generate Snappy compatible output stream + -verify + Verify written files + +``` + +## s2d + +``` +Usage: s2d [options] file1 file2 + +Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'. +Output file names have the extension removed. By default output files will be overwritten. +Use - as the only file name to read from stdin and write to stdout. + +Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt +Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt + +File names beginning with 'http://' and 'https://' will be downloaded and decompressed. +Extensions on downloaded files are ignored. Only http response code 200 is accepted. + +Options: + -bench int + Run benchmark n times. No output will be written + -c Write all output to stdout. Multiple input files will be concatenated + -help + Display help + -o string + Write output to another file. Single input file only + -q Don't write any output to terminal, except errors + -rm + Delete source file(s) after successful decompression + -safe + Do not overwrite output files + -verify + Verify files, but do not write output +``` + +## s2sx: self-extracting archives + +s2sx allows creating self-extracting archives with no dependencies. + +By default, executables are created for the same platforms as the host os, +but this can be overridden with `-os` and `-arch` parameters. + +Extracted files have 0666 permissions, except when untar option used. + +``` +Usage: s2sx [options] file1 file2 + +Compresses all files supplied as input separately. +If files have '.s2' extension they are assumed to be compressed already. +Output files are written as 'filename.s2sx' and with '.exe' for windows targets. +If output is big, an additional file with ".more" is written. This must be included as well. +By default output files will be overwritten. + +Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt +Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt + +Options: + -arch string + Destination architecture (default "amd64") + -c Write all output to stdout. Multiple input files will be concatenated + -cpu int + Compress using this amount of threads (default 32) + -help + Display help + -max string + Maximum executable size. Rest will be written to another file. (default "1G") + -os string + Destination operating system (default "windows") + -q Don't write any output to terminal, except errors + -rm + Delete source file(s) after successful compression + -safe + Do not overwrite output files + -untar + Untar on destination +``` + +Available platforms are: + + * darwin-amd64 + * darwin-arm64 + * linux-amd64 + * linux-arm + * linux-arm64 + * linux-mips64 + * linux-ppc64le + * windows-386 + * windows-amd64 + +By default, there is a size limit of 1GB for the output executable. + +When this is exceeded the remaining file content is written to a file called +output+`.more`. This file must be included for a successful extraction and +placed alongside the executable for a successful extraction. + +This file *must* have the same name as the executable, so if the executable is renamed, +so must the `.more` file. + +This functionality is disabled with stdin/stdout. + +### Self-extracting TAR files + +If you wrap a TAR file you can specify `-untar` to make it untar on the destination host. + +Files are extracted to the current folder with the path specified in the tar file. + +Note that tar files are not validated before they are wrapped. + +For security reasons files that move below the root folder are not allowed. + +# Performance + +This section will focus on comparisons to Snappy. +This package is solely aimed at replacing Snappy as a high speed compression package. +If you are mainly looking for better compression [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) +gives better compression, but typically at speeds slightly below "better" mode in this package. + +Compression is increased compared to Snappy, mostly around 5-20% and the throughput is typically 25-40% increased (single threaded) compared to the Snappy Go implementation. + +Streams are concurrently compressed. The stream will be distributed among all available CPU cores for the best possible throughput. + +A "better" compression mode is also available. This allows to trade a bit of speed for a minor compression gain. +The content compressed in this mode is fully compatible with the standard decoder. + +Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU): + +| File | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller | +|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------| +| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 12.70x | 10556 MB/s | 7.35% | 4.15x | 3455 MB/s | 12.79% | +| (1 CPU) | 1.14x | 948 MB/s | - | 0.42x | 349 MB/s | - | +| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x | 14484 MB/s | 31.60% | 10.09x | 8533 MB/s | 37.71% | +| (1 CPU) | 1.33x | 1127 MB/s | - | 0.70x | 589 MB/s | - | +| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12000 MB/s | -5.79% | 6.59x | 5223 MB/s | 5.80% | +| (1 CPU) | 1.11x | 877 MB/s | - | 0.47x | 370 MB/s | - | +| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 14.62x | 12116 MB/s | 15.90% | 5.35x | 4430 MB/s | 16.08% | +| (1 CPU) | 1.38x | 1146 MB/s | - | 0.38x | 312 MB/s | - | +| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 8.83x | 17579 MB/s | 43.86% | 6.54x | 13011 MB/s | 47.23% | +| (1 CPU) | 1.14x | 2259 MB/s | - | 0.74x | 1475 MB/s | - | +| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 16.72x | 14019 MB/s | 24.02% | 10.11x | 8477 MB/s | 30.48% | +| (1 CPU) | 1.24x | 1043 MB/s | - | 0.70x | 586 MB/s | - | +| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9254 MB/s | 1.84% | 6.75x | 4686 MB/s | 6.72% | +| (1 CPU) | 0.97x | 672 MB/s | - | 0.53x | 366 MB/s | - | +| sharnd.out.2gb | 2.11x | 12639 MB/s | 0.01% | 1.98x | 11833 MB/s | 0.01% | +| (1 CPU) | 0.93x | 5594 MB/s | - | 1.34x | 8030 MB/s | - | +| [enwik9](http://mattmahoney.net/dc/textdata.html) | 19.34x | 8220 MB/s | 3.98% | 7.87x | 3345 MB/s | 15.82% | +| (1 CPU) | 1.06x | 452 MB/s | - | 0.50x | 213 MB/s | - | +| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 10.48x | 6124 MB/s | 5.67% | 3.76x | 2197 MB/s | 12.60% | +| (1 CPU) | 0.97x | 568 MB/s | - | 0.46x | 271 MB/s | - | +| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 21.07x | 9020 MB/s | 6.36% | 6.91x | 2959 MB/s | 16.95% | +| (1 CPU) | 1.07x | 460 MB/s | - | 0.51x | 220 MB/s | - | + +### Legend + +* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core. +* `S2 throughput`: Throughput of S2 in MB/s. +* `S2 % smaller`: How many percent of the Snappy output size is S2 better. +* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy. +* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy. +* `"better" % smaller`: How many percent of the Snappy output size is S2 better when using "better" compression. + +There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads. + +Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size. + +The "better" compression mode sees a good improvement in all cases, but usually at a performance cost. + +Incompressible content (`sharnd.out.2gb`, 2GB random data) sees the smallest speedup. +This is likely dominated by synchronization overhead, which is confirmed by the fact that single threaded performance is higher (see above). + +## Decompression + +S2 attempts to create content that is also fast to decompress, except in "better" mode where the smallest representation is used. + +S2 vs Snappy **decompression** speed. Both operating on single core: + +| File | S2 Throughput | vs. Snappy | Better Throughput | vs. Snappy | +|-----------------------------------------------------------------------------------------------------|---------------|------------|-------------------|------------| +| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 2117 MB/s | 1.14x | 1738 MB/s | 0.94x | +| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 2401 MB/s | 1.25x | 2307 MB/s | 1.20x | +| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 2075 MB/s | 0.98x | 1764 MB/s | 0.83x | +| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 2967 MB/s | 1.05x | 2885 MB/s | 1.02x | +| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 4141 MB/s | 1.07x | 4184 MB/s | 1.08x | +| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 2264 MB/s | 1.12x | 2185 MB/s | 1.08x | +| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 1525 MB/s | 1.03x | 1347 MB/s | 0.91x | +| sharnd.out.2gb | 3813 MB/s | 0.79x | 3900 MB/s | 0.81x | +| [enwik9](http://mattmahoney.net/dc/textdata.html) | 1246 MB/s | 1.29x | 967 MB/s | 1.00x | +| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 1433 MB/s | 1.12x | 1203 MB/s | 0.94x | +| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 1284 MB/s | 1.32x | 1010 MB/s | 1.04x | + +### Legend + +* `S2 Throughput`: Decompression speed of S2 encoded content. +* `Better Throughput`: Decompression speed of S2 "better" encoded content. +* `vs Snappy`: Decompression speed of S2 "better" mode compared to Snappy and absolute speed. + + +While the decompression code hasn't changed, there is a significant speedup in decompression speed. +S2 prefers longer matches and will typically only find matches that are 6 bytes or longer. +While this reduces compression a bit, it improves decompression speed. + +The "better" compression mode will actively look for shorter matches, which is why it has a decompression speed quite similar to Snappy. + +Without assembly decompression is also very fast; single goroutine decompression speed. No assembly: + +| File | S2 Throughput | S2 throughput | +|--------------------------------|--------------|---------------| +| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s | +| 10gb.tar.s2 | 1.30x | 867.07 MB/s | +| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s | +| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s | +| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s | +| enwik9.s2 | 1.67x | 681.53 MB/s | +| adresser.json.s2 | 3.41x | 4230.53 MB/s | +| silesia.tar.s2 | 1.52x | 811.58 | + +Even though S2 typically compresses better than Snappy, decompression speed is always better. + +## Block compression + + +When compressing blocks no concurrent compression is performed just as Snappy. +This is because blocks are for smaller payloads and generally will not benefit from concurrent compression. + +An important change is that incompressible blocks will not be more than at most 10 bytes bigger than the input. +In rare, worst case scenario Snappy blocks could be significantly bigger than the input. + +### Mixed content blocks + +The most reliable is a wide dataset. +For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), +53927 files, total input size: 4,014,735,833 bytes. Single goroutine used. + +| * | Input | Output | Reduction | MB/s | +|-------------------|------------|------------|-----------|--------| +| S2 | 4014735833 | 1059723369 | 73.60% | **934.34** | +| S2 Better | 4014735833 | 969670507 | 75.85% | 532.70 | +| S2 Best | 4014735833 | 906625668 | **77.85%** | 46.84 | +| Snappy | 4014735833 | 1128706759 | 71.89% | 762.59 | +| S2, Snappy Output | 4014735833 | 1093821420 | 72.75% | 908.60 | +| LZ4 | 4014735833 | 1079259294 | 73.12% | 526.94 | + +S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best". +"Better" mode provides the same compression speed as LZ4 with better compression ratio. + +When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression. + +As can be seen from the other benchmarks decompression should also be easier on the S2 generated output. + +Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for +other Go compressors: + +| * | Input | Output | Reduction | MB/s | +|-------------------|------------|------------|-----------|--------| +| Zstd Fastest (Go) | 4014735833 | 794608518 | 80.21% | 236.04 | +| Zstd Best (Go) | 4014735833 | 704603356 | 82.45% | 35.63 | +| Deflate (Go) l1 | 4014735833 | 871294239 | 78.30% | 214.04 | +| Deflate (Go) l9 | 4014735833 | 730389060 | 81.81% | 41.17 | + +### Standard block compression + +Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns. +So individual benchmarks should only be seen as a guideline and the overall picture is more important. + +These micro-benchmarks are with data in cache and trained branch predictors. For a more realistic benchmark see the mixed content above. + +Block compression. Parallel benchmark running on 16 cores, 16 goroutines. + +AMD64 assembly is use for both S2 and Snappy. + +| Absolute Perf | Snappy size | S2 Size | Snappy Speed | S2 Speed | Snappy dec | S2 dec | +|-----------------------|-------------|---------|--------------|-------------|-------------|-------------| +| html | 22843 | 21111 | 16246 MB/s | 17438 MB/s | 40972 MB/s | 49263 MB/s | +| urls.10K | 335492 | 287326 | 7943 MB/s | 9693 MB/s | 22523 MB/s | 26484 MB/s | +| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 273889 MB/s | 718321 MB/s | 827552 MB/s | +| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 17773 MB/s | 33691 MB/s | 52421 MB/s | +| paper-100k.pdf | 85304 | 84459 | 167546 MB/s | 101263 MB/s | 326905 MB/s | 291944 MB/s | +| html_x_4 | 92234 | 21113 | 15194 MB/s | 50670 MB/s | 30843 MB/s | 32217 MB/s | +| alice29.txt | 88034 | 85975 | 5936 MB/s | 6139 MB/s | 12882 MB/s | 20044 MB/s | +| asyoulik.txt | 77503 | 79650 | 5517 MB/s | 6366 MB/s | 12735 MB/s | 22806 MB/s | +| lcet10.txt | 234661 | 220670 | 6235 MB/s | 6067 MB/s | 14519 MB/s | 18697 MB/s | +| plrabn12.txt | 319267 | 317985 | 5159 MB/s | 5726 MB/s | 11923 MB/s | 19901 MB/s | +| geo.protodata | 23335 | 18690 | 21220 MB/s | 26529 MB/s | 56271 MB/s | 62540 MB/s | +| kppkn.gtb | 69526 | 65312 | 9732 MB/s | 8559 MB/s | 18491 MB/s | 18969 MB/s | +| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 15489 MB/s | 31883 MB/s | 38874 MB/s | +| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13000 MB/s | 48056 MB/s | 52341 MB/s | +| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12806 MB/s | 32378 MB/s | 46322 MB/s | +| alice29.txt (20000B) | 12686 | 13574 | 7733 MB/s | 11210 MB/s | 30566 MB/s | 58969 MB/s | + + +| Relative Perf | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed | +|-----------------------|-------------|------------------|----------|--------------| +| html | 22.31% | 7.58% | 1.07x | 1.20x | +| urls.10K | 47.78% | 14.36% | 1.22x | 1.18x | +| fireworks.jpeg | 99.95% | -0.05% | 0.78x | 1.15x | +| fireworks.jpeg (200B) | 73.00% | -6.16% | 2.00x | 1.56x | +| paper-100k.pdf | 83.30% | 0.99% | 0.60x | 0.89x | +| html_x_4 | 22.52% | 77.11% | 3.33x | 1.04x | +| alice29.txt | 57.88% | 2.34% | 1.03x | 1.56x | +| asyoulik.txt | 61.91% | -2.77% | 1.15x | 1.79x | +| lcet10.txt | 54.99% | 5.96% | 0.97x | 1.29x | +| plrabn12.txt | 66.26% | 0.40% | 1.11x | 1.67x | +| geo.protodata | 19.68% | 19.91% | 1.25x | 1.11x | +| kppkn.gtb | 37.72% | 6.06% | 0.88x | 1.03x | +| alice29.txt (128B) | 62.50% | -2.50% | 2.31x | 1.22x | +| alice29.txt (1000B) | 77.40% | 0.00% | 1.07x | 1.09x | +| alice29.txt (10000B) | 66.48% | -4.29% | 1.27x | 1.43x | +| alice29.txt (20000B) | 63.43% | -7.00% | 1.45x | 1.93x | + +Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size. + +Decompression speed is better than Snappy, except in one case. + +Since payloads are very small the variance in terms of size is rather big, so they should only be seen as a general guideline. + +Size is on average around Snappy, but varies on content type. +In cases where compression is worse, it usually is compensated by a speed boost. + + +### Better compression + +Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns. +So individual benchmarks should only be seen as a guideline and the overall picture is more important. + +| Absolute Perf | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec | Better dec | +|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------| +| html | 22843 | 19833 | 16246 MB/s | 7731 MB/s | 40972 MB/s | 40292 MB/s | +| urls.10K | 335492 | 253529 | 7943 MB/s | 3980 MB/s | 22523 MB/s | 20981 MB/s | +| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 9760 MB/s | 718321 MB/s | 823698 MB/s | +| fireworks.jpeg (200B) | 146 | 142 | 8869 MB/s | 594 MB/s | 33691 MB/s | 30101 MB/s | +| paper-100k.pdf | 85304 | 82915 | 167546 MB/s | 7470 MB/s | 326905 MB/s | 198869 MB/s | +| html_x_4 | 92234 | 19841 | 15194 MB/s | 23403 MB/s | 30843 MB/s | 30937 MB/s | +| alice29.txt | 88034 | 73218 | 5936 MB/s | 2945 MB/s | 12882 MB/s | 16611 MB/s | +| asyoulik.txt | 77503 | 66844 | 5517 MB/s | 2739 MB/s | 12735 MB/s | 14975 MB/s | +| lcet10.txt | 234661 | 190589 | 6235 MB/s | 3099 MB/s | 14519 MB/s | 16634 MB/s | +| plrabn12.txt | 319267 | 270828 | 5159 MB/s | 2600 MB/s | 11923 MB/s | 13382 MB/s | +| geo.protodata | 23335 | 18278 | 21220 MB/s | 11208 MB/s | 56271 MB/s | 57961 MB/s | +| kppkn.gtb | 69526 | 61851 | 9732 MB/s | 4556 MB/s | 18491 MB/s | 16524 MB/s | +| alice29.txt (128B) | 80 | 81 | 6691 MB/s | 529 MB/s | 31883 MB/s | 34225 MB/s | +| alice29.txt (1000B) | 774 | 748 | 12204 MB/s | 1943 MB/s | 48056 MB/s | 42068 MB/s | +| alice29.txt (10000B) | 6648 | 6234 | 10044 MB/s | 2949 MB/s | 32378 MB/s | 28813 MB/s | +| alice29.txt (20000B) | 12686 | 11584 | 7733 MB/s | 2822 MB/s | 30566 MB/s | 27315 MB/s | + + +| Relative Perf | Snappy size | Better size | Better Speed | Better dec | +|-----------------------|-------------|-------------|--------------|------------| +| html | 22.31% | 13.18% | 0.48x | 0.98x | +| urls.10K | 47.78% | 24.43% | 0.50x | 0.93x | +| fireworks.jpeg | 99.95% | -0.05% | 0.03x | 1.15x | +| fireworks.jpeg (200B) | 73.00% | 2.74% | 0.07x | 0.89x | +| paper-100k.pdf | 83.30% | 2.80% | 0.07x | 0.61x | +| html_x_4 | 22.52% | 78.49% | 0.04x | 1.00x | +| alice29.txt | 57.88% | 16.83% | 1.54x | 1.29x | +| asyoulik.txt | 61.91% | 13.75% | 0.50x | 1.18x | +| lcet10.txt | 54.99% | 18.78% | 0.50x | 1.15x | +| plrabn12.txt | 66.26% | 15.17% | 0.50x | 1.12x | +| geo.protodata | 19.68% | 21.67% | 0.50x | 1.03x | +| kppkn.gtb | 37.72% | 11.04% | 0.53x | 0.89x | +| alice29.txt (128B) | 62.50% | -1.25% | 0.47x | 1.07x | +| alice29.txt (1000B) | 77.40% | 3.36% | 0.08x | 0.88x | +| alice29.txt (10000B) | 66.48% | 6.23% | 0.16x | 0.89x | +| alice29.txt (20000B) | 63.43% | 8.69% | 0.29x | 0.89x | + +Except for the mostly incompressible JPEG image compression is better and usually in the +double digits in terms of percentage reduction over Snappy. + +The PDF sample shows a significant slowdown compared to Snappy, as this mode tries harder +to compress the data. Very small blocks are also not favorable for better compression, so throughput is way down. + +This mode aims to provide better compression at the expense of performance and achieves that +without a huge performance penalty, except on very small blocks. + +Decompression speed suffers a little compared to the regular S2 mode, +but still manages to be close to Snappy in spite of increased compression. + +# Best compression mode + +S2 offers a "best" compression mode. + +This will compress as much as possible with little regard to CPU usage. + +Mainly for offline compression, but where decompression speed should still +be high and compatible with other S2 compressed data. + +Some examples compared on 16 core CPU, amd64 assembly used: + +``` +* enwik10 +Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s +Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s +Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s + +* github-june-2days-2019.json +Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s +Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s +Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s + +* nyc-taxi-data-10M.csv +Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s +Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s +Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s + +* 10gb.tar +Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s +Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s +Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/ + +* consensus.db.10gb +Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s +Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s +Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s +``` + +Decompression speed should be around the same as using the 'better' compression mode. + +# Snappy Compatibility + +S2 now offers full compatibility with Snappy. + +This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output. + +There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by +simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`. +This uses "better" mode for all operations. +If you would like more control, you can use the s2 package as described below: + +## Blocks + +Snappy compatible blocks can be generated with the S2 encoder. +Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace + +| Snappy | S2 replacement | +|----------------------------|-------------------------| +| snappy.Encode(...) | s2.EncodeSnappy(...) | +| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) | + +`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. + +`s2.ConcatBlocks` is compatible with snappy blocks. + +Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), +53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used: + +| Encoder | Size | MB/s | Reduction | +|-----------------------|------------|--------|------------ +| snappy.Encode | 1128706759 | 725.59 | 71.89% | +| s2.EncodeSnappy | 1093823291 | 899.16 | 72.75% | +| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% | +| s2.EncodeSnappyBest | 944507998 | 66.00 | 76.47% | + +## Streams + +For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`. +All other options are available, but note that block size limit is different for snappy. + +Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput: + +| File | snappy.NewWriter | S2 Snappy | S2 Snappy, Better | S2 Snappy, Best | +|-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------| +| nyc-taxi-data-10M.csv | 1316042016 - 517.54MB/s | 1307003093 - 8406.29MB/s | 1174534014 - 4984.35MB/s | 1115904679 - 177.81MB/s | +| enwik10 | 5088294643 - 433.45MB/s | 5175840939 - 8454.52MB/s | 4560784526 - 4403.10MB/s | 4340299103 - 159.71MB/s | +| 10gb.tar | 6056946612 - 703.25MB/s | 6208571995 - 9035.75MB/s | 5741646126 - 2402.08MB/s | 5548973895 - 171.17MB/s | +| github-june-2days-2019.json | 1525176492 - 908.11MB/s | 1476519054 - 12625.93MB/s | 1400547532 - 6163.61MB/s | 1321887137 - 200.71MB/s | +| consensus.db.10gb | 5412897703 - 1054.38MB/s | 5354073487 - 12634.82MB/s | 5335069899 - 2472.23MB/s | 5201000954 - 166.32MB/s | + +# Decompression + +All decompression functions map directly to equivalent s2 functions. + +| Snappy | S2 replacement | +|------------------------|--------------------| +| snappy.Decode(...) | s2.Decode(...) | +| snappy.DecodedLen(...) | s2.DecodedLen(...) | +| snappy.NewReader(...) | s2.NewReader(...) | + +Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip) +are also available for Snappy streams. + +If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize) +on your Reader will reduce memory consumption. + +# Concatenating blocks and streams. + +Concatenating streams will concatenate the output of both without recompressing them. +While this is inefficient in terms of compression it might be usable in certain scenarios. +The 10 byte 'stream identifier' of the second stream can optionally be stripped, but it is not a requirement. + +Blocks can be concatenated using the `ConcatBlocks` function. + +Snappy blocks/streams can safely be concatenated with S2 blocks and streams. + +# Format Extensions + +* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`. +* [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB). +* Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset. + +Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0. + +The length is specified by reading the 3-bit length specified in the tag and decode using this table: + +| Length | Actual Length | +|--------|----------------------| +| 0 | 4 | +| 1 | 5 | +| 2 | 6 | +| 3 | 7 | +| 4 | 8 | +| 5 | 8 + read 1 byte | +| 6 | 260 + read 2 bytes | +| 7 | 65540 + read 3 bytes | + +This allows any repeat offset + length to be represented by 2 to 5 bytes. + +Lengths are stored as little endian values. + +The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams. + +Default streaming block size is 1MB. + +# LICENSE + +This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation. + +Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go new file mode 100644 index 00000000..d0ae5304 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode.go @@ -0,0 +1,565 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package s2 + +import ( + "encoding/binary" + "errors" + "io" +) + +var ( + // ErrCorrupt reports that the input is invalid. + ErrCorrupt = errors.New("s2: corrupt input") + // ErrCRC reports that the input failed CRC validation (streams only) + ErrCRC = errors.New("s2: corrupt input, crc mismatch") + // ErrTooLarge reports that the uncompressed length is too large. + ErrTooLarge = errors.New("s2: decoded block is too large") + // ErrUnsupported reports that the input isn't supported. + ErrUnsupported = errors.New("s2: unsupported input") +) + +// DecodedLen returns the length of the decoded block. +func DecodedLen(src []byte) (int, error) { + v, _, err := decodedLen(src) + return v, err +} + +// decodedLen returns the length of the decoded block and the number of bytes +// that the length header occupied. +func decodedLen(src []byte) (blockLen, headerLen int, err error) { + v, n := binary.Uvarint(src) + if n <= 0 || v > 0xffffffff { + return 0, 0, ErrCorrupt + } + + const wordSize = 32 << (^uint(0) >> 32 & 1) + if wordSize == 32 && v > 0x7fffffff { + return 0, 0, ErrTooLarge + } + return int(v), n, nil +} + +const ( + decodeErrCodeCorrupt = 1 +) + +// Decode returns the decoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire decoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +func Decode(dst, src []byte) ([]byte, error) { + dLen, s, err := decodedLen(src) + if err != nil { + return nil, err + } + if dLen <= cap(dst) { + dst = dst[:dLen] + } else { + dst = make([]byte, dLen) + } + if s2Decode(dst, src[s:]) != 0 { + return nil, ErrCorrupt + } + return dst, nil +} + +// NewReader returns a new Reader that decompresses from r, using the framing +// format described at +// https://github.com/google/snappy/blob/master/framing_format.txt with S2 changes. +func NewReader(r io.Reader, opts ...ReaderOption) *Reader { + nr := Reader{ + r: r, + maxBlock: maxBlockSize, + } + for _, opt := range opts { + if err := opt(&nr); err != nil { + nr.err = err + return &nr + } + } + nr.maxBufSize = MaxEncodedLen(nr.maxBlock) + checksumSize + if nr.lazyBuf > 0 { + nr.buf = make([]byte, MaxEncodedLen(nr.lazyBuf)+checksumSize) + } else { + nr.buf = make([]byte, MaxEncodedLen(defaultBlockSize)+checksumSize) + } + nr.paramsOK = true + return &nr +} + +// ReaderOption is an option for creating a decoder. +type ReaderOption func(*Reader) error + +// ReaderMaxBlockSize allows to control allocations if the stream +// has been compressed with a smaller WriterBlockSize, or with the default 1MB. +// Blocks must be this size or smaller to decompress, +// otherwise the decoder will return ErrUnsupported. +// +// For streams compressed with Snappy this can safely be set to 64KB (64 << 10). +// +// Default is the maximum limit of 4MB. +func ReaderMaxBlockSize(blockSize int) ReaderOption { + return func(r *Reader) error { + if blockSize > maxBlockSize || blockSize <= 0 { + return errors.New("s2: block size too large. Must be <= 4MB and > 0") + } + if r.lazyBuf == 0 && blockSize < defaultBlockSize { + r.lazyBuf = blockSize + } + r.maxBlock = blockSize + return nil + } +} + +// ReaderAllocBlock allows to control upfront stream allocations +// and not allocate for frames bigger than this initially. +// If frames bigger than this is seen a bigger buffer will be allocated. +// +// Default is 1MB, which is default output size. +func ReaderAllocBlock(blockSize int) ReaderOption { + return func(r *Reader) error { + if blockSize > maxBlockSize || blockSize < 1024 { + return errors.New("s2: invalid ReaderAllocBlock. Must be <= 4MB and >= 1024") + } + r.lazyBuf = blockSize + return nil + } +} + +// Reader is an io.Reader that can read Snappy-compressed bytes. +type Reader struct { + r io.Reader + err error + decoded []byte + buf []byte + // decoded[i:j] contains decoded bytes that have not yet been passed on. + i, j int + // maximum block size allowed. + maxBlock int + // maximum expected buffer size. + maxBufSize int + // alloc a buffer this size if > 0. + lazyBuf int + readHeader bool + paramsOK bool + snappyFrame bool +} + +// ensureBufferSize will ensure that the buffer can take at least n bytes. +// If false is returned the buffer exceeds maximum allowed size. +func (r *Reader) ensureBufferSize(n int) bool { + if len(r.buf) >= n { + return true + } + if n > r.maxBufSize { + r.err = ErrCorrupt + return false + } + // Realloc buffer. + r.buf = make([]byte, n) + return true +} + +// Reset discards any buffered data, resets all state, and switches the Snappy +// reader to read from r. This permits reusing a Reader rather than allocating +// a new one. +func (r *Reader) Reset(reader io.Reader) { + if !r.paramsOK { + return + } + r.r = reader + r.err = nil + r.i = 0 + r.j = 0 + r.readHeader = false +} + +func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) { + if _, r.err = io.ReadFull(r.r, p); r.err != nil { + if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) { + r.err = ErrCorrupt + } + return false + } + return true +} + +// skipN will skip n bytes. +// If the supplied reader supports seeking that is used. +// tmp is used as a temporary buffer for reading. +// The supplied slice does not need to be the size of the read. +func (r *Reader) skipN(tmp []byte, n int, allowEOF bool) (ok bool) { + if rs, ok := r.r.(io.ReadSeeker); ok { + _, err := rs.Seek(int64(n), io.SeekCurrent) + if err == nil { + return true + } + if err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) { + r.err = ErrCorrupt + return false + } + } + for n > 0 { + if n < len(tmp) { + tmp = tmp[:n] + } + if _, r.err = io.ReadFull(r.r, tmp); r.err != nil { + if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) { + r.err = ErrCorrupt + } + return false + } + n -= len(tmp) + } + return true +} + +// Read satisfies the io.Reader interface. +func (r *Reader) Read(p []byte) (int, error) { + if r.err != nil { + return 0, r.err + } + for { + if r.i < r.j { + n := copy(p, r.decoded[r.i:r.j]) + r.i += n + return n, nil + } + if !r.readFull(r.buf[:4], true) { + return 0, r.err + } + chunkType := r.buf[0] + if !r.readHeader { + if chunkType != chunkTypeStreamIdentifier { + r.err = ErrCorrupt + return 0, r.err + } + r.readHeader = true + } + chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 + + // The chunk types are specified at + // https://github.com/google/snappy/blob/master/framing_format.txt + switch chunkType { + case chunkTypeCompressedData: + // Section 4.2. Compressed data (chunk type 0x00). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + if !r.ensureBufferSize(chunkLen) { + if r.err == nil { + r.err = ErrUnsupported + } + return 0, r.err + } + buf := r.buf[:chunkLen] + if !r.readFull(buf, false) { + return 0, r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + buf = buf[checksumSize:] + + n, err := DecodedLen(buf) + if err != nil { + r.err = err + return 0, r.err + } + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } + + if n > len(r.decoded) { + if n > r.maxBlock { + r.err = ErrCorrupt + return 0, r.err + } + r.decoded = make([]byte, n) + } + if _, err := Decode(r.decoded, buf); err != nil { + r.err = err + return 0, r.err + } + if crc(r.decoded[:n]) != checksum { + r.err = ErrCRC + return 0, r.err + } + r.i, r.j = 0, n + continue + + case chunkTypeUncompressedData: + // Section 4.3. Uncompressed data (chunk type 0x01). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + if !r.ensureBufferSize(chunkLen) { + if r.err == nil { + r.err = ErrUnsupported + } + return 0, r.err + } + buf := r.buf[:checksumSize] + if !r.readFull(buf, false) { + return 0, r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + // Read directly into r.decoded instead of via r.buf. + n := chunkLen - checksumSize + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } + if n > len(r.decoded) { + if n > r.maxBlock { + r.err = ErrCorrupt + return 0, r.err + } + r.decoded = make([]byte, n) + } + if !r.readFull(r.decoded[:n], false) { + return 0, r.err + } + if crc(r.decoded[:n]) != checksum { + r.err = ErrCRC + return 0, r.err + } + r.i, r.j = 0, n + continue + + case chunkTypeStreamIdentifier: + // Section 4.1. Stream identifier (chunk type 0xff). + if chunkLen != len(magicBody) { + r.err = ErrCorrupt + return 0, r.err + } + if !r.readFull(r.buf[:len(magicBody)], false) { + return 0, r.err + } + if string(r.buf[:len(magicBody)]) != magicBody { + if string(r.buf[:len(magicBody)]) != magicBodySnappy { + r.err = ErrCorrupt + return 0, r.err + } else { + r.snappyFrame = true + } + } else { + r.snappyFrame = false + } + continue + } + + if chunkType <= 0x7f { + // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). + r.err = ErrUnsupported + return 0, r.err + } + // Section 4.4 Padding (chunk type 0xfe). + // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). + if chunkLen > maxBlockSize { + r.err = ErrUnsupported + return 0, r.err + } + + if !r.skipN(r.buf, chunkLen, false) { + return 0, r.err + } + } +} + +// Skip will skip n bytes forward in the decompressed output. +// For larger skips this consumes less CPU and is faster than reading output and discarding it. +// CRC is not checked on skipped blocks. +// io.ErrUnexpectedEOF is returned if the stream ends before all bytes have been skipped. +// If a decoding error is encountered subsequent calls to Read will also fail. +func (r *Reader) Skip(n int64) error { + if n < 0 { + return errors.New("attempted negative skip") + } + if r.err != nil { + return r.err + } + + for n > 0 { + if r.i < r.j { + // Skip in buffer. + // decoded[i:j] contains decoded bytes that have not yet been passed on. + left := int64(r.j - r.i) + if left >= n { + r.i += int(n) + return nil + } + n -= int64(r.j - r.i) + r.i, r.j = 0, 0 + } + + // Buffer empty; read blocks until we have content. + if !r.readFull(r.buf[:4], true) { + if r.err == io.EOF { + r.err = io.ErrUnexpectedEOF + } + return r.err + } + chunkType := r.buf[0] + if !r.readHeader { + if chunkType != chunkTypeStreamIdentifier { + r.err = ErrCorrupt + return r.err + } + r.readHeader = true + } + chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 + + // The chunk types are specified at + // https://github.com/google/snappy/blob/master/framing_format.txt + switch chunkType { + case chunkTypeCompressedData: + // Section 4.2. Compressed data (chunk type 0x00). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return r.err + } + if !r.ensureBufferSize(chunkLen) { + if r.err == nil { + r.err = ErrUnsupported + } + return r.err + } + buf := r.buf[:chunkLen] + if !r.readFull(buf, false) { + return r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + buf = buf[checksumSize:] + + dLen, err := DecodedLen(buf) + if err != nil { + r.err = err + return r.err + } + if dLen > r.maxBlock { + r.err = ErrCorrupt + return r.err + } + // Check if destination is within this block + if int64(dLen) > n { + if len(r.decoded) < dLen { + r.decoded = make([]byte, dLen) + } + if _, err := Decode(r.decoded, buf); err != nil { + r.err = err + return r.err + } + if crc(r.decoded[:dLen]) != checksum { + r.err = ErrCorrupt + return r.err + } + } else { + // Skip block completely + n -= int64(dLen) + dLen = 0 + } + r.i, r.j = 0, dLen + continue + case chunkTypeUncompressedData: + // Section 4.3. Uncompressed data (chunk type 0x01). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return r.err + } + if !r.ensureBufferSize(chunkLen) { + if r.err != nil { + r.err = ErrUnsupported + } + return r.err + } + buf := r.buf[:checksumSize] + if !r.readFull(buf, false) { + return r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + // Read directly into r.decoded instead of via r.buf. + n2 := chunkLen - checksumSize + if n2 > len(r.decoded) { + if n2 > r.maxBlock { + r.err = ErrCorrupt + return r.err + } + r.decoded = make([]byte, n2) + } + if !r.readFull(r.decoded[:n2], false) { + return r.err + } + if int64(n2) < n { + if crc(r.decoded[:n2]) != checksum { + r.err = ErrCorrupt + return r.err + } + } + r.i, r.j = 0, n2 + continue + case chunkTypeStreamIdentifier: + // Section 4.1. Stream identifier (chunk type 0xff). + if chunkLen != len(magicBody) { + r.err = ErrCorrupt + return r.err + } + if !r.readFull(r.buf[:len(magicBody)], false) { + return r.err + } + if string(r.buf[:len(magicBody)]) != magicBody { + if string(r.buf[:len(magicBody)]) != magicBodySnappy { + r.err = ErrCorrupt + return r.err + } + } + + continue + } + + if chunkType <= 0x7f { + // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). + r.err = ErrUnsupported + return r.err + } + if chunkLen > maxBlockSize { + r.err = ErrUnsupported + return r.err + } + // Section 4.4 Padding (chunk type 0xfe). + // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). + if !r.skipN(r.buf, chunkLen, false) { + return r.err + } + } + return nil +} + +// ReadByte satisfies the io.ByteReader interface. +func (r *Reader) ReadByte() (byte, error) { + if r.err != nil { + return 0, r.err + } + if r.i < r.j { + c := r.decoded[r.i] + r.i++ + return c, nil + } + var tmp [1]byte + for i := 0; i < 10; i++ { + n, err := r.Read(tmp[:]) + if err != nil { + return 0, err + } + if n == 1 { + return tmp[0], nil + } + } + return 0, io.ErrNoProgress +} diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s new file mode 100644 index 00000000..9b105e03 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s @@ -0,0 +1,568 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" + +#define R_TMP0 AX +#define R_TMP1 BX +#define R_LEN CX +#define R_OFF DX +#define R_SRC SI +#define R_DST DI +#define R_DBASE R8 +#define R_DLEN R9 +#define R_DEND R10 +#define R_SBASE R11 +#define R_SLEN R12 +#define R_SEND R13 +#define R_TMP2 R14 +#define R_TMP3 R15 + +// The asm code generally follows the pure Go code in decode_other.go, except +// where marked with a "!!!". + +// func decode(dst, src []byte) int +// +// All local variables fit into registers. The non-zero stack size is only to +// spill registers and push args when issuing a CALL. The register allocation: +// - R_TMP0 scratch +// - R_TMP1 scratch +// - R_LEN length or x (shared) +// - R_OFF offset +// - R_SRC &src[s] +// - R_DST &dst[d] +// + R_DBASE dst_base +// + R_DLEN dst_len +// + R_DEND dst_base + dst_len +// + R_SBASE src_base +// + R_SLEN src_len +// + R_SEND src_base + src_len +// - R_TMP2 used by doCopy +// - R_TMP3 used by doCopy +// +// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the +// function, and after a CALL returns, and are not otherwise modified. +// +// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. +// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. +TEXT ·s2Decode(SB), NOSPLIT, $48-56 + // Initialize R_SRC, R_DST and R_DBASE-R_SEND. + MOVQ dst_base+0(FP), R_DBASE + MOVQ dst_len+8(FP), R_DLEN + MOVQ R_DBASE, R_DST + MOVQ R_DBASE, R_DEND + ADDQ R_DLEN, R_DEND + MOVQ src_base+24(FP), R_SBASE + MOVQ src_len+32(FP), R_SLEN + MOVQ R_SBASE, R_SRC + MOVQ R_SBASE, R_SEND + ADDQ R_SLEN, R_SEND + XORQ R_OFF, R_OFF + +loop: + // for s < len(src) + CMPQ R_SRC, R_SEND + JEQ end + + // R_LEN = uint32(src[s]) + // + // switch src[s] & 0x03 + MOVBLZX (R_SRC), R_LEN + MOVL R_LEN, R_TMP1 + ANDL $3, R_TMP1 + CMPL R_TMP1, $1 + JAE tagCopy + + // ---------------------------------------- + // The code below handles literal tags. + + // case tagLiteral: + // x := uint32(src[s] >> 2) + // switch + SHRL $2, R_LEN + CMPL R_LEN, $60 + JAE tagLit60Plus + + // case x < 60: + // s++ + INCQ R_SRC + +doLit: + // This is the end of the inner "switch", when we have a literal tag. + // + // We assume that R_LEN == x and x fits in a uint32, where x is the variable + // used in the pure Go decode_other.go code. + + // length = int(x) + 1 + // + // Unlike the pure Go code, we don't need to check if length <= 0 because + // R_LEN can hold 64 bits, so the increment cannot overflow. + INCQ R_LEN + + // Prepare to check if copying length bytes will run past the end of dst or + // src. + // + // R_TMP0 = len(dst) - d + // R_TMP1 = len(src) - s + MOVQ R_DEND, R_TMP0 + SUBQ R_DST, R_TMP0 + MOVQ R_SEND, R_TMP1 + SUBQ R_SRC, R_TMP1 + + // !!! Try a faster technique for short (16 or fewer bytes) copies. + // + // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { + // goto callMemmove // Fall back on calling runtime·memmove. + // } + // + // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s + // against 21 instead of 16, because it cannot assume that all of its input + // is contiguous in memory and so it needs to leave enough source bytes to + // read the next tag without refilling buffers, but Go's Decode assumes + // contiguousness (the src argument is a []byte). + CMPQ R_LEN, $16 + JGT callMemmove + CMPQ R_TMP0, $16 + JLT callMemmove + CMPQ R_TMP1, $16 + JLT callMemmove + + // !!! Implement the copy from src to dst as a 16-byte load and store. + // (Decode's documentation says that dst and src must not overlap.) + // + // This always copies 16 bytes, instead of only length bytes, but that's + // OK. If the input is a valid Snappy encoding then subsequent iterations + // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a + // non-nil error), so the overrun will be ignored. + // + // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or + // 16-byte loads and stores. This technique probably wouldn't be as + // effective on architectures that are fussier about alignment. + MOVOU 0(R_SRC), X0 + MOVOU X0, 0(R_DST) + + // d += length + // s += length + ADDQ R_LEN, R_DST + ADDQ R_LEN, R_SRC + JMP loop + +callMemmove: + // if length > len(dst)-d || length > len(src)-s { etc } + CMPQ R_LEN, R_TMP0 + JGT errCorrupt + CMPQ R_LEN, R_TMP1 + JGT errCorrupt + + // copy(dst[d:], src[s:s+length]) + // + // This means calling runtime·memmove(&dst[d], &src[s], length), so we push + // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those + // three registers to the stack, to save local variables across the CALL. + MOVQ R_DST, 0(SP) + MOVQ R_SRC, 8(SP) + MOVQ R_LEN, 16(SP) + MOVQ R_DST, 24(SP) + MOVQ R_SRC, 32(SP) + MOVQ R_LEN, 40(SP) + MOVQ R_OFF, 48(SP) + CALL runtime·memmove(SB) + + // Restore local variables: unspill registers from the stack and + // re-calculate R_DBASE-R_SEND. + MOVQ 24(SP), R_DST + MOVQ 32(SP), R_SRC + MOVQ 40(SP), R_LEN + MOVQ 48(SP), R_OFF + MOVQ dst_base+0(FP), R_DBASE + MOVQ dst_len+8(FP), R_DLEN + MOVQ R_DBASE, R_DEND + ADDQ R_DLEN, R_DEND + MOVQ src_base+24(FP), R_SBASE + MOVQ src_len+32(FP), R_SLEN + MOVQ R_SBASE, R_SEND + ADDQ R_SLEN, R_SEND + + // d += length + // s += length + ADDQ R_LEN, R_DST + ADDQ R_LEN, R_SRC + JMP loop + +tagLit60Plus: + // !!! This fragment does the + // + // s += x - 58; if uint(s) > uint(len(src)) { etc } + // + // checks. In the asm version, we code it once instead of once per switch case. + ADDQ R_LEN, R_SRC + SUBQ $58, R_SRC + CMPQ R_SRC, R_SEND + JA errCorrupt + + // case x == 60: + CMPL R_LEN, $61 + JEQ tagLit61 + JA tagLit62Plus + + // x = uint32(src[s-1]) + MOVBLZX -1(R_SRC), R_LEN + JMP doLit + +tagLit61: + // case x == 61: + // x = uint32(src[s-2]) | uint32(src[s-1])<<8 + MOVWLZX -2(R_SRC), R_LEN + JMP doLit + +tagLit62Plus: + CMPL R_LEN, $62 + JA tagLit63 + + // case x == 62: + // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 + // We read one byte, safe to read one back, since we are just reading tag. + // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8 + MOVL -4(R_SRC), R_LEN + SHRL $8, R_LEN + JMP doLit + +tagLit63: + // case x == 63: + // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 + MOVL -4(R_SRC), R_LEN + JMP doLit + +// The code above handles literal tags. +// ---------------------------------------- +// The code below handles copy tags. + +tagCopy4: + // case tagCopy4: + // s += 5 + ADDQ $5, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = 1 + int(src[s-5])>>2 + SHRQ $2, R_LEN + INCQ R_LEN + + // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) + MOVLQZX -4(R_SRC), R_OFF + JMP doCopy + +tagCopy2: + // case tagCopy2: + // s += 3 + ADDQ $3, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = 1 + int(src[s-3])>>2 + SHRQ $2, R_LEN + INCQ R_LEN + + // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) + MOVWQZX -2(R_SRC), R_OFF + JMP doCopy + +tagCopy: + // We have a copy tag. We assume that: + // - R_TMP1 == src[s] & 0x03 + // - R_LEN == src[s] + CMPQ R_TMP1, $2 + JEQ tagCopy2 + JA tagCopy4 + + // case tagCopy1: + // s += 2 + ADDQ $2, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + // length = 4 + int(src[s-2])>>2&0x7 + MOVBQZX -1(R_SRC), R_TMP1 + MOVQ R_LEN, R_TMP0 + SHRQ $2, R_LEN + ANDQ $0xe0, R_TMP0 + ANDQ $7, R_LEN + SHLQ $3, R_TMP0 + ADDQ $4, R_LEN + ORQ R_TMP1, R_TMP0 + + // check if repeat code, ZF set by ORQ. + JZ repeatCode + + // This is a regular copy, transfer our temporary value to R_OFF (length) + MOVQ R_TMP0, R_OFF + JMP doCopy + +// This is a repeat code. +repeatCode: + // If length < 9, reuse last offset, with the length already calculated. + CMPQ R_LEN, $9 + JL doCopyRepeat + + // Read additional bytes for length. + JE repeatLen1 + + // Rare, so the extra branch shouldn't hurt too much. + CMPQ R_LEN, $10 + JE repeatLen2 + JMP repeatLen3 + +// Read repeat lengths. +repeatLen1: + // s ++ + ADDQ $1, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = src[s-1] + 8 + MOVBQZX -1(R_SRC), R_LEN + ADDL $8, R_LEN + JMP doCopyRepeat + +repeatLen2: + // s +=2 + ADDQ $2, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8) + MOVWQZX -2(R_SRC), R_LEN + ADDL $260, R_LEN + JMP doCopyRepeat + +repeatLen3: + // s +=3 + ADDQ $3, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16) + // Read one byte further back (just part of the tag, shifted out) + MOVL -4(R_SRC), R_LEN + SHRL $8, R_LEN + ADDL $65540, R_LEN + JMP doCopyRepeat + +doCopy: + // This is the end of the outer "switch", when we have a copy tag. + // + // We assume that: + // - R_LEN == length && R_LEN > 0 + // - R_OFF == offset + + // if d < offset { etc } + MOVQ R_DST, R_TMP1 + SUBQ R_DBASE, R_TMP1 + CMPQ R_TMP1, R_OFF + JLT errCorrupt + + // Repeat values can skip the test above, since any offset > 0 will be in dst. +doCopyRepeat: + // if offset <= 0 { etc } + CMPQ R_OFF, $0 + JLE errCorrupt + + // if length > len(dst)-d { etc } + MOVQ R_DEND, R_TMP1 + SUBQ R_DST, R_TMP1 + CMPQ R_LEN, R_TMP1 + JGT errCorrupt + + // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length + // + // Set: + // - R_TMP2 = len(dst)-d + // - R_TMP3 = &dst[d-offset] + MOVQ R_DEND, R_TMP2 + SUBQ R_DST, R_TMP2 + MOVQ R_DST, R_TMP3 + SUBQ R_OFF, R_TMP3 + + // !!! Try a faster technique for short (16 or fewer bytes) forward copies. + // + // First, try using two 8-byte load/stores, similar to the doLit technique + // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is + // still OK if offset >= 8. Note that this has to be two 8-byte load/stores + // and not one 16-byte load/store, and the first store has to be before the + // second load, due to the overlap if offset is in the range [8, 16). + // + // if length > 16 || offset < 8 || len(dst)-d < 16 { + // goto slowForwardCopy + // } + // copy 16 bytes + // d += length + CMPQ R_LEN, $16 + JGT slowForwardCopy + CMPQ R_OFF, $8 + JLT slowForwardCopy + CMPQ R_TMP2, $16 + JLT slowForwardCopy + MOVQ 0(R_TMP3), R_TMP0 + MOVQ R_TMP0, 0(R_DST) + MOVQ 8(R_TMP3), R_TMP1 + MOVQ R_TMP1, 8(R_DST) + ADDQ R_LEN, R_DST + JMP loop + +slowForwardCopy: + // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we + // can still try 8-byte load stores, provided we can overrun up to 10 extra + // bytes. As above, the overrun will be fixed up by subsequent iterations + // of the outermost loop. + // + // The C++ snappy code calls this technique IncrementalCopyFastPath. Its + // commentary says: + // + // ---- + // + // The main part of this loop is a simple copy of eight bytes at a time + // until we've copied (at least) the requested amount of bytes. However, + // if d and d-offset are less than eight bytes apart (indicating a + // repeating pattern of length < 8), we first need to expand the pattern in + // order to get the correct results. For instance, if the buffer looks like + // this, with the eight-byte <d-offset> and <d> patterns marked as + // intervals: + // + // abxxxxxxxxxxxx + // [------] d-offset + // [------] d + // + // a single eight-byte copy from <d-offset> to <d> will repeat the pattern + // once, after which we can move <d> two bytes without moving <d-offset>: + // + // ababxxxxxxxxxx + // [------] d-offset + // [------] d + // + // and repeat the exercise until the two no longer overlap. + // + // This allows us to do very well in the special case of one single byte + // repeated many times, without taking a big hit for more general cases. + // + // The worst case of extra writing past the end of the match occurs when + // offset == 1 and length == 1; the last copy will read from byte positions + // [0..7] and write to [4..11], whereas it was only supposed to write to + // position 1. Thus, ten excess bytes. + // + // ---- + // + // That "10 byte overrun" worst case is confirmed by Go's + // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy + // and finishSlowForwardCopy algorithm. + // + // if length > len(dst)-d-10 { + // goto verySlowForwardCopy + // } + SUBQ $10, R_TMP2 + CMPQ R_LEN, R_TMP2 + JGT verySlowForwardCopy + + // We want to keep the offset, so we use R_TMP2 from here. + MOVQ R_OFF, R_TMP2 + +makeOffsetAtLeast8: + // !!! As above, expand the pattern so that offset >= 8 and we can use + // 8-byte load/stores. + // + // for offset < 8 { + // copy 8 bytes from dst[d-offset:] to dst[d:] + // length -= offset + // d += offset + // offset += offset + // // The two previous lines together means that d-offset, and therefore + // // R_TMP3, is unchanged. + // } + CMPQ R_TMP2, $8 + JGE fixUpSlowForwardCopy + MOVQ (R_TMP3), R_TMP1 + MOVQ R_TMP1, (R_DST) + SUBQ R_TMP2, R_LEN + ADDQ R_TMP2, R_DST + ADDQ R_TMP2, R_TMP2 + JMP makeOffsetAtLeast8 + +fixUpSlowForwardCopy: + // !!! Add length (which might be negative now) to d (implied by R_DST being + // &dst[d]) so that d ends up at the right place when we jump back to the + // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if + // length is positive, copying the remaining length bytes will write to the + // right place. + MOVQ R_DST, R_TMP0 + ADDQ R_LEN, R_DST + +finishSlowForwardCopy: + // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative + // length means that we overrun, but as above, that will be fixed up by + // subsequent iterations of the outermost loop. + CMPQ R_LEN, $0 + JLE loop + MOVQ (R_TMP3), R_TMP1 + MOVQ R_TMP1, (R_TMP0) + ADDQ $8, R_TMP3 + ADDQ $8, R_TMP0 + SUBQ $8, R_LEN + JMP finishSlowForwardCopy + +verySlowForwardCopy: + // verySlowForwardCopy is a simple implementation of forward copy. In C + // parlance, this is a do/while loop instead of a while loop, since we know + // that length > 0. In Go syntax: + // + // for { + // dst[d] = dst[d - offset] + // d++ + // length-- + // if length == 0 { + // break + // } + // } + MOVB (R_TMP3), R_TMP1 + MOVB R_TMP1, (R_DST) + INCQ R_TMP3 + INCQ R_DST + DECQ R_LEN + JNZ verySlowForwardCopy + JMP loop + +// The code above handles copy tags. +// ---------------------------------------- + +end: + // This is the end of the "for s < len(src)". + // + // if d != len(dst) { etc } + CMPQ R_DST, R_DEND + JNE errCorrupt + + // return 0 + MOVQ $0, ret+48(FP) + RET + +errCorrupt: + // return decodeErrCodeCorrupt + MOVQ $1, ret+48(FP) + RET diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s new file mode 100644 index 00000000..4b63d508 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s @@ -0,0 +1,574 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" + +#define R_TMP0 R2 +#define R_TMP1 R3 +#define R_LEN R4 +#define R_OFF R5 +#define R_SRC R6 +#define R_DST R7 +#define R_DBASE R8 +#define R_DLEN R9 +#define R_DEND R10 +#define R_SBASE R11 +#define R_SLEN R12 +#define R_SEND R13 +#define R_TMP2 R14 +#define R_TMP3 R15 + +// TEST_SRC will check if R_SRC is <= SRC_END +#define TEST_SRC() \ + CMP R_SEND, R_SRC \ + BGT errCorrupt + +// MOVD R_SRC, R_TMP1 +// SUB R_SBASE, R_TMP1, R_TMP1 +// CMP R_SLEN, R_TMP1 +// BGT errCorrupt + +// The asm code generally follows the pure Go code in decode_other.go, except +// where marked with a "!!!". + +// func decode(dst, src []byte) int +// +// All local variables fit into registers. The non-zero stack size is only to +// spill registers and push args when issuing a CALL. The register allocation: +// - R_TMP0 scratch +// - R_TMP1 scratch +// - R_LEN length or x +// - R_OFF offset +// - R_SRC &src[s] +// - R_DST &dst[d] +// + R_DBASE dst_base +// + R_DLEN dst_len +// + R_DEND dst_base + dst_len +// + R_SBASE src_base +// + R_SLEN src_len +// + R_SEND src_base + src_len +// - R_TMP2 used by doCopy +// - R_TMP3 used by doCopy +// +// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the +// function, and after a CALL returns, and are not otherwise modified. +// +// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. +// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. +TEXT ·s2Decode(SB), NOSPLIT, $56-64 + // Initialize R_SRC, R_DST and R_DBASE-R_SEND. + MOVD dst_base+0(FP), R_DBASE + MOVD dst_len+8(FP), R_DLEN + MOVD R_DBASE, R_DST + MOVD R_DBASE, R_DEND + ADD R_DLEN, R_DEND, R_DEND + MOVD src_base+24(FP), R_SBASE + MOVD src_len+32(FP), R_SLEN + MOVD R_SBASE, R_SRC + MOVD R_SBASE, R_SEND + ADD R_SLEN, R_SEND, R_SEND + MOVD $0, R_OFF + +loop: + // for s < len(src) + CMP R_SEND, R_SRC + BEQ end + + // R_LEN = uint32(src[s]) + // + // switch src[s] & 0x03 + MOVBU (R_SRC), R_LEN + MOVW R_LEN, R_TMP1 + ANDW $3, R_TMP1 + MOVW $1, R1 + CMPW R1, R_TMP1 + BGE tagCopy + + // ---------------------------------------- + // The code below handles literal tags. + + // case tagLiteral: + // x := uint32(src[s] >> 2) + // switch + MOVW $60, R1 + LSRW $2, R_LEN, R_LEN + CMPW R_LEN, R1 + BLS tagLit60Plus + + // case x < 60: + // s++ + ADD $1, R_SRC, R_SRC + +doLit: + // This is the end of the inner "switch", when we have a literal tag. + // + // We assume that R_LEN == x and x fits in a uint32, where x is the variable + // used in the pure Go decode_other.go code. + + // length = int(x) + 1 + // + // Unlike the pure Go code, we don't need to check if length <= 0 because + // R_LEN can hold 64 bits, so the increment cannot overflow. + ADD $1, R_LEN, R_LEN + + // Prepare to check if copying length bytes will run past the end of dst or + // src. + // + // R_TMP0 = len(dst) - d + // R_TMP1 = len(src) - s + MOVD R_DEND, R_TMP0 + SUB R_DST, R_TMP0, R_TMP0 + MOVD R_SEND, R_TMP1 + SUB R_SRC, R_TMP1, R_TMP1 + + // !!! Try a faster technique for short (16 or fewer bytes) copies. + // + // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { + // goto callMemmove // Fall back on calling runtime·memmove. + // } + // + // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s + // against 21 instead of 16, because it cannot assume that all of its input + // is contiguous in memory and so it needs to leave enough source bytes to + // read the next tag without refilling buffers, but Go's Decode assumes + // contiguousness (the src argument is a []byte). + CMP $16, R_LEN + BGT callMemmove + CMP $16, R_TMP0 + BLT callMemmove + CMP $16, R_TMP1 + BLT callMemmove + + // !!! Implement the copy from src to dst as a 16-byte load and store. + // (Decode's documentation says that dst and src must not overlap.) + // + // This always copies 16 bytes, instead of only length bytes, but that's + // OK. If the input is a valid Snappy encoding then subsequent iterations + // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a + // non-nil error), so the overrun will be ignored. + // + // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or + // 16-byte loads and stores. This technique probably wouldn't be as + // effective on architectures that are fussier about alignment. + LDP 0(R_SRC), (R_TMP2, R_TMP3) + STP (R_TMP2, R_TMP3), 0(R_DST) + + // d += length + // s += length + ADD R_LEN, R_DST, R_DST + ADD R_LEN, R_SRC, R_SRC + B loop + +callMemmove: + // if length > len(dst)-d || length > len(src)-s { etc } + CMP R_TMP0, R_LEN + BGT errCorrupt + CMP R_TMP1, R_LEN + BGT errCorrupt + + // copy(dst[d:], src[s:s+length]) + // + // This means calling runtime·memmove(&dst[d], &src[s], length), so we push + // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those + // three registers to the stack, to save local variables across the CALL. + MOVD R_DST, 8(RSP) + MOVD R_SRC, 16(RSP) + MOVD R_LEN, 24(RSP) + MOVD R_DST, 32(RSP) + MOVD R_SRC, 40(RSP) + MOVD R_LEN, 48(RSP) + MOVD R_OFF, 56(RSP) + CALL runtime·memmove(SB) + + // Restore local variables: unspill registers from the stack and + // re-calculate R_DBASE-R_SEND. + MOVD 32(RSP), R_DST + MOVD 40(RSP), R_SRC + MOVD 48(RSP), R_LEN + MOVD 56(RSP), R_OFF + MOVD dst_base+0(FP), R_DBASE + MOVD dst_len+8(FP), R_DLEN + MOVD R_DBASE, R_DEND + ADD R_DLEN, R_DEND, R_DEND + MOVD src_base+24(FP), R_SBASE + MOVD src_len+32(FP), R_SLEN + MOVD R_SBASE, R_SEND + ADD R_SLEN, R_SEND, R_SEND + + // d += length + // s += length + ADD R_LEN, R_DST, R_DST + ADD R_LEN, R_SRC, R_SRC + B loop + +tagLit60Plus: + // !!! This fragment does the + // + // s += x - 58; if uint(s) > uint(len(src)) { etc } + // + // checks. In the asm version, we code it once instead of once per switch case. + ADD R_LEN, R_SRC, R_SRC + SUB $58, R_SRC, R_SRC + TEST_SRC() + + // case x == 60: + MOVW $61, R1 + CMPW R1, R_LEN + BEQ tagLit61 + BGT tagLit62Plus + + // x = uint32(src[s-1]) + MOVBU -1(R_SRC), R_LEN + B doLit + +tagLit61: + // case x == 61: + // x = uint32(src[s-2]) | uint32(src[s-1])<<8 + MOVHU -2(R_SRC), R_LEN + B doLit + +tagLit62Plus: + CMPW $62, R_LEN + BHI tagLit63 + + // case x == 62: + // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 + MOVHU -3(R_SRC), R_LEN + MOVBU -1(R_SRC), R_TMP1 + ORR R_TMP1<<16, R_LEN + B doLit + +tagLit63: + // case x == 63: + // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 + MOVWU -4(R_SRC), R_LEN + B doLit + + // The code above handles literal tags. + // ---------------------------------------- + // The code below handles copy tags. + +tagCopy4: + // case tagCopy4: + // s += 5 + ADD $5, R_SRC, R_SRC + + // if uint(s) > uint(len(src)) { etc } + MOVD R_SRC, R_TMP1 + SUB R_SBASE, R_TMP1, R_TMP1 + CMP R_SLEN, R_TMP1 + BGT errCorrupt + + // length = 1 + int(src[s-5])>>2 + MOVD $1, R1 + ADD R_LEN>>2, R1, R_LEN + + // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) + MOVWU -4(R_SRC), R_OFF + B doCopy + +tagCopy2: + // case tagCopy2: + // s += 3 + ADD $3, R_SRC, R_SRC + + // if uint(s) > uint(len(src)) { etc } + TEST_SRC() + + // length = 1 + int(src[s-3])>>2 + MOVD $1, R1 + ADD R_LEN>>2, R1, R_LEN + + // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) + MOVHU -2(R_SRC), R_OFF + B doCopy + +tagCopy: + // We have a copy tag. We assume that: + // - R_TMP1 == src[s] & 0x03 + // - R_LEN == src[s] + CMP $2, R_TMP1 + BEQ tagCopy2 + BGT tagCopy4 + + // case tagCopy1: + // s += 2 + ADD $2, R_SRC, R_SRC + + // if uint(s) > uint(len(src)) { etc } + TEST_SRC() + + // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + // Calculate offset in R_TMP0 in case it is a repeat. + MOVD R_LEN, R_TMP0 + AND $0xe0, R_TMP0 + MOVBU -1(R_SRC), R_TMP1 + ORR R_TMP0<<3, R_TMP1, R_TMP0 + + // length = 4 + int(src[s-2])>>2&0x7 + MOVD $7, R1 + AND R_LEN>>2, R1, R_LEN + ADD $4, R_LEN, R_LEN + + // check if repeat code with offset 0. + CMP $0, R_TMP0 + BEQ repeatCode + + // This is a regular copy, transfer our temporary value to R_OFF (offset) + MOVD R_TMP0, R_OFF + B doCopy + + // This is a repeat code. +repeatCode: + // If length < 9, reuse last offset, with the length already calculated. + CMP $9, R_LEN + BLT doCopyRepeat + BEQ repeatLen1 + CMP $10, R_LEN + BEQ repeatLen2 + +repeatLen3: + // s +=3 + ADD $3, R_SRC, R_SRC + + // if uint(s) > uint(len(src)) { etc } + TEST_SRC() + + // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540 + MOVBU -1(R_SRC), R_TMP0 + MOVHU -3(R_SRC), R_LEN + ORR R_TMP0<<16, R_LEN, R_LEN + ADD $65540, R_LEN, R_LEN + B doCopyRepeat + +repeatLen2: + // s +=2 + ADD $2, R_SRC, R_SRC + + // if uint(s) > uint(len(src)) { etc } + TEST_SRC() + + // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260 + MOVHU -2(R_SRC), R_LEN + ADD $260, R_LEN, R_LEN + B doCopyRepeat + +repeatLen1: + // s +=1 + ADD $1, R_SRC, R_SRC + + // if uint(s) > uint(len(src)) { etc } + TEST_SRC() + + // length = src[s-1] + 8 + MOVBU -1(R_SRC), R_LEN + ADD $8, R_LEN, R_LEN + B doCopyRepeat + +doCopy: + // This is the end of the outer "switch", when we have a copy tag. + // + // We assume that: + // - R_LEN == length && R_LEN > 0 + // - R_OFF == offset + + // if d < offset { etc } + MOVD R_DST, R_TMP1 + SUB R_DBASE, R_TMP1, R_TMP1 + CMP R_OFF, R_TMP1 + BLT errCorrupt + + // Repeat values can skip the test above, since any offset > 0 will be in dst. +doCopyRepeat: + + // if offset <= 0 { etc } + CMP $0, R_OFF + BLE errCorrupt + + // if length > len(dst)-d { etc } + MOVD R_DEND, R_TMP1 + SUB R_DST, R_TMP1, R_TMP1 + CMP R_TMP1, R_LEN + BGT errCorrupt + + // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length + // + // Set: + // - R_TMP2 = len(dst)-d + // - R_TMP3 = &dst[d-offset] + MOVD R_DEND, R_TMP2 + SUB R_DST, R_TMP2, R_TMP2 + MOVD R_DST, R_TMP3 + SUB R_OFF, R_TMP3, R_TMP3 + + // !!! Try a faster technique for short (16 or fewer bytes) forward copies. + // + // First, try using two 8-byte load/stores, similar to the doLit technique + // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is + // still OK if offset >= 8. Note that this has to be two 8-byte load/stores + // and not one 16-byte load/store, and the first store has to be before the + // second load, due to the overlap if offset is in the range [8, 16). + // + // if length > 16 || offset < 8 || len(dst)-d < 16 { + // goto slowForwardCopy + // } + // copy 16 bytes + // d += length + CMP $16, R_LEN + BGT slowForwardCopy + CMP $8, R_OFF + BLT slowForwardCopy + CMP $16, R_TMP2 + BLT slowForwardCopy + MOVD 0(R_TMP3), R_TMP0 + MOVD R_TMP0, 0(R_DST) + MOVD 8(R_TMP3), R_TMP1 + MOVD R_TMP1, 8(R_DST) + ADD R_LEN, R_DST, R_DST + B loop + +slowForwardCopy: + // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we + // can still try 8-byte load stores, provided we can overrun up to 10 extra + // bytes. As above, the overrun will be fixed up by subsequent iterations + // of the outermost loop. + // + // The C++ snappy code calls this technique IncrementalCopyFastPath. Its + // commentary says: + // + // ---- + // + // The main part of this loop is a simple copy of eight bytes at a time + // until we've copied (at least) the requested amount of bytes. However, + // if d and d-offset are less than eight bytes apart (indicating a + // repeating pattern of length < 8), we first need to expand the pattern in + // order to get the correct results. For instance, if the buffer looks like + // this, with the eight-byte <d-offset> and <d> patterns marked as + // intervals: + // + // abxxxxxxxxxxxx + // [------] d-offset + // [------] d + // + // a single eight-byte copy from <d-offset> to <d> will repeat the pattern + // once, after which we can move <d> two bytes without moving <d-offset>: + // + // ababxxxxxxxxxx + // [------] d-offset + // [------] d + // + // and repeat the exercise until the two no longer overlap. + // + // This allows us to do very well in the special case of one single byte + // repeated many times, without taking a big hit for more general cases. + // + // The worst case of extra writing past the end of the match occurs when + // offset == 1 and length == 1; the last copy will read from byte positions + // [0..7] and write to [4..11], whereas it was only supposed to write to + // position 1. Thus, ten excess bytes. + // + // ---- + // + // That "10 byte overrun" worst case is confirmed by Go's + // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy + // and finishSlowForwardCopy algorithm. + // + // if length > len(dst)-d-10 { + // goto verySlowForwardCopy + // } + SUB $10, R_TMP2, R_TMP2 + CMP R_TMP2, R_LEN + BGT verySlowForwardCopy + + // We want to keep the offset, so we use R_TMP2 from here. + MOVD R_OFF, R_TMP2 + +makeOffsetAtLeast8: + // !!! As above, expand the pattern so that offset >= 8 and we can use + // 8-byte load/stores. + // + // for offset < 8 { + // copy 8 bytes from dst[d-offset:] to dst[d:] + // length -= offset + // d += offset + // offset += offset + // // The two previous lines together means that d-offset, and therefore + // // R_TMP3, is unchanged. + // } + CMP $8, R_TMP2 + BGE fixUpSlowForwardCopy + MOVD (R_TMP3), R_TMP1 + MOVD R_TMP1, (R_DST) + SUB R_TMP2, R_LEN, R_LEN + ADD R_TMP2, R_DST, R_DST + ADD R_TMP2, R_TMP2, R_TMP2 + B makeOffsetAtLeast8 + +fixUpSlowForwardCopy: + // !!! Add length (which might be negative now) to d (implied by R_DST being + // &dst[d]) so that d ends up at the right place when we jump back to the + // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if + // length is positive, copying the remaining length bytes will write to the + // right place. + MOVD R_DST, R_TMP0 + ADD R_LEN, R_DST, R_DST + +finishSlowForwardCopy: + // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative + // length means that we overrun, but as above, that will be fixed up by + // subsequent iterations of the outermost loop. + MOVD $0, R1 + CMP R1, R_LEN + BLE loop + MOVD (R_TMP3), R_TMP1 + MOVD R_TMP1, (R_TMP0) + ADD $8, R_TMP3, R_TMP3 + ADD $8, R_TMP0, R_TMP0 + SUB $8, R_LEN, R_LEN + B finishSlowForwardCopy + +verySlowForwardCopy: + // verySlowForwardCopy is a simple implementation of forward copy. In C + // parlance, this is a do/while loop instead of a while loop, since we know + // that length > 0. In Go syntax: + // + // for { + // dst[d] = dst[d - offset] + // d++ + // length-- + // if length == 0 { + // break + // } + // } + MOVB (R_TMP3), R_TMP1 + MOVB R_TMP1, (R_DST) + ADD $1, R_TMP3, R_TMP3 + ADD $1, R_DST, R_DST + SUB $1, R_LEN, R_LEN + CBNZ R_LEN, verySlowForwardCopy + B loop + + // The code above handles copy tags. + // ---------------------------------------- + +end: + // This is the end of the "for s < len(src)". + // + // if d != len(dst) { etc } + CMP R_DEND, R_DST + BNE errCorrupt + + // return 0 + MOVD $0, ret+48(FP) + RET + +errCorrupt: + // return decodeErrCodeCorrupt + MOVD $1, R_TMP0 + MOVD R_TMP0, ret+48(FP) + RET diff --git a/vendor/github.com/klauspost/compress/s2/decode_asm.go b/vendor/github.com/klauspost/compress/s2/decode_asm.go new file mode 100644 index 00000000..cb3576ed --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_asm.go @@ -0,0 +1,17 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (amd64 || arm64) && !appengine && gc && !noasm +// +build amd64 arm64 +// +build !appengine +// +build gc +// +build !noasm + +package s2 + +// decode has the same semantics as in decode_other.go. +// +//go:noescape +func s2Decode(dst, src []byte) int diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go new file mode 100644 index 00000000..1074ebd2 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_other.go @@ -0,0 +1,267 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (!amd64 && !arm64) || appengine || !gc || noasm +// +build !amd64,!arm64 appengine !gc noasm + +package s2 + +import ( + "fmt" + "strconv" +) + +// decode writes the decoding of src to dst. It assumes that the varint-encoded +// length of the decompressed bytes has already been read, and that len(dst) +// equals that length. +// +// It returns 0 on success or a decodeErrCodeXxx error code on failure. +func s2Decode(dst, src []byte) int { + const debug = false + if debug { + fmt.Println("Starting decode, dst len:", len(dst)) + } + var d, s, length int + offset := 0 + + // As long as we can read at least 5 bytes... + for s < len(src)-5 { + switch src[s] & 0x03 { + case tagLiteral: + x := uint32(src[s] >> 2) + switch { + case x < 60: + s++ + case x == 60: + s += 2 + x = uint32(src[s-1]) + case x == 61: + s += 3 + x = uint32(src[s-2]) | uint32(src[s-1])<<8 + case x == 62: + s += 4 + x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 + case x == 63: + s += 5 + x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 + } + length = int(x) + 1 + if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) { + return decodeErrCodeCorrupt + } + if debug { + fmt.Println("literals, length:", length, "d-after:", d+length) + } + + copy(dst[d:], src[s:s+length]) + d += length + s += length + continue + + case tagCopy1: + s += 2 + length = int(src[s-2]) >> 2 & 0x7 + toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + if toffset == 0 { + if debug { + fmt.Print("(repeat) ") + } + // keep last offset + switch length { + case 5: + s += 1 + length = int(uint32(src[s-1])) + 4 + case 6: + s += 2 + length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8) + case 7: + s += 3 + length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16) + default: // 0-> 4 + } + } else { + offset = toffset + } + length += 4 + case tagCopy2: + s += 3 + length = 1 + int(src[s-3])>>2 + offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) + + case tagCopy4: + s += 5 + length = 1 + int(src[s-5])>>2 + offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) + } + + if offset <= 0 || d < offset || length > len(dst)-d { + return decodeErrCodeCorrupt + } + + if debug { + fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length) + } + + // Copy from an earlier sub-slice of dst to a later sub-slice. + // If no overlap, use the built-in copy: + if offset > length { + copy(dst[d:d+length], dst[d-offset:]) + d += length + continue + } + + // Unlike the built-in copy function, this byte-by-byte copy always runs + // forwards, even if the slices overlap. Conceptually, this is: + // + // d += forwardCopy(dst[d:d+length], dst[d-offset:]) + // + // We align the slices into a and b and show the compiler they are the same size. + // This allows the loop to run without bounds checks. + a := dst[d : d+length] + b := dst[d-offset:] + b = b[:len(a)] + for i := range a { + a[i] = b[i] + } + d += length + } + + // Remaining with extra checks... + for s < len(src) { + switch src[s] & 0x03 { + case tagLiteral: + x := uint32(src[s] >> 2) + switch { + case x < 60: + s++ + case x == 60: + s += 2 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-1]) + case x == 61: + s += 3 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-2]) | uint32(src[s-1])<<8 + case x == 62: + s += 4 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 + case x == 63: + s += 5 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 + } + length = int(x) + 1 + if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) { + return decodeErrCodeCorrupt + } + if debug { + fmt.Println("literals, length:", length, "d-after:", d+length) + } + + copy(dst[d:], src[s:s+length]) + d += length + s += length + continue + + case tagCopy1: + s += 2 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = int(src[s-2]) >> 2 & 0x7 + toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + if toffset == 0 { + if debug { + fmt.Print("(repeat) ") + } + // keep last offset + switch length { + case 5: + s += 1 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = int(uint32(src[s-1])) + 4 + case 6: + s += 2 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8) + case 7: + s += 3 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16) + default: // 0-> 4 + } + } else { + offset = toffset + } + length += 4 + case tagCopy2: + s += 3 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = 1 + int(src[s-3])>>2 + offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) + + case tagCopy4: + s += 5 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = 1 + int(src[s-5])>>2 + offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) + } + + if offset <= 0 || d < offset || length > len(dst)-d { + return decodeErrCodeCorrupt + } + + if debug { + fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length) + } + + // Copy from an earlier sub-slice of dst to a later sub-slice. + // If no overlap, use the built-in copy: + if offset > length { + copy(dst[d:d+length], dst[d-offset:]) + d += length + continue + } + + // Unlike the built-in copy function, this byte-by-byte copy always runs + // forwards, even if the slices overlap. Conceptually, this is: + // + // d += forwardCopy(dst[d:d+length], dst[d-offset:]) + // + // We align the slices into a and b and show the compiler they are the same size. + // This allows the loop to run without bounds checks. + a := dst[d : d+length] + b := dst[d-offset:] + b = b[:len(a)] + for i := range a { + a[i] = b[i] + } + d += length + } + + if d != len(dst) { + return decodeErrCodeCorrupt + } + return 0 +} diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go new file mode 100644 index 00000000..aa8b108d --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encode.go @@ -0,0 +1,1172 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package s2 + +import ( + "crypto/rand" + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "math/bits" + "runtime" + "sync" +) + +// Encode returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func Encode(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + n := encodeBlock(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + +// EncodeBetter returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// EncodeBetter compresses better than Encode but typically with a +// 10-40% speed decrease on both compression and decompression. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func EncodeBetter(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if len(dst) < n { + dst = make([]byte, n) + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + n := encodeBlockBetter(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + +// EncodeBest returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// EncodeBest compresses as good as reasonably possible but with a +// big speed decrease. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func EncodeBest(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if len(dst) < n { + dst = make([]byte, n) + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + n := encodeBlockBest(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + +// EncodeSnappy returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The output is Snappy compatible and will likely decompress faster. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func EncodeSnappy(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + + n := encodeBlockSnappy(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + +// EncodeSnappyBetter returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The output is Snappy compatible and will likely decompress faster. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func EncodeSnappyBetter(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + + n := encodeBlockBetterSnappy(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + +// EncodeSnappyBest returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The output is Snappy compatible and will likely decompress faster. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func EncodeSnappyBest(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + + n := encodeBlockBestSnappy(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + +// ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination. +// If the destination is nil or too small, a new will be allocated. +// The blocks are not validated, so garbage in = garbage out. +// dst may not overlap block data. +// Any data in dst is preserved as is, so it will not be considered a block. +func ConcatBlocks(dst []byte, blocks ...[]byte) ([]byte, error) { + totalSize := uint64(0) + compSize := 0 + for _, b := range blocks { + l, hdr, err := decodedLen(b) + if err != nil { + return nil, err + } + totalSize += uint64(l) + compSize += len(b) - hdr + } + if totalSize == 0 { + dst = append(dst, 0) + return dst, nil + } + if totalSize > math.MaxUint32 { + return nil, ErrTooLarge + } + var tmp [binary.MaxVarintLen32]byte + hdrSize := binary.PutUvarint(tmp[:], totalSize) + wantSize := hdrSize + compSize + + if cap(dst)-len(dst) < wantSize { + dst = append(make([]byte, 0, wantSize+len(dst)), dst...) + } + dst = append(dst, tmp[:hdrSize]...) + for _, b := range blocks { + _, hdr, err := decodedLen(b) + if err != nil { + return nil, err + } + dst = append(dst, b[hdr:]...) + } + return dst, nil +} + +// inputMargin is the minimum number of extra input bytes to keep, inside +// encodeBlock's inner loop. On some architectures, this margin lets us +// implement a fast path for emitLiteral, where the copy of short (<= 16 byte) +// literals can be implemented as a single load to and store from a 16-byte +// register. That literal's actual length can be as short as 1 byte, so this +// can copy up to 15 bytes too much, but that's OK as subsequent iterations of +// the encoding loop will fix up the copy overrun, and this inputMargin ensures +// that we don't overrun the dst and src buffers. +const inputMargin = 8 + +// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that +// will be accepted by the encoder. +const minNonLiteralBlockSize = 32 + +// MaxBlockSize is the maximum value where MaxEncodedLen will return a valid block size. +// Blocks this big are highly discouraged, though. +const MaxBlockSize = math.MaxUint32 - binary.MaxVarintLen32 - 5 + +// MaxEncodedLen returns the maximum length of a snappy block, given its +// uncompressed length. +// +// It will return a negative value if srcLen is too large to encode. +// 32 bit platforms will have lower thresholds for rejecting big content. +func MaxEncodedLen(srcLen int) int { + n := uint64(srcLen) + if n > 0xffffffff { + // Also includes negative. + return -1 + } + // Size of the varint encoded block size. + n = n + uint64((bits.Len64(n)+7)/7) + + // Add maximum size of encoding block as literals. + n += uint64(literalExtraSize(int64(srcLen))) + if n > 0xffffffff { + return -1 + } + return int(n) +} + +var errClosed = errors.New("s2: Writer is closed") + +// NewWriter returns a new Writer that compresses to w, using the +// framing format described at +// https://github.com/google/snappy/blob/master/framing_format.txt +// +// Users must call Close to guarantee all data has been forwarded to +// the underlying io.Writer and that resources are released. +// They may also call Flush zero or more times before calling Close. +func NewWriter(w io.Writer, opts ...WriterOption) *Writer { + w2 := Writer{ + blockSize: defaultBlockSize, + concurrency: runtime.GOMAXPROCS(0), + randSrc: rand.Reader, + level: levelFast, + } + for _, opt := range opts { + if err := opt(&w2); err != nil { + w2.errState = err + return &w2 + } + } + w2.obufLen = obufHeaderLen + MaxEncodedLen(w2.blockSize) + w2.paramsOK = true + w2.ibuf = make([]byte, 0, w2.blockSize) + w2.buffers.New = func() interface{} { + return make([]byte, w2.obufLen) + } + w2.Reset(w) + return &w2 +} + +// Writer is an io.Writer that can write Snappy-compressed bytes. +type Writer struct { + errMu sync.Mutex + errState error + + // ibuf is a buffer for the incoming (uncompressed) bytes. + ibuf []byte + + blockSize int + obufLen int + concurrency int + written int64 + output chan chan result + buffers sync.Pool + pad int + + writer io.Writer + randSrc io.Reader + writerWg sync.WaitGroup + + // wroteStreamHeader is whether we have written the stream header. + wroteStreamHeader bool + paramsOK bool + snappy bool + flushOnWrite bool + level uint8 +} + +const ( + levelUncompressed = iota + 1 + levelFast + levelBetter + levelBest +) + +type result []byte + +// err returns the previously set error. +// If no error has been set it is set to err if not nil. +func (w *Writer) err(err error) error { + w.errMu.Lock() + errSet := w.errState + if errSet == nil && err != nil { + w.errState = err + errSet = err + } + w.errMu.Unlock() + return errSet +} + +// Reset discards the writer's state and switches the Snappy writer to write to w. +// This permits reusing a Writer rather than allocating a new one. +func (w *Writer) Reset(writer io.Writer) { + if !w.paramsOK { + return + } + // Close previous writer, if any. + if w.output != nil { + close(w.output) + w.writerWg.Wait() + w.output = nil + } + w.errState = nil + w.ibuf = w.ibuf[:0] + w.wroteStreamHeader = false + w.written = 0 + w.writer = writer + // If we didn't get a writer, stop here. + if writer == nil { + return + } + // If no concurrency requested, don't spin up writer goroutine. + if w.concurrency == 1 { + return + } + + toWrite := make(chan chan result, w.concurrency) + w.output = toWrite + w.writerWg.Add(1) + + // Start a writer goroutine that will write all output in order. + go func() { + defer w.writerWg.Done() + + // Get a queued write. + for write := range toWrite { + // Wait for the data to be available. + in := <-write + if len(in) > 0 { + if w.err(nil) == nil { + // Don't expose data from previous buffers. + toWrite := in[:len(in):len(in)] + // Write to output. + n, err := writer.Write(toWrite) + if err == nil && n != len(toWrite) { + err = io.ErrShortBuffer + } + _ = w.err(err) + w.written += int64(n) + } + } + if cap(in) >= w.obufLen { + w.buffers.Put([]byte(in)) + } + // close the incoming write request. + // This can be used for synchronizing flushes. + close(write) + } + }() +} + +// Write satisfies the io.Writer interface. +func (w *Writer) Write(p []byte) (nRet int, errRet error) { + if w.flushOnWrite { + return w.write(p) + } + // If we exceed the input buffer size, start writing + for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err(nil) == nil { + var n int + if len(w.ibuf) == 0 { + // Large write, empty buffer. + // Write directly from p to avoid copy. + n, _ = w.write(p) + } else { + n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) + w.ibuf = w.ibuf[:len(w.ibuf)+n] + w.write(w.ibuf) + w.ibuf = w.ibuf[:0] + } + nRet += n + p = p[n:] + } + if err := w.err(nil); err != nil { + return nRet, err + } + // p should always be able to fit into w.ibuf now. + n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) + w.ibuf = w.ibuf[:len(w.ibuf)+n] + nRet += n + return nRet, nil +} + +// ReadFrom implements the io.ReaderFrom interface. +// Using this is typically more efficient since it avoids a memory copy. +// ReadFrom reads data from r until EOF or error. +// The return value n is the number of bytes read. +// Any error except io.EOF encountered during the read is also returned. +func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) { + if len(w.ibuf) > 0 { + err := w.Flush() + if err != nil { + return 0, err + } + } + if br, ok := r.(byter); ok { + buf := br.Bytes() + if err := w.EncodeBuffer(buf); err != nil { + return 0, err + } + return int64(len(buf)), w.Flush() + } + for { + inbuf := w.buffers.Get().([]byte)[:w.blockSize+obufHeaderLen] + n2, err := io.ReadFull(r, inbuf[obufHeaderLen:]) + if err != nil { + if err == io.ErrUnexpectedEOF { + err = io.EOF + } + if err != io.EOF { + return n, w.err(err) + } + } + if n2 == 0 { + break + } + n += int64(n2) + err2 := w.writeFull(inbuf[:n2+obufHeaderLen]) + if w.err(err2) != nil { + break + } + + if err != nil { + // We got EOF and wrote everything + break + } + } + + return n, w.err(nil) +} + +// EncodeBuffer will add a buffer to the stream. +// This is the fastest way to encode a stream, +// but the input buffer cannot be written to by the caller +// until Flush or Close has been called when concurrency != 1. +// +// If you cannot control that, use the regular Write function. +// +// Note that input is not buffered. +// This means that each write will result in discrete blocks being created. +// For buffered writes, use the regular Write function. +func (w *Writer) EncodeBuffer(buf []byte) (err error) { + if err := w.err(nil); err != nil { + return err + } + + if w.flushOnWrite { + _, err := w.write(buf) + return err + } + // Flush queued data first. + if len(w.ibuf) > 0 { + err := w.Flush() + if err != nil { + return err + } + } + if w.concurrency == 1 { + _, err := w.writeSync(buf) + return err + } + + // Spawn goroutine and write block to output channel. + if !w.wroteStreamHeader { + w.wroteStreamHeader = true + hWriter := make(chan result) + w.output <- hWriter + if w.snappy { + hWriter <- []byte(magicChunkSnappy) + } else { + hWriter <- []byte(magicChunk) + } + } + + for len(buf) > 0 { + // Cut input. + uncompressed := buf + if len(uncompressed) > w.blockSize { + uncompressed = uncompressed[:w.blockSize] + } + buf = buf[len(uncompressed):] + // Get an output buffer. + obuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen] + output := make(chan result) + // Queue output now, so we keep order. + w.output <- output + go func() { + checksum := crc(uncompressed) + + // Set to uncompressed. + chunkType := uint8(chunkTypeUncompressedData) + chunkLen := 4 + len(uncompressed) + + // Attempt compressing. + n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) + n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) + + // Check if we should use this, or store as uncompressed instead. + if n2 > 0 { + chunkType = uint8(chunkTypeCompressedData) + chunkLen = 4 + n + n2 + obuf = obuf[:obufHeaderLen+n+n2] + } else { + // copy uncompressed + copy(obuf[obufHeaderLen:], uncompressed) + } + + // Fill in the per-chunk header that comes before the body. + obuf[0] = chunkType + obuf[1] = uint8(chunkLen >> 0) + obuf[2] = uint8(chunkLen >> 8) + obuf[3] = uint8(chunkLen >> 16) + obuf[4] = uint8(checksum >> 0) + obuf[5] = uint8(checksum >> 8) + obuf[6] = uint8(checksum >> 16) + obuf[7] = uint8(checksum >> 24) + + // Queue final output. + output <- obuf + }() + } + return nil +} + +func (w *Writer) encodeBlock(obuf, uncompressed []byte) int { + if w.snappy { + switch w.level { + case levelFast: + return encodeBlockSnappy(obuf, uncompressed) + case levelBetter: + return encodeBlockBetterSnappy(obuf, uncompressed) + case levelBest: + return encodeBlockBestSnappy(obuf, uncompressed) + } + return 0 + } + switch w.level { + case levelFast: + return encodeBlock(obuf, uncompressed) + case levelBetter: + return encodeBlockBetter(obuf, uncompressed) + case levelBest: + return encodeBlockBest(obuf, uncompressed) + } + return 0 +} + +func (w *Writer) write(p []byte) (nRet int, errRet error) { + if err := w.err(nil); err != nil { + return 0, err + } + if w.concurrency == 1 { + return w.writeSync(p) + } + + // Spawn goroutine and write block to output channel. + for len(p) > 0 { + if !w.wroteStreamHeader { + w.wroteStreamHeader = true + hWriter := make(chan result) + w.output <- hWriter + if w.snappy { + hWriter <- []byte(magicChunkSnappy) + } else { + hWriter <- []byte(magicChunk) + } + } + + var uncompressed []byte + if len(p) > w.blockSize { + uncompressed, p = p[:w.blockSize], p[w.blockSize:] + } else { + uncompressed, p = p, nil + } + + // Copy input. + // If the block is incompressible, this is used for the result. + inbuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen] + obuf := w.buffers.Get().([]byte)[:w.obufLen] + copy(inbuf[obufHeaderLen:], uncompressed) + uncompressed = inbuf[obufHeaderLen:] + + output := make(chan result) + // Queue output now, so we keep order. + w.output <- output + go func() { + checksum := crc(uncompressed) + + // Set to uncompressed. + chunkType := uint8(chunkTypeUncompressedData) + chunkLen := 4 + len(uncompressed) + + // Attempt compressing. + n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) + n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) + + // Check if we should use this, or store as uncompressed instead. + if n2 > 0 { + chunkType = uint8(chunkTypeCompressedData) + chunkLen = 4 + n + n2 + obuf = obuf[:obufHeaderLen+n+n2] + } else { + // Use input as output. + obuf, inbuf = inbuf, obuf + } + + // Fill in the per-chunk header that comes before the body. + obuf[0] = chunkType + obuf[1] = uint8(chunkLen >> 0) + obuf[2] = uint8(chunkLen >> 8) + obuf[3] = uint8(chunkLen >> 16) + obuf[4] = uint8(checksum >> 0) + obuf[5] = uint8(checksum >> 8) + obuf[6] = uint8(checksum >> 16) + obuf[7] = uint8(checksum >> 24) + + // Queue final output. + output <- obuf + + // Put unused buffer back in pool. + w.buffers.Put(inbuf) + }() + nRet += len(uncompressed) + } + return nRet, nil +} + +// writeFull is a special version of write that will always write the full buffer. +// Data to be compressed should start at offset obufHeaderLen and fill the remainder of the buffer. +// The data will be written as a single block. +// The caller is not allowed to use inbuf after this function has been called. +func (w *Writer) writeFull(inbuf []byte) (errRet error) { + if err := w.err(nil); err != nil { + return err + } + + if w.concurrency == 1 { + _, err := w.writeSync(inbuf[obufHeaderLen:]) + return err + } + + // Spawn goroutine and write block to output channel. + if !w.wroteStreamHeader { + w.wroteStreamHeader = true + hWriter := make(chan result) + w.output <- hWriter + if w.snappy { + hWriter <- []byte(magicChunkSnappy) + } else { + hWriter <- []byte(magicChunk) + } + } + + // Get an output buffer. + obuf := w.buffers.Get().([]byte)[:w.obufLen] + uncompressed := inbuf[obufHeaderLen:] + + output := make(chan result) + // Queue output now, so we keep order. + w.output <- output + go func() { + checksum := crc(uncompressed) + + // Set to uncompressed. + chunkType := uint8(chunkTypeUncompressedData) + chunkLen := 4 + len(uncompressed) + + // Attempt compressing. + n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) + n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) + + // Check if we should use this, or store as uncompressed instead. + if n2 > 0 { + chunkType = uint8(chunkTypeCompressedData) + chunkLen = 4 + n + n2 + obuf = obuf[:obufHeaderLen+n+n2] + } else { + // Use input as output. + obuf, inbuf = inbuf, obuf + } + + // Fill in the per-chunk header that comes before the body. + obuf[0] = chunkType + obuf[1] = uint8(chunkLen >> 0) + obuf[2] = uint8(chunkLen >> 8) + obuf[3] = uint8(chunkLen >> 16) + obuf[4] = uint8(checksum >> 0) + obuf[5] = uint8(checksum >> 8) + obuf[6] = uint8(checksum >> 16) + obuf[7] = uint8(checksum >> 24) + + // Queue final output. + output <- obuf + + // Put unused buffer back in pool. + w.buffers.Put(inbuf) + }() + return nil +} + +func (w *Writer) writeSync(p []byte) (nRet int, errRet error) { + if err := w.err(nil); err != nil { + return 0, err + } + if !w.wroteStreamHeader { + w.wroteStreamHeader = true + var n int + var err error + if w.snappy { + n, err = w.writer.Write([]byte(magicChunkSnappy)) + } else { + n, err = w.writer.Write([]byte(magicChunk)) + } + if err != nil { + return 0, w.err(err) + } + if n != len(magicChunk) { + return 0, w.err(io.ErrShortWrite) + } + w.written += int64(n) + } + + for len(p) > 0 { + var uncompressed []byte + if len(p) > w.blockSize { + uncompressed, p = p[:w.blockSize], p[w.blockSize:] + } else { + uncompressed, p = p, nil + } + + obuf := w.buffers.Get().([]byte)[:w.obufLen] + checksum := crc(uncompressed) + + // Set to uncompressed. + chunkType := uint8(chunkTypeUncompressedData) + chunkLen := 4 + len(uncompressed) + + // Attempt compressing. + n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) + n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) + + if n2 > 0 { + chunkType = uint8(chunkTypeCompressedData) + chunkLen = 4 + n + n2 + obuf = obuf[:obufHeaderLen+n+n2] + } else { + obuf = obuf[:8] + } + + // Fill in the per-chunk header that comes before the body. + obuf[0] = chunkType + obuf[1] = uint8(chunkLen >> 0) + obuf[2] = uint8(chunkLen >> 8) + obuf[3] = uint8(chunkLen >> 16) + obuf[4] = uint8(checksum >> 0) + obuf[5] = uint8(checksum >> 8) + obuf[6] = uint8(checksum >> 16) + obuf[7] = uint8(checksum >> 24) + + n, err := w.writer.Write(obuf) + if err != nil { + return 0, w.err(err) + } + if n != len(obuf) { + return 0, w.err(io.ErrShortWrite) + } + w.written += int64(n) + if chunkType == chunkTypeUncompressedData { + // Write uncompressed data. + n, err := w.writer.Write(uncompressed) + if err != nil { + return 0, w.err(err) + } + if n != len(uncompressed) { + return 0, w.err(io.ErrShortWrite) + } + w.written += int64(n) + } + w.buffers.Put(obuf) + // Queue final output. + nRet += len(uncompressed) + } + return nRet, nil +} + +// Flush flushes the Writer to its underlying io.Writer. +// This does not apply padding. +func (w *Writer) Flush() error { + if err := w.err(nil); err != nil { + return err + } + + // Queue any data still in input buffer. + if len(w.ibuf) != 0 { + if !w.wroteStreamHeader { + _, err := w.writeSync(w.ibuf) + w.ibuf = w.ibuf[:0] + return w.err(err) + } else { + _, err := w.write(w.ibuf) + w.ibuf = w.ibuf[:0] + err = w.err(err) + if err != nil { + return err + } + } + } + if w.output == nil { + return w.err(nil) + } + + // Send empty buffer + res := make(chan result) + w.output <- res + // Block until this has been picked up. + res <- nil + // When it is closed, we have flushed. + <-res + return w.err(nil) +} + +// Close calls Flush and then closes the Writer. +// Calling Close multiple times is ok. +func (w *Writer) Close() error { + err := w.Flush() + if w.output != nil { + close(w.output) + w.writerWg.Wait() + w.output = nil + } + if w.err(nil) == nil && w.writer != nil && w.pad > 0 { + add := calcSkippableFrame(w.written, int64(w.pad)) + frame, err := skippableFrame(w.ibuf[:0], add, w.randSrc) + if err = w.err(err); err != nil { + return err + } + _, err2 := w.writer.Write(frame) + _ = w.err(err2) + } + _ = w.err(errClosed) + if err == errClosed { + return nil + } + return err +} + +const skippableFrameHeader = 4 + +// calcSkippableFrame will return a total size to be added for written +// to be divisible by multiple. +// The value will always be > skippableFrameHeader. +// The function will panic if written < 0 or wantMultiple <= 0. +func calcSkippableFrame(written, wantMultiple int64) int { + if wantMultiple <= 0 { + panic("wantMultiple <= 0") + } + if written < 0 { + panic("written < 0") + } + leftOver := written % wantMultiple + if leftOver == 0 { + return 0 + } + toAdd := wantMultiple - leftOver + for toAdd < skippableFrameHeader { + toAdd += wantMultiple + } + return int(toAdd) +} + +// skippableFrame will add a skippable frame with a total size of bytes. +// total should be >= skippableFrameHeader and < maxBlockSize + skippableFrameHeader +func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) { + if total == 0 { + return dst, nil + } + if total < skippableFrameHeader { + return dst, fmt.Errorf("s2: requested skippable frame (%d) < 4", total) + } + if int64(total) >= maxBlockSize+skippableFrameHeader { + return dst, fmt.Errorf("s2: requested skippable frame (%d) >= max 1<<24", total) + } + // Chunk type 0xfe "Section 4.4 Padding (chunk type 0xfe)" + dst = append(dst, chunkTypePadding) + f := uint32(total - skippableFrameHeader) + // Add chunk length. + dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16)) + // Add data + start := len(dst) + dst = append(dst, make([]byte, f)...) + _, err := io.ReadFull(r, dst[start:]) + return dst, err +} + +// WriterOption is an option for creating a encoder. +type WriterOption func(*Writer) error + +// WriterConcurrency will set the concurrency, +// meaning the maximum number of decoders to run concurrently. +// The value supplied must be at least 1. +// By default this will be set to GOMAXPROCS. +func WriterConcurrency(n int) WriterOption { + return func(w *Writer) error { + if n <= 0 { + return errors.New("concurrency must be at least 1") + } + w.concurrency = n + return nil + } +} + +// WriterBetterCompression will enable better compression. +// EncodeBetter compresses better than Encode but typically with a +// 10-40% speed decrease on both compression and decompression. +func WriterBetterCompression() WriterOption { + return func(w *Writer) error { + w.level = levelBetter + return nil + } +} + +// WriterBestCompression will enable better compression. +// EncodeBetter compresses better than Encode but typically with a +// big speed decrease on compression. +func WriterBestCompression() WriterOption { + return func(w *Writer) error { + w.level = levelBest + return nil + } +} + +// WriterUncompressed will bypass compression. +// The stream will be written as uncompressed blocks only. +// If concurrency is > 1 CRC and output will still be done async. +func WriterUncompressed() WriterOption { + return func(w *Writer) error { + w.level = levelUncompressed + return nil + } +} + +// WriterBlockSize allows to override the default block size. +// Blocks will be this size or smaller. +// Minimum size is 4KB and and maximum size is 4MB. +// +// Bigger blocks may give bigger throughput on systems with many cores, +// and will increase compression slightly, but it will limit the possible +// concurrency for smaller payloads for both encoding and decoding. +// Default block size is 1MB. +// +// When writing Snappy compatible output using WriterSnappyCompat, +// the maximum block size is 64KB. +func WriterBlockSize(n int) WriterOption { + return func(w *Writer) error { + if w.snappy && n > maxSnappyBlockSize || n < minBlockSize { + return errors.New("s2: block size too large. Must be <= 64K and >=4KB on for snappy compatible output") + } + if n > maxBlockSize || n < minBlockSize { + return errors.New("s2: block size too large. Must be <= 4MB and >=4KB") + } + w.blockSize = n + return nil + } +} + +// WriterPadding will add padding to all output so the size will be a multiple of n. +// This can be used to obfuscate the exact output size or make blocks of a certain size. +// The contents will be a skippable frame, so it will be invisible by the decoder. +// n must be > 0 and <= 4MB. +// The padded area will be filled with data from crypto/rand.Reader. +// The padding will be applied whenever Close is called on the writer. +func WriterPadding(n int) WriterOption { + return func(w *Writer) error { + if n <= 0 { + return fmt.Errorf("s2: padding must be at least 1") + } + // No need to waste our time. + if n == 1 { + w.pad = 0 + } + if n > maxBlockSize { + return fmt.Errorf("s2: padding must less than 4MB") + } + w.pad = n + return nil + } +} + +// WriterPaddingSrc will get random data for padding from the supplied source. +// By default crypto/rand is used. +func WriterPaddingSrc(reader io.Reader) WriterOption { + return func(w *Writer) error { + w.randSrc = reader + return nil + } +} + +// WriterSnappyCompat will write snappy compatible output. +// The output can be decompressed using either snappy or s2. +// If block size is more than 64KB it is set to that. +func WriterSnappyCompat() WriterOption { + return func(w *Writer) error { + w.snappy = true + if w.blockSize > 64<<10 { + // We choose 8 bytes less than 64K, since that will make literal emits slightly more effective. + // And allows us to skip some size checks. + w.blockSize = (64 << 10) - 8 + } + return nil + } +} + +// WriterFlushOnWrite will compress blocks on each call to the Write function. +// +// This is quite inefficient as blocks size will depend on the write size. +// +// Use WriterConcurrency(1) to also make sure that output is flushed. +// When Write calls return, otherwise they will be written when compression is done. +func WriterFlushOnWrite() WriterOption { + return func(w *Writer) error { + w.flushOnWrite = true + return nil + } +} diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go new file mode 100644 index 00000000..8b16c38a --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encode_all.go @@ -0,0 +1,456 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package s2 + +import ( + "bytes" + "encoding/binary" + "math/bits" +) + +func load32(b []byte, i int) uint32 { + return binary.LittleEndian.Uint32(b[i:]) +} + +func load64(b []byte, i int) uint64 { + return binary.LittleEndian.Uint64(b[i:]) +} + +// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash6(u uint64, h uint8) uint32 { + const prime6bytes = 227718039650203 + return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63)) +} + +func encodeGo(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if len(dst) < n { + dst = make([]byte, n) + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + n := encodeBlockGo(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + +// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockGo(dst, src []byte) (d int) { + // Initialize the hash table. + const ( + tableBits = 14 + maxTableSize = 1 << tableBits + + debug = false + ) + + var table [maxTableSize]uint32 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 5 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + + for { + candidate := 0 + for { + // Next src position to check + nextS := s + (s-nextEmit)>>6 + 4 + if nextS > sLimit { + goto emitRemainder + } + hash0 := hash6(cv, tableBits) + hash1 := hash6(cv>>8, tableBits) + candidate = int(table[hash0]) + candidate2 := int(table[hash1]) + table[hash0] = uint32(s) + table[hash1] = uint32(s + 1) + hash2 := hash6(cv>>16, tableBits) + + // Check repeat at offset checkRep. + const checkRep = 1 + if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + + // Extend forward + candidate := s - repeat + 4 + checkRep + s += 4 + checkRep + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + if debug { + // Validate match. + if s <= candidate { + panic("s <= candidate") + } + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + if nextEmit > 0 { + // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. + d += emitRepeat(dst[d:], repeat, s-base) + } else { + // First match, cannot be repeat. + d += emitCopy(dst[d:], repeat, s-base) + } + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + cv = load64(src, s) + continue + } + + if uint32(cv) == load32(src, candidate) { + break + } + candidate = int(table[hash2]) + if uint32(cv>>8) == load32(src, candidate2) { + table[hash2] = uint32(s + 2) + candidate = candidate2 + s++ + break + } + table[hash2] = uint32(s + 2) + if uint32(cv>>16) == load32(src, candidate) { + s += 2 + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards. + // The top bytes will be rechecked to get the full match. + for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { + candidate-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + d += emitLiteral(dst[d:], src[nextEmit:s]) + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + base := s + repeat = base - candidate + + // Extend the 4-byte match as long as possible. + s += 4 + candidate += 4 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopy(dst[d:], repeat, s-base) + if debug { + // Validate match. + if s <= candidate { + panic("s <= candidate") + } + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Check for an immediate match, otherwise start search at s+1 + x := load64(src, s-2) + m2Hash := hash6(x, tableBits) + currHash := hash6(x>>16, tableBits) + candidate = int(table[currHash]) + table[m2Hash] = uint32(s - 2) + table[currHash] = uint32(s) + if debug && s == candidate { + panic("s == candidate") + } + if uint32(x>>16) != load32(src, candidate) { + cv = load64(src, s+1) + s++ + break + } + } + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + +func encodeBlockSnappyGo(dst, src []byte) (d int) { + // Initialize the hash table. + const ( + tableBits = 14 + maxTableSize = 1 << tableBits + ) + + var table [maxTableSize]uint32 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 5 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + + for { + candidate := 0 + for { + // Next src position to check + nextS := s + (s-nextEmit)>>6 + 4 + if nextS > sLimit { + goto emitRemainder + } + hash0 := hash6(cv, tableBits) + hash1 := hash6(cv>>8, tableBits) + candidate = int(table[hash0]) + candidate2 := int(table[hash1]) + table[hash0] = uint32(s) + table[hash1] = uint32(s + 1) + hash2 := hash6(cv>>16, tableBits) + + // Check repeat at offset checkRep. + const checkRep = 1 + if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + + // Extend forward + candidate := s - repeat + 4 + checkRep + s += 4 + checkRep + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopyNoRepeat(dst[d:], repeat, s-base) + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + cv = load64(src, s) + continue + } + + if uint32(cv) == load32(src, candidate) { + break + } + candidate = int(table[hash2]) + if uint32(cv>>8) == load32(src, candidate2) { + table[hash2] = uint32(s + 2) + candidate = candidate2 + s++ + break + } + table[hash2] = uint32(s + 2) + if uint32(cv>>16) == load32(src, candidate) { + s += 2 + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards + for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { + candidate-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + d += emitLiteral(dst[d:], src[nextEmit:s]) + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + base := s + repeat = base - candidate + + // Extend the 4-byte match as long as possible. + s += 4 + candidate += 4 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopyNoRepeat(dst[d:], repeat, s-base) + if false { + // Validate match. + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Check for an immediate match, otherwise start search at s+1 + x := load64(src, s-2) + m2Hash := hash6(x, tableBits) + currHash := hash6(x>>16, tableBits) + candidate = int(table[currHash]) + table[m2Hash] = uint32(s - 2) + table[currHash] = uint32(s) + if uint32(x>>16) != load32(src, candidate) { + cv = load64(src, s+1) + s++ + break + } + } + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go new file mode 100644 index 00000000..e612225f --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go @@ -0,0 +1,142 @@ +//go:build !appengine && !noasm && gc +// +build !appengine,!noasm,gc + +package s2 + +// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlock(dst, src []byte) (d int) { + const ( + // Use 12 bit table when less than... + limit12B = 16 << 10 + // Use 10 bit table when less than... + limit10B = 4 << 10 + // Use 8 bit table when less than... + limit8B = 512 + ) + + if len(src) >= 4<<20 { + return encodeBlockAsm(dst, src) + } + if len(src) >= limit12B { + return encodeBlockAsm4MB(dst, src) + } + if len(src) >= limit10B { + return encodeBlockAsm12B(dst, src) + } + if len(src) >= limit8B { + return encodeBlockAsm10B(dst, src) + } + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeBlockAsm8B(dst, src) +} + +// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBetter(dst, src []byte) (d int) { + const ( + // Use 12 bit table when less than... + limit12B = 16 << 10 + // Use 10 bit table when less than... + limit10B = 4 << 10 + // Use 8 bit table when less than... + limit8B = 512 + ) + + if len(src) > 4<<20 { + return encodeBetterBlockAsm(dst, src) + } + if len(src) >= limit12B { + return encodeBetterBlockAsm4MB(dst, src) + } + if len(src) >= limit10B { + return encodeBetterBlockAsm12B(dst, src) + } + if len(src) >= limit8B { + return encodeBetterBlockAsm10B(dst, src) + } + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeBetterBlockAsm8B(dst, src) +} + +// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockSnappy(dst, src []byte) (d int) { + const ( + // Use 12 bit table when less than... + limit12B = 16 << 10 + // Use 10 bit table when less than... + limit10B = 4 << 10 + // Use 8 bit table when less than... + limit8B = 512 + ) + if len(src) >= 64<<10 { + return encodeSnappyBlockAsm(dst, src) + } + if len(src) >= limit12B { + return encodeSnappyBlockAsm64K(dst, src) + } + if len(src) >= limit10B { + return encodeSnappyBlockAsm12B(dst, src) + } + if len(src) >= limit8B { + return encodeSnappyBlockAsm10B(dst, src) + } + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeSnappyBlockAsm8B(dst, src) +} + +// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBetterSnappy(dst, src []byte) (d int) { + const ( + // Use 12 bit table when less than... + limit12B = 16 << 10 + // Use 10 bit table when less than... + limit10B = 4 << 10 + // Use 8 bit table when less than... + limit8B = 512 + ) + if len(src) >= 64<<10 { + return encodeSnappyBetterBlockAsm(dst, src) + } + if len(src) >= limit12B { + return encodeSnappyBetterBlockAsm64K(dst, src) + } + if len(src) >= limit10B { + return encodeSnappyBetterBlockAsm12B(dst, src) + } + if len(src) >= limit8B { + return encodeSnappyBetterBlockAsm10B(dst, src) + } + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeSnappyBetterBlockAsm8B(dst, src) +} diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go new file mode 100644 index 00000000..44803477 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encode_best.go @@ -0,0 +1,604 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package s2 + +import ( + "fmt" + "math/bits" +) + +// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBest(dst, src []byte) (d int) { + // Initialize the hash tables. + const ( + // Long hash matches. + lTableBits = 19 + maxLTableSize = 1 << lTableBits + + // Short hash matches. + sTableBits = 16 + maxSTableSize = 1 << sTableBits + + inputMargin = 8 + 2 + ) + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + if len(src) < minNonLiteralBlockSize { + return 0 + } + + var lTable [maxLTableSize]uint64 + var sTable [maxSTableSize]uint64 + + // Bail if we can't compress to at least this. + dstLimit := len(src) - 5 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + const lowbitMask = 0xffffffff + getCur := func(x uint64) int { + return int(x & lowbitMask) + } + getPrev := func(x uint64) int { + return int(x >> 32) + } + const maxSkip = 64 + + for { + type match struct { + offset int + s int + length int + score int + rep bool + } + var best match + for { + // Next src position to check + nextS := (s-nextEmit)>>8 + 1 + if nextS > maxSkip { + nextS = s + maxSkip + } else { + nextS += s + } + if nextS > sLimit { + goto emitRemainder + } + hashL := hash8(cv, lTableBits) + hashS := hash4(cv, sTableBits) + candidateL := lTable[hashL] + candidateS := sTable[hashS] + + score := func(m match) int { + // Matches that are longer forward are penalized since we must emit it as a literal. + score := m.length - m.s + if nextEmit == m.s { + // If we do not have to emit literals, we save 1 byte + score++ + } + offset := m.s - m.offset + if m.rep { + return score - emitRepeatSize(offset, m.length) + } + return score - emitCopySize(offset, m.length) + } + + matchAt := func(offset, s int, first uint32, rep bool) match { + if best.length != 0 && best.s-best.offset == s-offset { + // Don't retest if we have the same offset. + return match{offset: offset, s: s} + } + if load32(src, offset) != first { + return match{offset: offset, s: s} + } + m := match{offset: offset, s: s, length: 4 + offset, rep: rep} + s += 4 + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, m.length); diff != 0 { + m.length += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + m.length += 8 + } + m.length -= offset + m.score = score(m) + if m.score <= -m.s { + // Eliminate if no savings, we might find a better one. + m.length = 0 + } + return m + } + + bestOf := func(a, b match) match { + if b.length == 0 { + return a + } + if a.length == 0 { + return b + } + as := a.score + b.s + bs := b.score + a.s + if as >= bs { + return a + } + return b + } + + best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false)) + best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false)) + best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false)) + + { + best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true)) + if best.length > 0 { + // s+1 + nextShort := sTable[hash4(cv>>8, sTableBits)] + s := s + 1 + cv := load64(src, s) + nextLong := lTable[hash8(cv, lTableBits)] + best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false)) + best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false)) + best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false)) + best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false)) + // Repeat at + 2 + best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true)) + + // s+2 + if true { + nextShort = sTable[hash4(cv>>8, sTableBits)] + s++ + cv = load64(src, s) + nextLong = lTable[hash8(cv, lTableBits)] + best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false)) + best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false)) + best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false)) + best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false)) + } + // Search for a match at best match end, see if that is better. + if sAt := best.s + best.length; sAt < sLimit { + sBack := best.s + backL := best.length + // Load initial values + cv = load64(src, sBack) + // Search for mismatch + next := lTable[hash8(load64(src, sAt), lTableBits)] + //next := sTable[hash4(load64(src, sAt), sTableBits)] + + if checkAt := getCur(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + if checkAt := getPrev(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + } + } + } + + // Update table + lTable[hashL] = uint64(s) | candidateL<<32 + sTable[hashS] = uint64(s) | candidateS<<32 + + if best.length > 0 { + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards, not needed for repeats... + s = best.s + if !best.rep { + for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] { + best.offset-- + best.length++ + s-- + } + } + if false && best.offset >= s { + panic(fmt.Errorf("t %d >= s %d", best.offset, s)) + } + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + base := s + offset := s - best.offset + + s += best.length + + if offset > 65535 && s-base <= 5 && !best.rep { + // Bail if the match is equal or worse to the encoding. + s = best.s + 1 + if s >= sLimit { + goto emitRemainder + } + cv = load64(src, s) + continue + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + if best.rep { + if nextEmit > 0 { + // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. + d += emitRepeat(dst[d:], offset, best.length) + } else { + // First match, cannot be repeat. + d += emitCopy(dst[d:], offset, best.length) + } + } else { + d += emitCopy(dst[d:], offset, best.length) + } + repeat = offset + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Fill tables... + for i := best.s + 1; i < s; i++ { + cv0 := load64(src, i) + long0 := hash8(cv0, lTableBits) + short0 := hash4(cv0, sTableBits) + lTable[long0] = uint64(i) | lTable[long0]<<32 + sTable[short0] = uint64(i) | sTable[short0]<<32 + } + cv = load64(src, s) + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + +// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBestSnappy(dst, src []byte) (d int) { + // Initialize the hash tables. + const ( + // Long hash matches. + lTableBits = 19 + maxLTableSize = 1 << lTableBits + + // Short hash matches. + sTableBits = 16 + maxSTableSize = 1 << sTableBits + + inputMargin = 8 + 2 + ) + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + if len(src) < minNonLiteralBlockSize { + return 0 + } + + var lTable [maxLTableSize]uint64 + var sTable [maxSTableSize]uint64 + + // Bail if we can't compress to at least this. + dstLimit := len(src) - 5 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + const lowbitMask = 0xffffffff + getCur := func(x uint64) int { + return int(x & lowbitMask) + } + getPrev := func(x uint64) int { + return int(x >> 32) + } + const maxSkip = 64 + + for { + type match struct { + offset int + s int + length int + score int + } + var best match + for { + // Next src position to check + nextS := (s-nextEmit)>>8 + 1 + if nextS > maxSkip { + nextS = s + maxSkip + } else { + nextS += s + } + if nextS > sLimit { + goto emitRemainder + } + hashL := hash8(cv, lTableBits) + hashS := hash4(cv, sTableBits) + candidateL := lTable[hashL] + candidateS := sTable[hashS] + + score := func(m match) int { + // Matches that are longer forward are penalized since we must emit it as a literal. + score := m.length - m.s + if nextEmit == m.s { + // If we do not have to emit literals, we save 1 byte + score++ + } + offset := m.s - m.offset + + return score - emitCopySize(offset, m.length) + } + + matchAt := func(offset, s int, first uint32) match { + if best.length != 0 && best.s-best.offset == s-offset { + // Don't retest if we have the same offset. + return match{offset: offset, s: s} + } + if load32(src, offset) != first { + return match{offset: offset, s: s} + } + m := match{offset: offset, s: s, length: 4 + offset} + s += 4 + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, m.length); diff != 0 { + m.length += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + m.length += 8 + } + m.length -= offset + m.score = score(m) + if m.score <= -m.s { + // Eliminate if no savings, we might find a better one. + m.length = 0 + } + return m + } + + bestOf := func(a, b match) match { + if b.length == 0 { + return a + } + if a.length == 0 { + return b + } + as := a.score + b.s + bs := b.score + a.s + if as >= bs { + return a + } + return b + } + + best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv))) + best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv))) + + { + best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8))) + if best.length > 0 { + // s+1 + nextShort := sTable[hash4(cv>>8, sTableBits)] + s := s + 1 + cv := load64(src, s) + nextLong := lTable[hash8(cv, lTableBits)] + best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv))) + best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv))) + // Repeat at + 2 + best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8))) + + // s+2 + if true { + nextShort = sTable[hash4(cv>>8, sTableBits)] + s++ + cv = load64(src, s) + nextLong = lTable[hash8(cv, lTableBits)] + best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv))) + best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv))) + } + // Search for a match at best match end, see if that is better. + if sAt := best.s + best.length; sAt < sLimit { + sBack := best.s + backL := best.length + // Load initial values + cv = load64(src, sBack) + // Search for mismatch + next := lTable[hash8(load64(src, sAt), lTableBits)] + //next := sTable[hash4(load64(src, sAt), sTableBits)] + + if checkAt := getCur(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv))) + } + if checkAt := getPrev(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv))) + } + } + } + } + + // Update table + lTable[hashL] = uint64(s) | candidateL<<32 + sTable[hashS] = uint64(s) | candidateS<<32 + + if best.length > 0 { + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards, not needed for repeats... + s = best.s + if true { + for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] { + best.offset-- + best.length++ + s-- + } + } + if false && best.offset >= s { + panic(fmt.Errorf("t %d >= s %d", best.offset, s)) + } + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + base := s + offset := s - best.offset + + s += best.length + + if offset > 65535 && s-base <= 5 { + // Bail if the match is equal or worse to the encoding. + s = best.s + 1 + if s >= sLimit { + goto emitRemainder + } + cv = load64(src, s) + continue + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + d += emitCopyNoRepeat(dst[d:], offset, best.length) + repeat = offset + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Fill tables... + for i := best.s + 1; i < s; i++ { + cv0 := load64(src, i) + long0 := hash8(cv0, lTableBits) + short0 := hash4(cv0, sTableBits) + lTable[long0] = uint64(i) | lTable[long0]<<32 + sTable[short0] = uint64(i) | sTable[short0]<<32 + } + cv = load64(src, s) + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + +// emitCopySize returns the size to encode the offset+length +// +// It assumes that: +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +func emitCopySize(offset, length int) int { + if offset >= 65536 { + i := 0 + if length > 64 { + length -= 64 + if length >= 4 { + // Emit remaining as repeats + return 5 + emitRepeatSize(offset, length) + } + i = 5 + } + if length == 0 { + return i + } + return i + 5 + } + + // Offset no more than 2 bytes. + if length > 64 { + // Emit remaining as repeats, at least 4 bytes remain. + return 3 + emitRepeatSize(offset, length-60) + } + if length >= 12 || offset >= 2048 { + return 3 + } + // Emit the remaining copy, encoded as 2 bytes. + return 2 +} + +// emitRepeatSize returns the number of bytes required to encode a repeat. +// Length must be at least 4 and < 1<<24 +func emitRepeatSize(offset, length int) int { + // Repeat offset, make length cheaper + if length <= 4+4 || (length < 8+4 && offset < 2048) { + return 2 + } + if length < (1<<8)+4+4 { + return 3 + } + if length < (1<<16)+(1<<8)+4 { + return 4 + } + const maxRepeat = (1 << 24) - 1 + length -= (1 << 16) - 4 + left := 0 + if length > maxRepeat { + left = length - maxRepeat + 4 + length = maxRepeat - 4 + } + if left > 0 { + return 5 + emitRepeatSize(offset, left) + } + return 5 +} diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go new file mode 100644 index 00000000..943215b8 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encode_better.go @@ -0,0 +1,431 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package s2 + +import ( + "math/bits" +) + +// hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4(u uint64, h uint8) uint32 { + const prime4bytes = 2654435761 + return (uint32(u) * prime4bytes) >> ((32 - h) & 31) +} + +// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash5(u uint64, h uint8) uint32 { + const prime5bytes = 889523592379 + return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63)) +} + +// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash7(u uint64, h uint8) uint32 { + const prime7bytes = 58295818150454627 + return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63)) +} + +// hash8 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash8(u uint64, h uint8) uint32 { + const prime8bytes = 0xcf1bbcdcb7a56463 + return uint32((u * prime8bytes) >> ((64 - h) & 63)) +} + +// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBetterGo(dst, src []byte) (d int) { + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + if len(src) < minNonLiteralBlockSize { + return 0 + } + + // Initialize the hash tables. + const ( + // Long hash matches. + lTableBits = 16 + maxLTableSize = 1 << lTableBits + + // Short hash matches. + sTableBits = 14 + maxSTableSize = 1 << sTableBits + ) + + var lTable [maxLTableSize]uint32 + var sTable [maxSTableSize]uint32 + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 6 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We initialize repeat to 0, so we never match on first attempt + repeat := 0 + + for { + candidateL := 0 + nextS := 0 + for { + // Next src position to check + nextS = s + (s-nextEmit)>>7 + 1 + if nextS > sLimit { + goto emitRemainder + } + hashL := hash7(cv, lTableBits) + hashS := hash4(cv, sTableBits) + candidateL = int(lTable[hashL]) + candidateS := int(sTable[hashS]) + lTable[hashL] = uint32(s) + sTable[hashS] = uint32(s) + + // Check repeat at offset checkRep. + const checkRep = 1 + if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + + // Extend forward + candidate := s - repeat + 4 + checkRep + s += 4 + checkRep + for s < len(src) { + if len(src)-s < 8 { + if src[s] == src[candidate] { + s++ + candidate++ + continue + } + break + } + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + if nextEmit > 0 { + // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. + d += emitRepeat(dst[d:], repeat, s-base) + } else { + // First match, cannot be repeat. + d += emitCopy(dst[d:], repeat, s-base) + } + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + cv = load64(src, s) + continue + } + + if uint32(cv) == load32(src, candidateL) { + break + } + + // Check our short candidate + if uint32(cv) == load32(src, candidateS) { + // Try a long candidate at s+1 + hashL = hash7(cv>>8, lTableBits) + candidateL = int(lTable[hashL]) + lTable[hashL] = uint32(s + 1) + if uint32(cv>>8) == load32(src, candidateL) { + s++ + break + } + // Use our short candidate. + candidateL = candidateS + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards + for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { + candidateL-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + base := s + offset := base - candidateL + + // Extend the 4-byte match as long as possible. + s += 4 + candidateL += 4 + for s < len(src) { + if len(src)-s < 8 { + if src[s] == src[candidateL] { + s++ + candidateL++ + continue + } + break + } + if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidateL += 8 + } + + if offset > 65535 && s-base <= 5 && repeat != offset { + // Bail if the match is equal or worse to the encoding. + s = nextS + 1 + if s >= sLimit { + goto emitRemainder + } + cv = load64(src, s) + continue + } + + d += emitLiteral(dst[d:], src[nextEmit:base]) + if repeat == offset { + d += emitRepeat(dst[d:], offset, s-base) + } else { + d += emitCopy(dst[d:], offset, s-base) + repeat = offset + } + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Index match start+1 (long) and start+2 (short) + index0 := base + 1 + // Index match end-2 (long) and end-1 (short) + index1 := s - 2 + + cv0 := load64(src, index0) + cv1 := load64(src, index1) + cv = load64(src, s) + lTable[hash7(cv0, lTableBits)] = uint32(index0) + lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) + lTable[hash7(cv1, lTableBits)] = uint32(index1) + lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) + sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + +// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + if len(src) < minNonLiteralBlockSize { + return 0 + } + + // Initialize the hash tables. + const ( + // Long hash matches. + lTableBits = 16 + maxLTableSize = 1 << lTableBits + + // Short hash matches. + sTableBits = 14 + maxSTableSize = 1 << sTableBits + ) + + var lTable [maxLTableSize]uint32 + var sTable [maxSTableSize]uint32 + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 6 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We initialize repeat to 0, so we never match on first attempt + repeat := 0 + const maxSkip = 100 + + for { + candidateL := 0 + nextS := 0 + for { + // Next src position to check + nextS = (s-nextEmit)>>7 + 1 + if nextS > maxSkip { + nextS = s + maxSkip + } else { + nextS += s + } + + if nextS > sLimit { + goto emitRemainder + } + hashL := hash7(cv, lTableBits) + hashS := hash4(cv, sTableBits) + candidateL = int(lTable[hashL]) + candidateS := int(sTable[hashS]) + lTable[hashL] = uint32(s) + sTable[hashS] = uint32(s) + + if uint32(cv) == load32(src, candidateL) { + break + } + + // Check our short candidate + if uint32(cv) == load32(src, candidateS) { + // Try a long candidate at s+1 + hashL = hash7(cv>>8, lTableBits) + candidateL = int(lTable[hashL]) + lTable[hashL] = uint32(s + 1) + if uint32(cv>>8) == load32(src, candidateL) { + s++ + break + } + // Use our short candidate. + candidateL = candidateS + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards + for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { + candidateL-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + base := s + offset := base - candidateL + + // Extend the 4-byte match as long as possible. + s += 4 + candidateL += 4 + for s < len(src) { + if len(src)-s < 8 { + if src[s] == src[candidateL] { + s++ + candidateL++ + continue + } + break + } + if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidateL += 8 + } + + if offset > 65535 && s-base <= 5 && repeat != offset { + // Bail if the match is equal or worse to the encoding. + s = nextS + 1 + if s >= sLimit { + goto emitRemainder + } + cv = load64(src, s) + continue + } + + d += emitLiteral(dst[d:], src[nextEmit:base]) + d += emitCopyNoRepeat(dst[d:], offset, s-base) + repeat = offset + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Index match start+1 (long) and start+2 (short) + index0 := base + 1 + // Index match end-2 (long) and end-1 (short) + index1 := s - 2 + + cv0 := load64(src, index0) + cv1 := load64(src, index1) + cv = load64(src, s) + lTable[hash7(cv0, lTableBits)] = uint32(index0) + lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) + lTable[hash7(cv1, lTableBits)] = uint32(index1) + lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) + sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go new file mode 100644 index 00000000..43d43534 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encode_go.go @@ -0,0 +1,298 @@ +//go:build !amd64 || appengine || !gc || noasm +// +build !amd64 appengine !gc noasm + +package s2 + +import ( + "math/bits" +) + +// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) +func encodeBlock(dst, src []byte) (d int) { + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeBlockGo(dst, src) +} + +// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) +func encodeBlockBetter(dst, src []byte) (d int) { + return encodeBlockBetterGo(dst, src) +} + +// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) +func encodeBlockBetterSnappy(dst, src []byte) (d int) { + return encodeBlockBetterSnappyGo(dst, src) +} + +// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) +func encodeBlockSnappy(dst, src []byte) (d int) { + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeBlockSnappyGo(dst, src) +} + +// emitLiteral writes a literal chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 0 <= len(lit) && len(lit) <= math.MaxUint32 +func emitLiteral(dst, lit []byte) int { + if len(lit) == 0 { + return 0 + } + const num = 63<<2 | tagLiteral + i, n := 0, uint(len(lit)-1) + switch { + case n < 60: + dst[0] = uint8(n)<<2 | tagLiteral + i = 1 + case n < 1<<8: + dst[1] = uint8(n) + dst[0] = 60<<2 | tagLiteral + i = 2 + case n < 1<<16: + dst[2] = uint8(n >> 8) + dst[1] = uint8(n) + dst[0] = 61<<2 | tagLiteral + i = 3 + case n < 1<<24: + dst[3] = uint8(n >> 16) + dst[2] = uint8(n >> 8) + dst[1] = uint8(n) + dst[0] = 62<<2 | tagLiteral + i = 4 + default: + dst[4] = uint8(n >> 24) + dst[3] = uint8(n >> 16) + dst[2] = uint8(n >> 8) + dst[1] = uint8(n) + dst[0] = 63<<2 | tagLiteral + i = 5 + } + return i + copy(dst[i:], lit) +} + +// emitRepeat writes a repeat chunk and returns the number of bytes written. +// Length must be at least 4 and < 1<<24 +func emitRepeat(dst []byte, offset, length int) int { + // Repeat offset, make length cheaper + length -= 4 + if length <= 4 { + dst[0] = uint8(length)<<2 | tagCopy1 + dst[1] = 0 + return 2 + } + if length < 8 && offset < 2048 { + // Encode WITH offset + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 + return 2 + } + if length < (1<<8)+4 { + length -= 4 + dst[2] = uint8(length) + dst[1] = 0 + dst[0] = 5<<2 | tagCopy1 + return 3 + } + if length < (1<<16)+(1<<8) { + length -= 1 << 8 + dst[3] = uint8(length >> 8) + dst[2] = uint8(length >> 0) + dst[1] = 0 + dst[0] = 6<<2 | tagCopy1 + return 4 + } + const maxRepeat = (1 << 24) - 1 + length -= 1 << 16 + left := 0 + if length > maxRepeat { + left = length - maxRepeat + 4 + length = maxRepeat - 4 + } + dst[4] = uint8(length >> 16) + dst[3] = uint8(length >> 8) + dst[2] = uint8(length >> 0) + dst[1] = 0 + dst[0] = 7<<2 | tagCopy1 + if left > 0 { + return 5 + emitRepeat(dst[5:], offset, left) + } + return 5 +} + +// emitCopy writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +func emitCopy(dst []byte, offset, length int) int { + if offset >= 65536 { + i := 0 + if length > 64 { + // Emit a length 64 copy, encoded as 5 bytes. + dst[4] = uint8(offset >> 24) + dst[3] = uint8(offset >> 16) + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 63<<2 | tagCopy4 + length -= 64 + if length >= 4 { + // Emit remaining as repeats + return 5 + emitRepeat(dst[5:], offset, length) + } + i = 5 + } + if length == 0 { + return i + } + // Emit a copy, offset encoded as 4 bytes. + dst[i+0] = uint8(length-1)<<2 | tagCopy4 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + dst[i+3] = uint8(offset >> 16) + dst[i+4] = uint8(offset >> 24) + return i + 5 + } + + // Offset no more than 2 bytes. + if length > 64 { + // Emit a length 60 copy, encoded as 3 bytes. + // Emit remaining as repeat value (minimum 4 bytes). + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 59<<2 | tagCopy2 + length -= 60 + // Emit remaining as repeats, at least 4 bytes remain. + return 3 + emitRepeat(dst[3:], offset, length) + } + if length >= 12 || offset >= 2048 { + // Emit the remaining copy, encoded as 3 bytes. + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = uint8(length-1)<<2 | tagCopy2 + return 3 + } + // Emit the remaining copy, encoded as 2 bytes. + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 + return 2 +} + +// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +func emitCopyNoRepeat(dst []byte, offset, length int) int { + if offset >= 65536 { + i := 0 + if length > 64 { + // Emit a length 64 copy, encoded as 5 bytes. + dst[4] = uint8(offset >> 24) + dst[3] = uint8(offset >> 16) + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 63<<2 | tagCopy4 + length -= 64 + if length >= 4 { + // Emit remaining as repeats + return 5 + emitCopyNoRepeat(dst[5:], offset, length) + } + i = 5 + } + if length == 0 { + return i + } + // Emit a copy, offset encoded as 4 bytes. + dst[i+0] = uint8(length-1)<<2 | tagCopy4 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + dst[i+3] = uint8(offset >> 16) + dst[i+4] = uint8(offset >> 24) + return i + 5 + } + + // Offset no more than 2 bytes. + if length > 64 { + // Emit a length 60 copy, encoded as 3 bytes. + // Emit remaining as repeat value (minimum 4 bytes). + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 59<<2 | tagCopy2 + length -= 60 + // Emit remaining as repeats, at least 4 bytes remain. + return 3 + emitCopyNoRepeat(dst[3:], offset, length) + } + if length >= 12 || offset >= 2048 { + // Emit the remaining copy, encoded as 3 bytes. + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = uint8(length-1)<<2 | tagCopy2 + return 3 + } + // Emit the remaining copy, encoded as 2 bytes. + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 + return 2 +} + +// matchLen returns how many bytes match in a and b +// +// It assumes that: +// len(a) <= len(b) +// +func matchLen(a []byte, b []byte) int { + b = b[:len(a)] + var checked int + if len(a) > 4 { + // Try 4 bytes first + if diff := load32(a, 0) ^ load32(b, 0); diff != 0 { + return bits.TrailingZeros32(diff) >> 3 + } + // Switch to 8 byte matching. + checked = 4 + a = a[4:] + b = b[4:] + for len(a) >= 8 { + b = b[:len(a)] + if diff := load64(a, 0) ^ load64(b, 0); diff != 0 { + return checked + (bits.TrailingZeros64(diff) >> 3) + } + checked += 8 + a = a[8:] + b = b[8:] + } + } + b = b[:len(a)] + for i := range a { + if a[i] != b[i] { + return int(i) + checked + } + } + return len(a) + checked +} diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go new file mode 100644 index 00000000..c8cf7b69 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go @@ -0,0 +1,189 @@ +// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. + +//go:build !appengine && !noasm && gc +// +build !appengine,!noasm,gc + +package s2 + +// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4294967295 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm(dst []byte, src []byte) int + +// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4194304 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm4MB(dst []byte, src []byte) int + +// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 16383 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm12B(dst []byte, src []byte) int + +// encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4095 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm10B(dst []byte, src []byte) int + +// encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 511 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm8B(dst []byte, src []byte) int + +// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4294967295 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm(dst []byte, src []byte) int + +// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4194304 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm4MB(dst []byte, src []byte) int + +// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 16383 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm12B(dst []byte, src []byte) int + +// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4095 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm10B(dst []byte, src []byte) int + +// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 511 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm8B(dst []byte, src []byte) int + +// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4294967295 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBlockAsm(dst []byte, src []byte) int + +// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 65535 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBlockAsm64K(dst []byte, src []byte) int + +// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 16383 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBlockAsm12B(dst []byte, src []byte) int + +// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4095 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBlockAsm10B(dst []byte, src []byte) int + +// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 511 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBlockAsm8B(dst []byte, src []byte) int + +// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4294967295 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int + +// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 65535 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int + +// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 16383 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int + +// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4095 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int + +// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 511 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int + +// emitLiteral writes a literal chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes with margin of 0 bytes +// 0 <= len(lit) && len(lit) <= math.MaxUint32 +// +//go:noescape +func emitLiteral(dst []byte, lit []byte) int + +// emitRepeat writes a repeat chunk and returns the number of bytes written. +// Length must be at least 4 and < 1<<32 +// +//go:noescape +func emitRepeat(dst []byte, offset int, length int) int + +// emitCopy writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +// +//go:noescape +func emitCopy(dst []byte, offset int, length int) int + +// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +// +//go:noescape +func emitCopyNoRepeat(dst []byte, offset int, length int) int + +// matchLen returns how many bytes match in a and b +// +// It assumes that: +// len(a) <= len(b) +// +//go:noescape +func matchLen(a []byte, b []byte) int diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s new file mode 100644 index 00000000..1ac65a0e --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -0,0 +1,15678 @@ +// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build gc + +#include "textflag.h" + +// func encodeBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm(SB), $65560-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000200, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm + +repeat_extend_back_loop_encodeBlockAsm: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm + +repeat_extend_back_end_encodeBlockAsm: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_repeat_emit_encodeBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_repeat_emit_encodeBlockAsm + +four_bytes_repeat_emit_encodeBlockAsm: + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBlockAsm + +three_bytes_repeat_emit_encodeBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm + +two_bytes_repeat_emit_encodeBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm + JMP memmove_long_repeat_emit_encodeBlockAsm + +one_byte_repeat_emit_encodeBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +memmove_long_repeat_emit_encodeBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm + +matchlen_loopback_repeat_extend_encodeBlockAsm: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm + +matchlen_loop_repeat_extend_encodeBlockAsm: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm + +matchlen_single_repeat_extend_encodeBlockAsm: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm + +matchlen_single_loopback_repeat_extend_encodeBlockAsm: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm + +repeat_extend_forward_end_encodeBlockAsm: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm + + // emitRepeat +emit_repeat_again_match_repeat_encodeBlockAsm: + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm + +cant_repeat_two_offset_match_repeat_encodeBlockAsm: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm + CMPL SI, $0x00010100 + JLT repeat_four_match_repeat_encodeBlockAsm + CMPL SI, $0x0100ffff + JLT repeat_five_match_repeat_encodeBlockAsm + LEAL -16842747(SI), SI + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_repeat_encodeBlockAsm + +repeat_five_match_repeat_encodeBlockAsm: + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_match_repeat_encodeBlockAsm: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_match_repeat_encodeBlockAsm: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_match_repeat_encodeBlockAsm: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_match_repeat_encodeBlockAsm: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_as_copy_encodeBlockAsm: + // emitCopy + CMPL DI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm + +four_bytes_loop_back_repeat_as_copy_encodeBlockAsm: + CMPL SI, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm + MOVB $0xff, (AX) + MOVL DI, 1(AX) + LEAL -64(SI), SI + ADDQ $0x05, AX + CMPL SI, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL SI, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy + LEAL -16842747(SI), SI + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy + +repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm + +four_bytes_remain_repeat_as_copy_encodeBlockAsm: + TESTL SI, SI + JZ repeat_end_emit_encodeBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVL DI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +two_byte_offset_repeat_as_copy_encodeBlockAsm: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL SI, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short + LEAL -16842747(SI), SI + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short + +repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +emit_copy_three_repeat_as_copy_encodeBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm + +no_repeat_found_encodeBlockAsm: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm + +candidate3_match_encodeBlockAsm: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm + +candidate2_match_encodeBlockAsm: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm + +match_extend_back_loop_encodeBlockAsm: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm + JMP match_extend_back_loop_encodeBlockAsm + +match_extend_back_end_encodeBlockAsm: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeBlockAsm + MOVB $0xfc, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeBlockAsm + +four_bytes_match_emit_encodeBlockAsm: + MOVL R8, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW R8, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBlockAsm + +three_bytes_match_emit_encodeBlockAsm: + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm + +two_bytes_match_emit_encodeBlockAsm: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm + JMP memmove_long_match_emit_encodeBlockAsm + +one_byte_match_emit_encodeBlockAsm: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm + +memmove_long_match_emit_encodeBlockAsm: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm: +match_nolit_loop_encodeBlockAsm: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm + +matchlen_loopback_match_nolit_encodeBlockAsm: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm + +matchlen_loop_match_nolit_encodeBlockAsm: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm + +matchlen_single_match_nolit_encodeBlockAsm: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm + +matchlen_single_loopback_match_nolit_encodeBlockAsm: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm + +match_nolit_end_encodeBlockAsm: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm + +four_bytes_loop_back_match_nolit_encodeBlockAsm: + CMPL R10, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsm + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(R10), R10 + ADDQ $0x05, AX + CMPL R10, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm + + // emitRepeat +emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy + CMPL R10, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy + LEAL -16842747(R10), R10 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy + +repeat_five_match_nolit_encodeBlockAsm_emit_copy: + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_four_match_nolit_encodeBlockAsm_emit_copy: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_three_match_nolit_encodeBlockAsm_emit_copy: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_match_nolit_encodeBlockAsm_emit_copy: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm + +four_bytes_remain_match_nolit_encodeBlockAsm: + TESTL R10, R10 + JZ match_nolit_emitcopy_end_encodeBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +two_byte_offset_match_nolit_encodeBlockAsm: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short + CMPL R10, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short + LEAL -16842747(R10), R10 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short + +repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + JMP two_byte_offset_match_nolit_encodeBlockAsm + +two_byte_offset_short_match_nolit_encodeBlockAsm: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +emit_copy_three_match_nolit_encodeBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm: + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm + INCL CX + JMP search_loop_encodeBlockAsm + +emit_remainder_encodeBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeBlockAsm + +four_bytes_emit_remainder_encodeBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBlockAsm + +three_bytes_emit_remainder_encodeBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm + +two_bytes_emit_remainder_encodeBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm + JMP memmove_long_emit_remainder_encodeBlockAsm + +one_byte_emit_remainder_encodeBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +memmove_long_emit_remainder_encodeBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm4MB(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm4MB(SB), $65560-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000200, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm4MB: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm4MB + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm4MB: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm4MB + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm4MB + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm4MB + +repeat_extend_back_loop_encodeBlockAsm4MB: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm4MB + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm4MB + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm4MB + +repeat_extend_back_end_encodeBlockAsm4MB: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm4MB + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm4MB + CMPL SI, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm4MB + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBlockAsm4MB + +three_bytes_repeat_emit_encodeBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm4MB + +two_bytes_repeat_emit_encodeBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm4MB + JMP memmove_long_repeat_emit_encodeBlockAsm4MB + +one_byte_repeat_emit_encodeBlockAsm4MB: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm4MB: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB + +memmove_long_repeat_emit_encodeBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm4MB: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm4MB + +matchlen_loopback_repeat_extend_encodeBlockAsm4MB: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm4MB + +matchlen_loop_repeat_extend_encodeBlockAsm4MB: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB + +matchlen_single_repeat_extend_encodeBlockAsm4MB: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm4MB + +matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm4MB + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB + +repeat_extend_forward_end_encodeBlockAsm4MB: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm4MB + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm4MB + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB + +cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm4MB + CMPL SI, $0x00010100 + JLT repeat_four_match_repeat_encodeBlockAsm4MB + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_match_repeat_encodeBlockAsm4MB: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_match_repeat_encodeBlockAsm4MB: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_match_repeat_encodeBlockAsm4MB: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_match_repeat_encodeBlockAsm4MB: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_as_copy_encodeBlockAsm4MB: + // emitCopy + CMPL DI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB + +four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB: + CMPL SI, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB + MOVB $0xff, (AX) + MOVL DI, 1(AX) + LEAL -64(SI), SI + ADDQ $0x05, AX + CMPL SI, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB + +four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: + TESTL SI, SI + JZ repeat_end_emit_encodeBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVL DI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm4MB: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm4MB + +no_repeat_found_encodeBlockAsm4MB: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm4MB + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm4MB + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm4MB + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm4MB + +candidate3_match_encodeBlockAsm4MB: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm4MB + +candidate2_match_encodeBlockAsm4MB: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm4MB: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm4MB + +match_extend_back_loop_encodeBlockAsm4MB: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm4MB + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm4MB + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm4MB + JMP match_extend_back_loop_encodeBlockAsm4MB + +match_extend_back_end_encodeBlockAsm4MB: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 4(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm4MB: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm4MB + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm4MB + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm4MB + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm4MB + MOVL R8, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW R8, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBlockAsm4MB + +three_bytes_match_emit_encodeBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm4MB + +two_bytes_match_emit_encodeBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm4MB + JMP memmove_long_match_emit_encodeBlockAsm4MB + +one_byte_match_emit_encodeBlockAsm4MB: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm4MB: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm4MB: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm4MB + +memmove_long_match_emit_encodeBlockAsm4MB: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm4MB: +match_nolit_loop_encodeBlockAsm4MB: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm4MB + +matchlen_loopback_match_nolit_encodeBlockAsm4MB: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm4MB + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm4MB + +matchlen_loop_match_nolit_encodeBlockAsm4MB: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB + +matchlen_single_match_nolit_encodeBlockAsm4MB: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm4MB + +matchlen_single_loopback_match_nolit_encodeBlockAsm4MB: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm4MB + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB + +match_nolit_end_encodeBlockAsm4MB: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm4MB + +four_bytes_loop_back_match_nolit_encodeBlockAsm4MB: + CMPL R10, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(R10), R10 + ADDQ $0x05, AX + CMPL R10, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm4MB + + // emitRepeat + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB + +four_bytes_remain_match_nolit_encodeBlockAsm4MB: + TESTL R10, R10 + JZ match_nolit_emitcopy_end_encodeBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +two_byte_offset_match_nolit_encodeBlockAsm4MB: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + JMP two_byte_offset_match_nolit_encodeBlockAsm4MB + +two_byte_offset_short_match_nolit_encodeBlockAsm4MB: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm4MB + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm4MB + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +emit_copy_three_match_nolit_encodeBlockAsm4MB: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm4MB: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm4MB + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm4MB: + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm4MB + INCL CX + JMP search_loop_encodeBlockAsm4MB + +emit_remainder_encodeBlockAsm4MB: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 4(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm4MB: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm4MB + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm4MB + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm4MB + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBlockAsm4MB + +three_bytes_emit_remainder_encodeBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm4MB + +two_bytes_emit_remainder_encodeBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm4MB + JMP memmove_long_emit_remainder_encodeBlockAsm4MB + +one_byte_emit_remainder_encodeBlockAsm4MB: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm4MB: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm4MB: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB + +memmove_long_emit_remainder_encodeBlockAsm4MB: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm4MB: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm12B(SB), $16408-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000080, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm12B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x18, R11 + IMULQ R9, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm12B + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm12B + +repeat_extend_back_loop_encodeBlockAsm12B: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm12B + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm12B + +repeat_extend_back_end_encodeBlockAsm12B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm12B + +two_bytes_repeat_emit_encodeBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm12B + JMP memmove_long_repeat_emit_encodeBlockAsm12B + +one_byte_repeat_emit_encodeBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +memmove_long_repeat_emit_encodeBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm12B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm12B + +matchlen_loopback_repeat_extend_encodeBlockAsm12B: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm12B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm12B + +matchlen_loop_repeat_extend_encodeBlockAsm12B: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B + +matchlen_single_repeat_extend_encodeBlockAsm12B: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm12B + +matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm12B + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B + +repeat_extend_forward_end_encodeBlockAsm12B: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm12B + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm12B + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm12B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm12B + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_three_match_repeat_encodeBlockAsm12B: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_match_repeat_encodeBlockAsm12B: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_match_repeat_encodeBlockAsm12B: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_as_copy_encodeBlockAsm12B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm12B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +emit_copy_three_repeat_as_copy_encodeBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm12B: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm12B + +no_repeat_found_encodeBlockAsm12B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm12B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm12B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm12B + +candidate3_match_encodeBlockAsm12B: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm12B + +candidate2_match_encodeBlockAsm12B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm12B + +match_extend_back_loop_encodeBlockAsm12B: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm12B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm12B + JMP match_extend_back_loop_encodeBlockAsm12B + +match_extend_back_end_encodeBlockAsm12B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm12B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm12B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm12B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm12B + +two_bytes_match_emit_encodeBlockAsm12B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm12B + JMP memmove_long_match_emit_encodeBlockAsm12B + +one_byte_match_emit_encodeBlockAsm12B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm12B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm12B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +memmove_long_match_emit_encodeBlockAsm12B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm12B: +match_nolit_loop_encodeBlockAsm12B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm12B + +matchlen_loopback_match_nolit_encodeBlockAsm12B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm12B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm12B + +matchlen_loop_match_nolit_encodeBlockAsm12B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm12B + +matchlen_single_match_nolit_encodeBlockAsm12B: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeBlockAsm12B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm12B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B + +match_nolit_end_encodeBlockAsm12B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm12B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP two_byte_offset_match_nolit_encodeBlockAsm12B + +two_byte_offset_short_match_nolit_encodeBlockAsm12B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm12B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +emit_copy_three_match_nolit_encodeBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm12B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm12B: + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x18, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 + SHLQ $0x18, SI + IMULQ R9, SI + SHRQ $0x34, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm12B + INCL CX + JMP search_loop_encodeBlockAsm12B + +emit_remainder_encodeBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm12B + +two_bytes_emit_remainder_encodeBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm12B + JMP memmove_long_emit_remainder_encodeBlockAsm12B + +one_byte_emit_remainder_encodeBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +memmove_long_emit_remainder_encodeBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm10B(SB), $4120-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000020, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm10B + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm10B + +repeat_extend_back_loop_encodeBlockAsm10B: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm10B + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm10B + +repeat_extend_back_end_encodeBlockAsm10B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm10B + +two_bytes_repeat_emit_encodeBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm10B + JMP memmove_long_repeat_emit_encodeBlockAsm10B + +one_byte_repeat_emit_encodeBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm10B + +memmove_long_repeat_emit_encodeBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm10B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm10B + +matchlen_loopback_repeat_extend_encodeBlockAsm10B: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm10B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm10B + +matchlen_loop_repeat_extend_encodeBlockAsm10B: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B + +matchlen_single_repeat_extend_encodeBlockAsm10B: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm10B + +matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm10B + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B + +repeat_extend_forward_end_encodeBlockAsm10B: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm10B + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm10B + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm10B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm10B + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_three_match_repeat_encodeBlockAsm10B: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_match_repeat_encodeBlockAsm10B: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_match_repeat_encodeBlockAsm10B: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_as_copy_encodeBlockAsm10B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm10B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +emit_copy_three_repeat_as_copy_encodeBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm10B: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm10B + +no_repeat_found_encodeBlockAsm10B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm10B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm10B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm10B + +candidate3_match_encodeBlockAsm10B: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm10B + +candidate2_match_encodeBlockAsm10B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm10B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm10B + +match_extend_back_loop_encodeBlockAsm10B: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm10B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm10B + JMP match_extend_back_loop_encodeBlockAsm10B + +match_extend_back_end_encodeBlockAsm10B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm10B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm10B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm10B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm10B + +two_bytes_match_emit_encodeBlockAsm10B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm10B + JMP memmove_long_match_emit_encodeBlockAsm10B + +one_byte_match_emit_encodeBlockAsm10B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm10B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm10B + +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm10B + +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm10B + +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm10B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm10B + +memmove_long_match_emit_encodeBlockAsm10B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm10B: +match_nolit_loop_encodeBlockAsm10B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm10B + +matchlen_loopback_match_nolit_encodeBlockAsm10B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm10B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm10B + +matchlen_loop_match_nolit_encodeBlockAsm10B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm10B + +matchlen_single_match_nolit_encodeBlockAsm10B: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeBlockAsm10B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm10B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B + +match_nolit_end_encodeBlockAsm10B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm10B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP two_byte_offset_match_nolit_encodeBlockAsm10B + +two_byte_offset_short_match_nolit_encodeBlockAsm10B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm10B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +emit_copy_three_match_nolit_encodeBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm10B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm10B: + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x36, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x36, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm10B + INCL CX + JMP search_loop_encodeBlockAsm10B + +emit_remainder_encodeBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm10B + +two_bytes_emit_remainder_encodeBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm10B + JMP memmove_long_emit_remainder_encodeBlockAsm10B + +one_byte_emit_remainder_encodeBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm10B + +memmove_long_emit_remainder_encodeBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm8B(SB), $1048-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000008, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm8B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm8B + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm8B + +repeat_extend_back_loop_encodeBlockAsm8B: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm8B + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm8B + +repeat_extend_back_end_encodeBlockAsm8B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm8B + +two_bytes_repeat_emit_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm8B + JMP memmove_long_repeat_emit_encodeBlockAsm8B + +one_byte_repeat_emit_encodeBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm8B + +memmove_long_repeat_emit_encodeBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm8B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm8B + +matchlen_loopback_repeat_extend_encodeBlockAsm8B: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm8B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_loop_repeat_extend_encodeBlockAsm8B: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B + +matchlen_single_repeat_extend_encodeBlockAsm8B: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm8B + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B + +repeat_extend_forward_end_encodeBlockAsm8B: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm8B + + // emitRepeat + MOVL SI, DI + LEAL -4(SI), SI + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm8B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm8B + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_match_repeat_encodeBlockAsm8B: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_match_repeat_encodeBlockAsm8B: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_as_copy_encodeBlockAsm8B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm8B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat + MOVL SI, DI + LEAL -4(SI), SI + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +emit_copy_three_repeat_as_copy_encodeBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm8B: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm8B + +no_repeat_found_encodeBlockAsm8B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm8B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm8B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm8B + +candidate3_match_encodeBlockAsm8B: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm8B + +candidate2_match_encodeBlockAsm8B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm8B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm8B + +match_extend_back_loop_encodeBlockAsm8B: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm8B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm8B + JMP match_extend_back_loop_encodeBlockAsm8B + +match_extend_back_end_encodeBlockAsm8B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm8B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm8B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm8B + +two_bytes_match_emit_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm8B + JMP memmove_long_match_emit_encodeBlockAsm8B + +one_byte_match_emit_encodeBlockAsm8B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm8B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm8B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm8B + +memmove_long_match_emit_encodeBlockAsm8B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm8B: +match_nolit_loop_encodeBlockAsm8B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm8B + +matchlen_loopback_match_nolit_encodeBlockAsm8B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm8B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm8B + +matchlen_loop_match_nolit_encodeBlockAsm8B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm8B + +matchlen_single_match_nolit_encodeBlockAsm8B: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeBlockAsm8B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm8B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B + +match_nolit_end_encodeBlockAsm8B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm8B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat + MOVL R10, SI + LEAL -4(R10), R10 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBlockAsm8B + +two_byte_offset_short_match_nolit_encodeBlockAsm8B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +emit_copy_three_match_nolit_encodeBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm8B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm8B: + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x38, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x38, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm8B + INCL CX + JMP search_loop_encodeBlockAsm8B + +emit_remainder_encodeBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm8B + +two_bytes_emit_remainder_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm8B + JMP memmove_long_emit_remainder_encodeBlockAsm8B + +one_byte_emit_remainder_encodeBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm8B + +memmove_long_emit_remainder_encodeBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + CMPL SI, $0x63 + JLE check_maxskip_ok_encodeBetterBlockAsm + LEAL 100(CX), SI + JMP check_maxskip_cont_encodeBetterBlockAsm + +check_maxskip_ok_encodeBetterBlockAsm: + LEAL 1(CX)(SI*1), SI + +check_maxskip_cont_encodeBetterBlockAsm: + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm + +candidateS_match_encodeBetterBlockAsm: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm + +match_extend_back_loop_encodeBetterBlockAsm: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm + JMP match_extend_back_loop_encodeBetterBlockAsm + +match_extend_back_end_encodeBetterBlockAsm: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm + +matchlen_loopback_match_nolit_encodeBetterBlockAsm: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm + +matchlen_loop_match_nolit_encodeBetterBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm + +matchlen_single_match_nolit_encodeBetterBlockAsm: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm + +match_nolit_end_encodeBetterBlockAsm: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm + CMPL R12, $0x01 + JG match_length_ok_encodeBetterBlockAsm + CMPL R8, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm + +match_length_ok_encodeBetterBlockAsm: + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_encodeBetterBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_match_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +four_bytes_match_emit_encodeBetterBlockAsm: + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +three_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +two_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm + +one_byte_match_emit_encodeBetterBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm + +memmove_long_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy + CMPL R8, $0x00010000 + JL two_byte_offset_match_nolit_encodeBetterBlockAsm + +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: + CMPL R12, $0x40 + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm + MOVB $0xff, (AX) + MOVL R8, 1(AX) + LEAL -64(R12), R12 + ADDQ $0x05, AX + CMPL R12, $0x04 + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm + + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R12, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy + LEAL -16842747(R12), R12 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm + +four_bytes_remain_match_nolit_encodeBetterBlockAsm: + TESTL R12, R12 + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +two_byte_offset_match_nolit_encodeBetterBlockAsm: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R12, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short + LEAL -16842747(R12), R12 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +emit_copy_three_match_nolit_encodeBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +match_is_repeat_encodeBetterBlockAsm: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm + +four_bytes_match_emit_repeat_encodeBetterBlockAsm: + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm + +three_bytes_match_emit_repeat_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm + +two_bytes_match_emit_repeat_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm + +one_byte_match_emit_repeat_encodeBetterBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm + +memmove_long_match_emit_repeat_encodeBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat +emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm + CMPL R12, $0x0100ffff + JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm + LEAL -16842747(R12), R12 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm + +repeat_five_match_nolit_repeat_encodeBetterBlockAsm: + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_repeat_encodeBetterBlockAsm: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm + +emit_remainder_encodeBetterBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +four_bytes_emit_remainder_encodeBetterBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +three_bytes_emit_remainder_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +two_bytes_emit_remainder_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +one_byte_emit_remainder_encodeBetterBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm + +memmove_long_emit_remainder_encodeBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm4MB: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm4MB + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm4MB: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + CMPL SI, $0x63 + JLE check_maxskip_ok_encodeBetterBlockAsm4MB + LEAL 100(CX), SI + JMP check_maxskip_cont_encodeBetterBlockAsm4MB + +check_maxskip_ok_encodeBetterBlockAsm4MB: + LEAL 1(CX)(SI*1), SI + +check_maxskip_cont_encodeBetterBlockAsm4MB: + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm4MB + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm4MB + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm4MB + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm4MB + +candidateS_match_encodeBetterBlockAsm4MB: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm4MB + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm4MB: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm4MB + +match_extend_back_loop_encodeBetterBlockAsm4MB: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm4MB + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm4MB + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm4MB + JMP match_extend_back_loop_encodeBetterBlockAsm4MB + +match_extend_back_end_encodeBetterBlockAsm4MB: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 4(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm4MB: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB + +matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm4MB + +matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB + +matchlen_single_match_nolit_encodeBetterBlockAsm4MB: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm4MB + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm4MB + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB + +match_nolit_end_encodeBetterBlockAsm4MB: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm4MB + CMPL R12, $0x01 + JG match_length_ok_encodeBetterBlockAsm4MB + CMPL R8, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm4MB + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm4MB + +match_length_ok_encodeBetterBlockAsm4MB: + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm4MB + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm4MB + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_encodeBetterBlockAsm4MB + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB + +three_bytes_match_emit_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB + +two_bytes_match_emit_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm4MB + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB + +one_byte_match_emit_encodeBetterBlockAsm4MB: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB + +memmove_long_match_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm4MB: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy + CMPL R8, $0x00010000 + JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB + +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB: + CMPL R12, $0x40 + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB + MOVB $0xff, (AX) + MOVL R8, 1(AX) + LEAL -64(R12), R12 + ADDQ $0x05, AX + CMPL R12, $0x04 + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB + +four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: + TESTL R12, R12 + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +match_is_repeat_encodeBetterBlockAsm4MB: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB + +three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB + +two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB + +one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB + +memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm4MB + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm4MB: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm4MB + +emit_remainder_encodeBetterBlockAsm4MB: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 4(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm4MB: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB + +three_bytes_emit_remainder_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB + +two_bytes_emit_remainder_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm4MB + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB + +one_byte_emit_remainder_encodeBetterBlockAsm4MB: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm4MB: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB + +memmove_long_emit_remainder_encodeBetterBlockAsm4MB: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000280, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm12B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 65560(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 65560(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm12B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm12B + +candidateS_match_encodeBetterBlockAsm12B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm12B + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm12B + +match_extend_back_loop_encodeBetterBlockAsm12B: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm12B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm12B + JMP match_extend_back_loop_encodeBetterBlockAsm12B + +match_extend_back_end_encodeBetterBlockAsm12B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm12B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm12B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm12B + +matchlen_loop_match_nolit_encodeBetterBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B + +matchlen_single_match_nolit_encodeBetterBlockAsm12B: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm12B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B + +match_nolit_end_encodeBetterBlockAsm12B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm12B + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm12B + +two_bytes_match_emit_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm12B + JMP memmove_long_match_emit_encodeBetterBlockAsm12B + +one_byte_match_emit_encodeBetterBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B + +memmove_long_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm12B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm12B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +emit_copy_three_match_nolit_encodeBetterBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +match_is_repeat_encodeBetterBlockAsm12B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B + +two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm12B + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B + +one_byte_match_emit_repeat_encodeBetterBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B + +memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm12B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm12B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x34, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 65560(SP)(R11*4) + MOVL R15, 65560(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 65560(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm12B + +emit_remainder_encodeBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + +two_bytes_emit_remainder_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + +one_byte_emit_remainder_encodeBetterBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + +memmove_long_emit_remainder_encodeBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x000000a0, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 16408(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 16408(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm10B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm10B + +candidateS_match_encodeBetterBlockAsm10B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm10B + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm10B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm10B + +match_extend_back_loop_encodeBetterBlockAsm10B: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm10B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm10B + JMP match_extend_back_loop_encodeBetterBlockAsm10B + +match_extend_back_end_encodeBetterBlockAsm10B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm10B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm10B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm10B + +matchlen_loop_match_nolit_encodeBetterBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B + +matchlen_single_match_nolit_encodeBetterBlockAsm10B: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm10B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B + +match_nolit_end_encodeBetterBlockAsm10B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm10B + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm10B + +two_bytes_match_emit_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm10B + JMP memmove_long_match_emit_encodeBetterBlockAsm10B + +one_byte_match_emit_encodeBetterBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B + +memmove_long_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm10B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm10B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +emit_copy_three_match_nolit_encodeBetterBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +match_is_repeat_encodeBetterBlockAsm10B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B + +two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm10B + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B + +one_byte_match_emit_repeat_encodeBetterBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B + +memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm10B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x36, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 16408(SP)(R11*4) + MOVL R15, 16408(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 16408(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm10B + +emit_remainder_encodeBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + +two_bytes_emit_remainder_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + +one_byte_emit_remainder_encodeBetterBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + +memmove_long_emit_remainder_encodeBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000028, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm8B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 4120(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 4120(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm8B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm8B + +candidateS_match_encodeBetterBlockAsm8B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm8B + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm8B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm8B + +match_extend_back_loop_encodeBetterBlockAsm8B: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm8B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm8B + JMP match_extend_back_loop_encodeBetterBlockAsm8B + +match_extend_back_end_encodeBetterBlockAsm8B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm8B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm8B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm8B + +matchlen_loop_match_nolit_encodeBetterBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B + +matchlen_single_match_nolit_encodeBetterBlockAsm8B: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm8B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B + +match_nolit_end_encodeBetterBlockAsm8B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm8B + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm8B + +two_bytes_match_emit_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm8B + JMP memmove_long_match_emit_encodeBetterBlockAsm8B + +one_byte_match_emit_encodeBetterBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B + +memmove_long_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm8B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm8B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +emit_copy_three_match_nolit_encodeBetterBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +match_is_repeat_encodeBetterBlockAsm8B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B + +two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm8B + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B + +one_byte_match_emit_repeat_encodeBetterBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ R8, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (R9), R10 + MOVL R10, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B + +memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R11 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R13*1), R10 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R13*1), X4 + MOVOU -16(R9)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm8B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm8B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x38, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 4120(SP)(R11*4) + MOVL R15, 4120(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 4120(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm8B + +emit_remainder_encodeBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B + +two_bytes_emit_remainder_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B + +one_byte_emit_remainder_encodeBetterBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B + +memmove_long_emit_remainder_encodeBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm(SB), $65560-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000200, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm + +repeat_extend_back_loop_encodeSnappyBlockAsm: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm + +repeat_extend_back_end_encodeSnappyBlockAsm: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_repeat_emit_encodeSnappyBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_repeat_emit_encodeSnappyBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + +four_bytes_repeat_emit_encodeSnappyBlockAsm: + MOVL SI, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + +three_bytes_repeat_emit_encodeSnappyBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + +two_bytes_repeat_emit_encodeSnappyBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + +one_byte_repeat_emit_encodeSnappyBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm + +memmove_long_repeat_emit_encodeSnappyBlockAsm: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm + +matchlen_single_repeat_extend_encodeSnappyBlockAsm: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm + +repeat_extend_forward_end_encodeSnappyBlockAsm: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy + CMPL DI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm + +four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: + CMPL SI, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm + MOVB $0xff, (AX) + MOVL DI, 1(AX) + LEAL -64(SI), SI + ADDQ $0x05, AX + CMPL SI, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm + +four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: + TESTL SI, SI + JZ repeat_end_emit_encodeSnappyBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVL DI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeSnappyBlockAsm + +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm + +no_repeat_found_encodeSnappyBlockAsm: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm + +candidate3_match_encodeSnappyBlockAsm: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm + +candidate2_match_encodeSnappyBlockAsm: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm + +match_extend_back_loop_encodeSnappyBlockAsm: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm + JMP match_extend_back_loop_encodeSnappyBlockAsm + +match_extend_back_end_encodeSnappyBlockAsm: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeSnappyBlockAsm + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeSnappyBlockAsm + MOVB $0xfc, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm + +four_bytes_match_emit_encodeSnappyBlockAsm: + MOVL R8, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW R8, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm + +three_bytes_match_emit_encodeSnappyBlockAsm: + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm + +two_bytes_match_emit_encodeSnappyBlockAsm: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm + JMP memmove_long_match_emit_encodeSnappyBlockAsm + +one_byte_match_emit_encodeSnappyBlockAsm: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm + +memmove_long_match_emit_encodeSnappyBlockAsm: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm: +match_nolit_loop_encodeSnappyBlockAsm: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm + +matchlen_loop_match_nolit_encodeSnappyBlockAsm: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm + +matchlen_single_match_nolit_encodeSnappyBlockAsm: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm + +match_nolit_end_encodeSnappyBlockAsm: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_match_nolit_encodeSnappyBlockAsm + +four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: + CMPL R10, $0x40 + JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(R10), R10 + ADDQ $0x05, AX + CMPL R10, $0x04 + JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm + +four_bytes_remain_match_nolit_encodeSnappyBlockAsm: + TESTL R10, R10 + JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm + +two_byte_offset_match_nolit_encodeSnappyBlockAsm: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm + +emit_copy_three_match_nolit_encodeSnappyBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm: + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm + INCL CX + JMP search_loop_encodeSnappyBlockAsm + +emit_remainder_encodeSnappyBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeSnappyBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeSnappyBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + +four_bytes_emit_remainder_encodeSnappyBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + +three_bytes_emit_remainder_encodeSnappyBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + +two_bytes_emit_remainder_encodeSnappyBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + +one_byte_emit_remainder_encodeSnappyBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm + +memmove_long_emit_remainder_encodeSnappyBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000200, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm64K: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm64K: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm64K + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm64K + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm64K + +repeat_extend_back_loop_encodeSnappyBlockAsm64K: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm64K + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm64K + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K + +repeat_extend_back_end_encodeSnappyBlockAsm64K: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K + +two_bytes_repeat_emit_encodeSnappyBlockAsm64K: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm64K + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K + +one_byte_repeat_emit_encodeSnappyBlockAsm64K: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K + +memmove_long_repeat_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K + +matchlen_single_repeat_extend_encodeSnappyBlockAsm64K: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K + +repeat_extend_forward_end_encodeSnappyBlockAsm64K: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm64K + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm64K: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm64K + +no_repeat_found_encodeSnappyBlockAsm64K: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm64K + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm64K + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm64K + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm64K + +candidate3_match_encodeSnappyBlockAsm64K: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm64K + +candidate2_match_encodeSnappyBlockAsm64K: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm64K: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm64K + +match_extend_back_loop_encodeSnappyBlockAsm64K: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm64K + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm64K + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm64K + JMP match_extend_back_loop_encodeSnappyBlockAsm64K + +match_extend_back_end_encodeSnappyBlockAsm64K: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm64K: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm64K + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm64K + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm64K + +two_bytes_match_emit_encodeSnappyBlockAsm64K: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm64K + JMP memmove_long_match_emit_encodeSnappyBlockAsm64K + +one_byte_match_emit_encodeSnappyBlockAsm64K: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K + +memmove_long_match_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm64K: +match_nolit_loop_encodeSnappyBlockAsm64K: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm64K + +matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K + +matchlen_single_match_nolit_encodeSnappyBlockAsm64K: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm64K + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm64K + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K + +match_nolit_end_encodeSnappyBlockAsm64K: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K + +emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm64K + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm64K: + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm64K + INCL CX + JMP search_loop_encodeSnappyBlockAsm64K + +emit_remainder_encodeSnappyBlockAsm64K: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm64K: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K + +two_bytes_emit_remainder_encodeSnappyBlockAsm64K: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm64K + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K + +one_byte_emit_remainder_encodeSnappyBlockAsm64K: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K + +memmove_long_emit_remainder_encodeSnappyBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000080, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm12B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x18, R11 + IMULQ R9, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm12B + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm12B + +repeat_extend_back_loop_encodeSnappyBlockAsm12B: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm12B + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm12B + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B + +repeat_extend_back_end_encodeSnappyBlockAsm12B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B + +two_bytes_repeat_emit_encodeSnappyBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm12B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B + +one_byte_repeat_emit_encodeSnappyBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B + +memmove_long_repeat_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B + +matchlen_single_repeat_extend_encodeSnappyBlockAsm12B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B + +repeat_extend_forward_end_encodeSnappyBlockAsm12B: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm12B + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm12B: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm12B + +no_repeat_found_encodeSnappyBlockAsm12B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm12B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm12B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm12B + +candidate3_match_encodeSnappyBlockAsm12B: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm12B + +candidate2_match_encodeSnappyBlockAsm12B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm12B + +match_extend_back_loop_encodeSnappyBlockAsm12B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm12B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm12B + JMP match_extend_back_loop_encodeSnappyBlockAsm12B + +match_extend_back_end_encodeSnappyBlockAsm12B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm12B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm12B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm12B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm12B + +two_bytes_match_emit_encodeSnappyBlockAsm12B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm12B + JMP memmove_long_match_emit_encodeSnappyBlockAsm12B + +one_byte_match_emit_encodeSnappyBlockAsm12B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B + +memmove_long_match_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm12B: +match_nolit_loop_encodeSnappyBlockAsm12B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm12B + +matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B + +matchlen_single_match_nolit_encodeSnappyBlockAsm12B: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm12B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B + +match_nolit_end_encodeSnappyBlockAsm12B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B + +emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm12B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm12B: + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x18, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 + SHLQ $0x18, SI + IMULQ R9, SI + SHRQ $0x34, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm12B + INCL CX + JMP search_loop_encodeSnappyBlockAsm12B + +emit_remainder_encodeSnappyBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B + +two_bytes_emit_remainder_encodeSnappyBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm12B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B + +one_byte_emit_remainder_encodeSnappyBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B + +memmove_long_emit_remainder_encodeSnappyBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000020, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm10B + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm10B + +repeat_extend_back_loop_encodeSnappyBlockAsm10B: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm10B + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm10B + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B + +repeat_extend_back_end_encodeSnappyBlockAsm10B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B + +two_bytes_repeat_emit_encodeSnappyBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm10B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B + +one_byte_repeat_emit_encodeSnappyBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B + +memmove_long_repeat_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B + +matchlen_single_repeat_extend_encodeSnappyBlockAsm10B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B + +repeat_extend_forward_end_encodeSnappyBlockAsm10B: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm10B + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm10B: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm10B + +no_repeat_found_encodeSnappyBlockAsm10B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm10B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm10B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm10B + +candidate3_match_encodeSnappyBlockAsm10B: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm10B + +candidate2_match_encodeSnappyBlockAsm10B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm10B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm10B + +match_extend_back_loop_encodeSnappyBlockAsm10B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm10B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm10B + JMP match_extend_back_loop_encodeSnappyBlockAsm10B + +match_extend_back_end_encodeSnappyBlockAsm10B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm10B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm10B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm10B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm10B + +two_bytes_match_emit_encodeSnappyBlockAsm10B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm10B + JMP memmove_long_match_emit_encodeSnappyBlockAsm10B + +one_byte_match_emit_encodeSnappyBlockAsm10B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B + +memmove_long_match_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm10B: +match_nolit_loop_encodeSnappyBlockAsm10B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm10B + +matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B + +matchlen_single_match_nolit_encodeSnappyBlockAsm10B: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm10B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B + +match_nolit_end_encodeSnappyBlockAsm10B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B + +emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm10B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm10B: + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x36, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x36, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm10B + INCL CX + JMP search_loop_encodeSnappyBlockAsm10B + +emit_remainder_encodeSnappyBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B + +two_bytes_emit_remainder_encodeSnappyBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm10B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B + +one_byte_emit_remainder_encodeSnappyBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B + +memmove_long_emit_remainder_encodeSnappyBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000008, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm8B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm8B + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm8B + +repeat_extend_back_loop_encodeSnappyBlockAsm8B: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm8B + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm8B + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B + +repeat_extend_back_end_encodeSnappyBlockAsm8B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B + +two_bytes_repeat_emit_encodeSnappyBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm8B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B + +one_byte_repeat_emit_encodeSnappyBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B + +memmove_long_repeat_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B + +matchlen_single_repeat_extend_encodeSnappyBlockAsm8B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B + +repeat_extend_forward_end_encodeSnappyBlockAsm8B: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm8B + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm8B: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm8B + +no_repeat_found_encodeSnappyBlockAsm8B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm8B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm8B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm8B + +candidate3_match_encodeSnappyBlockAsm8B: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm8B + +candidate2_match_encodeSnappyBlockAsm8B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm8B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm8B + +match_extend_back_loop_encodeSnappyBlockAsm8B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm8B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm8B + JMP match_extend_back_loop_encodeSnappyBlockAsm8B + +match_extend_back_end_encodeSnappyBlockAsm8B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm8B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm8B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm8B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm8B + +two_bytes_match_emit_encodeSnappyBlockAsm8B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm8B + JMP memmove_long_match_emit_encodeSnappyBlockAsm8B + +one_byte_match_emit_encodeSnappyBlockAsm8B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B + +memmove_long_match_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm8B: +match_nolit_loop_encodeSnappyBlockAsm8B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm8B + +matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B + +matchlen_single_match_nolit_encodeSnappyBlockAsm8B: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm8B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B + +match_nolit_end_encodeSnappyBlockAsm8B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B + +emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm8B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm8B: + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x38, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x38, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm8B + INCL CX + JMP search_loop_encodeSnappyBlockAsm8B + +emit_remainder_encodeSnappyBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B + +two_bytes_emit_remainder_encodeSnappyBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm8B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B + +one_byte_emit_remainder_encodeSnappyBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B + +memmove_long_emit_remainder_encodeSnappyBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + CMPL SI, $0x63 + JLE check_maxskip_ok_encodeSnappyBetterBlockAsm + LEAL 100(CX), SI + JMP check_maxskip_cont_encodeSnappyBetterBlockAsm + +check_maxskip_ok_encodeSnappyBetterBlockAsm: + LEAL 1(CX)(SI*1), SI + +check_maxskip_cont_encodeSnappyBetterBlockAsm: + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm + +candidateS_match_encodeSnappyBetterBlockAsm: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm + +match_extend_back_loop_encodeSnappyBetterBlockAsm: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm + +match_extend_back_end_encodeSnappyBetterBlockAsm: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm + +match_nolit_end_encodeSnappyBetterBlockAsm: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL R12, $0x01 + JG match_length_ok_encodeSnappyBetterBlockAsm + CMPL R8, $0x0000ffff + JLE match_length_ok_encodeSnappyBetterBlockAsm + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeSnappyBetterBlockAsm + +match_length_ok_encodeSnappyBetterBlockAsm: + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +four_bytes_match_emit_encodeSnappyBetterBlockAsm: + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +three_bytes_match_emit_encodeSnappyBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +two_bytes_match_emit_encodeSnappyBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +one_byte_match_emit_encodeSnappyBetterBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm + +memmove_long_match_emit_encodeSnappyBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy + CMPL R8, $0x00010000 + JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm + +four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R12, $0x40 + JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm + MOVB $0xff, (AX) + MOVL R8, 1(AX) + LEAL -64(R12), R12 + ADDQ $0x05, AX + CMPL R12, $0x04 + JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm + +four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: + TESTL R12, R12 + JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm + +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm + +emit_remainder_encodeSnappyBetterBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm64K: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm64K: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm64K + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm64K + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm64K + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm64K + +candidateS_match_encodeSnappyBetterBlockAsm64K: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm64K + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm64K: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K + +match_extend_back_loop_encodeSnappyBetterBlockAsm64K: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K + +match_extend_back_end_encodeSnappyBetterBlockAsm64K: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm64K: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm64K + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm64K + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm64K + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K + +match_nolit_end_encodeSnappyBetterBlockAsm64K: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K + +two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm64K + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K + +one_byte_match_emit_encodeSnappyBetterBlockAsm64K: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K + +memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm64K + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm64K + +emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm64K: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000280, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm12B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 65560(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 65560(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm12B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm12B + +candidateS_match_encodeSnappyBetterBlockAsm12B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm12B + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B + +match_extend_back_loop_encodeSnappyBetterBlockAsm12B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B + +match_extend_back_end_encodeSnappyBetterBlockAsm12B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm12B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm12B + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm12B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B + +match_nolit_end_encodeSnappyBetterBlockAsm12B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B + +two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm12B + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B + +one_byte_match_emit_encodeSnappyBetterBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B + +memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm12B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x34, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 65560(SP)(R11*4) + MOVL R15, 65560(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 65560(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm12B + +emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x000000a0, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 16408(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 16408(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm10B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm10B + +candidateS_match_encodeSnappyBetterBlockAsm10B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm10B + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm10B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B + +match_extend_back_loop_encodeSnappyBetterBlockAsm10B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B + +match_extend_back_end_encodeSnappyBetterBlockAsm10B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm10B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm10B + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm10B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B + +match_nolit_end_encodeSnappyBetterBlockAsm10B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B + +two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm10B + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B + +one_byte_match_emit_encodeSnappyBetterBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B + +memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm10B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x36, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 16408(SP)(R11*4) + MOVL R15, 16408(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 16408(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm10B + +emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000028, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm8B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 4120(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 4120(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm8B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm8B + +candidateS_match_encodeSnappyBetterBlockAsm8B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm8B + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm8B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B + +match_extend_back_loop_encodeSnappyBetterBlockAsm8B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B + +match_extend_back_end_encodeSnappyBetterBlockAsm8B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm8B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm8B + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm8B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B + +match_nolit_end_encodeSnappyBetterBlockAsm8B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B + +two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm8B + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B + +one_byte_match_emit_encodeSnappyBetterBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B + +memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm8B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x38, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 4120(SP)(R11*4) + MOVL R15, 4120(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 4120(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm8B + +emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func emitLiteral(dst []byte, lit []byte) int +// Requires: SSE2 +TEXT ·emitLiteral(SB), NOSPLIT, $0-56 + MOVQ lit_len+32(FP), DX + MOVQ dst_base+0(FP), AX + MOVQ lit_base+24(FP), CX + TESTQ DX, DX + JZ emit_literal_end_standalone_skip + MOVL DX, BX + LEAL -1(DX), SI + CMPL SI, $0x3c + JLT one_byte_standalone + CMPL SI, $0x00000100 + JLT two_bytes_standalone + CMPL SI, $0x00010000 + JLT three_bytes_standalone + CMPL SI, $0x01000000 + JLT four_bytes_standalone + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP memmove_long_standalone + +four_bytes_standalone: + MOVL SI, DI + SHRL $0x10, DI + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB DI, 3(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP memmove_long_standalone + +three_bytes_standalone: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP memmove_long_standalone + +two_bytes_standalone: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_standalone + JMP memmove_long_standalone + +one_byte_standalone: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, BX + ADDQ $0x01, AX + +memmove_standalone: + // genMemMoveShort + CMPQ DX, $0x03 + JB emit_lit_memmove_standalone_memmove_move_1or2 + JE emit_lit_memmove_standalone_memmove_move_3 + CMPQ DX, $0x08 + JB emit_lit_memmove_standalone_memmove_move_4through7 + CMPQ DX, $0x10 + JBE emit_lit_memmove_standalone_memmove_move_8through16 + CMPQ DX, $0x20 + JBE emit_lit_memmove_standalone_memmove_move_17through32 + JMP emit_lit_memmove_standalone_memmove_move_33through64 + +emit_lit_memmove_standalone_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(DX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(DX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(DX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(DX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(DX*1), X2 + MOVOU -16(CX)(DX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DX*1) + MOVOU X3, -16(AX)(DX*1) + JMP emit_literal_end_standalone + JMP emit_literal_end_standalone + +memmove_long_standalone: + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(DX*1), X2 + MOVOU -16(CX)(DX*1), X3 + MOVQ DX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_standalonelarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_standalonelarge_big_loop_back + +emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ DX, R8 + JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DX*1) + MOVOU X3, -16(AX)(DX*1) + JMP emit_literal_end_standalone + JMP emit_literal_end_standalone + +emit_literal_end_standalone_skip: + XORQ BX, BX + +emit_literal_end_standalone: + MOVQ BX, ret+48(FP) + RET + +// func emitRepeat(dst []byte, offset int, length int) int +TEXT ·emitRepeat(SB), NOSPLIT, $0-48 + XORQ BX, BX + MOVQ dst_base+0(FP), AX + MOVQ offset+24(FP), CX + MOVQ length+32(FP), DX + + // emitRepeat +emit_repeat_again_standalone: + MOVL DX, SI + LEAL -4(DX), DX + CMPL SI, $0x08 + JLE repeat_two_standalone + CMPL SI, $0x0c + JGE cant_repeat_two_offset_standalone + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone + +cant_repeat_two_offset_standalone: + CMPL DX, $0x00000104 + JLT repeat_three_standalone + CMPL DX, $0x00010100 + JLT repeat_four_standalone + CMPL DX, $0x0100ffff + JLT repeat_five_standalone + LEAL -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone + +repeat_five_standalone: + LEAL -65536(DX), DX + MOVL DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARL $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_repeat_end + +repeat_four_standalone: + LEAL -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_repeat_end + +repeat_three_standalone: + LEAL -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_repeat_end + +repeat_two_standalone: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_repeat_end + +repeat_two_offset_standalone: + XORQ SI, SI + LEAL 1(SI)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + +gen_emit_repeat_end: + MOVQ BX, ret+40(FP) + RET + +// func emitCopy(dst []byte, offset int, length int) int +TEXT ·emitCopy(SB), NOSPLIT, $0-48 + XORQ BX, BX + MOVQ dst_base+0(FP), AX + MOVQ offset+24(FP), CX + MOVQ length+32(FP), DX + + // emitCopy + CMPL CX, $0x00010000 + JL two_byte_offset_standalone + +four_bytes_loop_back_standalone: + CMPL DX, $0x40 + JLE four_bytes_remain_standalone + MOVB $0xff, (AX) + MOVL CX, 1(AX) + LEAL -64(DX), DX + ADDQ $0x05, BX + ADDQ $0x05, AX + CMPL DX, $0x04 + JL four_bytes_remain_standalone + + // emitRepeat +emit_repeat_again_standalone_emit_copy: + MOVL DX, SI + LEAL -4(DX), DX + CMPL SI, $0x08 + JLE repeat_two_standalone_emit_copy + CMPL SI, $0x0c + JGE cant_repeat_two_offset_standalone_emit_copy + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone_emit_copy + +cant_repeat_two_offset_standalone_emit_copy: + CMPL DX, $0x00000104 + JLT repeat_three_standalone_emit_copy + CMPL DX, $0x00010100 + JLT repeat_four_standalone_emit_copy + CMPL DX, $0x0100ffff + JLT repeat_five_standalone_emit_copy + LEAL -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone_emit_copy + +repeat_five_standalone_emit_copy: + LEAL -65536(DX), DX + MOVL DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARL $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +repeat_four_standalone_emit_copy: + LEAL -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_copy_end + +repeat_three_standalone_emit_copy: + LEAL -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_copy_end + +repeat_two_standalone_emit_copy: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +repeat_two_offset_standalone_emit_copy: + XORQ SI, SI + LEAL 1(SI)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + JMP four_bytes_loop_back_standalone + +four_bytes_remain_standalone: + TESTL DX, DX + JZ gen_emit_copy_end + MOVB $0x03, SI + LEAL -4(SI)(DX*4), DX + MOVB DL, (AX) + MOVL CX, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +two_byte_offset_standalone: + CMPL DX, $0x40 + JLE two_byte_offset_short_standalone + MOVB $0xee, (AX) + MOVW CX, 1(AX) + LEAL -60(DX), DX + ADDQ $0x03, AX + ADDQ $0x03, BX + + // emitRepeat +emit_repeat_again_standalone_emit_copy_short: + MOVL DX, SI + LEAL -4(DX), DX + CMPL SI, $0x08 + JLE repeat_two_standalone_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_standalone_emit_copy_short + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone_emit_copy_short + +cant_repeat_two_offset_standalone_emit_copy_short: + CMPL DX, $0x00000104 + JLT repeat_three_standalone_emit_copy_short + CMPL DX, $0x00010100 + JLT repeat_four_standalone_emit_copy_short + CMPL DX, $0x0100ffff + JLT repeat_five_standalone_emit_copy_short + LEAL -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone_emit_copy_short + +repeat_five_standalone_emit_copy_short: + LEAL -65536(DX), DX + MOVL DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARL $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +repeat_four_standalone_emit_copy_short: + LEAL -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_copy_end + +repeat_three_standalone_emit_copy_short: + LEAL -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_copy_end + +repeat_two_standalone_emit_copy_short: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +repeat_two_offset_standalone_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + JMP two_byte_offset_standalone + +two_byte_offset_short_standalone: + CMPL DX, $0x0c + JGE emit_copy_three_standalone + CMPL CX, $0x00000800 + JGE emit_copy_three_standalone + MOVB $0x01, SI + LEAL -16(SI)(DX*4), DX + MOVB CL, 1(AX) + SHRL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +emit_copy_three_standalone: + MOVB $0x02, SI + LEAL -4(SI)(DX*4), DX + MOVB DL, (AX) + MOVW CX, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + +gen_emit_copy_end: + MOVQ BX, ret+40(FP) + RET + +// func emitCopyNoRepeat(dst []byte, offset int, length int) int +TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 + XORQ BX, BX + MOVQ dst_base+0(FP), AX + MOVQ offset+24(FP), CX + MOVQ length+32(FP), DX + + // emitCopy + CMPL CX, $0x00010000 + JL two_byte_offset_standalone_snappy + +four_bytes_loop_back_standalone_snappy: + CMPL DX, $0x40 + JLE four_bytes_remain_standalone_snappy + MOVB $0xff, (AX) + MOVL CX, 1(AX) + LEAL -64(DX), DX + ADDQ $0x05, BX + ADDQ $0x05, AX + CMPL DX, $0x04 + JL four_bytes_remain_standalone_snappy + JMP four_bytes_loop_back_standalone_snappy + +four_bytes_remain_standalone_snappy: + TESTL DX, DX + JZ gen_emit_copy_end_snappy + MOVB $0x03, SI + LEAL -4(SI)(DX*4), DX + MOVB DL, (AX) + MOVL CX, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end_snappy + +two_byte_offset_standalone_snappy: + CMPL DX, $0x40 + JLE two_byte_offset_short_standalone_snappy + MOVB $0xee, (AX) + MOVW CX, 1(AX) + LEAL -60(DX), DX + ADDQ $0x03, AX + ADDQ $0x03, BX + JMP two_byte_offset_standalone_snappy + +two_byte_offset_short_standalone_snappy: + CMPL DX, $0x0c + JGE emit_copy_three_standalone_snappy + CMPL CX, $0x00000800 + JGE emit_copy_three_standalone_snappy + MOVB $0x01, SI + LEAL -16(SI)(DX*4), DX + MOVB CL, 1(AX) + SHRL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end_snappy + +emit_copy_three_standalone_snappy: + MOVB $0x02, SI + LEAL -4(SI)(DX*4), DX + MOVB DL, (AX) + MOVW CX, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + +gen_emit_copy_end_snappy: + MOVQ BX, ret+40(FP) + RET + +// func matchLen(a []byte, b []byte) int +TEXT ·matchLen(SB), NOSPLIT, $0-56 + MOVQ a_base+0(FP), AX + MOVQ b_base+24(FP), CX + MOVQ a_len+8(FP), DX + + // matchLen + XORL SI, SI + CMPL DX, $0x08 + JL matchlen_single_standalone + +matchlen_loopback_standalone: + MOVQ (AX)(SI*1), BX + XORQ (CX)(SI*1), BX + TESTQ BX, BX + JZ matchlen_loop_standalone + BSFQ BX, BX + SARQ $0x03, BX + LEAL (SI)(BX*1), SI + JMP gen_match_len_end + +matchlen_loop_standalone: + LEAL -8(DX), DX + LEAL 8(SI), SI + CMPL DX, $0x08 + JGE matchlen_loopback_standalone + +matchlen_single_standalone: + TESTL DX, DX + JZ gen_match_len_end + +matchlen_single_loopback_standalone: + MOVB (AX)(SI*1), BL + CMPB (CX)(SI*1), BL + JNE gen_match_len_end + LEAL 1(SI), SI + DECL DX + JNZ matchlen_single_loopback_standalone + +gen_match_len_end: + MOVQ SI, ret+48(FP) + RET diff --git a/vendor/github.com/klauspost/compress/s2/s2.go b/vendor/github.com/klauspost/compress/s2/s2.go new file mode 100644 index 00000000..89d69e96 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/s2.go @@ -0,0 +1,139 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package s2 implements the S2 compression format. +// +// S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput, +// which is why it features concurrent compression for bigger payloads. +// +// Decoding is compatible with Snappy compressed content, +// but content compressed with S2 cannot be decompressed by Snappy. +// +// For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2 +// +// There are actually two S2 formats: block and stream. They are related, +// but different: trying to decompress block-compressed data as a S2 stream +// will fail, and vice versa. The block format is the Decode and Encode +// functions and the stream format is the Reader and Writer types. +// +// A "better" compression option is available. This will trade some compression +// speed +// +// The block format, the more common case, is used when the complete size (the +// number of bytes) of the original data is known upfront, at the time +// compression starts. The stream format, also known as the framing format, is +// for when that isn't always true. +// +// Blocks to not offer much data protection, so it is up to you to +// add data validation of decompressed blocks. +// +// Streams perform CRC validation of the decompressed data. +// Stream compression will also be performed on multiple CPU cores concurrently +// significantly improving throughput. +package s2 + +import ( + "bytes" + "hash/crc32" +) + +/* +Each encoded block begins with the varint-encoded length of the decoded data, +followed by a sequence of chunks. Chunks begin and end on byte boundaries. The +first byte of each chunk is broken into its 2 least and 6 most significant bits +called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. +Zero means a literal tag. All other values mean a copy tag. + +For literal tags: + - If m < 60, the next 1 + m bytes are literal bytes. + - Otherwise, let n be the little-endian unsigned integer denoted by the next + m - 59 bytes. The next 1 + n bytes after that are literal bytes. + +For copy tags, length bytes are copied from offset bytes ago, in the style of +Lempel-Ziv compression algorithms. In particular: + - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). + The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 + of the offset. The next byte is bits 0-7 of the offset. + - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). + The length is 1 + m. The offset is the little-endian unsigned integer + denoted by the next 2 bytes. + - For l == 3, the offset ranges in [0, 1<<32) and the length in + [1, 65). The length is 1 + m. The offset is the little-endian unsigned + integer denoted by the next 4 bytes. +*/ +const ( + tagLiteral = 0x00 + tagCopy1 = 0x01 + tagCopy2 = 0x02 + tagCopy4 = 0x03 +) + +const ( + checksumSize = 4 + chunkHeaderSize = 4 + magicChunk = "\xff\x06\x00\x00" + magicBody + magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy + magicBodySnappy = "sNaPpY" + magicBody = "S2sTwO" + + // maxBlockSize is the maximum size of the input to encodeBlock. + // + // For the framing format (Writer type instead of Encode function), + // this is the maximum uncompressed size of a block. + maxBlockSize = 4 << 20 + + // minBlockSize is the minimum size of block setting when creating a writer. + minBlockSize = 4 << 10 + + // Default block size + defaultBlockSize = 1 << 20 + + // maxSnappyBlockSize is the maximum snappy block size. + maxSnappyBlockSize = 1 << 16 + + obufHeaderLen = checksumSize + chunkHeaderSize +) + +const ( + chunkTypeCompressedData = 0x00 + chunkTypeUncompressedData = 0x01 + chunkTypePadding = 0xfe + chunkTypeStreamIdentifier = 0xff +) + +var crcTable = crc32.MakeTable(crc32.Castagnoli) + +// crc implements the checksum specified in section 3 of +// https://github.com/google/snappy/blob/master/framing_format.txt +func crc(b []byte) uint32 { + c := crc32.Update(0, crcTable, b) + return c>>15 | c<<17 + 0xa282ead8 +} + +// literalExtraSize returns the extra size of encoding n literals. +// n should be >= 0 and <= math.MaxUint32. +func literalExtraSize(n int64) int64 { + if n == 0 { + return 0 + } + switch { + case n < 60: + return 1 + case n < 1<<8: + return 2 + case n < 1<<16: + return 3 + case n < 1<<24: + return 4 + default: + return 5 + } +} + +type byter interface { + Bytes() []byte +} + +var _ byter = &bytes.Buffer{} |