summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s')
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s15678
1 files changed, 15678 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
new file mode 100644
index 00000000..1ac65a0e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -0,0 +1,15678 @@
+// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
+
+// +build !appengine
+// +build !noasm
+// +build gc
+
+#include "textflag.h"
+
+// func encodeBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm(SB), $65560-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000200, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R11
+ IMULQ R9, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm
+
+repeat_extend_back_loop_encodeBlockAsm:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm
+
+repeat_extend_back_end_encodeBlockAsm:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_repeat_emit_encodeBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_repeat_emit_encodeBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm
+
+four_bytes_repeat_emit_encodeBlockAsm:
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm
+
+three_bytes_repeat_emit_encodeBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm
+
+two_bytes_repeat_emit_encodeBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm
+ JMP memmove_long_repeat_emit_encodeBlockAsm
+
+one_byte_repeat_emit_encodeBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm
+
+memmove_long_repeat_emit_encodeBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm
+
+matchlen_loopback_repeat_extend_encodeBlockAsm:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_loop_repeat_extend_encodeBlockAsm:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm
+
+matchlen_single_repeat_extend_encodeBlockAsm:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm
+
+repeat_extend_forward_end_encodeBlockAsm:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm
+
+ // emitRepeat
+emit_repeat_again_match_repeat_encodeBlockAsm:
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_match_repeat_encodeBlockAsm
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm
+ CMPL SI, $0x00010100
+ JLT repeat_four_match_repeat_encodeBlockAsm
+ CMPL SI, $0x0100ffff
+ JLT repeat_five_match_repeat_encodeBlockAsm
+ LEAL -16842747(SI), SI
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_repeat_encodeBlockAsm
+
+repeat_five_match_repeat_encodeBlockAsm:
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_four_match_repeat_encodeBlockAsm:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_three_match_repeat_encodeBlockAsm:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_match_repeat_encodeBlockAsm:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_match_repeat_encodeBlockAsm:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_as_copy_encodeBlockAsm:
+ // emitCopy
+ CMPL DI, $0x00010000
+ JL two_byte_offset_repeat_as_copy_encodeBlockAsm
+
+four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
+ CMPL SI, $0x40
+ JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm
+ MOVB $0xff, (AX)
+ MOVL DI, 1(AX)
+ LEAL -64(SI), SI
+ ADDQ $0x05, AX
+ CMPL SI, $0x04
+ JL four_bytes_remain_repeat_as_copy_encodeBlockAsm
+
+ // emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
+ CMPL SI, $0x0100ffff
+ JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
+ LEAL -16842747(SI), SI
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+ JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm:
+ TESTL SI, SI
+ JZ repeat_end_emit_encodeBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVL DI, 1(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ CMPL SI, $0x0100ffff
+ JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
+ LEAL -16842747(SI), SI
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm
+
+no_repeat_found_encodeBlockAsm:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm
+
+candidate3_match_encodeBlockAsm:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm
+
+candidate2_match_encodeBlockAsm:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm
+
+match_extend_back_loop_encodeBlockAsm:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm
+ JMP match_extend_back_loop_encodeBlockAsm
+
+match_extend_back_end_encodeBlockAsm:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 5(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm
+ CMPL R8, $0x00010000
+ JLT three_bytes_match_emit_encodeBlockAsm
+ CMPL R8, $0x01000000
+ JLT four_bytes_match_emit_encodeBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_encodeBlockAsm
+
+four_bytes_match_emit_encodeBlockAsm:
+ MOVL R8, R10
+ SHRL $0x10, R10
+ MOVB $0xf8, (AX)
+ MOVW R8, 1(AX)
+ MOVB R10, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeBlockAsm
+
+three_bytes_match_emit_encodeBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm
+
+two_bytes_match_emit_encodeBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm
+ JMP memmove_long_match_emit_encodeBlockAsm
+
+one_byte_match_emit_encodeBlockAsm:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm
+
+memmove_long_match_emit_encodeBlockAsm:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm:
+match_nolit_loop_encodeBlockAsm:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm
+
+matchlen_loopback_match_nolit_encodeBlockAsm:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm
+
+matchlen_loop_match_nolit_encodeBlockAsm:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm
+
+matchlen_single_match_nolit_encodeBlockAsm:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm
+
+match_nolit_end_encodeBlockAsm:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL SI, $0x00010000
+ JL two_byte_offset_match_nolit_encodeBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeBlockAsm:
+ CMPL R10, $0x40
+ JLE four_bytes_remain_match_nolit_encodeBlockAsm
+ MOVB $0xff, (AX)
+ MOVL SI, 1(AX)
+ LEAL -64(R10), R10
+ ADDQ $0x05, AX
+ CMPL R10, $0x04
+ JL four_bytes_remain_match_nolit_encodeBlockAsm
+
+ // emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy
+ LEAL -16842747(R10), R10
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy:
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+ JMP four_bytes_loop_back_match_nolit_encodeBlockAsm
+
+four_bytes_remain_match_nolit_encodeBlockAsm:
+ TESTL R10, R10
+ JZ match_nolit_emitcopy_end_encodeBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+two_byte_offset_match_nolit_encodeBlockAsm:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
+ LEAL -16842747(R10), R10
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+ JMP two_byte_offset_match_nolit_encodeBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBlockAsm:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy_three_match_nolit_encodeBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm:
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x10, R8
+ IMULQ R9, R8
+ SHRQ $0x32, R8
+ SHLQ $0x10, SI
+ IMULQ R9, SI
+ SHRQ $0x32, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm
+ INCL CX
+ JMP search_loop_encodeBlockAsm
+
+emit_remainder_encodeBlockAsm:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeBlockAsm
+ CMPL DX, $0x01000000
+ JLT four_bytes_emit_remainder_encodeBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL DX, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm
+
+four_bytes_emit_remainder_encodeBlockAsm:
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm
+
+three_bytes_emit_remainder_encodeBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm
+
+two_bytes_emit_remainder_encodeBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm
+ JMP memmove_long_emit_remainder_encodeBlockAsm
+
+one_byte_emit_remainder_encodeBlockAsm:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm
+
+memmove_long_emit_remainder_encodeBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBlockAsm4MB(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm4MB(SB), $65560-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000200, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm4MB:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm4MB
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm4MB:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm4MB
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R11
+ IMULQ R9, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm4MB
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm4MB
+
+repeat_extend_back_loop_encodeBlockAsm4MB:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm4MB
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm4MB
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm4MB
+
+repeat_extend_back_end_encodeBlockAsm4MB:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm4MB
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm4MB
+ CMPL SI, $0x00010000
+ JLT three_bytes_repeat_emit_encodeBlockAsm4MB
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm4MB
+
+three_bytes_repeat_emit_encodeBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm4MB
+
+two_bytes_repeat_emit_encodeBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm4MB
+ JMP memmove_long_repeat_emit_encodeBlockAsm4MB
+
+one_byte_repeat_emit_encodeBlockAsm4MB:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
+
+memmove_long_repeat_emit_encodeBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm4MB:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm4MB
+
+matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_loop_repeat_extend_encodeBlockAsm4MB:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB
+
+matchlen_single_repeat_extend_encodeBlockAsm4MB:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm4MB
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB
+
+repeat_extend_forward_end_encodeBlockAsm4MB:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm4MB
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm4MB
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm4MB
+ CMPL SI, $0x00010100
+ JLT repeat_four_match_repeat_encodeBlockAsm4MB
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_match_repeat_encodeBlockAsm4MB:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_match_repeat_encodeBlockAsm4MB:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_match_repeat_encodeBlockAsm4MB:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_as_copy_encodeBlockAsm4MB:
+ // emitCopy
+ CMPL DI, $0x00010000
+ JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
+
+four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
+ CMPL SI, $0x40
+ JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+ MOVB $0xff, (AX)
+ MOVL DI, 1(AX)
+ LEAL -64(SI), SI
+ ADDQ $0x05, AX
+ CMPL SI, $0x04
+ JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+ JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
+ TESTL SI, SI
+ JZ repeat_end_emit_encodeBlockAsm4MB
+ MOVB $0x03, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVL DI, 1(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+ CMPL SI, $0x00010100
+ JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+ LEAL -65536(SI), SI
+ MOVL SI, DI
+ MOVW $0x001d, (AX)
+ MOVW SI, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm4MB
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm4MB:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm4MB
+
+no_repeat_found_encodeBlockAsm4MB:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm4MB
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm4MB
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm4MB
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm4MB
+
+candidate3_match_encodeBlockAsm4MB:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm4MB
+
+candidate2_match_encodeBlockAsm4MB:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm4MB:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm4MB
+
+match_extend_back_loop_encodeBlockAsm4MB:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm4MB
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm4MB
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm4MB
+ JMP match_extend_back_loop_encodeBlockAsm4MB
+
+match_extend_back_end_encodeBlockAsm4MB:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 4(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm4MB:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm4MB
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm4MB
+ CMPL R8, $0x00010000
+ JLT three_bytes_match_emit_encodeBlockAsm4MB
+ MOVL R8, R10
+ SHRL $0x10, R10
+ MOVB $0xf8, (AX)
+ MOVW R8, 1(AX)
+ MOVB R10, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeBlockAsm4MB
+
+three_bytes_match_emit_encodeBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm4MB
+
+two_bytes_match_emit_encodeBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm4MB
+ JMP memmove_long_match_emit_encodeBlockAsm4MB
+
+one_byte_match_emit_encodeBlockAsm4MB:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm4MB:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm4MB:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm4MB
+
+memmove_long_match_emit_encodeBlockAsm4MB:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm4MB:
+match_nolit_loop_encodeBlockAsm4MB:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm4MB
+
+matchlen_loopback_match_nolit_encodeBlockAsm4MB:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm4MB
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm4MB
+
+matchlen_loop_match_nolit_encodeBlockAsm4MB:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB
+
+matchlen_single_match_nolit_encodeBlockAsm4MB:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm4MB
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm4MB:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm4MB
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB
+
+match_nolit_end_encodeBlockAsm4MB:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL SI, $0x00010000
+ JL two_byte_offset_match_nolit_encodeBlockAsm4MB
+
+four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
+ CMPL R10, $0x40
+ JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB
+ MOVB $0xff, (AX)
+ MOVL SI, 1(AX)
+ LEAL -64(R10), R10
+ ADDQ $0x05, AX
+ CMPL R10, $0x04
+ JL four_bytes_remain_match_nolit_encodeBlockAsm4MB
+
+ // emitRepeat
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+ JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBlockAsm4MB:
+ TESTL R10, R10
+ JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
+ MOVB $0x03, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBlockAsm4MB:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
+ CMPL R10, $0x00010100
+ JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
+ LEAL -65536(R10), R10
+ MOVL R10, SI
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+ JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBlockAsm4MB:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm4MB:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm4MB
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm4MB:
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x10, R8
+ IMULQ R9, R8
+ SHRQ $0x32, R8
+ SHLQ $0x10, SI
+ IMULQ R9, SI
+ SHRQ $0x32, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm4MB
+ INCL CX
+ JMP search_loop_encodeBlockAsm4MB
+
+emit_remainder_encodeBlockAsm4MB:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 4(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm4MB:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm4MB
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm4MB
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeBlockAsm4MB
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm4MB
+ JMP memmove_long_emit_remainder_encodeBlockAsm4MB
+
+one_byte_emit_remainder_encodeBlockAsm4MB:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm4MB:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBlockAsm4MB:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm4MB:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm12B(SB), $16408-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000080, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm12B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm12B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm12B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm12B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x000000cf1bbcdcbb, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x18, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ SHLQ $0x18, R11
+ IMULQ R9, R11
+ SHRQ $0x34, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x18, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm12B
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm12B
+
+repeat_extend_back_loop_encodeBlockAsm12B:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm12B
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm12B
+
+repeat_extend_back_end_encodeBlockAsm12B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm12B
+
+two_bytes_repeat_emit_encodeBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm12B
+ JMP memmove_long_repeat_emit_encodeBlockAsm12B
+
+one_byte_repeat_emit_encodeBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
+
+memmove_long_repeat_emit_encodeBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm12B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm12B
+
+matchlen_loopback_repeat_extend_encodeBlockAsm12B:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm12B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_loop_repeat_extend_encodeBlockAsm12B:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B
+
+matchlen_single_repeat_extend_encodeBlockAsm12B:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm12B:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm12B
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B
+
+repeat_extend_forward_end_encodeBlockAsm12B:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm12B
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm12B
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_match_repeat_encodeBlockAsm12B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm12B
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_match_repeat_encodeBlockAsm12B:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_match_repeat_encodeBlockAsm12B:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_match_repeat_encodeBlockAsm12B:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_as_copy_encodeBlockAsm12B:
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm12B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm12B
+
+no_repeat_found_encodeBlockAsm12B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm12B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm12B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm12B
+
+candidate3_match_encodeBlockAsm12B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm12B
+
+candidate2_match_encodeBlockAsm12B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm12B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm12B
+
+match_extend_back_loop_encodeBlockAsm12B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm12B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm12B
+ JMP match_extend_back_loop_encodeBlockAsm12B
+
+match_extend_back_end_encodeBlockAsm12B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm12B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm12B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm12B
+
+two_bytes_match_emit_encodeBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm12B
+ JMP memmove_long_match_emit_encodeBlockAsm12B
+
+one_byte_match_emit_encodeBlockAsm12B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm12B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm12B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm12B
+
+memmove_long_match_emit_encodeBlockAsm12B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm12B:
+match_nolit_loop_encodeBlockAsm12B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeBlockAsm12B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm12B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm12B
+
+matchlen_loop_match_nolit_encodeBlockAsm12B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm12B
+
+matchlen_single_match_nolit_encodeBlockAsm12B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm12B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm12B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B
+
+match_nolit_end_encodeBlockAsm12B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBlockAsm12B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+ JMP two_byte_offset_match_nolit_encodeBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm12B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm12B
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm12B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm12B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm12B:
+ MOVQ $0x000000cf1bbcdcbb, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x18, R8
+ IMULQ R9, R8
+ SHRQ $0x34, R8
+ SHLQ $0x18, SI
+ IMULQ R9, SI
+ SHRQ $0x34, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm12B
+ INCL CX
+ JMP search_loop_encodeBlockAsm12B
+
+emit_remainder_encodeBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm12B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm12B
+
+two_bytes_emit_remainder_encodeBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm12B
+ JMP memmove_long_emit_remainder_encodeBlockAsm12B
+
+one_byte_emit_remainder_encodeBlockAsm12B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm12B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm12B
+
+memmove_long_emit_remainder_encodeBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm12B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm10B(SB), $4120-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000020, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm10B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm10B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm10B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm10B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ R9, R11
+ SHRQ $0x36, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm10B
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm10B
+
+repeat_extend_back_loop_encodeBlockAsm10B:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm10B
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm10B
+
+repeat_extend_back_end_encodeBlockAsm10B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm10B
+
+two_bytes_repeat_emit_encodeBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm10B
+ JMP memmove_long_repeat_emit_encodeBlockAsm10B
+
+one_byte_repeat_emit_encodeBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
+
+memmove_long_repeat_emit_encodeBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm10B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm10B
+
+matchlen_loopback_repeat_extend_encodeBlockAsm10B:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm10B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_loop_repeat_extend_encodeBlockAsm10B:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B
+
+matchlen_single_repeat_extend_encodeBlockAsm10B:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm10B:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm10B
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B
+
+repeat_extend_forward_end_encodeBlockAsm10B:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm10B
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm10B
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_match_repeat_encodeBlockAsm10B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm10B
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_match_repeat_encodeBlockAsm10B:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_match_repeat_encodeBlockAsm10B:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_match_repeat_encodeBlockAsm10B:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_as_copy_encodeBlockAsm10B:
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL SI, R8
+ LEAL -4(SI), SI
+ CMPL R8, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+ CMPL DI, $0x00000800
+ JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm10B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm10B
+
+no_repeat_found_encodeBlockAsm10B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm10B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm10B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm10B
+
+candidate3_match_encodeBlockAsm10B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm10B
+
+candidate2_match_encodeBlockAsm10B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm10B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm10B
+
+match_extend_back_loop_encodeBlockAsm10B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm10B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm10B
+ JMP match_extend_back_loop_encodeBlockAsm10B
+
+match_extend_back_end_encodeBlockAsm10B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm10B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm10B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm10B
+
+two_bytes_match_emit_encodeBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm10B
+ JMP memmove_long_match_emit_encodeBlockAsm10B
+
+one_byte_match_emit_encodeBlockAsm10B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm10B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm10B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm10B
+
+memmove_long_match_emit_encodeBlockAsm10B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm10B:
+match_nolit_loop_encodeBlockAsm10B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeBlockAsm10B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm10B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm10B
+
+matchlen_loop_match_nolit_encodeBlockAsm10B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm10B
+
+matchlen_single_match_nolit_encodeBlockAsm10B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm10B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm10B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B
+
+match_nolit_end_encodeBlockAsm10B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBlockAsm10B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R10, DI
+ LEAL -4(R10), R10
+ CMPL DI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+ CMPL SI, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+ JMP two_byte_offset_match_nolit_encodeBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm10B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm10B
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm10B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm10B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm10B:
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x20, R8
+ IMULQ R9, R8
+ SHRQ $0x36, R8
+ SHLQ $0x20, SI
+ IMULQ R9, SI
+ SHRQ $0x36, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm10B
+ INCL CX
+ JMP search_loop_encodeBlockAsm10B
+
+emit_remainder_encodeBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm10B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm10B
+
+two_bytes_emit_remainder_encodeBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm10B
+ JMP memmove_long_emit_remainder_encodeBlockAsm10B
+
+one_byte_emit_remainder_encodeBlockAsm10B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm10B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm10B
+
+memmove_long_emit_remainder_encodeBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm10B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBlockAsm8B(SB), $1048-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000008, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBlockAsm8B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBlockAsm8B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBlockAsm8B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x04, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBlockAsm8B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x38, R10
+ SHLQ $0x20, R11
+ IMULQ R9, R11
+ SHRQ $0x38, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x38, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeBlockAsm8B
+ LEAL 1(CX), DI
+ MOVL 12(SP), R8
+ MOVL DI, SI
+ SUBL 16(SP), SI
+ JZ repeat_extend_back_end_encodeBlockAsm8B
+
+repeat_extend_back_loop_encodeBlockAsm8B:
+ CMPL DI, R8
+ JLE repeat_extend_back_end_encodeBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeBlockAsm8B
+ LEAL -1(DI), DI
+ DECL SI
+ JNZ repeat_extend_back_loop_encodeBlockAsm8B
+
+repeat_extend_back_end_encodeBlockAsm8B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeBlockAsm8B
+
+two_bytes_repeat_emit_encodeBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeBlockAsm8B
+ JMP memmove_long_repeat_emit_encodeBlockAsm8B
+
+one_byte_repeat_emit_encodeBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
+
+memmove_long_repeat_emit_encodeBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R13*1), X4
+ MOVOU -16(R10)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R9, R13
+ JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeBlockAsm8B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R9
+ SUBL CX, R9
+ LEAQ (DX)(CX*1), R10
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R12, R12
+ CMPL R9, $0x08
+ JL matchlen_single_repeat_extend_encodeBlockAsm8B
+
+matchlen_loopback_repeat_extend_encodeBlockAsm8B:
+ MOVQ (R10)(R12*1), R11
+ XORQ (SI)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_repeat_extend_encodeBlockAsm8B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_loop_repeat_extend_encodeBlockAsm8B:
+ LEAL -8(R9), R9
+ LEAL 8(R12), R12
+ CMPL R9, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B
+
+matchlen_single_repeat_extend_encodeBlockAsm8B:
+ TESTL R9, R9
+ JZ repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_single_loopback_repeat_extend_encodeBlockAsm8B:
+ MOVB (R10)(R12*1), R11
+ CMPB (SI)(R12*1), R11
+ JNE repeat_extend_forward_end_encodeBlockAsm8B
+ LEAL 1(R12), R12
+ DECL R9
+ JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B
+
+repeat_extend_forward_end_encodeBlockAsm8B:
+ ADDL R12, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+ TESTL R8, R8
+ JZ repeat_as_copy_encodeBlockAsm8B
+
+ // emitRepeat
+ MOVL SI, DI
+ LEAL -4(SI), SI
+ CMPL DI, $0x08
+ JLE repeat_two_match_repeat_encodeBlockAsm8B
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
+ CMPL SI, $0x00000104
+ JLT repeat_three_match_repeat_encodeBlockAsm8B
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_match_repeat_encodeBlockAsm8B:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_match_repeat_encodeBlockAsm8B:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_as_copy_encodeBlockAsm8B:
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL SI, DI
+ LEAL -4(SI), SI
+ CMPL DI, $0x08
+ JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+ CMPL DI, $0x0c
+ JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+ CMPL SI, $0x00000104
+ JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+ LEAL -256(SI), SI
+ MOVW $0x0019, (AX)
+ MOVW SI, 2(AX)
+ ADDQ $0x04, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+ LEAL -4(SI), SI
+ MOVW $0x0015, (AX)
+ MOVB SI, 2(AX)
+ ADDQ $0x03, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+ SHLL $0x02, SI
+ ORL $0x01, SI
+ MOVW SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+ XORQ R8, R8
+ LEAL 1(R8)(SI*4), SI
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+ JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeBlockAsm8B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeBlockAsm8B
+
+no_repeat_found_encodeBlockAsm8B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBlockAsm8B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeBlockAsm8B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBlockAsm8B
+
+candidate3_match_encodeBlockAsm8B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeBlockAsm8B
+
+candidate2_match_encodeBlockAsm8B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeBlockAsm8B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBlockAsm8B
+
+match_extend_back_loop_encodeBlockAsm8B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBlockAsm8B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBlockAsm8B
+ JMP match_extend_back_loop_encodeBlockAsm8B
+
+match_extend_back_end_encodeBlockAsm8B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBlockAsm8B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeBlockAsm8B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBlockAsm8B
+
+two_bytes_match_emit_encodeBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeBlockAsm8B
+ JMP memmove_long_match_emit_encodeBlockAsm8B
+
+one_byte_match_emit_encodeBlockAsm8B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBlockAsm8B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm8B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeBlockAsm8B
+
+memmove_long_match_emit_encodeBlockAsm8B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeBlockAsm8B:
+match_nolit_loop_encodeBlockAsm8B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeBlockAsm8B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeBlockAsm8B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm8B
+
+matchlen_loop_match_nolit_encodeBlockAsm8B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBlockAsm8B
+
+matchlen_single_match_nolit_encodeBlockAsm8B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeBlockAsm8B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeBlockAsm8B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B
+
+match_nolit_end_encodeBlockAsm8B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBlockAsm8B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R10, SI
+ LEAL -4(R10), R10
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+ XORQ DI, DI
+ LEAL 1(DI)(R10*4), R10
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+ JMP two_byte_offset_match_nolit_encodeBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm8B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeBlockAsm8B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBlockAsm8B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBlockAsm8B:
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x20, R8
+ IMULQ R9, R8
+ SHRQ $0x38, R8
+ SHLQ $0x20, SI
+ IMULQ R9, SI
+ SHRQ $0x38, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeBlockAsm8B
+ INCL CX
+ JMP search_loop_encodeBlockAsm8B
+
+emit_remainder_encodeBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBlockAsm8B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBlockAsm8B
+
+two_bytes_emit_remainder_encodeBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBlockAsm8B
+ JMP memmove_long_emit_remainder_encodeBlockAsm8B
+
+one_byte_emit_remainder_encodeBlockAsm8B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm8B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBlockAsm8B
+
+memmove_long_emit_remainder_encodeBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBlockAsm8B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm(SB), $327704-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000a00, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x07, SI
+ CMPL SI, $0x63
+ JLE check_maxskip_ok_encodeBetterBlockAsm
+ LEAL 100(CX), SI
+ JMP check_maxskip_cont_encodeBetterBlockAsm
+
+check_maxskip_ok_encodeBetterBlockAsm:
+ LEAL 1(CX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm:
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 262168(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 262168(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm
+
+candidateS_match_encodeBetterBlockAsm:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm
+
+match_extend_back_loop_encodeBetterBlockAsm:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm
+ JMP match_extend_back_loop_encodeBetterBlockAsm
+
+match_extend_back_end_encodeBetterBlockAsm:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 5(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm
+
+matchlen_single_match_nolit_encodeBetterBlockAsm:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm
+
+match_nolit_end_encodeBetterBlockAsm:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm
+ CMPL R12, $0x01
+ JG match_length_ok_encodeBetterBlockAsm
+ CMPL R8, $0x0000ffff
+ JLE match_length_ok_encodeBetterBlockAsm
+ MOVL 20(SP), CX
+ INCL CX
+ JMP search_loop_encodeBetterBlockAsm
+
+match_length_ok_encodeBetterBlockAsm:
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_encodeBetterBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_match_emit_encodeBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm
+
+four_bytes_match_emit_encodeBetterBlockAsm:
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm
+
+three_bytes_match_emit_encodeBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm
+
+two_bytes_match_emit_encodeBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm
+ JMP memmove_long_match_emit_encodeBetterBlockAsm
+
+one_byte_match_emit_encodeBetterBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm
+
+memmove_long_match_emit_encodeBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL R8, $0x00010000
+ JL two_byte_offset_match_nolit_encodeBetterBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
+ CMPL R12, $0x40
+ JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm
+ MOVB $0xff, (AX)
+ MOVL R8, 1(AX)
+ LEAL -64(R12), R12
+ ADDQ $0x05, AX
+ CMPL R12, $0x04
+ JL four_bytes_remain_match_nolit_encodeBetterBlockAsm
+
+ // emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
+ CMPL R12, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
+ LEAL -16842747(R12), R12
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+ JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm:
+ TESTL R12, R12
+ JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ CMPL R12, $0x0100ffff
+ JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
+ LEAL -16842747(R12), R12
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+match_is_repeat_encodeBetterBlockAsm:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+four_bytes_match_emit_repeat_encodeBetterBlockAsm:
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm
+ CMPL R12, $0x0100ffff
+ JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm
+ LEAL -16842747(R12), R12
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
+
+repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm:
+ MOVQ $0x00cf1bbcdcbfa563, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x32, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 262168(SP)(R11*4)
+ MOVL R15, 262168(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 262168(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm
+
+emit_remainder_encodeBetterBlockAsm:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeBetterBlockAsm
+ CMPL DX, $0x01000000
+ JLT four_bytes_emit_remainder_encodeBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL DX, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm
+
+four_bytes_emit_remainder_encodeBetterBlockAsm:
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm
+
+three_bytes_emit_remainder_encodeBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm
+
+two_bytes_emit_remainder_encodeBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm
+
+one_byte_emit_remainder_encodeBetterBlockAsm:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
+
+memmove_long_emit_remainder_encodeBetterBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000a00, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm4MB:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm4MB
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm4MB:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x07, SI
+ CMPL SI, $0x63
+ JLE check_maxskip_ok_encodeBetterBlockAsm4MB
+ LEAL 100(CX), SI
+ JMP check_maxskip_cont_encodeBetterBlockAsm4MB
+
+check_maxskip_ok_encodeBetterBlockAsm4MB:
+ LEAL 1(CX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm4MB:
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm4MB
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 262168(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 262168(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm4MB
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm4MB
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm4MB
+
+candidateS_match_encodeBetterBlockAsm4MB:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm4MB
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm4MB:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm4MB
+
+match_extend_back_loop_encodeBetterBlockAsm4MB:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm4MB
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm4MB
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm4MB
+ JMP match_extend_back_loop_encodeBetterBlockAsm4MB
+
+match_extend_back_end_encodeBetterBlockAsm4MB:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 4(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm4MB:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_single_match_nolit_encodeBetterBlockAsm4MB:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm4MB
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB
+
+match_nolit_end_encodeBetterBlockAsm4MB:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm4MB
+ CMPL R12, $0x01
+ JG match_length_ok_encodeBetterBlockAsm4MB
+ CMPL R8, $0x0000ffff
+ JLE match_length_ok_encodeBetterBlockAsm4MB
+ MOVL 20(SP), CX
+ INCL CX
+ JMP search_loop_encodeBetterBlockAsm4MB
+
+match_length_ok_encodeBetterBlockAsm4MB:
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm4MB
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm4MB
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_encodeBetterBlockAsm4MB
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_encodeBetterBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_encodeBetterBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm4MB
+ JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_encodeBetterBlockAsm4MB:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL R8, $0x00010000
+ JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
+
+four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
+ CMPL R12, $0x40
+ JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+ MOVB $0xff, (AX)
+ MOVL R8, 1(AX)
+ LEAL -64(R12), R12
+ ADDQ $0x05, AX
+ CMPL R12, $0x04
+ JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+ JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
+ TESTL R12, R12
+ JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+ MOVB $0x03, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+match_is_repeat_encodeBetterBlockAsm4MB:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
+ CMPL R12, $0x00010100
+ JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
+ LEAL -65536(R12), R12
+ MOVL R12, R8
+ MOVW $0x001d, (AX)
+ MOVW R12, 2(AX)
+ SARL $0x10, R8
+ MOVB R8, 4(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm4MB
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm4MB:
+ MOVQ $0x00cf1bbcdcbfa563, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x32, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 262168(SP)(R11*4)
+ MOVL R15, 262168(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 262168(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm4MB
+
+emit_remainder_encodeBetterBlockAsm4MB:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 4(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm4MB
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm4MB:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm4MB
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+one_byte_emit_remainder_encodeBetterBlockAsm4MB:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000280, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm12B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm12B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm12B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm12B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x34, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 65560(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 65560(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm12B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm12B
+
+candidateS_match_encodeBetterBlockAsm12B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm12B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm12B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm12B
+
+match_extend_back_loop_encodeBetterBlockAsm12B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm12B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm12B
+ JMP match_extend_back_loop_encodeBetterBlockAsm12B
+
+match_extend_back_end_encodeBetterBlockAsm12B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm12B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_single_match_nolit_encodeBetterBlockAsm12B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm12B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B
+
+match_nolit_end_encodeBetterBlockAsm12B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm12B
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_encodeBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm12B
+ JMP memmove_long_match_emit_encodeBetterBlockAsm12B
+
+one_byte_match_emit_encodeBetterBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_encodeBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm12B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+match_is_repeat_encodeBetterBlockAsm12B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm12B
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm12B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm12B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x32, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x34, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x34, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 65560(SP)(R11*4)
+ MOVL R15, 65560(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x34, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x32, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 65560(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm12B
+
+emit_remainder_encodeBetterBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm12B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm12B
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeBetterBlockAsm12B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x000000a0, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm10B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm10B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm10B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm10B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x36, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 16408(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 16408(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm10B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm10B
+
+candidateS_match_encodeBetterBlockAsm10B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm10B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm10B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm10B
+
+match_extend_back_loop_encodeBetterBlockAsm10B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm10B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm10B
+ JMP match_extend_back_loop_encodeBetterBlockAsm10B
+
+match_extend_back_end_encodeBetterBlockAsm10B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm10B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_single_match_nolit_encodeBetterBlockAsm10B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm10B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B
+
+match_nolit_end_encodeBetterBlockAsm10B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm10B
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_encodeBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm10B
+ JMP memmove_long_match_emit_encodeBetterBlockAsm10B
+
+one_byte_match_emit_encodeBetterBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_encodeBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm10B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+match_is_repeat_encodeBetterBlockAsm10B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm10B
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+ CMPL R8, $0x00000800
+ JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm10B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm10B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x34, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x36, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x36, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 16408(SP)(R11*4)
+ MOVL R15, 16408(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x36, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x34, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 16408(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm10B
+
+emit_remainder_encodeBetterBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm10B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm10B
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeBetterBlockAsm10B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000028, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm8B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeBetterBlockAsm8B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -6(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm8B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x04, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm8B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x38, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 4120(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 4120(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm8B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeBetterBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm8B
+
+candidateS_match_encodeBetterBlockAsm8B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeBetterBlockAsm8B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeBetterBlockAsm8B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeBetterBlockAsm8B
+
+match_extend_back_loop_encodeBetterBlockAsm8B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeBetterBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeBetterBlockAsm8B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeBetterBlockAsm8B
+ JMP match_extend_back_loop_encodeBetterBlockAsm8B
+
+match_extend_back_end_encodeBetterBlockAsm8B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeBetterBlockAsm8B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_single_match_nolit_encodeBetterBlockAsm8B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeBetterBlockAsm8B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B
+
+match_nolit_end_encodeBetterBlockAsm8B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL 16(SP), R8
+ JEQ match_is_repeat_encodeBetterBlockAsm8B
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeBetterBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_encodeBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeBetterBlockAsm8B
+ JMP memmove_long_match_emit_encodeBetterBlockAsm8B
+
+one_byte_match_emit_encodeBetterBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeBetterBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x04
+ JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
+ CMPQ R9, $0x08
+ JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
+ MOVL (R10), R11
+ MOVL R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
+ MOVL (R10), R11
+ MOVL -4(R10)(R9*1), R10
+ MOVL R11, (AX)
+ MOVL R10, -4(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_encodeBetterBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm8B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+ JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+match_is_repeat_encodeBetterBlockAsm8B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_repeat_encodeBetterBlockAsm8B
+ JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm8B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x04
+ JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
+ CMPQ R8, $0x08
+ JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
+ MOVL (R9), R10
+ MOVL R10, (AX)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
+ JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitRepeat
+ MOVL R12, SI
+ LEAL -4(R12), R12
+ CMPL SI, $0x08
+ JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
+ CMPL R12, $0x00000104
+ JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
+ LEAL -256(R12), R12
+ MOVW $0x0019, (AX)
+ MOVW R12, 2(AX)
+ ADDQ $0x04, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
+ LEAL -4(R12), R12
+ MOVW $0x0015, (AX)
+ MOVB R12, 2(AX)
+ ADDQ $0x03, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
+ SHLL $0x02, R12
+ ORL $0x01, R12
+ MOVW R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+ XORQ SI, SI
+ LEAL 1(SI)(R12*4), R12
+ MOVB R8, 1(AX)
+ SARL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeBetterBlockAsm8B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm8B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x36, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x38, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x38, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 4120(SP)(R11*4)
+ MOVL R15, 4120(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x38, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x36, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 4120(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeBetterBlockAsm8B
+
+emit_remainder_encodeBetterBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeBetterBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeBetterBlockAsm8B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeBetterBlockAsm8B
+ JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeBetterBlockAsm8B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeBetterBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x04
+ JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4
+ CMPQ BX, $0x08
+ JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4:
+ MOVL (CX), SI
+ MOVL SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(BX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm(SB), $65560-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000200, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R11
+ IMULQ R9, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm
+
+repeat_extend_back_loop_encodeSnappyBlockAsm:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
+
+repeat_extend_back_end_encodeSnappyBlockAsm:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_repeat_emit_encodeSnappyBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_repeat_emit_encodeSnappyBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+four_bytes_repeat_emit_encodeSnappyBlockAsm:
+ MOVL SI, R10
+ SHRL $0x10, R10
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R10, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+one_byte_repeat_emit_encodeSnappyBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm
+
+repeat_extend_forward_end_encodeSnappyBlockAsm:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+ CMPL DI, $0x00010000
+ JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
+ CMPL SI, $0x40
+ JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+ MOVB $0xff, (AX)
+ MOVL DI, 1(AX)
+ LEAL -64(SI), SI
+ ADDQ $0x05, AX
+ CMPL SI, $0x04
+ JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+ JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
+ TESTL SI, SI
+ JZ repeat_end_emit_encodeSnappyBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVL DI, 1(AX)
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm
+
+no_repeat_found_encodeSnappyBlockAsm:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm
+
+candidate3_match_encodeSnappyBlockAsm:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm
+
+candidate2_match_encodeSnappyBlockAsm:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm
+
+match_extend_back_loop_encodeSnappyBlockAsm:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm
+ JMP match_extend_back_loop_encodeSnappyBlockAsm
+
+match_extend_back_end_encodeSnappyBlockAsm:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 5(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm
+ CMPL R8, $0x00010000
+ JLT three_bytes_match_emit_encodeSnappyBlockAsm
+ CMPL R8, $0x01000000
+ JLT four_bytes_match_emit_encodeSnappyBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm
+
+four_bytes_match_emit_encodeSnappyBlockAsm:
+ MOVL R8, R10
+ SHRL $0x10, R10
+ MOVB $0xf8, (AX)
+ MOVW R8, 1(AX)
+ MOVB R10, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm
+
+three_bytes_match_emit_encodeSnappyBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm
+
+two_bytes_match_emit_encodeSnappyBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm
+
+one_byte_match_emit_encodeSnappyBlockAsm:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
+
+memmove_long_match_emit_encodeSnappyBlockAsm:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm:
+match_nolit_loop_encodeSnappyBlockAsm:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm
+
+match_nolit_end_encodeSnappyBlockAsm:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL SI, $0x00010000
+ JL two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
+ CMPL R10, $0x40
+ JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+ MOVB $0xff, (AX)
+ MOVL SI, 1(AX)
+ LEAL -64(R10), R10
+ ADDQ $0x05, AX
+ CMPL R10, $0x04
+ JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+ JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
+ TESTL R10, R10
+ JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBlockAsm:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm:
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x10, R8
+ IMULQ R9, R8
+ SHRQ $0x32, R8
+ SHLQ $0x10, SI
+ IMULQ R9, SI
+ SHRQ $0x32, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm
+
+emit_remainder_encodeSnappyBlockAsm:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeSnappyBlockAsm
+ CMPL DX, $0x01000000
+ JLT four_bytes_emit_remainder_encodeSnappyBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL DX, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBlockAsm:
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBlockAsm:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000200, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm64K:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm64K
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm64K:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm64K
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R11
+ IMULQ R9, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm64K
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
+
+repeat_extend_back_loop_encodeSnappyBlockAsm64K:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm64K
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
+
+repeat_extend_back_end_encodeSnappyBlockAsm64K:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm64K
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+one_byte_repeat_emit_encodeSnappyBlockAsm64K:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm64K:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K
+
+repeat_extend_forward_end_encodeSnappyBlockAsm64K:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm64K
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm64K:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm64K
+
+no_repeat_found_encodeSnappyBlockAsm64K:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm64K
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm64K
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm64K
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm64K
+
+candidate3_match_encodeSnappyBlockAsm64K:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm64K
+
+candidate2_match_encodeSnappyBlockAsm64K:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm64K:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBlockAsm64K:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm64K
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm64K
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm64K
+ JMP match_extend_back_loop_encodeSnappyBlockAsm64K
+
+match_extend_back_end_encodeSnappyBlockAsm64K:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm64K:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm64K
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm64K
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBlockAsm64K:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
+match_nolit_loop_encodeSnappyBlockAsm64K:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm64K:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm64K
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K
+
+match_nolit_end_encodeSnappyBlockAsm64K:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm64K
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm64K:
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x10, R8
+ IMULQ R9, R8
+ SHRQ $0x32, R8
+ SHLQ $0x10, SI
+ IMULQ R9, SI
+ SHRQ $0x32, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm64K
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm64K
+
+emit_remainder_encodeSnappyBlockAsm64K:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm64K:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm64K
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBlockAsm64K:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000080, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm12B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm12B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm12B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm12B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x000000cf1bbcdcbb, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x18, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ SHLQ $0x18, R11
+ IMULQ R9, R11
+ SHRQ $0x34, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x18, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm12B
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm12B:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm12B
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
+
+repeat_extend_back_end_encodeSnappyBlockAsm12B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm12B
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm12B:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B
+
+repeat_extend_forward_end_encodeSnappyBlockAsm12B:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm12B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm12B
+
+no_repeat_found_encodeSnappyBlockAsm12B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm12B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm12B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm12B
+
+candidate3_match_encodeSnappyBlockAsm12B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm12B
+
+candidate2_match_encodeSnappyBlockAsm12B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm12B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBlockAsm12B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm12B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm12B
+ JMP match_extend_back_loop_encodeSnappyBlockAsm12B
+
+match_extend_back_end_encodeSnappyBlockAsm12B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm12B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm12B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm12B
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBlockAsm12B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
+match_nolit_loop_encodeSnappyBlockAsm12B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm12B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm12B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B
+
+match_nolit_end_encodeSnappyBlockAsm12B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm12B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm12B:
+ MOVQ $0x000000cf1bbcdcbb, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x18, R8
+ IMULQ R9, R8
+ SHRQ $0x34, R8
+ SHLQ $0x18, SI
+ IMULQ R9, SI
+ SHRQ $0x34, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm12B
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm12B
+
+emit_remainder_encodeSnappyBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm12B
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm12B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000020, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm10B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm10B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm10B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm10B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ R9, R11
+ SHRQ $0x36, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm10B
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm10B:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm10B
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
+
+repeat_extend_back_end_encodeSnappyBlockAsm10B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm10B
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm10B:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B
+
+repeat_extend_forward_end_encodeSnappyBlockAsm10B:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+ CMPL DI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm10B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm10B
+
+no_repeat_found_encodeSnappyBlockAsm10B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm10B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm10B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm10B
+
+candidate3_match_encodeSnappyBlockAsm10B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm10B
+
+candidate2_match_encodeSnappyBlockAsm10B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm10B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBlockAsm10B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm10B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm10B
+ JMP match_extend_back_loop_encodeSnappyBlockAsm10B
+
+match_extend_back_end_encodeSnappyBlockAsm10B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm10B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm10B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm10B
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBlockAsm10B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
+match_nolit_loop_encodeSnappyBlockAsm10B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm10B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm10B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B
+
+match_nolit_end_encodeSnappyBlockAsm10B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm10B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm10B:
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x20, R8
+ IMULQ R9, R8
+ SHRQ $0x36, R8
+ SHLQ $0x20, SI
+ IMULQ R9, SI
+ SHRQ $0x36, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm10B
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm10B
+
+emit_remainder_encodeSnappyBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm10B
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm10B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000008, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm8B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBlockAsm8B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBlockAsm8B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x04, SI
+ LEAL 4(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm8B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHRQ $0x08, R11
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x38, R10
+ SHLQ $0x20, R11
+ IMULQ R9, R11
+ SHRQ $0x38, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 24(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ LEAL 1(CX), R10
+ MOVL R10, 24(SP)(R11*4)
+ MOVQ DI, R10
+ SHRQ $0x10, R10
+ SHLQ $0x20, R10
+ IMULQ R9, R10
+ SHRQ $0x38, R10
+ MOVL CX, R9
+ SUBL 16(SP), R9
+ MOVL 1(DX)(R9*1), R11
+ MOVQ DI, R9
+ SHRQ $0x08, R9
+ CMPL R9, R11
+ JNE no_repeat_found_encodeSnappyBlockAsm8B
+ LEAL 1(CX), DI
+ MOVL 12(SP), SI
+ MOVL DI, R8
+ SUBL 16(SP), R8
+ JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm8B:
+ CMPL DI, SI
+ JLE repeat_extend_back_end_encodeSnappyBlockAsm8B
+ MOVB -1(DX)(R8*1), BL
+ MOVB -1(DX)(DI*1), R9
+ CMPB BL, R9
+ JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
+ LEAL -1(DI), DI
+ DECL R8
+ JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
+
+repeat_extend_back_end_encodeSnappyBlockAsm8B:
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+ MOVL DI, R8
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R9
+ SUBL SI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_repeat_emit_encodeSnappyBlockAsm8B
+ JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(R8*1), SI
+
+ // genMemMoveLong
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
+ JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ SI, AX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
+ ADDL $0x05, CX
+ MOVL CX, SI
+ SUBL 16(SP), SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R11, R11
+ CMPL R8, $0x08
+ JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
+ MOVQ (R9)(R11*1), R10
+ XORQ (SI)(R11*1), R10
+ TESTQ R10, R10
+ JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
+ BSFQ R10, R10
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
+ JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_single_repeat_extend_encodeSnappyBlockAsm8B:
+ TESTL R8, R8
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B:
+ MOVB (R9)(R11*1), R10
+ CMPB (SI)(R11*1), R10
+ JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
+ LEAL 1(R11), R11
+ DECL R8
+ JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B
+
+repeat_extend_forward_end_encodeSnappyBlockAsm8B:
+ ADDL R11, CX
+ MOVL CX, SI
+ SUBL DI, SI
+ MOVL 16(SP), DI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
+ CMPL SI, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW DI, 1(AX)
+ LEAL -60(SI), SI
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
+ CMPL SI, $0x0c
+ JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(SI*4), SI
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_encodeSnappyBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(SI*4), SI
+ MOVB SI, (AX)
+ MOVW DI, 1(AX)
+ ADDQ $0x03, AX
+
+repeat_end_emit_encodeSnappyBlockAsm8B:
+ MOVL CX, 12(SP)
+ JMP search_loop_encodeSnappyBlockAsm8B
+
+no_repeat_found_encodeSnappyBlockAsm8B:
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBlockAsm8B
+ SHRQ $0x08, DI
+ MOVL 24(SP)(R10*4), SI
+ LEAL 2(CX), R9
+ CMPL (DX)(R8*1), DI
+ JEQ candidate2_match_encodeSnappyBlockAsm8B
+ MOVL R9, 24(SP)(R10*4)
+ SHRQ $0x08, DI
+ CMPL (DX)(SI*1), DI
+ JEQ candidate3_match_encodeSnappyBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBlockAsm8B
+
+candidate3_match_encodeSnappyBlockAsm8B:
+ ADDL $0x02, CX
+ JMP candidate_match_encodeSnappyBlockAsm8B
+
+candidate2_match_encodeSnappyBlockAsm8B:
+ MOVL R9, 24(SP)(R10*4)
+ INCL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm8B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBlockAsm8B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBlockAsm8B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBlockAsm8B
+ JMP match_extend_back_loop_encodeSnappyBlockAsm8B
+
+match_extend_back_end_encodeSnappyBlockAsm8B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBlockAsm8B:
+ MOVL CX, DI
+ MOVL 12(SP), R8
+ CMPL R8, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(R8*1), DI
+ SUBL R8, R9
+ LEAL -1(R9), R8
+ CMPL R8, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBlockAsm8B
+ CMPL R8, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB R8, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R8, $0x40
+ JL memmove_match_emit_encodeSnappyBlockAsm8B
+ JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBlockAsm8B:
+ SHLB $0x02, R8
+ MOVB R8, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+ MOVQ (DI), R10
+ MOVQ R10, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+ MOVQ (DI), R10
+ MOVQ -8(DI)(R9*1), DI
+ MOVQ R10, (AX)
+ MOVQ DI, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+ MOVOU (DI), X0
+ MOVOU -16(DI)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
+ MOVQ R8, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(R9*1), R8
+
+ // genMemMoveLong
+ MOVOU (DI), X0
+ MOVOU 16(DI), X1
+ MOVOU -32(DI)(R9*1), X2
+ MOVOU -16(DI)(R9*1), X3
+ MOVQ R9, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(DI)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
+ ADDQ $0x20, R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(DI)(R12*1), X4
+ MOVOU -16(DI)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R9, R12
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R8, AX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
+match_nolit_loop_encodeSnappyBlockAsm8B:
+ MOVL CX, DI
+ SUBL SI, DI
+ MOVL DI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(SI*1), SI
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
+ MOVQ (R8)(R10*1), R9
+ XORQ (SI)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
+ BSFQ R9, R9
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_single_match_nolit_encodeSnappyBlockAsm8B:
+ TESTL DI, DI
+ JZ match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B:
+ MOVB (R8)(R10*1), R9
+ CMPB (SI)(R10*1), R9
+ JNE match_nolit_end_encodeSnappyBlockAsm8B
+ LEAL 1(R10), R10
+ DECL DI
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B
+
+match_nolit_end_encodeSnappyBlockAsm8B:
+ ADDL R10, CX
+ MOVL 16(SP), SI
+ ADDL $0x04, R10
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW SI, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
+ CMPL R10, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R10*4), R10
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R10*4), R10
+ MOVB R10, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBlockAsm8B
+ MOVQ -2(DX)(CX*1), DI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm8B:
+ MOVQ $0x9e3779b1, R9
+ MOVQ DI, R8
+ SHRQ $0x10, DI
+ MOVQ DI, SI
+ SHLQ $0x20, R8
+ IMULQ R9, R8
+ SHRQ $0x38, R8
+ SHLQ $0x20, SI
+ IMULQ R9, SI
+ SHRQ $0x38, SI
+ LEAL -2(CX), R9
+ LEAQ 24(SP)(SI*4), R10
+ MOVL (R10), SI
+ MOVL R9, 24(SP)(R8*4)
+ MOVL CX, (R10)
+ CMPL (DX)(SI*1), DI
+ JEQ match_nolit_loop_encodeSnappyBlockAsm8B
+ INCL CX
+ JMP search_loop_encodeSnappyBlockAsm8B
+
+emit_remainder_encodeSnappyBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBlockAsm8B
+ JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm8B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000a00, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x07, SI
+ CMPL SI, $0x63
+ JLE check_maxskip_ok_encodeSnappyBetterBlockAsm
+ LEAL 100(CX), SI
+ JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
+
+check_maxskip_ok_encodeSnappyBetterBlockAsm:
+ LEAL 1(CX)(SI*1), SI
+
+check_maxskip_cont_encodeSnappyBetterBlockAsm:
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 262168(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 262168(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm
+
+candidateS_match_encodeSnappyBetterBlockAsm:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
+
+match_extend_back_end_encodeSnappyBetterBlockAsm:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 5(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm
+
+match_nolit_end_encodeSnappyBetterBlockAsm:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ CMPL R12, $0x01
+ JG match_length_ok_encodeSnappyBetterBlockAsm
+ CMPL R8, $0x0000ffff
+ JLE match_length_ok_encodeSnappyBetterBlockAsm
+ MOVL 20(SP), CX
+ INCL CX
+ JMP search_loop_encodeSnappyBetterBlockAsm
+
+match_length_ok_encodeSnappyBetterBlockAsm:
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm
+ CMPL SI, $0x01000000
+ JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+four_bytes_match_emit_encodeSnappyBetterBlockAsm:
+ MOVL SI, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL R8, $0x00010000
+ JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
+ CMPL R12, $0x40
+ JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+ MOVB $0xff, (AX)
+ MOVL R8, 1(AX)
+ LEAL -64(R12), R12
+ ADDQ $0x05, AX
+ CMPL R12, $0x04
+ JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+ JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
+ TESTL R12, R12
+ JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+ MOVB $0x03, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVL R8, 1(AX)
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
+ MOVQ $0x00cf1bbcdcbfa563, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x32, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 262168(SP)(R11*4)
+ MOVL R15, 262168(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 262168(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm
+
+emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+ CMPL DX, $0x00010000
+ JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+ CMPL DX, $0x01000000
+ JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+ MOVB $0xfc, (AX)
+ MOVL DX, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVL DX, BX
+ SHRL $0x10, BX
+ MOVB $0xf8, (AX)
+ MOVW DX, 1(AX)
+ MOVB BL, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000a00, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm64K:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm64K
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm64K:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x07, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm64K
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x32, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 262168(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 262168(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm64K
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm64K
+
+candidateS_match_encodeSnappyBetterBlockAsm64K:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x08, R10
+ IMULQ R9, R10
+ SHRQ $0x30, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm64K
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm64K:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_end_encodeSnappyBetterBlockAsm64K:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm64K:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
+
+match_nolit_end_encodeSnappyBetterBlockAsm64K:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm64K
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm64K:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm64K
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
+ MOVQ $0x00cf1bbcdcbfa563, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x32, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 262168(SP)(R11*4)
+ MOVL R15, 262168(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x08, R10
+ IMULQ SI, R10
+ SHRQ $0x30, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x32, R11
+ SHLQ $0x08, R13
+ IMULQ SI, R13
+ SHRQ $0x30, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 262168(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm64K
+
+emit_remainder_encodeSnappyBetterBlockAsm64K:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000280, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm12B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm12B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm12B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x06, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm12B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x34, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 65560(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 65560(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm12B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm12B
+
+candidateS_match_encodeSnappyBetterBlockAsm12B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm12B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm12B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm12B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm12B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
+
+match_nolit_end_encodeSnappyBetterBlockAsm12B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm12B
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm12B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x32, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x34, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x34, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 65560(SP)(R11*4)
+ MOVL R15, 65560(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x34, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x32, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 65560(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm12B
+
+emit_remainder_encodeSnappyBetterBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x000000a0, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm10B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm10B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm10B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x05, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm10B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x36, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 16408(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 16408(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm10B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm10B
+
+candidateS_match_encodeSnappyBetterBlockAsm10B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x34, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm10B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm10B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm10B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm10B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
+
+match_nolit_end_encodeSnappyBetterBlockAsm10B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm10B
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm10B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x34, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x36, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x36, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 16408(SP)(R11*4)
+ MOVL R15, 16408(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x36, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x34, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 16408(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm10B
+
+emit_remainder_encodeSnappyBetterBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
+// Requires: SSE2
+TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
+ MOVQ dst_base+0(FP), AX
+ MOVQ $0x00000028, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm8B:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_encodeSnappyBetterBlockAsm8B
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+32(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), SI
+ MOVL SI, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL $0x00000000, 16(SP)
+ MOVQ src_base+24(FP), DX
+
+search_loop_encodeSnappyBetterBlockAsm8B:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ SHRL $0x04, SI
+ LEAL 1(CX)(SI*1), SI
+ CMPL SI, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm8B
+ MOVQ (DX)(CX*1), DI
+ MOVL SI, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R9
+ MOVQ $0x9e3779b1, SI
+ MOVQ DI, R10
+ MOVQ DI, R11
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ SI, R11
+ SHRQ $0x38, R11
+ MOVL 24(SP)(R10*4), SI
+ MOVL 4120(SP)(R11*4), R8
+ MOVL CX, 24(SP)(R10*4)
+ MOVL CX, 4120(SP)(R11*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm8B
+ CMPL (DX)(R8*1), DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm8B
+
+candidateS_match_encodeSnappyBetterBlockAsm8B:
+ SHRQ $0x08, DI
+ MOVQ DI, R10
+ SHLQ $0x10, R10
+ IMULQ R9, R10
+ SHRQ $0x36, R10
+ MOVL 24(SP)(R10*4), SI
+ INCL CX
+ MOVL CX, 24(SP)(R10*4)
+ CMPL (DX)(SI*1), DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm8B
+ DECL CX
+ MOVL R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm8B:
+ MOVL 12(SP), DI
+ TESTL SI, SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
+ CMPL CX, DI
+ JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B
+ MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(CX*1), R8
+ CMPB BL, R8
+ JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
+ LEAL -1(CX), CX
+ DECL SI
+ JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
+ JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm8B:
+ MOVL CX, DI
+ SUBL 12(SP), DI
+ LEAQ 3(AX)(DI*1), DI
+ CMPQ DI, (SP)
+ JL match_dst_size_check_encodeSnappyBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm8B:
+ MOVL CX, DI
+ ADDL $0x04, CX
+ ADDL $0x04, SI
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(SI*1), R10
+
+ // matchLen
+ XORL R12, R12
+ CMPL R8, $0x08
+ JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
+ MOVQ (R9)(R12*1), R11
+ XORQ (R10)(R12*1), R11
+ TESTQ R11, R11
+ JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
+ BSFQ R11, R11
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
+ LEAL -8(R8), R8
+ LEAL 8(R12), R12
+ CMPL R8, $0x08
+ JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B:
+ TESTL R8, R8
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
+ MOVB (R9)(R12*1), R11
+ CMPB (R10)(R12*1), R11
+ JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
+ LEAL 1(R12), R12
+ DECL R8
+ JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
+
+match_nolit_end_encodeSnappyBetterBlockAsm8B:
+ MOVL CX, R8
+ SUBL SI, R8
+
+ // Check if repeat
+ MOVL R8, 16(SP)
+ MOVL 12(SP), SI
+ CMPL SI, DI
+ JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+ MOVL DI, R9
+ MOVL DI, 12(SP)
+ LEAQ (DX)(SI*1), R10
+ SUBL SI, R9
+ LEAL -1(R9), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_encodeSnappyBetterBlockAsm8B
+ JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, AX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
+ MOVQ (R10), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (R10), R11
+ MOVQ -8(R10)(R9*1), R10
+ MOVQ R11, (AX)
+ MOVQ R10, -8(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (R10), X0
+ MOVOU -16(R10)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
+ MOVQ SI, AX
+ JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
+ LEAQ (AX)(R9*1), SI
+
+ // genMemMoveLong
+ MOVOU (R10), X0
+ MOVOU 16(R10), X1
+ MOVOU -32(R10)(R9*1), X2
+ MOVOU -16(R10)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R14
+ SUBQ R11, R14
+ DECQ R13
+ JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(R10)(R14*1), R11
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R11
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(R10)(R14*1), X4
+ MOVOU -16(R10)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ SI, AX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
+ ADDL R12, CX
+ ADDL $0x04, R12
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
+ CMPL R12, $0x40
+ JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R12), R12
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
+ CMPL R12, $0x0c
+ JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
+ MOVB $0x01, BL
+ LEAL -16(BX)(R12*4), R12
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, R12
+ MOVB R12, (AX)
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
+ MOVB $0x02, BL
+ LEAL -4(BX)(R12*4), R12
+ MOVB R12, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_encodeSnappyBetterBlockAsm8B
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
+ MOVQ $0x0000cf1bbcdcbf9b, SI
+ MOVQ $0x9e3779b1, R8
+ INCL DI
+ MOVQ (DX)(DI*1), R9
+ MOVQ R9, R10
+ MOVQ R9, R11
+ MOVQ R9, R12
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ SHRQ $0x10, R12
+ LEAL 1(DI), R14
+ LEAL 2(DI), R15
+ MOVQ -2(DX)(CX*1), R9
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x36, R13
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x38, R11
+ SHLQ $0x20, R12
+ IMULQ R8, R12
+ SHRQ $0x38, R12
+ MOVL DI, 24(SP)(R10*4)
+ MOVL R14, 24(SP)(R13*4)
+ MOVL R14, 4120(SP)(R11*4)
+ MOVL R15, 4120(SP)(R12*4)
+ MOVQ R9, R10
+ MOVQ R9, R11
+ SHRQ $0x08, R11
+ MOVQ R11, R13
+ LEAL -2(CX), R9
+ LEAL -1(CX), DI
+ SHLQ $0x10, R10
+ IMULQ SI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x20, R11
+ IMULQ R8, R11
+ SHRQ $0x38, R11
+ SHLQ $0x10, R13
+ IMULQ SI, R13
+ SHRQ $0x36, R13
+ MOVL R9, 24(SP)(R10*4)
+ MOVL DI, 4120(SP)(R11*4)
+ MOVL DI, 24(SP)(R13*4)
+ JMP search_loop_encodeSnappyBetterBlockAsm8B
+
+emit_remainder_encodeSnappyBetterBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B
+ MOVQ $0x00000000, ret+48(FP)
+ RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
+ MOVQ src_len+32(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), DX
+ CMPL DX, $0x3c
+ JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
+ CMPL DX, $0x00000100
+ JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
+ MOVB $0xf4, (AX)
+ MOVW DX, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ MOVB $0xf0, (AX)
+ MOVB DL, 1(AX)
+ ADDQ $0x02, AX
+ CMPL DX, $0x40
+ JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
+ JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ SHLB $0x02, DL
+ MOVB DL, (AX)
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveShort
+ CMPQ BX, $0x08
+ JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8
+ CMPQ BX, $0x10
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+ CMPQ BX, $0x20
+ JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+ JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8:
+ MOVQ (CX), SI
+ MOVQ SI, (AX)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(BX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(BX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(BX*1)
+ JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ MOVQ DX, AX
+ JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ LEAQ (AX)(SI*1), DX
+ MOVL SI, BX
+
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(BX*1), X2
+ MOVOU -16(CX)(BX*1), X3
+ MOVQ BX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ BX, R8
+ JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(BX*1)
+ MOVOU X3, -16(AX)(BX*1)
+ MOVQ DX, AX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ AX, ret+48(FP)
+ RET
+
+// func emitLiteral(dst []byte, lit []byte) int
+// Requires: SSE2
+TEXT ·emitLiteral(SB), NOSPLIT, $0-56
+ MOVQ lit_len+32(FP), DX
+ MOVQ dst_base+0(FP), AX
+ MOVQ lit_base+24(FP), CX
+ TESTQ DX, DX
+ JZ emit_literal_end_standalone_skip
+ MOVL DX, BX
+ LEAL -1(DX), SI
+ CMPL SI, $0x3c
+ JLT one_byte_standalone
+ CMPL SI, $0x00000100
+ JLT two_bytes_standalone
+ CMPL SI, $0x00010000
+ JLT three_bytes_standalone
+ CMPL SI, $0x01000000
+ JLT four_bytes_standalone
+ MOVB $0xfc, (AX)
+ MOVL SI, 1(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP memmove_long_standalone
+
+four_bytes_standalone:
+ MOVL SI, DI
+ SHRL $0x10, DI
+ MOVB $0xf8, (AX)
+ MOVW SI, 1(AX)
+ MOVB DI, 3(AX)
+ ADDQ $0x04, BX
+ ADDQ $0x04, AX
+ JMP memmove_long_standalone
+
+three_bytes_standalone:
+ MOVB $0xf4, (AX)
+ MOVW SI, 1(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+ JMP memmove_long_standalone
+
+two_bytes_standalone:
+ MOVB $0xf0, (AX)
+ MOVB SI, 1(AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_standalone
+ JMP memmove_long_standalone
+
+one_byte_standalone:
+ SHLB $0x02, SI
+ MOVB SI, (AX)
+ ADDQ $0x01, BX
+ ADDQ $0x01, AX
+
+memmove_standalone:
+ // genMemMoveShort
+ CMPQ DX, $0x03
+ JB emit_lit_memmove_standalone_memmove_move_1or2
+ JE emit_lit_memmove_standalone_memmove_move_3
+ CMPQ DX, $0x08
+ JB emit_lit_memmove_standalone_memmove_move_4through7
+ CMPQ DX, $0x10
+ JBE emit_lit_memmove_standalone_memmove_move_8through16
+ CMPQ DX, $0x20
+ JBE emit_lit_memmove_standalone_memmove_move_17through32
+ JMP emit_lit_memmove_standalone_memmove_move_33through64
+
+emit_lit_memmove_standalone_memmove_move_1or2:
+ MOVB (CX), SI
+ MOVB -1(CX)(DX*1), CL
+ MOVB SI, (AX)
+ MOVB CL, -1(AX)(DX*1)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_3:
+ MOVW (CX), SI
+ MOVB 2(CX), CL
+ MOVW SI, (AX)
+ MOVB CL, 2(AX)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_4through7:
+ MOVL (CX), SI
+ MOVL -4(CX)(DX*1), CX
+ MOVL SI, (AX)
+ MOVL CX, -4(AX)(DX*1)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_8through16:
+ MOVQ (CX), SI
+ MOVQ -8(CX)(DX*1), CX
+ MOVQ SI, (AX)
+ MOVQ CX, -8(AX)(DX*1)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_17through32:
+ MOVOU (CX), X0
+ MOVOU -16(CX)(DX*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(DX*1)
+ JMP emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_33through64:
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(DX*1), X2
+ MOVOU -16(CX)(DX*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(DX*1)
+ MOVOU X3, -16(AX)(DX*1)
+ JMP emit_literal_end_standalone
+ JMP emit_literal_end_standalone
+
+memmove_long_standalone:
+ // genMemMoveLong
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU -32(CX)(DX*1), X2
+ MOVOU -16(CX)(DX*1), X3
+ MOVQ DX, DI
+ SHRQ $0x05, DI
+ MOVQ AX, SI
+ ANDL $0x0000001f, SI
+ MOVQ $0x00000040, R8
+ SUBQ SI, R8
+ DECQ DI
+ JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+ LEAQ -32(CX)(R8*1), SI
+ LEAQ -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_standalonelarge_big_loop_back:
+ MOVOU (SI), X4
+ MOVOU 16(SI), X5
+ MOVOA X4, (R9)
+ MOVOA X5, 16(R9)
+ ADDQ $0x20, R9
+ ADDQ $0x20, SI
+ ADDQ $0x20, R8
+ DECQ DI
+ JNA emit_lit_memmove_long_standalonelarge_big_loop_back
+
+emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
+ MOVOU -32(CX)(R8*1), X4
+ MOVOU -16(CX)(R8*1), X5
+ MOVOA X4, -32(AX)(R8*1)
+ MOVOA X5, -16(AX)(R8*1)
+ ADDQ $0x20, R8
+ CMPQ DX, R8
+ JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(DX*1)
+ MOVOU X3, -16(AX)(DX*1)
+ JMP emit_literal_end_standalone
+ JMP emit_literal_end_standalone
+
+emit_literal_end_standalone_skip:
+ XORQ BX, BX
+
+emit_literal_end_standalone:
+ MOVQ BX, ret+48(FP)
+ RET
+
+// func emitRepeat(dst []byte, offset int, length int) int
+TEXT ·emitRepeat(SB), NOSPLIT, $0-48
+ XORQ BX, BX
+ MOVQ dst_base+0(FP), AX
+ MOVQ offset+24(FP), CX
+ MOVQ length+32(FP), DX
+
+ // emitRepeat
+emit_repeat_again_standalone:
+ MOVL DX, SI
+ LEAL -4(DX), DX
+ CMPL SI, $0x08
+ JLE repeat_two_standalone
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_standalone
+ CMPL CX, $0x00000800
+ JLT repeat_two_offset_standalone
+
+cant_repeat_two_offset_standalone:
+ CMPL DX, $0x00000104
+ JLT repeat_three_standalone
+ CMPL DX, $0x00010100
+ JLT repeat_four_standalone
+ CMPL DX, $0x0100ffff
+ JLT repeat_five_standalone
+ LEAL -16842747(DX), DX
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ ADDQ $0x05, BX
+ JMP emit_repeat_again_standalone
+
+repeat_five_standalone:
+ LEAL -65536(DX), DX
+ MOVL DX, CX
+ MOVW $0x001d, (AX)
+ MOVW DX, 2(AX)
+ SARL $0x10, CX
+ MOVB CL, 4(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_repeat_end
+
+repeat_four_standalone:
+ LEAL -256(DX), DX
+ MOVW $0x0019, (AX)
+ MOVW DX, 2(AX)
+ ADDQ $0x04, BX
+ ADDQ $0x04, AX
+ JMP gen_emit_repeat_end
+
+repeat_three_standalone:
+ LEAL -4(DX), DX
+ MOVW $0x0015, (AX)
+ MOVB DL, 2(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+ JMP gen_emit_repeat_end
+
+repeat_two_standalone:
+ SHLL $0x02, DX
+ ORL $0x01, DX
+ MOVW DX, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_repeat_end
+
+repeat_two_offset_standalone:
+ XORQ SI, SI
+ LEAL 1(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SARL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+
+gen_emit_repeat_end:
+ MOVQ BX, ret+40(FP)
+ RET
+
+// func emitCopy(dst []byte, offset int, length int) int
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+ XORQ BX, BX
+ MOVQ dst_base+0(FP), AX
+ MOVQ offset+24(FP), CX
+ MOVQ length+32(FP), DX
+
+ // emitCopy
+ CMPL CX, $0x00010000
+ JL two_byte_offset_standalone
+
+four_bytes_loop_back_standalone:
+ CMPL DX, $0x40
+ JLE four_bytes_remain_standalone
+ MOVB $0xff, (AX)
+ MOVL CX, 1(AX)
+ LEAL -64(DX), DX
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ CMPL DX, $0x04
+ JL four_bytes_remain_standalone
+
+ // emitRepeat
+emit_repeat_again_standalone_emit_copy:
+ MOVL DX, SI
+ LEAL -4(DX), DX
+ CMPL SI, $0x08
+ JLE repeat_two_standalone_emit_copy
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_standalone_emit_copy
+ CMPL CX, $0x00000800
+ JLT repeat_two_offset_standalone_emit_copy
+
+cant_repeat_two_offset_standalone_emit_copy:
+ CMPL DX, $0x00000104
+ JLT repeat_three_standalone_emit_copy
+ CMPL DX, $0x00010100
+ JLT repeat_four_standalone_emit_copy
+ CMPL DX, $0x0100ffff
+ JLT repeat_five_standalone_emit_copy
+ LEAL -16842747(DX), DX
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ ADDQ $0x05, BX
+ JMP emit_repeat_again_standalone_emit_copy
+
+repeat_five_standalone_emit_copy:
+ LEAL -65536(DX), DX
+ MOVL DX, CX
+ MOVW $0x001d, (AX)
+ MOVW DX, 2(AX)
+ SARL $0x10, CX
+ MOVB CL, 4(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_copy_end
+
+repeat_four_standalone_emit_copy:
+ LEAL -256(DX), DX
+ MOVW $0x0019, (AX)
+ MOVW DX, 2(AX)
+ ADDQ $0x04, BX
+ ADDQ $0x04, AX
+ JMP gen_emit_copy_end
+
+repeat_three_standalone_emit_copy:
+ LEAL -4(DX), DX
+ MOVW $0x0015, (AX)
+ MOVB DL, 2(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+ JMP gen_emit_copy_end
+
+repeat_two_standalone_emit_copy:
+ SHLL $0x02, DX
+ ORL $0x01, DX
+ MOVW DX, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy:
+ XORQ SI, SI
+ LEAL 1(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SARL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+ JMP four_bytes_loop_back_standalone
+
+four_bytes_remain_standalone:
+ TESTL DX, DX
+ JZ gen_emit_copy_end
+ MOVB $0x03, SI
+ LEAL -4(SI)(DX*4), DX
+ MOVB DL, (AX)
+ MOVL CX, 1(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_copy_end
+
+two_byte_offset_standalone:
+ CMPL DX, $0x40
+ JLE two_byte_offset_short_standalone
+ MOVB $0xee, (AX)
+ MOVW CX, 1(AX)
+ LEAL -60(DX), DX
+ ADDQ $0x03, AX
+ ADDQ $0x03, BX
+
+ // emitRepeat
+emit_repeat_again_standalone_emit_copy_short:
+ MOVL DX, SI
+ LEAL -4(DX), DX
+ CMPL SI, $0x08
+ JLE repeat_two_standalone_emit_copy_short
+ CMPL SI, $0x0c
+ JGE cant_repeat_two_offset_standalone_emit_copy_short
+ CMPL CX, $0x00000800
+ JLT repeat_two_offset_standalone_emit_copy_short
+
+cant_repeat_two_offset_standalone_emit_copy_short:
+ CMPL DX, $0x00000104
+ JLT repeat_three_standalone_emit_copy_short
+ CMPL DX, $0x00010100
+ JLT repeat_four_standalone_emit_copy_short
+ CMPL DX, $0x0100ffff
+ JLT repeat_five_standalone_emit_copy_short
+ LEAL -16842747(DX), DX
+ MOVW $0x001d, (AX)
+ MOVW $0xfffb, 2(AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ ADDQ $0x05, BX
+ JMP emit_repeat_again_standalone_emit_copy_short
+
+repeat_five_standalone_emit_copy_short:
+ LEAL -65536(DX), DX
+ MOVL DX, CX
+ MOVW $0x001d, (AX)
+ MOVW DX, 2(AX)
+ SARL $0x10, CX
+ MOVB CL, 4(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_copy_end
+
+repeat_four_standalone_emit_copy_short:
+ LEAL -256(DX), DX
+ MOVW $0x0019, (AX)
+ MOVW DX, 2(AX)
+ ADDQ $0x04, BX
+ ADDQ $0x04, AX
+ JMP gen_emit_copy_end
+
+repeat_three_standalone_emit_copy_short:
+ LEAL -4(DX), DX
+ MOVW $0x0015, (AX)
+ MOVB DL, 2(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+ JMP gen_emit_copy_end
+
+repeat_two_standalone_emit_copy_short:
+ SHLL $0x02, DX
+ ORL $0x01, DX
+ MOVW DX, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy_short:
+ XORQ SI, SI
+ LEAL 1(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SARL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+ JMP two_byte_offset_standalone
+
+two_byte_offset_short_standalone:
+ CMPL DX, $0x0c
+ JGE emit_copy_three_standalone
+ CMPL CX, $0x00000800
+ JGE emit_copy_three_standalone
+ MOVB $0x01, SI
+ LEAL -16(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SHRL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end
+
+emit_copy_three_standalone:
+ MOVB $0x02, SI
+ LEAL -4(SI)(DX*4), DX
+ MOVB DL, (AX)
+ MOVW CX, 1(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+
+gen_emit_copy_end:
+ MOVQ BX, ret+40(FP)
+ RET
+
+// func emitCopyNoRepeat(dst []byte, offset int, length int) int
+TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
+ XORQ BX, BX
+ MOVQ dst_base+0(FP), AX
+ MOVQ offset+24(FP), CX
+ MOVQ length+32(FP), DX
+
+ // emitCopy
+ CMPL CX, $0x00010000
+ JL two_byte_offset_standalone_snappy
+
+four_bytes_loop_back_standalone_snappy:
+ CMPL DX, $0x40
+ JLE four_bytes_remain_standalone_snappy
+ MOVB $0xff, (AX)
+ MOVL CX, 1(AX)
+ LEAL -64(DX), DX
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ CMPL DX, $0x04
+ JL four_bytes_remain_standalone_snappy
+ JMP four_bytes_loop_back_standalone_snappy
+
+four_bytes_remain_standalone_snappy:
+ TESTL DX, DX
+ JZ gen_emit_copy_end_snappy
+ MOVB $0x03, SI
+ LEAL -4(SI)(DX*4), DX
+ MOVB DL, (AX)
+ MOVL CX, 1(AX)
+ ADDQ $0x05, BX
+ ADDQ $0x05, AX
+ JMP gen_emit_copy_end_snappy
+
+two_byte_offset_standalone_snappy:
+ CMPL DX, $0x40
+ JLE two_byte_offset_short_standalone_snappy
+ MOVB $0xee, (AX)
+ MOVW CX, 1(AX)
+ LEAL -60(DX), DX
+ ADDQ $0x03, AX
+ ADDQ $0x03, BX
+ JMP two_byte_offset_standalone_snappy
+
+two_byte_offset_short_standalone_snappy:
+ CMPL DX, $0x0c
+ JGE emit_copy_three_standalone_snappy
+ CMPL CX, $0x00000800
+ JGE emit_copy_three_standalone_snappy
+ MOVB $0x01, SI
+ LEAL -16(SI)(DX*4), DX
+ MOVB CL, 1(AX)
+ SHRL $0x08, CX
+ SHLL $0x05, CX
+ ORL CX, DX
+ MOVB DL, (AX)
+ ADDQ $0x02, BX
+ ADDQ $0x02, AX
+ JMP gen_emit_copy_end_snappy
+
+emit_copy_three_standalone_snappy:
+ MOVB $0x02, SI
+ LEAL -4(SI)(DX*4), DX
+ MOVB DL, (AX)
+ MOVW CX, 1(AX)
+ ADDQ $0x03, BX
+ ADDQ $0x03, AX
+
+gen_emit_copy_end_snappy:
+ MOVQ BX, ret+40(FP)
+ RET
+
+// func matchLen(a []byte, b []byte) int
+TEXT ·matchLen(SB), NOSPLIT, $0-56
+ MOVQ a_base+0(FP), AX
+ MOVQ b_base+24(FP), CX
+ MOVQ a_len+8(FP), DX
+
+ // matchLen
+ XORL SI, SI
+ CMPL DX, $0x08
+ JL matchlen_single_standalone
+
+matchlen_loopback_standalone:
+ MOVQ (AX)(SI*1), BX
+ XORQ (CX)(SI*1), BX
+ TESTQ BX, BX
+ JZ matchlen_loop_standalone
+ BSFQ BX, BX
+ SARQ $0x03, BX
+ LEAL (SI)(BX*1), SI
+ JMP gen_match_len_end
+
+matchlen_loop_standalone:
+ LEAL -8(DX), DX
+ LEAL 8(SI), SI
+ CMPL DX, $0x08
+ JGE matchlen_loopback_standalone
+
+matchlen_single_standalone:
+ TESTL DX, DX
+ JZ gen_match_len_end
+
+matchlen_single_loopback_standalone:
+ MOVB (AX)(SI*1), BL
+ CMPB (CX)(SI*1), BL
+ JNE gen_match_len_end
+ LEAL 1(SI), SI
+ DECL DX
+ JNZ matchlen_single_loopback_standalone
+
+gen_match_len_end:
+ MOVQ SI, ret+48(FP)
+ RET