diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s | 15678 |
1 files changed, 15678 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s new file mode 100644 index 00000000..1ac65a0e --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -0,0 +1,15678 @@ +// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build gc + +#include "textflag.h" + +// func encodeBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm(SB), $65560-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000200, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm + +repeat_extend_back_loop_encodeBlockAsm: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm + +repeat_extend_back_end_encodeBlockAsm: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_repeat_emit_encodeBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_repeat_emit_encodeBlockAsm + +four_bytes_repeat_emit_encodeBlockAsm: + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBlockAsm + +three_bytes_repeat_emit_encodeBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm + +two_bytes_repeat_emit_encodeBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm + JMP memmove_long_repeat_emit_encodeBlockAsm + +one_byte_repeat_emit_encodeBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +memmove_long_repeat_emit_encodeBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm + +matchlen_loopback_repeat_extend_encodeBlockAsm: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm + +matchlen_loop_repeat_extend_encodeBlockAsm: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm + +matchlen_single_repeat_extend_encodeBlockAsm: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm + +matchlen_single_loopback_repeat_extend_encodeBlockAsm: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm + +repeat_extend_forward_end_encodeBlockAsm: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm + + // emitRepeat +emit_repeat_again_match_repeat_encodeBlockAsm: + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm + +cant_repeat_two_offset_match_repeat_encodeBlockAsm: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm + CMPL SI, $0x00010100 + JLT repeat_four_match_repeat_encodeBlockAsm + CMPL SI, $0x0100ffff + JLT repeat_five_match_repeat_encodeBlockAsm + LEAL -16842747(SI), SI + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_repeat_encodeBlockAsm + +repeat_five_match_repeat_encodeBlockAsm: + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_match_repeat_encodeBlockAsm: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_match_repeat_encodeBlockAsm: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_match_repeat_encodeBlockAsm: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_match_repeat_encodeBlockAsm: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_as_copy_encodeBlockAsm: + // emitCopy + CMPL DI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm + +four_bytes_loop_back_repeat_as_copy_encodeBlockAsm: + CMPL SI, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm + MOVB $0xff, (AX) + MOVL DI, 1(AX) + LEAL -64(SI), SI + ADDQ $0x05, AX + CMPL SI, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL SI, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy + LEAL -16842747(SI), SI + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy + +repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm + +four_bytes_remain_repeat_as_copy_encodeBlockAsm: + TESTL SI, SI + JZ repeat_end_emit_encodeBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVL DI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +two_byte_offset_repeat_as_copy_encodeBlockAsm: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL SI, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short + LEAL -16842747(SI), SI + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short + +repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +emit_copy_three_repeat_as_copy_encodeBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm + +no_repeat_found_encodeBlockAsm: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm + +candidate3_match_encodeBlockAsm: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm + +candidate2_match_encodeBlockAsm: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm + +match_extend_back_loop_encodeBlockAsm: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm + JMP match_extend_back_loop_encodeBlockAsm + +match_extend_back_end_encodeBlockAsm: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeBlockAsm + MOVB $0xfc, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeBlockAsm + +four_bytes_match_emit_encodeBlockAsm: + MOVL R8, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW R8, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBlockAsm + +three_bytes_match_emit_encodeBlockAsm: + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm + +two_bytes_match_emit_encodeBlockAsm: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm + JMP memmove_long_match_emit_encodeBlockAsm + +one_byte_match_emit_encodeBlockAsm: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm + +memmove_long_match_emit_encodeBlockAsm: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm: +match_nolit_loop_encodeBlockAsm: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm + +matchlen_loopback_match_nolit_encodeBlockAsm: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm + +matchlen_loop_match_nolit_encodeBlockAsm: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm + +matchlen_single_match_nolit_encodeBlockAsm: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm + +matchlen_single_loopback_match_nolit_encodeBlockAsm: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm + +match_nolit_end_encodeBlockAsm: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm + +four_bytes_loop_back_match_nolit_encodeBlockAsm: + CMPL R10, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsm + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(R10), R10 + ADDQ $0x05, AX + CMPL R10, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm + + // emitRepeat +emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy + CMPL R10, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy + LEAL -16842747(R10), R10 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy + +repeat_five_match_nolit_encodeBlockAsm_emit_copy: + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_four_match_nolit_encodeBlockAsm_emit_copy: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_three_match_nolit_encodeBlockAsm_emit_copy: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_match_nolit_encodeBlockAsm_emit_copy: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm + +four_bytes_remain_match_nolit_encodeBlockAsm: + TESTL R10, R10 + JZ match_nolit_emitcopy_end_encodeBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +two_byte_offset_match_nolit_encodeBlockAsm: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short + CMPL R10, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short + LEAL -16842747(R10), R10 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short + +repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + JMP two_byte_offset_match_nolit_encodeBlockAsm + +two_byte_offset_short_match_nolit_encodeBlockAsm: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +emit_copy_three_match_nolit_encodeBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm: + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm + INCL CX + JMP search_loop_encodeBlockAsm + +emit_remainder_encodeBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeBlockAsm + +four_bytes_emit_remainder_encodeBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBlockAsm + +three_bytes_emit_remainder_encodeBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm + +two_bytes_emit_remainder_encodeBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm + JMP memmove_long_emit_remainder_encodeBlockAsm + +one_byte_emit_remainder_encodeBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +memmove_long_emit_remainder_encodeBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm4MB(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm4MB(SB), $65560-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000200, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm4MB: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm4MB + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm4MB: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm4MB + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm4MB + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm4MB + +repeat_extend_back_loop_encodeBlockAsm4MB: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm4MB + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm4MB + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm4MB + +repeat_extend_back_end_encodeBlockAsm4MB: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm4MB + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm4MB + CMPL SI, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm4MB + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBlockAsm4MB + +three_bytes_repeat_emit_encodeBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm4MB + +two_bytes_repeat_emit_encodeBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm4MB + JMP memmove_long_repeat_emit_encodeBlockAsm4MB + +one_byte_repeat_emit_encodeBlockAsm4MB: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm4MB: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB + +memmove_long_repeat_emit_encodeBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm4MB: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm4MB + +matchlen_loopback_repeat_extend_encodeBlockAsm4MB: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm4MB + +matchlen_loop_repeat_extend_encodeBlockAsm4MB: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB + +matchlen_single_repeat_extend_encodeBlockAsm4MB: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm4MB + +matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm4MB + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB + +repeat_extend_forward_end_encodeBlockAsm4MB: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm4MB + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm4MB + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB + +cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm4MB + CMPL SI, $0x00010100 + JLT repeat_four_match_repeat_encodeBlockAsm4MB + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_match_repeat_encodeBlockAsm4MB: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_match_repeat_encodeBlockAsm4MB: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_match_repeat_encodeBlockAsm4MB: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_match_repeat_encodeBlockAsm4MB: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_as_copy_encodeBlockAsm4MB: + // emitCopy + CMPL DI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB + +four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB: + CMPL SI, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB + MOVB $0xff, (AX) + MOVL DI, 1(AX) + LEAL -64(SI), SI + ADDQ $0x05, AX + CMPL SI, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB + +four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: + TESTL SI, SI + JZ repeat_end_emit_encodeBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVL DI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm4MB: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm4MB + +no_repeat_found_encodeBlockAsm4MB: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm4MB + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm4MB + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm4MB + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm4MB + +candidate3_match_encodeBlockAsm4MB: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm4MB + +candidate2_match_encodeBlockAsm4MB: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm4MB: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm4MB + +match_extend_back_loop_encodeBlockAsm4MB: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm4MB + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm4MB + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm4MB + JMP match_extend_back_loop_encodeBlockAsm4MB + +match_extend_back_end_encodeBlockAsm4MB: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 4(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm4MB: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm4MB + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm4MB + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm4MB + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm4MB + MOVL R8, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW R8, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBlockAsm4MB + +three_bytes_match_emit_encodeBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm4MB + +two_bytes_match_emit_encodeBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm4MB + JMP memmove_long_match_emit_encodeBlockAsm4MB + +one_byte_match_emit_encodeBlockAsm4MB: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm4MB: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm4MB: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm4MB + +memmove_long_match_emit_encodeBlockAsm4MB: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm4MB: +match_nolit_loop_encodeBlockAsm4MB: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm4MB + +matchlen_loopback_match_nolit_encodeBlockAsm4MB: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm4MB + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm4MB + +matchlen_loop_match_nolit_encodeBlockAsm4MB: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB + +matchlen_single_match_nolit_encodeBlockAsm4MB: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm4MB + +matchlen_single_loopback_match_nolit_encodeBlockAsm4MB: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm4MB + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB + +match_nolit_end_encodeBlockAsm4MB: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm4MB + +four_bytes_loop_back_match_nolit_encodeBlockAsm4MB: + CMPL R10, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(R10), R10 + ADDQ $0x05, AX + CMPL R10, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm4MB + + // emitRepeat + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB + +four_bytes_remain_match_nolit_encodeBlockAsm4MB: + TESTL R10, R10 + JZ match_nolit_emitcopy_end_encodeBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +two_byte_offset_match_nolit_encodeBlockAsm4MB: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + JMP two_byte_offset_match_nolit_encodeBlockAsm4MB + +two_byte_offset_short_match_nolit_encodeBlockAsm4MB: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm4MB + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm4MB + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +emit_copy_three_match_nolit_encodeBlockAsm4MB: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm4MB: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm4MB + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm4MB: + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm4MB + INCL CX + JMP search_loop_encodeBlockAsm4MB + +emit_remainder_encodeBlockAsm4MB: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 4(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm4MB: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm4MB + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm4MB + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm4MB + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBlockAsm4MB + +three_bytes_emit_remainder_encodeBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm4MB + +two_bytes_emit_remainder_encodeBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm4MB + JMP memmove_long_emit_remainder_encodeBlockAsm4MB + +one_byte_emit_remainder_encodeBlockAsm4MB: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm4MB: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm4MB: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB + +memmove_long_emit_remainder_encodeBlockAsm4MB: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm4MB: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm12B(SB), $16408-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000080, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm12B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x18, R11 + IMULQ R9, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm12B + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm12B + +repeat_extend_back_loop_encodeBlockAsm12B: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm12B + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm12B + +repeat_extend_back_end_encodeBlockAsm12B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm12B + +two_bytes_repeat_emit_encodeBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm12B + JMP memmove_long_repeat_emit_encodeBlockAsm12B + +one_byte_repeat_emit_encodeBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +memmove_long_repeat_emit_encodeBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm12B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm12B + +matchlen_loopback_repeat_extend_encodeBlockAsm12B: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm12B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm12B + +matchlen_loop_repeat_extend_encodeBlockAsm12B: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B + +matchlen_single_repeat_extend_encodeBlockAsm12B: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm12B + +matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm12B + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B + +repeat_extend_forward_end_encodeBlockAsm12B: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm12B + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm12B + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm12B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm12B + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_three_match_repeat_encodeBlockAsm12B: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_match_repeat_encodeBlockAsm12B: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_match_repeat_encodeBlockAsm12B: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_as_copy_encodeBlockAsm12B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm12B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +emit_copy_three_repeat_as_copy_encodeBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm12B: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm12B + +no_repeat_found_encodeBlockAsm12B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm12B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm12B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm12B + +candidate3_match_encodeBlockAsm12B: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm12B + +candidate2_match_encodeBlockAsm12B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm12B + +match_extend_back_loop_encodeBlockAsm12B: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm12B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm12B + JMP match_extend_back_loop_encodeBlockAsm12B + +match_extend_back_end_encodeBlockAsm12B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm12B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm12B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm12B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm12B + +two_bytes_match_emit_encodeBlockAsm12B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm12B + JMP memmove_long_match_emit_encodeBlockAsm12B + +one_byte_match_emit_encodeBlockAsm12B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm12B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm12B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +memmove_long_match_emit_encodeBlockAsm12B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm12B: +match_nolit_loop_encodeBlockAsm12B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm12B + +matchlen_loopback_match_nolit_encodeBlockAsm12B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm12B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm12B + +matchlen_loop_match_nolit_encodeBlockAsm12B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm12B + +matchlen_single_match_nolit_encodeBlockAsm12B: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeBlockAsm12B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm12B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B + +match_nolit_end_encodeBlockAsm12B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm12B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP two_byte_offset_match_nolit_encodeBlockAsm12B + +two_byte_offset_short_match_nolit_encodeBlockAsm12B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm12B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +emit_copy_three_match_nolit_encodeBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm12B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm12B: + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x18, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 + SHLQ $0x18, SI + IMULQ R9, SI + SHRQ $0x34, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm12B + INCL CX + JMP search_loop_encodeBlockAsm12B + +emit_remainder_encodeBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm12B + +two_bytes_emit_remainder_encodeBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm12B + JMP memmove_long_emit_remainder_encodeBlockAsm12B + +one_byte_emit_remainder_encodeBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +memmove_long_emit_remainder_encodeBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm10B(SB), $4120-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000020, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm10B + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm10B + +repeat_extend_back_loop_encodeBlockAsm10B: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm10B + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm10B + +repeat_extend_back_end_encodeBlockAsm10B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm10B + +two_bytes_repeat_emit_encodeBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm10B + JMP memmove_long_repeat_emit_encodeBlockAsm10B + +one_byte_repeat_emit_encodeBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm10B + +memmove_long_repeat_emit_encodeBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm10B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm10B + +matchlen_loopback_repeat_extend_encodeBlockAsm10B: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm10B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm10B + +matchlen_loop_repeat_extend_encodeBlockAsm10B: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B + +matchlen_single_repeat_extend_encodeBlockAsm10B: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm10B + +matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm10B + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B + +repeat_extend_forward_end_encodeBlockAsm10B: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm10B + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm10B + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm10B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm10B + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_three_match_repeat_encodeBlockAsm10B: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_match_repeat_encodeBlockAsm10B: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_match_repeat_encodeBlockAsm10B: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_as_copy_encodeBlockAsm10B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm10B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +emit_copy_three_repeat_as_copy_encodeBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm10B: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm10B + +no_repeat_found_encodeBlockAsm10B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm10B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm10B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm10B + +candidate3_match_encodeBlockAsm10B: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm10B + +candidate2_match_encodeBlockAsm10B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm10B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm10B + +match_extend_back_loop_encodeBlockAsm10B: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm10B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm10B + JMP match_extend_back_loop_encodeBlockAsm10B + +match_extend_back_end_encodeBlockAsm10B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm10B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm10B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm10B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm10B + +two_bytes_match_emit_encodeBlockAsm10B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm10B + JMP memmove_long_match_emit_encodeBlockAsm10B + +one_byte_match_emit_encodeBlockAsm10B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm10B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm10B + +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm10B + +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm10B + +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm10B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm10B + +memmove_long_match_emit_encodeBlockAsm10B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm10B: +match_nolit_loop_encodeBlockAsm10B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm10B + +matchlen_loopback_match_nolit_encodeBlockAsm10B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm10B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm10B + +matchlen_loop_match_nolit_encodeBlockAsm10B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm10B + +matchlen_single_match_nolit_encodeBlockAsm10B: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeBlockAsm10B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm10B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B + +match_nolit_end_encodeBlockAsm10B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm10B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP two_byte_offset_match_nolit_encodeBlockAsm10B + +two_byte_offset_short_match_nolit_encodeBlockAsm10B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm10B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +emit_copy_three_match_nolit_encodeBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm10B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm10B: + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x36, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x36, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm10B + INCL CX + JMP search_loop_encodeBlockAsm10B + +emit_remainder_encodeBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm10B + +two_bytes_emit_remainder_encodeBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm10B + JMP memmove_long_emit_remainder_encodeBlockAsm10B + +one_byte_emit_remainder_encodeBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm10B + +memmove_long_emit_remainder_encodeBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm8B(SB), $1048-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000008, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBlockAsm8B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeBlockAsm8B + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI + JZ repeat_extend_back_end_encodeBlockAsm8B + +repeat_extend_back_loop_encodeBlockAsm8B: + CMPL DI, R8 + JLE repeat_extend_back_end_encodeBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBlockAsm8B + LEAL -1(DI), DI + DECL SI + JNZ repeat_extend_back_loop_encodeBlockAsm8B + +repeat_extend_back_end_encodeBlockAsm8B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm8B + +two_bytes_repeat_emit_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBlockAsm8B + JMP memmove_long_repeat_emit_encodeBlockAsm8B + +one_byte_repeat_emit_encodeBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm8B + +memmove_long_repeat_emit_encodeBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBlockAsm8B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R12, R12 + CMPL R9, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm8B + +matchlen_loopback_repeat_extend_encodeBlockAsm8B: + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_repeat_extend_encodeBlockAsm8B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_loop_repeat_extend_encodeBlockAsm8B: + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B + +matchlen_single_repeat_extend_encodeBlockAsm8B: + TESTL R9, R9 + JZ repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 + JNE repeat_extend_forward_end_encodeBlockAsm8B + LEAL 1(R12), R12 + DECL R9 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B + +repeat_extend_forward_end_encodeBlockAsm8B: + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 + JZ repeat_as_copy_encodeBlockAsm8B + + // emitRepeat + MOVL SI, DI + LEAL -4(SI), SI + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm8B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm8B + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_match_repeat_encodeBlockAsm8B: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_match_repeat_encodeBlockAsm8B: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_as_copy_encodeBlockAsm8B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm8B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + + // emitRepeat + MOVL SI, DI + LEAL -4(SI), SI + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +emit_copy_three_repeat_as_copy_encodeBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm8B: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm8B + +no_repeat_found_encodeBlockAsm8B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm8B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm8B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm8B + +candidate3_match_encodeBlockAsm8B: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm8B + +candidate2_match_encodeBlockAsm8B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeBlockAsm8B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm8B + +match_extend_back_loop_encodeBlockAsm8B: + CMPL CX, DI + JLE match_extend_back_end_encodeBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBlockAsm8B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBlockAsm8B + JMP match_extend_back_loop_encodeBlockAsm8B + +match_extend_back_end_encodeBlockAsm8B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm8B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm8B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm8B + +two_bytes_match_emit_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeBlockAsm8B + JMP memmove_long_match_emit_encodeBlockAsm8B + +one_byte_match_emit_encodeBlockAsm8B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm8B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBlockAsm8B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeBlockAsm8B + +memmove_long_match_emit_encodeBlockAsm8B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeBlockAsm8B: +match_nolit_loop_encodeBlockAsm8B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm8B + +matchlen_loopback_match_nolit_encodeBlockAsm8B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBlockAsm8B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm8B + +matchlen_loop_match_nolit_encodeBlockAsm8B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm8B + +matchlen_single_match_nolit_encodeBlockAsm8B: + TESTL DI, DI + JZ match_nolit_end_encodeBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeBlockAsm8B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeBlockAsm8B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B + +match_nolit_end_encodeBlockAsm8B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm8B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat + MOVL R10, SI + LEAL -4(R10), R10 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBlockAsm8B + +two_byte_offset_short_match_nolit_encodeBlockAsm8B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +emit_copy_three_match_nolit_encodeBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm8B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm8B: + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x38, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x38, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeBlockAsm8B + INCL CX + JMP search_loop_encodeBlockAsm8B + +emit_remainder_encodeBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm8B + +two_bytes_emit_remainder_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm8B + JMP memmove_long_emit_remainder_encodeBlockAsm8B + +one_byte_emit_remainder_encodeBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm8B + +memmove_long_emit_remainder_encodeBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + CMPL SI, $0x63 + JLE check_maxskip_ok_encodeBetterBlockAsm + LEAL 100(CX), SI + JMP check_maxskip_cont_encodeBetterBlockAsm + +check_maxskip_ok_encodeBetterBlockAsm: + LEAL 1(CX)(SI*1), SI + +check_maxskip_cont_encodeBetterBlockAsm: + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm + +candidateS_match_encodeBetterBlockAsm: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm + +match_extend_back_loop_encodeBetterBlockAsm: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm + JMP match_extend_back_loop_encodeBetterBlockAsm + +match_extend_back_end_encodeBetterBlockAsm: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm + +matchlen_loopback_match_nolit_encodeBetterBlockAsm: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm + +matchlen_loop_match_nolit_encodeBetterBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm + +matchlen_single_match_nolit_encodeBetterBlockAsm: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm + +match_nolit_end_encodeBetterBlockAsm: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm + CMPL R12, $0x01 + JG match_length_ok_encodeBetterBlockAsm + CMPL R8, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm + +match_length_ok_encodeBetterBlockAsm: + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_encodeBetterBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_match_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +four_bytes_match_emit_encodeBetterBlockAsm: + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +three_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +two_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm + +one_byte_match_emit_encodeBetterBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm + +memmove_long_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy + CMPL R8, $0x00010000 + JL two_byte_offset_match_nolit_encodeBetterBlockAsm + +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: + CMPL R12, $0x40 + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm + MOVB $0xff, (AX) + MOVL R8, 1(AX) + LEAL -64(R12), R12 + ADDQ $0x05, AX + CMPL R12, $0x04 + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm + + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R12, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy + LEAL -16842747(R12), R12 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm + +four_bytes_remain_match_nolit_encodeBetterBlockAsm: + TESTL R12, R12 + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +two_byte_offset_match_nolit_encodeBetterBlockAsm: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R12, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short + LEAL -16842747(R12), R12 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +emit_copy_three_match_nolit_encodeBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +match_is_repeat_encodeBetterBlockAsm: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm + +four_bytes_match_emit_repeat_encodeBetterBlockAsm: + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm + +three_bytes_match_emit_repeat_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm + +two_bytes_match_emit_repeat_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm + +one_byte_match_emit_repeat_encodeBetterBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm + +memmove_long_match_emit_repeat_encodeBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat +emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm + CMPL R12, $0x0100ffff + JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm + LEAL -16842747(R12), R12 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm + +repeat_five_match_nolit_repeat_encodeBetterBlockAsm: + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_repeat_encodeBetterBlockAsm: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm + +emit_remainder_encodeBetterBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +four_bytes_emit_remainder_encodeBetterBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +three_bytes_emit_remainder_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +two_bytes_emit_remainder_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +one_byte_emit_remainder_encodeBetterBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm + +memmove_long_emit_remainder_encodeBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm4MB: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm4MB + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm4MB: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + CMPL SI, $0x63 + JLE check_maxskip_ok_encodeBetterBlockAsm4MB + LEAL 100(CX), SI + JMP check_maxskip_cont_encodeBetterBlockAsm4MB + +check_maxskip_ok_encodeBetterBlockAsm4MB: + LEAL 1(CX)(SI*1), SI + +check_maxskip_cont_encodeBetterBlockAsm4MB: + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm4MB + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm4MB + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm4MB + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm4MB + +candidateS_match_encodeBetterBlockAsm4MB: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm4MB + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm4MB: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm4MB + +match_extend_back_loop_encodeBetterBlockAsm4MB: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm4MB + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm4MB + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm4MB + JMP match_extend_back_loop_encodeBetterBlockAsm4MB + +match_extend_back_end_encodeBetterBlockAsm4MB: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 4(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm4MB: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB + +matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm4MB + +matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB + +matchlen_single_match_nolit_encodeBetterBlockAsm4MB: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm4MB + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm4MB + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB + +match_nolit_end_encodeBetterBlockAsm4MB: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm4MB + CMPL R12, $0x01 + JG match_length_ok_encodeBetterBlockAsm4MB + CMPL R8, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm4MB + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm4MB + +match_length_ok_encodeBetterBlockAsm4MB: + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm4MB + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm4MB + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_encodeBetterBlockAsm4MB + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB + +three_bytes_match_emit_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB + +two_bytes_match_emit_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm4MB + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB + +one_byte_match_emit_encodeBetterBlockAsm4MB: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB + +memmove_long_match_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm4MB: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy + CMPL R8, $0x00010000 + JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB + +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB: + CMPL R12, $0x40 + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB + MOVB $0xff, (AX) + MOVL R8, 1(AX) + LEAL -64(R12), R12 + ADDQ $0x05, AX + CMPL R12, $0x04 + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB + +four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: + TESTL R12, R12 + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +match_is_repeat_encodeBetterBlockAsm4MB: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB + +three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB + +two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB + +one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB + +memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm4MB + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm4MB: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm4MB + +emit_remainder_encodeBetterBlockAsm4MB: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 4(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm4MB + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm4MB: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB + +three_bytes_emit_remainder_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB + +two_bytes_emit_remainder_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm4MB + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB + +one_byte_emit_remainder_encodeBetterBlockAsm4MB: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm4MB: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB + +memmove_long_emit_remainder_encodeBetterBlockAsm4MB: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000280, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm12B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 65560(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 65560(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm12B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm12B + +candidateS_match_encodeBetterBlockAsm12B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm12B + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm12B + +match_extend_back_loop_encodeBetterBlockAsm12B: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm12B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm12B + JMP match_extend_back_loop_encodeBetterBlockAsm12B + +match_extend_back_end_encodeBetterBlockAsm12B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm12B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm12B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm12B + +matchlen_loop_match_nolit_encodeBetterBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B + +matchlen_single_match_nolit_encodeBetterBlockAsm12B: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm12B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B + +match_nolit_end_encodeBetterBlockAsm12B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm12B + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm12B + +two_bytes_match_emit_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm12B + JMP memmove_long_match_emit_encodeBetterBlockAsm12B + +one_byte_match_emit_encodeBetterBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B + +memmove_long_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm12B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm12B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +emit_copy_three_match_nolit_encodeBetterBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +match_is_repeat_encodeBetterBlockAsm12B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B + +two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm12B + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B + +one_byte_match_emit_repeat_encodeBetterBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B + +memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm12B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm12B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x34, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 65560(SP)(R11*4) + MOVL R15, 65560(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 65560(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm12B + +emit_remainder_encodeBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + +two_bytes_emit_remainder_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + +one_byte_emit_remainder_encodeBetterBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + +memmove_long_emit_remainder_encodeBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x000000a0, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 16408(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 16408(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm10B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm10B + +candidateS_match_encodeBetterBlockAsm10B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm10B + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm10B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm10B + +match_extend_back_loop_encodeBetterBlockAsm10B: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm10B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm10B + JMP match_extend_back_loop_encodeBetterBlockAsm10B + +match_extend_back_end_encodeBetterBlockAsm10B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm10B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm10B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm10B + +matchlen_loop_match_nolit_encodeBetterBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B + +matchlen_single_match_nolit_encodeBetterBlockAsm10B: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm10B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B + +match_nolit_end_encodeBetterBlockAsm10B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm10B + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm10B + +two_bytes_match_emit_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm10B + JMP memmove_long_match_emit_encodeBetterBlockAsm10B + +one_byte_match_emit_encodeBetterBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B + +memmove_long_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm10B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm10B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +emit_copy_three_match_nolit_encodeBetterBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +match_is_repeat_encodeBetterBlockAsm10B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B + +two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm10B + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B + +one_byte_match_emit_repeat_encodeBetterBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B + +memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm10B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x36, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 16408(SP)(R11*4) + MOVL R15, 16408(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 16408(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm10B + +emit_remainder_encodeBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + +two_bytes_emit_remainder_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + +one_byte_emit_remainder_encodeBetterBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + +memmove_long_emit_remainder_encodeBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000028, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm8B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 4120(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 4120(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm8B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm8B + +candidateS_match_encodeBetterBlockAsm8B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm8B + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm8B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm8B + +match_extend_back_loop_encodeBetterBlockAsm8B: + CMPL CX, DI + JLE match_extend_back_end_encodeBetterBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeBetterBlockAsm8B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeBetterBlockAsm8B + JMP match_extend_back_loop_encodeBetterBlockAsm8B + +match_extend_back_end_encodeBetterBlockAsm8B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm8B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm8B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm8B + +matchlen_loop_match_nolit_encodeBetterBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B + +matchlen_single_match_nolit_encodeBetterBlockAsm8B: + TESTL R8, R8 + JZ match_nolit_end_encodeBetterBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeBetterBlockAsm8B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B + +match_nolit_end_encodeBetterBlockAsm8B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL 16(SP), R8 + JEQ match_is_repeat_encodeBetterBlockAsm8B + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm8B + +two_bytes_match_emit_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm8B + JMP memmove_long_match_emit_encodeBetterBlockAsm8B + +one_byte_match_emit_encodeBetterBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ R9, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B + +memmove_long_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm8B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm8B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +emit_copy_three_match_nolit_encodeBetterBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +match_is_repeat_encodeBetterBlockAsm8B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B + +two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm8B + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B + +one_byte_match_emit_repeat_encodeBetterBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ R8, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (R9), R10 + MOVL R10, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B + +memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R11 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R13*1), R10 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R13*1), X4 + MOVOU -16(R9)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B + +cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm8B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm8B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x38, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 4120(SP)(R11*4) + MOVL R15, 4120(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 4120(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeBetterBlockAsm8B + +emit_remainder_encodeBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B + +two_bytes_emit_remainder_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B + +one_byte_emit_remainder_encodeBetterBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B + +memmove_long_emit_remainder_encodeBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm(SB), $65560-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000200, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm + +repeat_extend_back_loop_encodeSnappyBlockAsm: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm + +repeat_extend_back_end_encodeSnappyBlockAsm: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_repeat_emit_encodeSnappyBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_repeat_emit_encodeSnappyBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + +four_bytes_repeat_emit_encodeSnappyBlockAsm: + MOVL SI, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + +three_bytes_repeat_emit_encodeSnappyBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + +two_bytes_repeat_emit_encodeSnappyBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + +one_byte_repeat_emit_encodeSnappyBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm + +memmove_long_repeat_emit_encodeSnappyBlockAsm: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm + +matchlen_single_repeat_extend_encodeSnappyBlockAsm: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm + +repeat_extend_forward_end_encodeSnappyBlockAsm: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy + CMPL DI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm + +four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: + CMPL SI, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm + MOVB $0xff, (AX) + MOVL DI, 1(AX) + LEAL -64(SI), SI + ADDQ $0x05, AX + CMPL SI, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm + +four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: + TESTL SI, SI + JZ repeat_end_emit_encodeSnappyBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVL DI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeSnappyBlockAsm + +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm + +no_repeat_found_encodeSnappyBlockAsm: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm + +candidate3_match_encodeSnappyBlockAsm: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm + +candidate2_match_encodeSnappyBlockAsm: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm + +match_extend_back_loop_encodeSnappyBlockAsm: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm + JMP match_extend_back_loop_encodeSnappyBlockAsm + +match_extend_back_end_encodeSnappyBlockAsm: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeSnappyBlockAsm + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeSnappyBlockAsm + MOVB $0xfc, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm + +four_bytes_match_emit_encodeSnappyBlockAsm: + MOVL R8, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW R8, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm + +three_bytes_match_emit_encodeSnappyBlockAsm: + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm + +two_bytes_match_emit_encodeSnappyBlockAsm: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm + JMP memmove_long_match_emit_encodeSnappyBlockAsm + +one_byte_match_emit_encodeSnappyBlockAsm: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm + +memmove_long_match_emit_encodeSnappyBlockAsm: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm: +match_nolit_loop_encodeSnappyBlockAsm: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm + +matchlen_loop_match_nolit_encodeSnappyBlockAsm: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm + +matchlen_single_match_nolit_encodeSnappyBlockAsm: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm + +match_nolit_end_encodeSnappyBlockAsm: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_match_nolit_encodeSnappyBlockAsm + +four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: + CMPL R10, $0x40 + JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(R10), R10 + ADDQ $0x05, AX + CMPL R10, $0x04 + JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm + +four_bytes_remain_match_nolit_encodeSnappyBlockAsm: + TESTL R10, R10 + JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm + +two_byte_offset_match_nolit_encodeSnappyBlockAsm: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm + +emit_copy_three_match_nolit_encodeSnappyBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm: + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm + INCL CX + JMP search_loop_encodeSnappyBlockAsm + +emit_remainder_encodeSnappyBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeSnappyBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeSnappyBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + +four_bytes_emit_remainder_encodeSnappyBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + +three_bytes_emit_remainder_encodeSnappyBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + +two_bytes_emit_remainder_encodeSnappyBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + +one_byte_emit_remainder_encodeSnappyBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm + +memmove_long_emit_remainder_encodeSnappyBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000200, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm64K: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm64K: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm64K + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm64K + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm64K + +repeat_extend_back_loop_encodeSnappyBlockAsm64K: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm64K + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm64K + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K + +repeat_extend_back_end_encodeSnappyBlockAsm64K: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K + +two_bytes_repeat_emit_encodeSnappyBlockAsm64K: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm64K + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K + +one_byte_repeat_emit_encodeSnappyBlockAsm64K: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K + +memmove_long_repeat_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K + +matchlen_single_repeat_extend_encodeSnappyBlockAsm64K: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K + +repeat_extend_forward_end_encodeSnappyBlockAsm64K: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm64K + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm64K: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm64K + +no_repeat_found_encodeSnappyBlockAsm64K: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm64K + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm64K + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm64K + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm64K + +candidate3_match_encodeSnappyBlockAsm64K: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm64K + +candidate2_match_encodeSnappyBlockAsm64K: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm64K: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm64K + +match_extend_back_loop_encodeSnappyBlockAsm64K: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm64K + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm64K + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm64K + JMP match_extend_back_loop_encodeSnappyBlockAsm64K + +match_extend_back_end_encodeSnappyBlockAsm64K: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm64K: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm64K + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm64K + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm64K + +two_bytes_match_emit_encodeSnappyBlockAsm64K: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm64K + JMP memmove_long_match_emit_encodeSnappyBlockAsm64K + +one_byte_match_emit_encodeSnappyBlockAsm64K: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K + +memmove_long_match_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm64K: +match_nolit_loop_encodeSnappyBlockAsm64K: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm64K + +matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K + +matchlen_single_match_nolit_encodeSnappyBlockAsm64K: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm64K + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm64K + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K + +match_nolit_end_encodeSnappyBlockAsm64K: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K + +emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm64K + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm64K: + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm64K + INCL CX + JMP search_loop_encodeSnappyBlockAsm64K + +emit_remainder_encodeSnappyBlockAsm64K: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm64K: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K + +two_bytes_emit_remainder_encodeSnappyBlockAsm64K: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm64K + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K + +one_byte_emit_remainder_encodeSnappyBlockAsm64K: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K + +memmove_long_emit_remainder_encodeSnappyBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000080, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm12B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x18, R11 + IMULQ R9, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm12B + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm12B + +repeat_extend_back_loop_encodeSnappyBlockAsm12B: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm12B + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm12B + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B + +repeat_extend_back_end_encodeSnappyBlockAsm12B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B + +two_bytes_repeat_emit_encodeSnappyBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm12B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B + +one_byte_repeat_emit_encodeSnappyBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B + +memmove_long_repeat_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B + +matchlen_single_repeat_extend_encodeSnappyBlockAsm12B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B + +repeat_extend_forward_end_encodeSnappyBlockAsm12B: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm12B + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm12B: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm12B + +no_repeat_found_encodeSnappyBlockAsm12B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm12B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm12B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm12B + +candidate3_match_encodeSnappyBlockAsm12B: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm12B + +candidate2_match_encodeSnappyBlockAsm12B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm12B + +match_extend_back_loop_encodeSnappyBlockAsm12B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm12B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm12B + JMP match_extend_back_loop_encodeSnappyBlockAsm12B + +match_extend_back_end_encodeSnappyBlockAsm12B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm12B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm12B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm12B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm12B + +two_bytes_match_emit_encodeSnappyBlockAsm12B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm12B + JMP memmove_long_match_emit_encodeSnappyBlockAsm12B + +one_byte_match_emit_encodeSnappyBlockAsm12B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B + +memmove_long_match_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm12B: +match_nolit_loop_encodeSnappyBlockAsm12B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm12B + +matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B + +matchlen_single_match_nolit_encodeSnappyBlockAsm12B: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm12B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B + +match_nolit_end_encodeSnappyBlockAsm12B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B + +emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm12B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm12B: + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x18, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 + SHLQ $0x18, SI + IMULQ R9, SI + SHRQ $0x34, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm12B + INCL CX + JMP search_loop_encodeSnappyBlockAsm12B + +emit_remainder_encodeSnappyBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B + +two_bytes_emit_remainder_encodeSnappyBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm12B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B + +one_byte_emit_remainder_encodeSnappyBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B + +memmove_long_emit_remainder_encodeSnappyBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000020, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm10B + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm10B + +repeat_extend_back_loop_encodeSnappyBlockAsm10B: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm10B + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm10B + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B + +repeat_extend_back_end_encodeSnappyBlockAsm10B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B + +two_bytes_repeat_emit_encodeSnappyBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm10B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B + +one_byte_repeat_emit_encodeSnappyBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B + +memmove_long_repeat_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B + +matchlen_single_repeat_extend_encodeSnappyBlockAsm10B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B + +repeat_extend_forward_end_encodeSnappyBlockAsm10B: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm10B + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm10B: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm10B + +no_repeat_found_encodeSnappyBlockAsm10B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm10B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm10B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm10B + +candidate3_match_encodeSnappyBlockAsm10B: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm10B + +candidate2_match_encodeSnappyBlockAsm10B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm10B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm10B + +match_extend_back_loop_encodeSnappyBlockAsm10B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm10B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm10B + JMP match_extend_back_loop_encodeSnappyBlockAsm10B + +match_extend_back_end_encodeSnappyBlockAsm10B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm10B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm10B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm10B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm10B + +two_bytes_match_emit_encodeSnappyBlockAsm10B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm10B + JMP memmove_long_match_emit_encodeSnappyBlockAsm10B + +one_byte_match_emit_encodeSnappyBlockAsm10B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B + +memmove_long_match_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm10B: +match_nolit_loop_encodeSnappyBlockAsm10B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm10B + +matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B + +matchlen_single_match_nolit_encodeSnappyBlockAsm10B: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm10B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B + +match_nolit_end_encodeSnappyBlockAsm10B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B + +emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm10B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm10B: + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x36, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x36, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm10B + INCL CX + JMP search_loop_encodeSnappyBlockAsm10B + +emit_remainder_encodeSnappyBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B + +two_bytes_emit_remainder_encodeSnappyBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm10B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B + +one_byte_emit_remainder_encodeSnappyBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B + +memmove_long_emit_remainder_encodeSnappyBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000008, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm8B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm8B + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm8B + +repeat_extend_back_loop_encodeSnappyBlockAsm8B: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm8B + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm8B + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B + +repeat_extend_back_end_encodeSnappyBlockAsm8B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B + +two_bytes_repeat_emit_encodeSnappyBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm8B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B + +one_byte_repeat_emit_encodeSnappyBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B + +memmove_long_repeat_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B + +matchlen_single_repeat_extend_encodeSnappyBlockAsm8B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B + +repeat_extend_forward_end_encodeSnappyBlockAsm8B: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm8B + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm8B: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm8B + +no_repeat_found_encodeSnappyBlockAsm8B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm8B + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm8B + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm8B + +candidate3_match_encodeSnappyBlockAsm8B: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm8B + +candidate2_match_encodeSnappyBlockAsm8B: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm8B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm8B + +match_extend_back_loop_encodeSnappyBlockAsm8B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm8B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm8B + JMP match_extend_back_loop_encodeSnappyBlockAsm8B + +match_extend_back_end_encodeSnappyBlockAsm8B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm8B: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm8B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm8B + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm8B + +two_bytes_match_emit_encodeSnappyBlockAsm8B: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm8B + JMP memmove_long_match_emit_encodeSnappyBlockAsm8B + +one_byte_match_emit_encodeSnappyBlockAsm8B: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B + +memmove_long_match_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm8B: +match_nolit_loop_encodeSnappyBlockAsm8B: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm8B + +matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B + +matchlen_single_match_nolit_encodeSnappyBlockAsm8B: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm8B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B + +match_nolit_end_encodeSnappyBlockAsm8B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B + +emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm8B + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm8B: + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x38, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x38, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm8B + INCL CX + JMP search_loop_encodeSnappyBlockAsm8B + +emit_remainder_encodeSnappyBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B + +two_bytes_emit_remainder_encodeSnappyBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm8B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B + +one_byte_emit_remainder_encodeSnappyBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B + +memmove_long_emit_remainder_encodeSnappyBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + CMPL SI, $0x63 + JLE check_maxskip_ok_encodeSnappyBetterBlockAsm + LEAL 100(CX), SI + JMP check_maxskip_cont_encodeSnappyBetterBlockAsm + +check_maxskip_ok_encodeSnappyBetterBlockAsm: + LEAL 1(CX)(SI*1), SI + +check_maxskip_cont_encodeSnappyBetterBlockAsm: + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm + +candidateS_match_encodeSnappyBetterBlockAsm: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm + +match_extend_back_loop_encodeSnappyBetterBlockAsm: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm + +match_extend_back_end_encodeSnappyBetterBlockAsm: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm + +match_nolit_end_encodeSnappyBetterBlockAsm: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL R12, $0x01 + JG match_length_ok_encodeSnappyBetterBlockAsm + CMPL R8, $0x0000ffff + JLE match_length_ok_encodeSnappyBetterBlockAsm + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeSnappyBetterBlockAsm + +match_length_ok_encodeSnappyBetterBlockAsm: + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +four_bytes_match_emit_encodeSnappyBetterBlockAsm: + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +three_bytes_match_emit_encodeSnappyBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +two_bytes_match_emit_encodeSnappyBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +one_byte_match_emit_encodeSnappyBetterBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm + +memmove_long_match_emit_encodeSnappyBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy + CMPL R8, $0x00010000 + JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm + +four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R12, $0x40 + JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm + MOVB $0xff, (AX) + MOVL R8, 1(AX) + LEAL -64(R12), R12 + ADDQ $0x05, AX + CMPL R12, $0x04 + JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm + +four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: + TESTL R12, R12 + JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm + +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm + +emit_remainder_encodeSnappyBetterBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm64K: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm64K: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm64K + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm64K + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm64K + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm64K + +candidateS_match_encodeSnappyBetterBlockAsm64K: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm64K + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm64K: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K + +match_extend_back_loop_encodeSnappyBetterBlockAsm64K: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K + +match_extend_back_end_encodeSnappyBetterBlockAsm64K: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm64K: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm64K + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm64K + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm64K + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K + +match_nolit_end_encodeSnappyBetterBlockAsm64K: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K + +two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm64K + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K + +one_byte_match_emit_encodeSnappyBetterBlockAsm64K: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K + +memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm64K + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm64K + +emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm64K: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000280, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm12B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 65560(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 65560(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm12B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm12B + +candidateS_match_encodeSnappyBetterBlockAsm12B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm12B + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B + +match_extend_back_loop_encodeSnappyBetterBlockAsm12B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B + +match_extend_back_end_encodeSnappyBetterBlockAsm12B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm12B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm12B + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm12B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B + +match_nolit_end_encodeSnappyBetterBlockAsm12B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B + +two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm12B + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B + +one_byte_match_emit_encodeSnappyBetterBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B + +memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm12B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x34, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 65560(SP)(R11*4) + MOVL R15, 65560(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 65560(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm12B + +emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x000000a0, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 16408(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 16408(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm10B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm10B + +candidateS_match_encodeSnappyBetterBlockAsm10B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm10B + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm10B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B + +match_extend_back_loop_encodeSnappyBetterBlockAsm10B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B + +match_extend_back_end_encodeSnappyBetterBlockAsm10B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm10B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm10B + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm10B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B + +match_nolit_end_encodeSnappyBetterBlockAsm10B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B + +two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm10B + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B + +one_byte_match_emit_encodeSnappyBetterBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B + +memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm10B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x36, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 16408(SP)(R11*4) + MOVL R15, 16408(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 16408(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm10B + +emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000028, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm8B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 4120(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 4120(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm8B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm8B + +candidateS_match_encodeSnappyBetterBlockAsm8B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm8B + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm8B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B + +match_extend_back_loop_encodeSnappyBetterBlockAsm8B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B + +match_extend_back_end_encodeSnappyBetterBlockAsm8B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm8B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm8B + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm8B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B + +match_nolit_end_encodeSnappyBetterBlockAsm8B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B + +two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm8B + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B + +one_byte_match_emit_encodeSnappyBetterBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B + +memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm8B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x38, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 4120(SP)(R11*4) + MOVL R15, 4120(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 4120(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm8B + +emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func emitLiteral(dst []byte, lit []byte) int +// Requires: SSE2 +TEXT ·emitLiteral(SB), NOSPLIT, $0-56 + MOVQ lit_len+32(FP), DX + MOVQ dst_base+0(FP), AX + MOVQ lit_base+24(FP), CX + TESTQ DX, DX + JZ emit_literal_end_standalone_skip + MOVL DX, BX + LEAL -1(DX), SI + CMPL SI, $0x3c + JLT one_byte_standalone + CMPL SI, $0x00000100 + JLT two_bytes_standalone + CMPL SI, $0x00010000 + JLT three_bytes_standalone + CMPL SI, $0x01000000 + JLT four_bytes_standalone + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP memmove_long_standalone + +four_bytes_standalone: + MOVL SI, DI + SHRL $0x10, DI + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB DI, 3(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP memmove_long_standalone + +three_bytes_standalone: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP memmove_long_standalone + +two_bytes_standalone: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_standalone + JMP memmove_long_standalone + +one_byte_standalone: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, BX + ADDQ $0x01, AX + +memmove_standalone: + // genMemMoveShort + CMPQ DX, $0x03 + JB emit_lit_memmove_standalone_memmove_move_1or2 + JE emit_lit_memmove_standalone_memmove_move_3 + CMPQ DX, $0x08 + JB emit_lit_memmove_standalone_memmove_move_4through7 + CMPQ DX, $0x10 + JBE emit_lit_memmove_standalone_memmove_move_8through16 + CMPQ DX, $0x20 + JBE emit_lit_memmove_standalone_memmove_move_17through32 + JMP emit_lit_memmove_standalone_memmove_move_33through64 + +emit_lit_memmove_standalone_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(DX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(DX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(DX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(DX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(DX*1), X2 + MOVOU -16(CX)(DX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DX*1) + MOVOU X3, -16(AX)(DX*1) + JMP emit_literal_end_standalone + JMP emit_literal_end_standalone + +memmove_long_standalone: + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(DX*1), X2 + MOVOU -16(CX)(DX*1), X3 + MOVQ DX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_standalonelarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_standalonelarge_big_loop_back + +emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ DX, R8 + JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DX*1) + MOVOU X3, -16(AX)(DX*1) + JMP emit_literal_end_standalone + JMP emit_literal_end_standalone + +emit_literal_end_standalone_skip: + XORQ BX, BX + +emit_literal_end_standalone: + MOVQ BX, ret+48(FP) + RET + +// func emitRepeat(dst []byte, offset int, length int) int +TEXT ·emitRepeat(SB), NOSPLIT, $0-48 + XORQ BX, BX + MOVQ dst_base+0(FP), AX + MOVQ offset+24(FP), CX + MOVQ length+32(FP), DX + + // emitRepeat +emit_repeat_again_standalone: + MOVL DX, SI + LEAL -4(DX), DX + CMPL SI, $0x08 + JLE repeat_two_standalone + CMPL SI, $0x0c + JGE cant_repeat_two_offset_standalone + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone + +cant_repeat_two_offset_standalone: + CMPL DX, $0x00000104 + JLT repeat_three_standalone + CMPL DX, $0x00010100 + JLT repeat_four_standalone + CMPL DX, $0x0100ffff + JLT repeat_five_standalone + LEAL -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone + +repeat_five_standalone: + LEAL -65536(DX), DX + MOVL DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARL $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_repeat_end + +repeat_four_standalone: + LEAL -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_repeat_end + +repeat_three_standalone: + LEAL -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_repeat_end + +repeat_two_standalone: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_repeat_end + +repeat_two_offset_standalone: + XORQ SI, SI + LEAL 1(SI)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + +gen_emit_repeat_end: + MOVQ BX, ret+40(FP) + RET + +// func emitCopy(dst []byte, offset int, length int) int +TEXT ·emitCopy(SB), NOSPLIT, $0-48 + XORQ BX, BX + MOVQ dst_base+0(FP), AX + MOVQ offset+24(FP), CX + MOVQ length+32(FP), DX + + // emitCopy + CMPL CX, $0x00010000 + JL two_byte_offset_standalone + +four_bytes_loop_back_standalone: + CMPL DX, $0x40 + JLE four_bytes_remain_standalone + MOVB $0xff, (AX) + MOVL CX, 1(AX) + LEAL -64(DX), DX + ADDQ $0x05, BX + ADDQ $0x05, AX + CMPL DX, $0x04 + JL four_bytes_remain_standalone + + // emitRepeat +emit_repeat_again_standalone_emit_copy: + MOVL DX, SI + LEAL -4(DX), DX + CMPL SI, $0x08 + JLE repeat_two_standalone_emit_copy + CMPL SI, $0x0c + JGE cant_repeat_two_offset_standalone_emit_copy + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone_emit_copy + +cant_repeat_two_offset_standalone_emit_copy: + CMPL DX, $0x00000104 + JLT repeat_three_standalone_emit_copy + CMPL DX, $0x00010100 + JLT repeat_four_standalone_emit_copy + CMPL DX, $0x0100ffff + JLT repeat_five_standalone_emit_copy + LEAL -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone_emit_copy + +repeat_five_standalone_emit_copy: + LEAL -65536(DX), DX + MOVL DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARL $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +repeat_four_standalone_emit_copy: + LEAL -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_copy_end + +repeat_three_standalone_emit_copy: + LEAL -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_copy_end + +repeat_two_standalone_emit_copy: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +repeat_two_offset_standalone_emit_copy: + XORQ SI, SI + LEAL 1(SI)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + JMP four_bytes_loop_back_standalone + +four_bytes_remain_standalone: + TESTL DX, DX + JZ gen_emit_copy_end + MOVB $0x03, SI + LEAL -4(SI)(DX*4), DX + MOVB DL, (AX) + MOVL CX, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +two_byte_offset_standalone: + CMPL DX, $0x40 + JLE two_byte_offset_short_standalone + MOVB $0xee, (AX) + MOVW CX, 1(AX) + LEAL -60(DX), DX + ADDQ $0x03, AX + ADDQ $0x03, BX + + // emitRepeat +emit_repeat_again_standalone_emit_copy_short: + MOVL DX, SI + LEAL -4(DX), DX + CMPL SI, $0x08 + JLE repeat_two_standalone_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_standalone_emit_copy_short + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone_emit_copy_short + +cant_repeat_two_offset_standalone_emit_copy_short: + CMPL DX, $0x00000104 + JLT repeat_three_standalone_emit_copy_short + CMPL DX, $0x00010100 + JLT repeat_four_standalone_emit_copy_short + CMPL DX, $0x0100ffff + JLT repeat_five_standalone_emit_copy_short + LEAL -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone_emit_copy_short + +repeat_five_standalone_emit_copy_short: + LEAL -65536(DX), DX + MOVL DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARL $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +repeat_four_standalone_emit_copy_short: + LEAL -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_copy_end + +repeat_three_standalone_emit_copy_short: + LEAL -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_copy_end + +repeat_two_standalone_emit_copy_short: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +repeat_two_offset_standalone_emit_copy_short: + XORQ SI, SI + LEAL 1(SI)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + JMP two_byte_offset_standalone + +two_byte_offset_short_standalone: + CMPL DX, $0x0c + JGE emit_copy_three_standalone + CMPL CX, $0x00000800 + JGE emit_copy_three_standalone + MOVB $0x01, SI + LEAL -16(SI)(DX*4), DX + MOVB CL, 1(AX) + SHRL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +emit_copy_three_standalone: + MOVB $0x02, SI + LEAL -4(SI)(DX*4), DX + MOVB DL, (AX) + MOVW CX, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + +gen_emit_copy_end: + MOVQ BX, ret+40(FP) + RET + +// func emitCopyNoRepeat(dst []byte, offset int, length int) int +TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 + XORQ BX, BX + MOVQ dst_base+0(FP), AX + MOVQ offset+24(FP), CX + MOVQ length+32(FP), DX + + // emitCopy + CMPL CX, $0x00010000 + JL two_byte_offset_standalone_snappy + +four_bytes_loop_back_standalone_snappy: + CMPL DX, $0x40 + JLE four_bytes_remain_standalone_snappy + MOVB $0xff, (AX) + MOVL CX, 1(AX) + LEAL -64(DX), DX + ADDQ $0x05, BX + ADDQ $0x05, AX + CMPL DX, $0x04 + JL four_bytes_remain_standalone_snappy + JMP four_bytes_loop_back_standalone_snappy + +four_bytes_remain_standalone_snappy: + TESTL DX, DX + JZ gen_emit_copy_end_snappy + MOVB $0x03, SI + LEAL -4(SI)(DX*4), DX + MOVB DL, (AX) + MOVL CX, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end_snappy + +two_byte_offset_standalone_snappy: + CMPL DX, $0x40 + JLE two_byte_offset_short_standalone_snappy + MOVB $0xee, (AX) + MOVW CX, 1(AX) + LEAL -60(DX), DX + ADDQ $0x03, AX + ADDQ $0x03, BX + JMP two_byte_offset_standalone_snappy + +two_byte_offset_short_standalone_snappy: + CMPL DX, $0x0c + JGE emit_copy_three_standalone_snappy + CMPL CX, $0x00000800 + JGE emit_copy_three_standalone_snappy + MOVB $0x01, SI + LEAL -16(SI)(DX*4), DX + MOVB CL, 1(AX) + SHRL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end_snappy + +emit_copy_three_standalone_snappy: + MOVB $0x02, SI + LEAL -4(SI)(DX*4), DX + MOVB DL, (AX) + MOVW CX, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + +gen_emit_copy_end_snappy: + MOVQ BX, ret+40(FP) + RET + +// func matchLen(a []byte, b []byte) int +TEXT ·matchLen(SB), NOSPLIT, $0-56 + MOVQ a_base+0(FP), AX + MOVQ b_base+24(FP), CX + MOVQ a_len+8(FP), DX + + // matchLen + XORL SI, SI + CMPL DX, $0x08 + JL matchlen_single_standalone + +matchlen_loopback_standalone: + MOVQ (AX)(SI*1), BX + XORQ (CX)(SI*1), BX + TESTQ BX, BX + JZ matchlen_loop_standalone + BSFQ BX, BX + SARQ $0x03, BX + LEAL (SI)(BX*1), SI + JMP gen_match_len_end + +matchlen_loop_standalone: + LEAL -8(DX), DX + LEAL 8(SI), SI + CMPL DX, $0x08 + JGE matchlen_loopback_standalone + +matchlen_single_standalone: + TESTL DX, DX + JZ gen_match_len_end + +matchlen_single_loopback_standalone: + MOVB (AX)(SI*1), BL + CMPB (CX)(SI*1), BL + JNE gen_match_len_end + LEAL 1(SI), SI + DECL DX + JNZ matchlen_single_loopback_standalone + +gen_match_len_end: + MOVQ SI, ret+48(FP) + RET |