// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.

// +build !appengine
// +build !noasm
// +build gc

#include "textflag.h"

// func encodeBlockAsm(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBlockAsm(SB), $65560-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBlockAsm
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ R9, R11
	SHRQ  $0x32, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm
	LEAL  1(CX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm

repeat_extend_back_loop_encodeBlockAsm:
	CMPL DI, R8
	JLE  repeat_extend_back_end_encodeBlockAsm
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeBlockAsm
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm

repeat_extend_back_end_encodeBlockAsm:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeBlockAsm
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeBlockAsm
	CMPL SI, $0x00010000
	JLT  three_bytes_repeat_emit_encodeBlockAsm
	CMPL SI, $0x01000000
	JLT  four_bytes_repeat_emit_encodeBlockAsm
	MOVB $0xfc, (AX)
	MOVL SI, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm

four_bytes_repeat_emit_encodeBlockAsm:
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm

three_bytes_repeat_emit_encodeBlockAsm:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm

two_bytes_repeat_emit_encodeBlockAsm:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeBlockAsm
	JMP  memmove_long_repeat_emit_encodeBlockAsm

one_byte_repeat_emit_encodeBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm

memmove_long_repeat_emit_encodeBlockAsm:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeBlockAsm:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL CX, R9
	LEAQ (DX)(CX*1), R10
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R12, R12
	CMPL R9, $0x08
	JL   matchlen_single_repeat_extend_encodeBlockAsm

matchlen_loopback_repeat_extend_encodeBlockAsm:
	MOVQ  (R10)(R12*1), R11
	XORQ  (SI)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_repeat_extend_encodeBlockAsm
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   repeat_extend_forward_end_encodeBlockAsm

matchlen_loop_repeat_extend_encodeBlockAsm:
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	CMPL R9, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm

matchlen_single_repeat_extend_encodeBlockAsm:
	TESTL R9, R9
	JZ    repeat_extend_forward_end_encodeBlockAsm

matchlen_single_loopback_repeat_extend_encodeBlockAsm:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm
	LEAL 1(R12), R12
	DECL R9
	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm

repeat_extend_forward_end_encodeBlockAsm:
	ADDL  R12, CX
	MOVL  CX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm

	// emitRepeat
emit_repeat_again_match_repeat_encodeBlockAsm:
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_match_repeat_encodeBlockAsm
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_match_repeat_encodeBlockAsm

cant_repeat_two_offset_match_repeat_encodeBlockAsm:
	CMPL SI, $0x00000104
	JLT  repeat_three_match_repeat_encodeBlockAsm
	CMPL SI, $0x00010100
	JLT  repeat_four_match_repeat_encodeBlockAsm
	CMPL SI, $0x0100ffff
	JLT  repeat_five_match_repeat_encodeBlockAsm
	LEAL -16842747(SI), SI
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_repeat_encodeBlockAsm

repeat_five_match_repeat_encodeBlockAsm:
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (AX)
	MOVW SI, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_match_repeat_encodeBlockAsm:
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_match_repeat_encodeBlockAsm:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_match_repeat_encodeBlockAsm:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_match_repeat_encodeBlockAsm:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_as_copy_encodeBlockAsm:
	// emitCopy
	CMPL DI, $0x00010000
	JL   two_byte_offset_repeat_as_copy_encodeBlockAsm

four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
	CMPL SI, $0x40
	JLE  four_bytes_remain_repeat_as_copy_encodeBlockAsm
	MOVB $0xff, (AX)
	MOVL DI, 1(AX)
	LEAL -64(SI), SI
	ADDQ $0x05, AX
	CMPL SI, $0x04
	JL   four_bytes_remain_repeat_as_copy_encodeBlockAsm

	// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
	CMPL SI, $0x00000104
	JLT  repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL SI, $0x00010100
	JLT  repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL SI, $0x0100ffff
	JLT  repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
	LEAL -16842747(SI), SI
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy

repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (AX)
	MOVW SI, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm
	JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm

four_bytes_remain_repeat_as_copy_encodeBlockAsm:
	TESTL SI, SI
	JZ    repeat_end_emit_encodeBlockAsm
	MOVB  $0x03, BL
	LEAL  -4(BX)(SI*4), SI
	MOVB  SI, (AX)
	MOVL  DI, 1(AX)
	ADDQ  $0x05, AX
	JMP   repeat_end_emit_encodeBlockAsm

two_byte_offset_repeat_as_copy_encodeBlockAsm:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	CMPL SI, $0x00000104
	JLT  repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL SI, $0x00010100
	JLT  repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL SI, $0x0100ffff
	JLT  repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
	LEAL -16842747(SI), SI
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short

repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (AX)
	MOVW SI, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm
	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm

two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm
	CMPL DI, $0x00000800
	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

emit_copy_three_repeat_as_copy_encodeBlockAsm:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm

no_repeat_found_encodeBlockAsm:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm

candidate3_match_encodeBlockAsm:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm

candidate2_match_encodeBlockAsm:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeBlockAsm:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm

match_extend_back_loop_encodeBlockAsm:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBlockAsm
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBlockAsm
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm
	JMP  match_extend_back_loop_encodeBlockAsm

match_extend_back_end_encodeBlockAsm:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 5(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeBlockAsm
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeBlockAsm
	CMPL R8, $0x00010000
	JLT  three_bytes_match_emit_encodeBlockAsm
	CMPL R8, $0x01000000
	JLT  four_bytes_match_emit_encodeBlockAsm
	MOVB $0xfc, (AX)
	MOVL R8, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_encodeBlockAsm

four_bytes_match_emit_encodeBlockAsm:
	MOVL R8, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW R8, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeBlockAsm

three_bytes_match_emit_encodeBlockAsm:
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm

two_bytes_match_emit_encodeBlockAsm:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeBlockAsm
	JMP  memmove_long_match_emit_encodeBlockAsm

one_byte_match_emit_encodeBlockAsm:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm

memmove_long_match_emit_encodeBlockAsm:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeBlockAsm:
match_nolit_loop_encodeBlockAsm:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeBlockAsm

matchlen_loopback_match_nolit_encodeBlockAsm:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeBlockAsm
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeBlockAsm

matchlen_loop_match_nolit_encodeBlockAsm:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBlockAsm

matchlen_single_match_nolit_encodeBlockAsm:
	TESTL DI, DI
	JZ    match_nolit_end_encodeBlockAsm

matchlen_single_loopback_match_nolit_encodeBlockAsm:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm

match_nolit_end_encodeBlockAsm:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
	CMPL SI, $0x00010000
	JL   two_byte_offset_match_nolit_encodeBlockAsm

four_bytes_loop_back_match_nolit_encodeBlockAsm:
	CMPL R10, $0x40
	JLE  four_bytes_remain_match_nolit_encodeBlockAsm
	MOVB $0xff, (AX)
	MOVL SI, 1(AX)
	LEAL -64(R10), R10
	ADDQ $0x05, AX
	CMPL R10, $0x04
	JL   four_bytes_remain_match_nolit_encodeBlockAsm

	// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JLE  repeat_two_match_nolit_encodeBlockAsm_emit_copy
	CMPL DI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
	CMPL SI, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy

cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
	CMPL R10, $0x00000104
	JLT  repeat_three_match_nolit_encodeBlockAsm_emit_copy
	CMPL R10, $0x00010100
	JLT  repeat_four_match_nolit_encodeBlockAsm_emit_copy
	CMPL R10, $0x0100ffff
	JLT  repeat_five_match_nolit_encodeBlockAsm_emit_copy
	LEAL -16842747(R10), R10
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy

repeat_five_match_nolit_encodeBlockAsm_emit_copy:
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_four_match_nolit_encodeBlockAsm_emit_copy:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_three_match_nolit_encodeBlockAsm_emit_copy:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_match_nolit_encodeBlockAsm_emit_copy:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm
	JMP four_bytes_loop_back_match_nolit_encodeBlockAsm

four_bytes_remain_match_nolit_encodeBlockAsm:
	TESTL R10, R10
	JZ    match_nolit_emitcopy_end_encodeBlockAsm
	MOVB  $0x03, BL
	LEAL  -4(BX)(R10*4), R10
	MOVB  R10, (AX)
	MOVL  SI, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeBlockAsm

two_byte_offset_match_nolit_encodeBlockAsm:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JLE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL DI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL SI, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
	CMPL R10, $0x00000104
	JLT  repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL R10, $0x00010100
	JLT  repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL R10, $0x0100ffff
	JLT  repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
	LEAL -16842747(R10), R10
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short

repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm
	JMP two_byte_offset_match_nolit_encodeBlockAsm

two_byte_offset_short_match_nolit_encodeBlockAsm:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBlockAsm
	CMPL SI, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeBlockAsm
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

emit_copy_three_match_nolit_encodeBlockAsm:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBlockAsm
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm:
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x10, R8
	IMULQ R9, R8
	SHRQ  $0x32, R8
	SHLQ  $0x10, SI
	IMULQ R9, SI
	SHRQ  $0x32, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm
	INCL  CX
	JMP   search_loop_encodeBlockAsm

emit_remainder_encodeBlockAsm:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 5(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBlockAsm
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBlockAsm
	CMPL DX, $0x00010000
	JLT  three_bytes_emit_remainder_encodeBlockAsm
	CMPL DX, $0x01000000
	JLT  four_bytes_emit_remainder_encodeBlockAsm
	MOVB $0xfc, (AX)
	MOVL DX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm

four_bytes_emit_remainder_encodeBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm

three_bytes_emit_remainder_encodeBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm

two_bytes_emit_remainder_encodeBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBlockAsm
	JMP  memmove_long_emit_remainder_encodeBlockAsm

one_byte_emit_remainder_encodeBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm

memmove_long_emit_remainder_encodeBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBlockAsm4MB(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBlockAsm4MB(SB), $65560-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm4MB:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm4MB
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm4MB:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBlockAsm4MB
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ R9, R11
	SHRQ  $0x32, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm4MB
	LEAL  1(CX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm4MB

repeat_extend_back_loop_encodeBlockAsm4MB:
	CMPL DI, R8
	JLE  repeat_extend_back_end_encodeBlockAsm4MB
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeBlockAsm4MB
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm4MB

repeat_extend_back_end_encodeBlockAsm4MB:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm4MB
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeBlockAsm4MB
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeBlockAsm4MB
	CMPL SI, $0x00010000
	JLT  three_bytes_repeat_emit_encodeBlockAsm4MB
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB

three_bytes_repeat_emit_encodeBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB

two_bytes_repeat_emit_encodeBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeBlockAsm4MB
	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB

one_byte_repeat_emit_encodeBlockAsm4MB:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm4MB:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm4MB

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm4MB

memmove_long_repeat_emit_encodeBlockAsm4MB:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeBlockAsm4MB:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL CX, R9
	LEAQ (DX)(CX*1), R10
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R12, R12
	CMPL R9, $0x08
	JL   matchlen_single_repeat_extend_encodeBlockAsm4MB

matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
	MOVQ  (R10)(R12*1), R11
	XORQ  (SI)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_repeat_extend_encodeBlockAsm4MB
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   repeat_extend_forward_end_encodeBlockAsm4MB

matchlen_loop_repeat_extend_encodeBlockAsm4MB:
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	CMPL R9, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm4MB

matchlen_single_repeat_extend_encodeBlockAsm4MB:
	TESTL R9, R9
	JZ    repeat_extend_forward_end_encodeBlockAsm4MB

matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm4MB
	LEAL 1(R12), R12
	DECL R9
	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB

repeat_extend_forward_end_encodeBlockAsm4MB:
	ADDL  R12, CX
	MOVL  CX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm4MB

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_match_repeat_encodeBlockAsm4MB
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_match_repeat_encodeBlockAsm4MB

cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
	CMPL SI, $0x00000104
	JLT  repeat_three_match_repeat_encodeBlockAsm4MB
	CMPL SI, $0x00010100
	JLT  repeat_four_match_repeat_encodeBlockAsm4MB
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (AX)
	MOVW SI, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_match_repeat_encodeBlockAsm4MB:
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_match_repeat_encodeBlockAsm4MB:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_match_repeat_encodeBlockAsm4MB:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_match_repeat_encodeBlockAsm4MB:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_as_copy_encodeBlockAsm4MB:
	// emitCopy
	CMPL DI, $0x00010000
	JL   two_byte_offset_repeat_as_copy_encodeBlockAsm4MB

four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
	CMPL SI, $0x40
	JLE  four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
	MOVB $0xff, (AX)
	MOVL DI, 1(AX)
	LEAL -64(SI), SI
	ADDQ $0x05, AX
	CMPL SI, $0x04
	JL   four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	CMPL SI, $0x00000104
	JLT  repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	CMPL SI, $0x00010100
	JLT  repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (AX)
	MOVW SI, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB
	JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB

four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
	TESTL SI, SI
	JZ    repeat_end_emit_encodeBlockAsm4MB
	MOVB  $0x03, BL
	LEAL  -4(BX)(SI*4), SI
	MOVB  SI, (AX)
	MOVL  DI, 1(AX)
	ADDQ  $0x05, AX
	JMP   repeat_end_emit_encodeBlockAsm4MB

two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	CMPL SI, $0x00000104
	JLT  repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	CMPL SI, $0x00010100
	JLT  repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (AX)
	MOVW SI, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB
	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB

two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
	CMPL DI, $0x00000800
	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm4MB:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm4MB

no_repeat_found_encodeBlockAsm4MB:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm4MB
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm4MB
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm4MB
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm4MB

candidate3_match_encodeBlockAsm4MB:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm4MB

candidate2_match_encodeBlockAsm4MB:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeBlockAsm4MB:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm4MB

match_extend_back_loop_encodeBlockAsm4MB:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBlockAsm4MB
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBlockAsm4MB
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm4MB
	JMP  match_extend_back_loop_encodeBlockAsm4MB

match_extend_back_end_encodeBlockAsm4MB:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 4(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm4MB:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm4MB
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeBlockAsm4MB
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeBlockAsm4MB
	CMPL R8, $0x00010000
	JLT  three_bytes_match_emit_encodeBlockAsm4MB
	MOVL R8, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW R8, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeBlockAsm4MB

three_bytes_match_emit_encodeBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm4MB

two_bytes_match_emit_encodeBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeBlockAsm4MB
	JMP  memmove_long_match_emit_encodeBlockAsm4MB

one_byte_match_emit_encodeBlockAsm4MB:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm4MB:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm4MB

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm4MB:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm4MB

memmove_long_match_emit_encodeBlockAsm4MB:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeBlockAsm4MB:
match_nolit_loop_encodeBlockAsm4MB:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeBlockAsm4MB

matchlen_loopback_match_nolit_encodeBlockAsm4MB:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeBlockAsm4MB
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeBlockAsm4MB

matchlen_loop_match_nolit_encodeBlockAsm4MB:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBlockAsm4MB

matchlen_single_match_nolit_encodeBlockAsm4MB:
	TESTL DI, DI
	JZ    match_nolit_end_encodeBlockAsm4MB

matchlen_single_loopback_match_nolit_encodeBlockAsm4MB:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm4MB
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm4MB

match_nolit_end_encodeBlockAsm4MB:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
	CMPL SI, $0x00010000
	JL   two_byte_offset_match_nolit_encodeBlockAsm4MB

four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
	CMPL R10, $0x40
	JLE  four_bytes_remain_match_nolit_encodeBlockAsm4MB
	MOVB $0xff, (AX)
	MOVL SI, 1(AX)
	LEAL -64(R10), R10
	ADDQ $0x05, AX
	CMPL R10, $0x04
	JL   four_bytes_remain_match_nolit_encodeBlockAsm4MB

	// emitRepeat
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JLE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
	CMPL DI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
	CMPL SI, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy

cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
	CMPL R10, $0x00000104
	JLT  repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
	CMPL R10, $0x00010100
	JLT  repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
	JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB

four_bytes_remain_match_nolit_encodeBlockAsm4MB:
	TESTL R10, R10
	JZ    match_nolit_emitcopy_end_encodeBlockAsm4MB
	MOVB  $0x03, BL
	LEAL  -4(BX)(R10*4), R10
	MOVB  R10, (AX)
	MOVL  SI, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeBlockAsm4MB

two_byte_offset_match_nolit_encodeBlockAsm4MB:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm4MB
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JLE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
	CMPL DI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
	CMPL SI, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	CMPL R10, $0x00000104
	JLT  repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
	CMPL R10, $0x00010100
	JLT  repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
	JMP two_byte_offset_match_nolit_encodeBlockAsm4MB

two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBlockAsm4MB
	CMPL SI, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeBlockAsm4MB
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

emit_copy_three_match_nolit_encodeBlockAsm4MB:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm4MB:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBlockAsm4MB
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm4MB:
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x10, R8
	IMULQ R9, R8
	SHRQ  $0x32, R8
	SHLQ  $0x10, SI
	IMULQ R9, SI
	SHRQ  $0x32, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm4MB
	INCL  CX
	JMP   search_loop_encodeBlockAsm4MB

emit_remainder_encodeBlockAsm4MB:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 4(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm4MB:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm4MB
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBlockAsm4MB
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBlockAsm4MB
	CMPL DX, $0x00010000
	JLT  three_bytes_emit_remainder_encodeBlockAsm4MB
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB

three_bytes_emit_remainder_encodeBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB

two_bytes_emit_remainder_encodeBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBlockAsm4MB
	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB

one_byte_emit_remainder_encodeBlockAsm4MB:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm4MB:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm4MB

memmove_long_emit_remainder_encodeBlockAsm4MB:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm4MB:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBlockAsm12B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBlockAsm12B(SB), $16408-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000080, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm12B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm12B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBlockAsm12B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x000000cf1bbcdcbb, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x18, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	SHLQ  $0x18, R11
	IMULQ R9, R11
	SHRQ  $0x34, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x18, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm12B
	LEAL  1(CX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm12B

repeat_extend_back_loop_encodeBlockAsm12B:
	CMPL DI, R8
	JLE  repeat_extend_back_end_encodeBlockAsm12B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeBlockAsm12B
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm12B

repeat_extend_back_end_encodeBlockAsm12B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeBlockAsm12B
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm12B

two_bytes_repeat_emit_encodeBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeBlockAsm12B
	JMP  memmove_long_repeat_emit_encodeBlockAsm12B

one_byte_repeat_emit_encodeBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm12B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm12B

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm12B:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm12B

memmove_long_repeat_emit_encodeBlockAsm12B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeBlockAsm12B:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL CX, R9
	LEAQ (DX)(CX*1), R10
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R12, R12
	CMPL R9, $0x08
	JL   matchlen_single_repeat_extend_encodeBlockAsm12B

matchlen_loopback_repeat_extend_encodeBlockAsm12B:
	MOVQ  (R10)(R12*1), R11
	XORQ  (SI)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_repeat_extend_encodeBlockAsm12B
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   repeat_extend_forward_end_encodeBlockAsm12B

matchlen_loop_repeat_extend_encodeBlockAsm12B:
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	CMPL R9, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm12B

matchlen_single_repeat_extend_encodeBlockAsm12B:
	TESTL R9, R9
	JZ    repeat_extend_forward_end_encodeBlockAsm12B

matchlen_single_loopback_repeat_extend_encodeBlockAsm12B:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm12B
	LEAL 1(R12), R12
	DECL R9
	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm12B

repeat_extend_forward_end_encodeBlockAsm12B:
	ADDL  R12, CX
	MOVL  CX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm12B

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_match_repeat_encodeBlockAsm12B
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_match_repeat_encodeBlockAsm12B

cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
	CMPL SI, $0x00000104
	JLT  repeat_three_match_repeat_encodeBlockAsm12B
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_three_match_repeat_encodeBlockAsm12B:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_match_repeat_encodeBlockAsm12B:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_offset_match_repeat_encodeBlockAsm12B:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_as_copy_encodeBlockAsm12B:
	// emitCopy
two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	CMPL SI, $0x00000104
	JLT  repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B
	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B

two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
	CMPL DI, $0x00000800
	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm12B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm12B

no_repeat_found_encodeBlockAsm12B:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm12B
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm12B
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm12B
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm12B

candidate3_match_encodeBlockAsm12B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm12B

candidate2_match_encodeBlockAsm12B:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeBlockAsm12B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm12B

match_extend_back_loop_encodeBlockAsm12B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBlockAsm12B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBlockAsm12B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm12B
	JMP  match_extend_back_loop_encodeBlockAsm12B

match_extend_back_end_encodeBlockAsm12B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm12B:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeBlockAsm12B
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm12B

two_bytes_match_emit_encodeBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeBlockAsm12B
	JMP  memmove_long_match_emit_encodeBlockAsm12B

one_byte_match_emit_encodeBlockAsm12B:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm12B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm12B

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm12B:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm12B

memmove_long_match_emit_encodeBlockAsm12B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeBlockAsm12B:
match_nolit_loop_encodeBlockAsm12B:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeBlockAsm12B

matchlen_loopback_match_nolit_encodeBlockAsm12B:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeBlockAsm12B
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeBlockAsm12B

matchlen_loop_match_nolit_encodeBlockAsm12B:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBlockAsm12B

matchlen_single_match_nolit_encodeBlockAsm12B:
	TESTL DI, DI
	JZ    match_nolit_end_encodeBlockAsm12B

matchlen_single_loopback_match_nolit_encodeBlockAsm12B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm12B
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm12B

match_nolit_end_encodeBlockAsm12B:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeBlockAsm12B:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm12B
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JLE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
	CMPL DI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
	CMPL SI, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
	CMPL R10, $0x00000104
	JLT  repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
	JMP two_byte_offset_match_nolit_encodeBlockAsm12B

two_byte_offset_short_match_nolit_encodeBlockAsm12B:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBlockAsm12B
	CMPL SI, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeBlockAsm12B
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

emit_copy_three_match_nolit_encodeBlockAsm12B:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm12B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBlockAsm12B
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm12B:
	MOVQ  $0x000000cf1bbcdcbb, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x18, R8
	IMULQ R9, R8
	SHRQ  $0x34, R8
	SHLQ  $0x18, SI
	IMULQ R9, SI
	SHRQ  $0x34, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm12B
	INCL  CX
	JMP   search_loop_encodeBlockAsm12B

emit_remainder_encodeBlockAsm12B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm12B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm12B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBlockAsm12B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm12B

two_bytes_emit_remainder_encodeBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeBlockAsm12B

one_byte_emit_remainder_encodeBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm12B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm12B

memmove_long_emit_remainder_encodeBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm12B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBlockAsm10B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBlockAsm10B(SB), $4120-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000020, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm10B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm10B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBlockAsm10B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ R9, R11
	SHRQ  $0x36, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm10B
	LEAL  1(CX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm10B

repeat_extend_back_loop_encodeBlockAsm10B:
	CMPL DI, R8
	JLE  repeat_extend_back_end_encodeBlockAsm10B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeBlockAsm10B
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm10B

repeat_extend_back_end_encodeBlockAsm10B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeBlockAsm10B
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm10B

two_bytes_repeat_emit_encodeBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeBlockAsm10B
	JMP  memmove_long_repeat_emit_encodeBlockAsm10B

one_byte_repeat_emit_encodeBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm10B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm10B

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm10B:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm10B

memmove_long_repeat_emit_encodeBlockAsm10B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeBlockAsm10B:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL CX, R9
	LEAQ (DX)(CX*1), R10
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R12, R12
	CMPL R9, $0x08
	JL   matchlen_single_repeat_extend_encodeBlockAsm10B

matchlen_loopback_repeat_extend_encodeBlockAsm10B:
	MOVQ  (R10)(R12*1), R11
	XORQ  (SI)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_repeat_extend_encodeBlockAsm10B
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   repeat_extend_forward_end_encodeBlockAsm10B

matchlen_loop_repeat_extend_encodeBlockAsm10B:
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	CMPL R9, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm10B

matchlen_single_repeat_extend_encodeBlockAsm10B:
	TESTL R9, R9
	JZ    repeat_extend_forward_end_encodeBlockAsm10B

matchlen_single_loopback_repeat_extend_encodeBlockAsm10B:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm10B
	LEAL 1(R12), R12
	DECL R9
	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm10B

repeat_extend_forward_end_encodeBlockAsm10B:
	ADDL  R12, CX
	MOVL  CX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm10B

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_match_repeat_encodeBlockAsm10B
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_match_repeat_encodeBlockAsm10B

cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
	CMPL SI, $0x00000104
	JLT  repeat_three_match_repeat_encodeBlockAsm10B
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_three_match_repeat_encodeBlockAsm10B:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_match_repeat_encodeBlockAsm10B:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_offset_match_repeat_encodeBlockAsm10B:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_as_copy_encodeBlockAsm10B:
	// emitCopy
two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JLE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
	CMPL R8, $0x0c
	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
	CMPL DI, $0x00000800
	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	CMPL SI, $0x00000104
	JLT  repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B
	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B

two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
	CMPL DI, $0x00000800
	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm10B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm10B

no_repeat_found_encodeBlockAsm10B:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm10B
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm10B
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm10B
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm10B

candidate3_match_encodeBlockAsm10B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm10B

candidate2_match_encodeBlockAsm10B:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeBlockAsm10B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm10B

match_extend_back_loop_encodeBlockAsm10B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBlockAsm10B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBlockAsm10B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm10B
	JMP  match_extend_back_loop_encodeBlockAsm10B

match_extend_back_end_encodeBlockAsm10B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm10B:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeBlockAsm10B
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm10B

two_bytes_match_emit_encodeBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeBlockAsm10B
	JMP  memmove_long_match_emit_encodeBlockAsm10B

one_byte_match_emit_encodeBlockAsm10B:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm10B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm10B

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm10B:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm10B

memmove_long_match_emit_encodeBlockAsm10B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeBlockAsm10B:
match_nolit_loop_encodeBlockAsm10B:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeBlockAsm10B

matchlen_loopback_match_nolit_encodeBlockAsm10B:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeBlockAsm10B
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeBlockAsm10B

matchlen_loop_match_nolit_encodeBlockAsm10B:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBlockAsm10B

matchlen_single_match_nolit_encodeBlockAsm10B:
	TESTL DI, DI
	JZ    match_nolit_end_encodeBlockAsm10B

matchlen_single_loopback_match_nolit_encodeBlockAsm10B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm10B
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm10B

match_nolit_end_encodeBlockAsm10B:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeBlockAsm10B:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm10B
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JLE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
	CMPL DI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
	CMPL SI, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
	CMPL R10, $0x00000104
	JLT  repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
	JMP two_byte_offset_match_nolit_encodeBlockAsm10B

two_byte_offset_short_match_nolit_encodeBlockAsm10B:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBlockAsm10B
	CMPL SI, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeBlockAsm10B
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

emit_copy_three_match_nolit_encodeBlockAsm10B:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm10B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBlockAsm10B
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm10B:
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x20, R8
	IMULQ R9, R8
	SHRQ  $0x36, R8
	SHLQ  $0x20, SI
	IMULQ R9, SI
	SHRQ  $0x36, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm10B
	INCL  CX
	JMP   search_loop_encodeBlockAsm10B

emit_remainder_encodeBlockAsm10B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm10B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm10B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBlockAsm10B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm10B

two_bytes_emit_remainder_encodeBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeBlockAsm10B

one_byte_emit_remainder_encodeBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm10B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm10B

memmove_long_emit_remainder_encodeBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm10B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBlockAsm8B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBlockAsm8B(SB), $1048-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000008, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm8B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm8B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x04, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBlockAsm8B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x38, R10
	SHLQ  $0x20, R11
	IMULQ R9, R11
	SHRQ  $0x38, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x38, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm8B
	LEAL  1(CX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm8B

repeat_extend_back_loop_encodeBlockAsm8B:
	CMPL DI, R8
	JLE  repeat_extend_back_end_encodeBlockAsm8B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeBlockAsm8B
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm8B

repeat_extend_back_end_encodeBlockAsm8B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeBlockAsm8B
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm8B

two_bytes_repeat_emit_encodeBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeBlockAsm8B
	JMP  memmove_long_repeat_emit_encodeBlockAsm8B

one_byte_repeat_emit_encodeBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm8B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm8B

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm8B:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm8B

memmove_long_repeat_emit_encodeBlockAsm8B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeBlockAsm8B:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL CX, R9
	LEAQ (DX)(CX*1), R10
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R12, R12
	CMPL R9, $0x08
	JL   matchlen_single_repeat_extend_encodeBlockAsm8B

matchlen_loopback_repeat_extend_encodeBlockAsm8B:
	MOVQ  (R10)(R12*1), R11
	XORQ  (SI)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_repeat_extend_encodeBlockAsm8B
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   repeat_extend_forward_end_encodeBlockAsm8B

matchlen_loop_repeat_extend_encodeBlockAsm8B:
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	CMPL R9, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm8B

matchlen_single_repeat_extend_encodeBlockAsm8B:
	TESTL R9, R9
	JZ    repeat_extend_forward_end_encodeBlockAsm8B

matchlen_single_loopback_repeat_extend_encodeBlockAsm8B:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm8B
	LEAL 1(R12), R12
	DECL R9
	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm8B

repeat_extend_forward_end_encodeBlockAsm8B:
	ADDL  R12, CX
	MOVL  CX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm8B

	// emitRepeat
	MOVL SI, DI
	LEAL -4(SI), SI
	CMPL DI, $0x08
	JLE  repeat_two_match_repeat_encodeBlockAsm8B
	CMPL DI, $0x0c
	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm8B

cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
	CMPL SI, $0x00000104
	JLT  repeat_three_match_repeat_encodeBlockAsm8B
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_three_match_repeat_encodeBlockAsm8B:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_two_match_repeat_encodeBlockAsm8B:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_as_copy_encodeBlockAsm8B:
	// emitCopy
two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX

	// emitRepeat
	MOVL SI, DI
	LEAL -4(SI), SI
	CMPL DI, $0x08
	JLE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
	CMPL DI, $0x0c
	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
	CMPL SI, $0x00000104
	JLT  repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
	LEAL -256(SI), SI
	MOVW $0x0019, (AX)
	MOVW SI, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (AX)
	MOVB SI, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B
	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B

two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm8B
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm8B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm8B

no_repeat_found_encodeBlockAsm8B:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm8B
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm8B
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm8B
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm8B

candidate3_match_encodeBlockAsm8B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm8B

candidate2_match_encodeBlockAsm8B:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeBlockAsm8B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm8B

match_extend_back_loop_encodeBlockAsm8B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBlockAsm8B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBlockAsm8B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm8B
	JMP  match_extend_back_loop_encodeBlockAsm8B

match_extend_back_end_encodeBlockAsm8B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm8B:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeBlockAsm8B
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm8B

two_bytes_match_emit_encodeBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeBlockAsm8B
	JMP  memmove_long_match_emit_encodeBlockAsm8B

one_byte_match_emit_encodeBlockAsm8B:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm8B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm8B

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm8B:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm8B

memmove_long_match_emit_encodeBlockAsm8B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeBlockAsm8B:
match_nolit_loop_encodeBlockAsm8B:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeBlockAsm8B

matchlen_loopback_match_nolit_encodeBlockAsm8B:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeBlockAsm8B
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeBlockAsm8B

matchlen_loop_match_nolit_encodeBlockAsm8B:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBlockAsm8B

matchlen_single_match_nolit_encodeBlockAsm8B:
	TESTL DI, DI
	JZ    match_nolit_end_encodeBlockAsm8B

matchlen_single_loopback_match_nolit_encodeBlockAsm8B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm8B
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm8B

match_nolit_end_encodeBlockAsm8B:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeBlockAsm8B:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm8B
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R10, SI
	LEAL -4(R10), R10
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
	CMPL R10, $0x00000104
	JLT  repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
	JMP two_byte_offset_match_nolit_encodeBlockAsm8B

two_byte_offset_short_match_nolit_encodeBlockAsm8B:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBlockAsm8B
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

emit_copy_three_match_nolit_encodeBlockAsm8B:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm8B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBlockAsm8B
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm8B:
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x20, R8
	IMULQ R9, R8
	SHRQ  $0x38, R8
	SHLQ  $0x20, SI
	IMULQ R9, SI
	SHRQ  $0x38, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm8B
	INCL  CX
	JMP   search_loop_encodeBlockAsm8B

emit_remainder_encodeBlockAsm8B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm8B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm8B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBlockAsm8B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm8B

two_bytes_emit_remainder_encodeBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeBlockAsm8B

one_byte_emit_remainder_encodeBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm8B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm8B

memmove_long_emit_remainder_encodeBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm8B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBetterBlockAsm(SB), $327704-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000a00, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm:
	MOVL CX, SI
	SUBL 12(SP), SI
	SHRL $0x07, SI
	CMPL SI, $0x63
	JLE  check_maxskip_ok_encodeBetterBlockAsm
	LEAL 100(CX), SI
	JMP  check_maxskip_cont_encodeBetterBlockAsm

check_maxskip_ok_encodeBetterBlockAsm:
	LEAL 1(CX)(SI*1), SI

check_maxskip_cont_encodeBetterBlockAsm:
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBetterBlockAsm
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x32, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  262168(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 262168(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeBetterBlockAsm
	MOVL  20(SP), CX
	JMP   search_loop_encodeBetterBlockAsm

candidateS_match_encodeBetterBlockAsm:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm

match_extend_back_loop_encodeBetterBlockAsm:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBetterBlockAsm
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm
	JMP  match_extend_back_loop_encodeBetterBlockAsm

match_extend_back_end_encodeBetterBlockAsm:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 5(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeBetterBlockAsm

matchlen_loopback_match_nolit_encodeBetterBlockAsm:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeBetterBlockAsm

matchlen_loop_match_nolit_encodeBetterBlockAsm:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm

matchlen_single_match_nolit_encodeBetterBlockAsm:
	TESTL R8, R8
	JZ    match_nolit_end_encodeBetterBlockAsm

matchlen_single_loopback_match_nolit_encodeBetterBlockAsm:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm

match_nolit_end_encodeBetterBlockAsm:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm
	CMPL R12, $0x01
	JG   match_length_ok_encodeBetterBlockAsm
	CMPL R8, $0x0000ffff
	JLE  match_length_ok_encodeBetterBlockAsm
	MOVL 20(SP), CX
	INCL CX
	JMP  search_loop_encodeBetterBlockAsm

match_length_ok_encodeBetterBlockAsm:
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeBetterBlockAsm
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeBetterBlockAsm
	CMPL SI, $0x00010000
	JLT  three_bytes_match_emit_encodeBetterBlockAsm
	CMPL SI, $0x01000000
	JLT  four_bytes_match_emit_encodeBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL SI, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

four_bytes_match_emit_encodeBetterBlockAsm:
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

three_bytes_match_emit_encodeBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

two_bytes_match_emit_encodeBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeBetterBlockAsm
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

one_byte_match_emit_encodeBetterBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (AX)
	MOVL R10, -4(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm

memmove_long_match_emit_encodeBetterBlockAsm:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeBetterBlockAsm:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
	CMPL R8, $0x00010000
	JL   two_byte_offset_match_nolit_encodeBetterBlockAsm

four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
	CMPL R12, $0x40
	JLE  four_bytes_remain_match_nolit_encodeBetterBlockAsm
	MOVB $0xff, (AX)
	MOVL R8, 1(AX)
	LEAL -64(R12), R12
	ADDQ $0x05, AX
	CMPL R12, $0x04
	JL   four_bytes_remain_match_nolit_encodeBetterBlockAsm

	// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL R12, $0x00010100
	JLT  repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL R12, $0x0100ffff
	JLT  repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
	LEAL -16842747(R12), R12
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy

repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (AX)
	MOVW R12, 2(AX)
	SARL $0x10, R8
	MOVB R8, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
	JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm

four_bytes_remain_match_nolit_encodeBetterBlockAsm:
	TESTL R12, R12
	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm
	MOVB  $0x03, BL
	LEAL  -4(BX)(R12*4), R12
	MOVB  R12, (AX)
	MOVL  R8, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm

two_byte_offset_match_nolit_encodeBetterBlockAsm:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL R12, $0x00010100
	JLT  repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL R12, $0x0100ffff
	JLT  repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
	LEAL -16842747(R12), R12
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short

repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (AX)
	MOVW R12, 2(AX)
	SARL $0x10, R8
	MOVB R8, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm

two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm
	CMPL R8, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

emit_copy_three_match_nolit_encodeBetterBlockAsm:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

match_is_repeat_encodeBetterBlockAsm:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm
	CMPL SI, $0x00010000
	JLT  three_bytes_match_emit_repeat_encodeBetterBlockAsm
	CMPL SI, $0x01000000
	JLT  four_bytes_match_emit_repeat_encodeBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL SI, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

four_bytes_match_emit_repeat_encodeBetterBlockAsm:
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

three_bytes_match_emit_repeat_encodeBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

two_bytes_match_emit_repeat_encodeBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_repeat_encodeBetterBlockAsm
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

one_byte_match_emit_repeat_encodeBetterBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (AX)
	MOVL R10, -4(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm

memmove_long_match_emit_repeat_encodeBetterBlockAsm:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitRepeat
emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm
	CMPL R12, $0x00010100
	JLT  repeat_four_match_nolit_repeat_encodeBetterBlockAsm
	CMPL R12, $0x0100ffff
	JLT  repeat_five_match_nolit_repeat_encodeBetterBlockAsm
	LEAL -16842747(R12), R12
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm

repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (AX)
	MOVW R12, 2(AX)
	SARL $0x10, R8
	MOVB R8, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBetterBlockAsm
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm:
	MOVQ  $0x00cf1bbcdcbfa563, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x08, R13
	IMULQ SI, R13
	SHRQ  $0x30, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x32, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 262168(SP)(R11*4)
	MOVL  R15, 262168(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x08, R13
	IMULQ SI, R13
	SHRQ  $0x30, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 262168(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeBetterBlockAsm

emit_remainder_encodeBetterBlockAsm:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 5(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBetterBlockAsm
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm
	CMPL DX, $0x00010000
	JLT  three_bytes_emit_remainder_encodeBetterBlockAsm
	CMPL DX, $0x01000000
	JLT  four_bytes_emit_remainder_encodeBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL DX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

four_bytes_emit_remainder_encodeBetterBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

three_bytes_emit_remainder_encodeBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

two_bytes_emit_remainder_encodeBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBetterBlockAsm
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

one_byte_emit_remainder_encodeBetterBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x04
	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4:
	MOVL (CX), SI
	MOVL SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm

memmove_long_emit_remainder_encodeBetterBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000a00, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm4MB:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm4MB
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm4MB:
	MOVL CX, SI
	SUBL 12(SP), SI
	SHRL $0x07, SI
	CMPL SI, $0x63
	JLE  check_maxskip_ok_encodeBetterBlockAsm4MB
	LEAL 100(CX), SI
	JMP  check_maxskip_cont_encodeBetterBlockAsm4MB

check_maxskip_ok_encodeBetterBlockAsm4MB:
	LEAL 1(CX)(SI*1), SI

check_maxskip_cont_encodeBetterBlockAsm4MB:
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBetterBlockAsm4MB
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x32, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  262168(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 262168(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm4MB
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeBetterBlockAsm4MB
	MOVL  20(SP), CX
	JMP   search_loop_encodeBetterBlockAsm4MB

candidateS_match_encodeBetterBlockAsm4MB:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm4MB
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm4MB:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm4MB

match_extend_back_loop_encodeBetterBlockAsm4MB:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBetterBlockAsm4MB
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm4MB
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm4MB
	JMP  match_extend_back_loop_encodeBetterBlockAsm4MB

match_extend_back_end_encodeBetterBlockAsm4MB:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 4(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBetterBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm4MB:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeBetterBlockAsm4MB

matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeBetterBlockAsm4MB

matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB

matchlen_single_match_nolit_encodeBetterBlockAsm4MB:
	TESTL R8, R8
	JZ    match_nolit_end_encodeBetterBlockAsm4MB

matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm4MB
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB

match_nolit_end_encodeBetterBlockAsm4MB:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm4MB
	CMPL R12, $0x01
	JG   match_length_ok_encodeBetterBlockAsm4MB
	CMPL R8, $0x0000ffff
	JLE  match_length_ok_encodeBetterBlockAsm4MB
	MOVL 20(SP), CX
	INCL CX
	JMP  search_loop_encodeBetterBlockAsm4MB

match_length_ok_encodeBetterBlockAsm4MB:
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeBetterBlockAsm4MB
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeBetterBlockAsm4MB
	CMPL SI, $0x00010000
	JLT  three_bytes_match_emit_encodeBetterBlockAsm4MB
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB

three_bytes_match_emit_encodeBetterBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB

two_bytes_match_emit_encodeBetterBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeBetterBlockAsm4MB
	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB

one_byte_match_emit_encodeBetterBlockAsm4MB:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm4MB:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (AX)
	MOVL R10, -4(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm4MB

memmove_long_match_emit_encodeBetterBlockAsm4MB:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
	CMPL R8, $0x00010000
	JL   two_byte_offset_match_nolit_encodeBetterBlockAsm4MB

four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
	CMPL R12, $0x40
	JLE  four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
	MOVB $0xff, (AX)
	MOVL R8, 1(AX)
	LEAL -64(R12), R12
	ADDQ $0x05, AX
	CMPL R12, $0x04
	JL   four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	CMPL R12, $0x00010100
	JLT  repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (AX)
	MOVW R12, 2(AX)
	SARL $0x10, R8
	MOVB R8, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
	JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB

four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
	TESTL R12, R12
	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
	MOVB  $0x03, BL
	LEAL  -4(BX)(R12*4), R12
	MOVB  R12, (AX)
	MOVL  R8, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	CMPL R12, $0x00010100
	JLT  repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (AX)
	MOVW R12, 2(AX)
	SARL $0x10, R8
	MOVB R8, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB

two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
	CMPL R8, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

match_is_repeat_encodeBetterBlockAsm4MB:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
	CMPL SI, $0x00010000
	JLT  three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB

three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB

two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_repeat_encodeBetterBlockAsm4MB
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB

one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (AX)
	MOVL R10, -4(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB

memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
	CMPL R12, $0x00010100
	JLT  repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (AX)
	MOVW R12, 2(AX)
	SARL $0x10, R8
	MOVB R8, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBetterBlockAsm4MB
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBetterBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm4MB:
	MOVQ  $0x00cf1bbcdcbfa563, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x08, R13
	IMULQ SI, R13
	SHRQ  $0x30, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x32, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 262168(SP)(R11*4)
	MOVL  R15, 262168(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x08, R13
	IMULQ SI, R13
	SHRQ  $0x30, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 262168(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeBetterBlockAsm4MB

emit_remainder_encodeBetterBlockAsm4MB:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 4(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBetterBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm4MB:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBetterBlockAsm4MB
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm4MB
	CMPL DX, $0x00010000
	JLT  three_bytes_emit_remainder_encodeBetterBlockAsm4MB
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB

three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB

two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBetterBlockAsm4MB
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB

one_byte_emit_remainder_encodeBetterBlockAsm4MB:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm4MB:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x04
	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4:
	MOVL (CX), SI
	MOVL SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB

memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000280, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm12B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm12B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  1(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBetterBlockAsm12B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x34, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  65560(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 65560(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm12B
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeBetterBlockAsm12B
	MOVL  20(SP), CX
	JMP   search_loop_encodeBetterBlockAsm12B

candidateS_match_encodeBetterBlockAsm12B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm12B
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm12B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm12B

match_extend_back_loop_encodeBetterBlockAsm12B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBetterBlockAsm12B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm12B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm12B
	JMP  match_extend_back_loop_encodeBetterBlockAsm12B

match_extend_back_end_encodeBetterBlockAsm12B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm12B:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeBetterBlockAsm12B

matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm12B
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeBetterBlockAsm12B

matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm12B

matchlen_single_match_nolit_encodeBetterBlockAsm12B:
	TESTL R8, R8
	JZ    match_nolit_end_encodeBetterBlockAsm12B

matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm12B
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B

match_nolit_end_encodeBetterBlockAsm12B:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm12B
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeBetterBlockAsm12B
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeBetterBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B

two_bytes_match_emit_encodeBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeBetterBlockAsm12B
	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B

one_byte_match_emit_encodeBetterBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm12B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (AX)
	MOVL R10, -4(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm12B

memmove_long_match_emit_encodeBetterBlockAsm12B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeBetterBlockAsm12B:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B

two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
	CMPL R8, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

match_is_repeat_encodeBetterBlockAsm12B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm12B
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B

two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_repeat_encodeBetterBlockAsm12B
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B

one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm12B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (AX)
	MOVL R10, -4(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B

memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBetterBlockAsm12B
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm12B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x32, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x34, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x34, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 65560(SP)(R11*4)
	MOVL  R15, 65560(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x34, R11
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x32, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 65560(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeBetterBlockAsm12B

emit_remainder_encodeBetterBlockAsm12B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm12B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBetterBlockAsm12B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B

two_bytes_emit_remainder_encodeBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBetterBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B

one_byte_emit_remainder_encodeBetterBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x04
	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4:
	MOVL (CX), SI
	MOVL SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B

memmove_long_emit_remainder_encodeBetterBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x000000a0, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm10B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm10B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  1(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBetterBlockAsm10B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x36, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  16408(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 16408(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm10B
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeBetterBlockAsm10B
	MOVL  20(SP), CX
	JMP   search_loop_encodeBetterBlockAsm10B

candidateS_match_encodeBetterBlockAsm10B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm10B
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm10B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm10B

match_extend_back_loop_encodeBetterBlockAsm10B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBetterBlockAsm10B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm10B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm10B
	JMP  match_extend_back_loop_encodeBetterBlockAsm10B

match_extend_back_end_encodeBetterBlockAsm10B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm10B:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeBetterBlockAsm10B

matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm10B
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeBetterBlockAsm10B

matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm10B

matchlen_single_match_nolit_encodeBetterBlockAsm10B:
	TESTL R8, R8
	JZ    match_nolit_end_encodeBetterBlockAsm10B

matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm10B
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B

match_nolit_end_encodeBetterBlockAsm10B:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm10B
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeBetterBlockAsm10B
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeBetterBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B

two_bytes_match_emit_encodeBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeBetterBlockAsm10B
	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B

one_byte_match_emit_encodeBetterBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm10B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (AX)
	MOVL R10, -4(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm10B

memmove_long_match_emit_encodeBetterBlockAsm10B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeBetterBlockAsm10B:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B

two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
	CMPL R8, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

match_is_repeat_encodeBetterBlockAsm10B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm10B
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B

two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_repeat_encodeBetterBlockAsm10B
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B

one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm10B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (AX)
	MOVL R10, -4(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B

memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
	CMPL R8, $0x00000800
	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBetterBlockAsm10B
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm10B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x34, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x36, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x36, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 16408(SP)(R11*4)
	MOVL  R15, 16408(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x36, R11
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x34, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 16408(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeBetterBlockAsm10B

emit_remainder_encodeBetterBlockAsm10B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm10B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBetterBlockAsm10B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B

two_bytes_emit_remainder_encodeBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBetterBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B

one_byte_emit_remainder_encodeBetterBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x04
	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4:
	MOVL (CX), SI
	MOVL SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B

memmove_long_emit_remainder_encodeBetterBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000028, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm8B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm8B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x04, SI
	LEAL  1(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeBetterBlockAsm8B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x38, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  4120(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 4120(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm8B
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeBetterBlockAsm8B
	MOVL  20(SP), CX
	JMP   search_loop_encodeBetterBlockAsm8B

candidateS_match_encodeBetterBlockAsm8B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm8B
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm8B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm8B

match_extend_back_loop_encodeBetterBlockAsm8B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeBetterBlockAsm8B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm8B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm8B
	JMP  match_extend_back_loop_encodeBetterBlockAsm8B

match_extend_back_end_encodeBetterBlockAsm8B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm8B:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeBetterBlockAsm8B

matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm8B
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeBetterBlockAsm8B

matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm8B

matchlen_single_match_nolit_encodeBetterBlockAsm8B:
	TESTL R8, R8
	JZ    match_nolit_end_encodeBetterBlockAsm8B

matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm8B
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B

match_nolit_end_encodeBetterBlockAsm8B:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm8B
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeBetterBlockAsm8B
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeBetterBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B

two_bytes_match_emit_encodeBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeBetterBlockAsm8B
	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B

one_byte_match_emit_encodeBetterBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm8B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (AX)
	MOVL R10, -4(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm8B

memmove_long_match_emit_encodeBetterBlockAsm8B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeBetterBlockAsm8B:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B

two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm8B
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

match_is_repeat_encodeBetterBlockAsm8B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm8B
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B

two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_repeat_encodeBetterBlockAsm8B
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B

one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm8B:
	LEAQ (AX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x04
	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B

memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
	LEAQ (AX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
	CMPL R12, $0x00000104
	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
	LEAL -256(R12), R12
	MOVW $0x0019, (AX)
	MOVW R12, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
	LEAL -4(R12), R12
	MOVW $0x0015, (AX)
	MOVB R12, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(AX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeBetterBlockAsm8B
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm8B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x36, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x38, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x38, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 4120(SP)(R11*4)
	MOVL  R15, 4120(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x38, R11
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x36, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 4120(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeBetterBlockAsm8B

emit_remainder_encodeBetterBlockAsm8B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm8B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeBetterBlockAsm8B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B

two_bytes_emit_remainder_encodeBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeBetterBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B

one_byte_emit_remainder_encodeBetterBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x04
	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4:
	MOVL (CX), SI
	MOVL SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B

memmove_long_emit_remainder_encodeBetterBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBlockAsm(SB), $65560-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBlockAsm
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ R9, R11
	SHRQ  $0x32, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm
	LEAL  1(CX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm

repeat_extend_back_loop_encodeSnappyBlockAsm:
	CMPL DI, SI
	JLE  repeat_extend_back_end_encodeSnappyBlockAsm
	MOVB -1(DX)(R8*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm

repeat_extend_back_end_encodeSnappyBlockAsm:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm
	CMPL SI, $0x00010000
	JLT  three_bytes_repeat_emit_encodeSnappyBlockAsm
	CMPL SI, $0x01000000
	JLT  four_bytes_repeat_emit_encodeSnappyBlockAsm
	MOVB $0xfc, (AX)
	MOVL SI, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

four_bytes_repeat_emit_encodeSnappyBlockAsm:
	MOVL SI, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

three_bytes_repeat_emit_encodeSnappyBlockAsm:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

two_bytes_repeat_emit_encodeSnappyBlockAsm:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeSnappyBlockAsm
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

one_byte_repeat_emit_encodeSnappyBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm:
	LEAQ (AX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm

memmove_long_repeat_emit_encodeSnappyBlockAsm:
	LEAQ (AX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R11, R11
	CMPL R8, $0x08
	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm

matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
	MOVQ  (R9)(R11*1), R10
	XORQ  (SI)(R11*1), R10
	TESTQ R10, R10
	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm
	BSFQ  R10, R10
	SARQ  $0x03, R10
	LEAL  (R11)(R10*1), R11
	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm

matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	CMPL R8, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm

matchlen_single_repeat_extend_encodeSnappyBlockAsm:
	TESTL R8, R8
	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm

matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm
	LEAL 1(R11), R11
	DECL R8
	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm

repeat_extend_forward_end_encodeSnappyBlockAsm:
	ADDL R11, CX
	MOVL CX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
	CMPL DI, $0x00010000
	JL   two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm

four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
	CMPL SI, $0x40
	JLE  four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
	MOVB $0xff, (AX)
	MOVL DI, 1(AX)
	LEAL -64(SI), SI
	ADDQ $0x05, AX
	CMPL SI, $0x04
	JL   four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
	JMP  four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm

four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
	TESTL SI, SI
	JZ    repeat_end_emit_encodeSnappyBlockAsm
	MOVB  $0x03, BL
	LEAL  -4(BX)(SI*4), SI
	MOVB  SI, (AX)
	MOVL  DI, 1(AX)
	ADDQ  $0x05, AX
	JMP   repeat_end_emit_encodeSnappyBlockAsm

two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
	CMPL DI, $0x00000800
	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm

no_repeat_found_encodeSnappyBlockAsm:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm

candidate3_match_encodeSnappyBlockAsm:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm

candidate2_match_encodeSnappyBlockAsm:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm

match_extend_back_loop_encodeSnappyBlockAsm:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBlockAsm
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm
	JMP  match_extend_back_loop_encodeSnappyBlockAsm

match_extend_back_end_encodeSnappyBlockAsm:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 5(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBlockAsm
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBlockAsm
	CMPL R8, $0x00010000
	JLT  three_bytes_match_emit_encodeSnappyBlockAsm
	CMPL R8, $0x01000000
	JLT  four_bytes_match_emit_encodeSnappyBlockAsm
	MOVB $0xfc, (AX)
	MOVL R8, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

four_bytes_match_emit_encodeSnappyBlockAsm:
	MOVL R8, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW R8, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

three_bytes_match_emit_encodeSnappyBlockAsm:
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

two_bytes_match_emit_encodeSnappyBlockAsm:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeSnappyBlockAsm
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

one_byte_match_emit_encodeSnappyBlockAsm:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm

memmove_long_match_emit_encodeSnappyBlockAsm:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm:
match_nolit_loop_encodeSnappyBlockAsm:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm

matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeSnappyBlockAsm

matchlen_loop_match_nolit_encodeSnappyBlockAsm:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm

matchlen_single_match_nolit_encodeSnappyBlockAsm:
	TESTL DI, DI
	JZ    match_nolit_end_encodeSnappyBlockAsm

matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm

match_nolit_end_encodeSnappyBlockAsm:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
	CMPL SI, $0x00010000
	JL   two_byte_offset_match_nolit_encodeSnappyBlockAsm

four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
	CMPL R10, $0x40
	JLE  four_bytes_remain_match_nolit_encodeSnappyBlockAsm
	MOVB $0xff, (AX)
	MOVL SI, 1(AX)
	LEAL -64(R10), R10
	ADDQ $0x05, AX
	CMPL R10, $0x04
	JL   four_bytes_remain_match_nolit_encodeSnappyBlockAsm
	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm

four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
	TESTL R10, R10
	JZ    match_nolit_emitcopy_end_encodeSnappyBlockAsm
	MOVB  $0x03, BL
	LEAL  -4(BX)(R10*4), R10
	MOVB  R10, (AX)
	MOVL  SI, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeSnappyBlockAsm

two_byte_offset_match_nolit_encodeSnappyBlockAsm:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
	CMPL SI, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm

emit_copy_three_match_nolit_encodeSnappyBlockAsm:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBlockAsm
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm:
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x10, R8
	IMULQ R9, R8
	SHRQ  $0x32, R8
	SHLQ  $0x10, SI
	IMULQ R9, SI
	SHRQ  $0x32, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm

emit_remainder_encodeSnappyBlockAsm:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 5(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm
	CMPL DX, $0x00010000
	JLT  three_bytes_emit_remainder_encodeSnappyBlockAsm
	CMPL DX, $0x01000000
	JLT  four_bytes_emit_remainder_encodeSnappyBlockAsm
	MOVB $0xfc, (AX)
	MOVL DX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

four_bytes_emit_remainder_encodeSnappyBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

three_bytes_emit_remainder_encodeSnappyBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

two_bytes_emit_remainder_encodeSnappyBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBlockAsm
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

one_byte_emit_remainder_encodeSnappyBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm

memmove_long_emit_remainder_encodeSnappyBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm64K:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm64K
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm64K:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBlockAsm64K
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ R9, R11
	SHRQ  $0x32, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm64K
	LEAL  1(CX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm64K

repeat_extend_back_loop_encodeSnappyBlockAsm64K:
	CMPL DI, SI
	JLE  repeat_extend_back_end_encodeSnappyBlockAsm64K
	MOVB -1(DX)(R8*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm64K
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm64K

repeat_extend_back_end_encodeSnappyBlockAsm64K:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm64K
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm64K
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K

two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeSnappyBlockAsm64K
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K

one_byte_repeat_emit_encodeSnappyBlockAsm64K:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm64K:
	LEAQ (AX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K

memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
	LEAQ (AX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R11, R11
	CMPL R8, $0x08
	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm64K

matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
	MOVQ  (R9)(R11*1), R10
	XORQ  (SI)(R11*1), R10
	TESTQ R10, R10
	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
	BSFQ  R10, R10
	SARQ  $0x03, R10
	LEAL  (R11)(R10*1), R11
	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm64K

matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	CMPL R8, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K

matchlen_single_repeat_extend_encodeSnappyBlockAsm64K:
	TESTL R8, R8
	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm64K

matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm64K
	LEAL 1(R11), R11
	DECL R8
	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K

repeat_extend_forward_end_encodeSnappyBlockAsm64K:
	ADDL R11, CX
	MOVL CX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
	CMPL DI, $0x00000800
	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm64K

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm64K:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm64K

no_repeat_found_encodeSnappyBlockAsm64K:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm64K
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm64K
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm64K
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm64K

candidate3_match_encodeSnappyBlockAsm64K:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm64K

candidate2_match_encodeSnappyBlockAsm64K:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm64K:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm64K

match_extend_back_loop_encodeSnappyBlockAsm64K:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBlockAsm64K
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm64K
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm64K
	JMP  match_extend_back_loop_encodeSnappyBlockAsm64K

match_extend_back_end_encodeSnappyBlockAsm64K:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm64K:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBlockAsm64K
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBlockAsm64K
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K

two_bytes_match_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeSnappyBlockAsm64K
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K

one_byte_match_emit_encodeSnappyBlockAsm64K:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm64K:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm64K

memmove_long_match_emit_encodeSnappyBlockAsm64K:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
match_nolit_loop_encodeSnappyBlockAsm64K:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm64K

matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeSnappyBlockAsm64K

matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K

matchlen_single_match_nolit_encodeSnappyBlockAsm64K:
	TESTL DI, DI
	JZ    match_nolit_end_encodeSnappyBlockAsm64K

matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm64K
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K

match_nolit_end_encodeSnappyBlockAsm64K:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm64K

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
	CMPL SI, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm64K

emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBlockAsm64K
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm64K:
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x10, R8
	IMULQ R9, R8
	SHRQ  $0x32, R8
	SHLQ  $0x10, SI
	IMULQ R9, SI
	SHRQ  $0x32, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm64K
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm64K

emit_remainder_encodeSnappyBlockAsm64K:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm64K:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm64K
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm64K
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K

two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBlockAsm64K
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K

one_byte_emit_remainder_encodeSnappyBlockAsm64K:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm64K:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K

memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000080, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm12B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm12B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBlockAsm12B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x000000cf1bbcdcbb, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x18, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	SHLQ  $0x18, R11
	IMULQ R9, R11
	SHRQ  $0x34, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x18, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm12B
	LEAL  1(CX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm12B

repeat_extend_back_loop_encodeSnappyBlockAsm12B:
	CMPL DI, SI
	JLE  repeat_extend_back_end_encodeSnappyBlockAsm12B
	MOVB -1(DX)(R8*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm12B
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm12B

repeat_extend_back_end_encodeSnappyBlockAsm12B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm12B
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B

two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeSnappyBlockAsm12B
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B

one_byte_repeat_emit_encodeSnappyBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm12B:
	LEAQ (AX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B

memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
	LEAQ (AX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R11, R11
	CMPL R8, $0x08
	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm12B

matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
	MOVQ  (R9)(R11*1), R10
	XORQ  (SI)(R11*1), R10
	TESTQ R10, R10
	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
	BSFQ  R10, R10
	SARQ  $0x03, R10
	LEAL  (R11)(R10*1), R11
	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm12B

matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	CMPL R8, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B

matchlen_single_repeat_extend_encodeSnappyBlockAsm12B:
	TESTL R8, R8
	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm12B

matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm12B
	LEAL 1(R11), R11
	DECL R8
	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B

repeat_extend_forward_end_encodeSnappyBlockAsm12B:
	ADDL R11, CX
	MOVL CX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
	CMPL DI, $0x00000800
	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm12B

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm12B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm12B

no_repeat_found_encodeSnappyBlockAsm12B:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm12B
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm12B
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm12B
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm12B

candidate3_match_encodeSnappyBlockAsm12B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm12B

candidate2_match_encodeSnappyBlockAsm12B:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm12B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm12B

match_extend_back_loop_encodeSnappyBlockAsm12B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBlockAsm12B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm12B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm12B
	JMP  match_extend_back_loop_encodeSnappyBlockAsm12B

match_extend_back_end_encodeSnappyBlockAsm12B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm12B:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBlockAsm12B
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B

two_bytes_match_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeSnappyBlockAsm12B
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B

one_byte_match_emit_encodeSnappyBlockAsm12B:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm12B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm12B

memmove_long_match_emit_encodeSnappyBlockAsm12B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
match_nolit_loop_encodeSnappyBlockAsm12B:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm12B

matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeSnappyBlockAsm12B

matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B

matchlen_single_match_nolit_encodeSnappyBlockAsm12B:
	TESTL DI, DI
	JZ    match_nolit_end_encodeSnappyBlockAsm12B

matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm12B
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B

match_nolit_end_encodeSnappyBlockAsm12B:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm12B

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
	CMPL SI, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm12B

emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBlockAsm12B
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm12B:
	MOVQ  $0x000000cf1bbcdcbb, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x18, R8
	IMULQ R9, R8
	SHRQ  $0x34, R8
	SHLQ  $0x18, SI
	IMULQ R9, SI
	SHRQ  $0x34, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm12B
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm12B

emit_remainder_encodeSnappyBlockAsm12B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm12B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm12B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B

two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B

one_byte_emit_remainder_encodeSnappyBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B

memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000020, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm10B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm10B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBlockAsm10B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ R9, R11
	SHRQ  $0x36, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm10B
	LEAL  1(CX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm10B

repeat_extend_back_loop_encodeSnappyBlockAsm10B:
	CMPL DI, SI
	JLE  repeat_extend_back_end_encodeSnappyBlockAsm10B
	MOVB -1(DX)(R8*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm10B
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm10B

repeat_extend_back_end_encodeSnappyBlockAsm10B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm10B
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B

two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeSnappyBlockAsm10B
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B

one_byte_repeat_emit_encodeSnappyBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm10B:
	LEAQ (AX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B

memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
	LEAQ (AX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R11, R11
	CMPL R8, $0x08
	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm10B

matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
	MOVQ  (R9)(R11*1), R10
	XORQ  (SI)(R11*1), R10
	TESTQ R10, R10
	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
	BSFQ  R10, R10
	SARQ  $0x03, R10
	LEAL  (R11)(R10*1), R11
	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm10B

matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	CMPL R8, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B

matchlen_single_repeat_extend_encodeSnappyBlockAsm10B:
	TESTL R8, R8
	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm10B

matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm10B
	LEAL 1(R11), R11
	DECL R8
	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B

repeat_extend_forward_end_encodeSnappyBlockAsm10B:
	ADDL R11, CX
	MOVL CX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
	CMPL DI, $0x00000800
	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm10B

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm10B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm10B

no_repeat_found_encodeSnappyBlockAsm10B:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm10B
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm10B
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm10B
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm10B

candidate3_match_encodeSnappyBlockAsm10B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm10B

candidate2_match_encodeSnappyBlockAsm10B:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm10B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm10B

match_extend_back_loop_encodeSnappyBlockAsm10B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBlockAsm10B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm10B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm10B
	JMP  match_extend_back_loop_encodeSnappyBlockAsm10B

match_extend_back_end_encodeSnappyBlockAsm10B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm10B:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBlockAsm10B
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B

two_bytes_match_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeSnappyBlockAsm10B
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B

one_byte_match_emit_encodeSnappyBlockAsm10B:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm10B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm10B

memmove_long_match_emit_encodeSnappyBlockAsm10B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
match_nolit_loop_encodeSnappyBlockAsm10B:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm10B

matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeSnappyBlockAsm10B

matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B

matchlen_single_match_nolit_encodeSnappyBlockAsm10B:
	TESTL DI, DI
	JZ    match_nolit_end_encodeSnappyBlockAsm10B

matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm10B
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B

match_nolit_end_encodeSnappyBlockAsm10B:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm10B

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
	CMPL SI, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm10B

emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBlockAsm10B
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm10B:
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x20, R8
	IMULQ R9, R8
	SHRQ  $0x36, R8
	SHLQ  $0x20, SI
	IMULQ R9, SI
	SHRQ  $0x36, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm10B
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm10B

emit_remainder_encodeSnappyBlockAsm10B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm10B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm10B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B

two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B

one_byte_emit_remainder_encodeSnappyBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B

memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000008, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm8B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm8B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x04, SI
	LEAL  4(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBlockAsm8B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x38, R10
	SHLQ  $0x20, R11
	IMULQ R9, R11
	SHRQ  $0x38, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  24(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	LEAL  1(CX), R10
	MOVL  R10, 24(SP)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x38, R10
	MOVL  CX, R9
	SUBL  16(SP), R9
	MOVL  1(DX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm8B
	LEAL  1(CX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm8B

repeat_extend_back_loop_encodeSnappyBlockAsm8B:
	CMPL DI, SI
	JLE  repeat_extend_back_end_encodeSnappyBlockAsm8B
	MOVB -1(DX)(R8*1), BL
	MOVB -1(DX)(DI*1), R9
	CMPB BL, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm8B
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm8B

repeat_extend_back_end_encodeSnappyBlockAsm8B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm8B
	CMPL SI, $0x00000100
	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B

two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_repeat_emit_encodeSnappyBlockAsm8B
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B

one_byte_repeat_emit_encodeSnappyBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm8B:
	LEAQ (AX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
	MOVQ SI, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B

memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
	LEAQ (AX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  SI, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
	ADDL $0x05, CX
	MOVL CX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R11, R11
	CMPL R8, $0x08
	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm8B

matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
	MOVQ  (R9)(R11*1), R10
	XORQ  (SI)(R11*1), R10
	TESTQ R10, R10
	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
	BSFQ  R10, R10
	SARQ  $0x03, R10
	LEAL  (R11)(R10*1), R11
	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm8B

matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	CMPL R8, $0x08
	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B

matchlen_single_repeat_extend_encodeSnappyBlockAsm8B:
	TESTL R8, R8
	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm8B

matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm8B
	LEAL 1(R11), R11
	DECL R8
	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B

repeat_extend_forward_end_encodeSnappyBlockAsm8B:
	ADDL R11, CX
	MOVL CX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
	CMPL SI, $0x40
	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(SI), SI
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
	CMPL SI, $0x0c
	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
	MOVB $0x01, BL
	LEAL -16(BX)(SI*4), SI
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm8B

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
	MOVB $0x02, BL
	LEAL -4(BX)(SI*4), SI
	MOVB SI, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm8B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm8B

no_repeat_found_encodeSnappyBlockAsm8B:
	CMPL (DX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm8B
	SHRQ $0x08, DI
	MOVL 24(SP)(R10*4), SI
	LEAL 2(CX), R9
	CMPL (DX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm8B
	MOVL R9, 24(SP)(R10*4)
	SHRQ $0x08, DI
	CMPL (DX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm8B
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm8B

candidate3_match_encodeSnappyBlockAsm8B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm8B

candidate2_match_encodeSnappyBlockAsm8B:
	MOVL R9, 24(SP)(R10*4)
	INCL CX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm8B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm8B

match_extend_back_loop_encodeSnappyBlockAsm8B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBlockAsm8B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm8B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm8B
	JMP  match_extend_back_loop_encodeSnappyBlockAsm8B

match_extend_back_end_encodeSnappyBlockAsm8B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm8B:
	MOVL CX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBlockAsm8B
	CMPL R8, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B

two_bytes_match_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB R8, 1(AX)
	ADDQ $0x02, AX
	CMPL R8, $0x40
	JL   memmove_match_emit_encodeSnappyBlockAsm8B
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B

one_byte_match_emit_encodeSnappyBlockAsm8B:
	SHLB $0x02, R8
	MOVB R8, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm8B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (AX)
	MOVQ DI, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
	MOVQ R8, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm8B

memmove_long_match_emit_encodeSnappyBlockAsm8B:
	LEAQ (AX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R8, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
match_nolit_loop_encodeSnappyBlockAsm8B:
	MOVL CX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(SI*1), SI

	// matchLen
	XORL R10, R10
	CMPL DI, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm8B

matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
	MOVQ  (R8)(R10*1), R9
	XORQ  (SI)(R10*1), R9
	TESTQ R9, R9
	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
	BSFQ  R9, R9
	SARQ  $0x03, R9
	LEAL  (R10)(R9*1), R10
	JMP   match_nolit_end_encodeSnappyBlockAsm8B

matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	CMPL DI, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B

matchlen_single_match_nolit_encodeSnappyBlockAsm8B:
	TESTL DI, DI
	JZ    match_nolit_end_encodeSnappyBlockAsm8B

matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm8B
	LEAL 1(R10), R10
	DECL DI
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B

match_nolit_end_encodeSnappyBlockAsm8B:
	ADDL R10, CX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
	CMPL R10, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm8B

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
	CMPL R10, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
	MOVB $0x01, BL
	LEAL -16(BX)(R10*4), R10
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm8B

emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
	MOVB $0x02, BL
	LEAL -4(BX)(R10*4), R10
	MOVB R10, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBlockAsm8B
	MOVQ -2(DX)(CX*1), DI
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm8B:
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x20, R8
	IMULQ R9, R8
	SHRQ  $0x38, R8
	SHLQ  $0x20, SI
	IMULQ R9, SI
	SHRQ  $0x38, SI
	LEAL  -2(CX), R9
	LEAQ  24(SP)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, 24(SP)(R8*4)
	MOVL  CX, (R10)
	CMPL  (DX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm8B
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm8B

emit_remainder_encodeSnappyBlockAsm8B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm8B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm8B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B

two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B

one_byte_emit_remainder_encodeSnappyBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B

memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000a00, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm:
	MOVL CX, SI
	SUBL 12(SP), SI
	SHRL $0x07, SI
	CMPL SI, $0x63
	JLE  check_maxskip_ok_encodeSnappyBetterBlockAsm
	LEAL 100(CX), SI
	JMP  check_maxskip_cont_encodeSnappyBetterBlockAsm

check_maxskip_ok_encodeSnappyBetterBlockAsm:
	LEAL 1(CX)(SI*1), SI

check_maxskip_cont_encodeSnappyBetterBlockAsm:
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBetterBlockAsm
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x32, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  262168(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 262168(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeSnappyBetterBlockAsm
	MOVL  20(SP), CX
	JMP   search_loop_encodeSnappyBetterBlockAsm

candidateS_match_encodeSnappyBetterBlockAsm:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm

match_extend_back_loop_encodeSnappyBetterBlockAsm:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm

match_extend_back_end_encodeSnappyBetterBlockAsm:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 5(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm

matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeSnappyBetterBlockAsm

matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm

matchlen_single_match_nolit_encodeSnappyBetterBlockAsm:
	TESTL R8, R8
	JZ    match_nolit_end_encodeSnappyBetterBlockAsm

matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm

match_nolit_end_encodeSnappyBetterBlockAsm:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL R12, $0x01
	JG   match_length_ok_encodeSnappyBetterBlockAsm
	CMPL R8, $0x0000ffff
	JLE  match_length_ok_encodeSnappyBetterBlockAsm
	MOVL 20(SP), CX
	INCL CX
	JMP  search_loop_encodeSnappyBetterBlockAsm

match_length_ok_encodeSnappyBetterBlockAsm:
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm
	CMPL SI, $0x00010000
	JLT  three_bytes_match_emit_encodeSnappyBetterBlockAsm
	CMPL SI, $0x01000000
	JLT  four_bytes_match_emit_encodeSnappyBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL SI, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

four_bytes_match_emit_encodeSnappyBetterBlockAsm:
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

three_bytes_match_emit_encodeSnappyBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

two_bytes_match_emit_encodeSnappyBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeSnappyBetterBlockAsm
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

one_byte_match_emit_encodeSnappyBetterBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm

memmove_long_match_emit_encodeSnappyBetterBlockAsm:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
	CMPL R8, $0x00010000
	JL   two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm

four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R12, $0x40
	JLE  four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
	MOVB $0xff, (AX)
	MOVL R8, 1(AX)
	LEAL -64(R12), R12
	ADDQ $0x05, AX
	CMPL R12, $0x04
	JL   four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm

four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
	TESTL R12, R12
	JZ    match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
	MOVB  $0x03, BL
	LEAL  -4(BX)(R12*4), R12
	MOVB  R12, (AX)
	MOVL  R8, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm

two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
	CMPL R8, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBetterBlockAsm
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
	MOVQ  $0x00cf1bbcdcbfa563, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x08, R13
	IMULQ SI, R13
	SHRQ  $0x30, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x32, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 262168(SP)(R11*4)
	MOVL  R15, 262168(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x08, R13
	IMULQ SI, R13
	SHRQ  $0x30, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 262168(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeSnappyBetterBlockAsm

emit_remainder_encodeSnappyBetterBlockAsm:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 5(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
	CMPL DX, $0x00010000
	JLT  three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
	CMPL DX, $0x01000000
	JLT  four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL DX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000a00, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm64K:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm64K
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm64K:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x07, SI
	LEAL  1(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBetterBlockAsm64K
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x32, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  262168(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 262168(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeSnappyBetterBlockAsm64K
	MOVL  20(SP), CX
	JMP   search_loop_encodeSnappyBetterBlockAsm64K

candidateS_match_encodeSnappyBetterBlockAsm64K:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm64K:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm64K

match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm64K
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm64K

match_extend_back_end_encodeSnappyBetterBlockAsm64K:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBetterBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm64K:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K

matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeSnappyBetterBlockAsm64K

matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K

matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K:
	TESTL R8, R8
	JZ    match_nolit_end_encodeSnappyBetterBlockAsm64K

matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm64K
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K

match_nolit_end_encodeSnappyBetterBlockAsm64K:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm64K
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K

two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeSnappyBetterBlockAsm64K
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K

one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm64K:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K

memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
	CMPL R8, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBetterBlockAsm64K
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
	MOVQ  $0x00cf1bbcdcbfa563, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x08, R13
	IMULQ SI, R13
	SHRQ  $0x30, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x32, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 262168(SP)(R11*4)
	MOVL  R15, 262168(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x08, R13
	IMULQ SI, R13
	SHRQ  $0x30, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 262168(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeSnappyBetterBlockAsm64K

emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K

one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000280, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm12B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm12B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  1(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBetterBlockAsm12B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x34, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  65560(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 65560(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeSnappyBetterBlockAsm12B
	MOVL  20(SP), CX
	JMP   search_loop_encodeSnappyBetterBlockAsm12B

candidateS_match_encodeSnappyBetterBlockAsm12B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm12B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm12B

match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm12B
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm12B

match_extend_back_end_encodeSnappyBetterBlockAsm12B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm12B:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B

matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeSnappyBetterBlockAsm12B

matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B

matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B:
	TESTL R8, R8
	JZ    match_nolit_end_encodeSnappyBetterBlockAsm12B

matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm12B
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B

match_nolit_end_encodeSnappyBetterBlockAsm12B:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm12B
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B

two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeSnappyBetterBlockAsm12B
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B

one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm12B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B

memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
	CMPL R8, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBetterBlockAsm12B
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x32, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x34, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x34, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 65560(SP)(R11*4)
	MOVL  R15, 65560(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x34, R11
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x32, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 65560(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeSnappyBetterBlockAsm12B

emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B

one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x000000a0, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm10B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm10B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  1(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBetterBlockAsm10B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x36, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  16408(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 16408(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeSnappyBetterBlockAsm10B
	MOVL  20(SP), CX
	JMP   search_loop_encodeSnappyBetterBlockAsm10B

candidateS_match_encodeSnappyBetterBlockAsm10B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm10B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm10B

match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm10B
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm10B

match_extend_back_end_encodeSnappyBetterBlockAsm10B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm10B:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B

matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeSnappyBetterBlockAsm10B

matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B

matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B:
	TESTL R8, R8
	JZ    match_nolit_end_encodeSnappyBetterBlockAsm10B

matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm10B
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B

match_nolit_end_encodeSnappyBetterBlockAsm10B:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm10B
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B

two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeSnappyBetterBlockAsm10B
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B

one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm10B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B

memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
	CMPL R8, $0x00000800
	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBetterBlockAsm10B
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x34, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x36, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x36, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 16408(SP)(R11*4)
	MOVL  R15, 16408(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x36, R11
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x34, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 16408(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeSnappyBetterBlockAsm10B

emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B

one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
// Requires: SSE2
TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000028, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm8B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm8B:
	MOVL  CX, SI
	SUBL  12(SP), SI
	SHRL  $0x04, SI
	LEAL  1(CX)(SI*1), SI
	CMPL  SI, 8(SP)
	JGE   emit_remainder_encodeSnappyBetterBlockAsm8B
	MOVQ  (DX)(CX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x38, R11
	MOVL  24(SP)(R10*4), SI
	MOVL  4120(SP)(R11*4), R8
	MOVL  CX, 24(SP)(R10*4)
	MOVL  CX, 4120(SP)(R11*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
	CMPL  (DX)(R8*1), DI
	JEQ   candidateS_match_encodeSnappyBetterBlockAsm8B
	MOVL  20(SP), CX
	JMP   search_loop_encodeSnappyBetterBlockAsm8B

candidateS_match_encodeSnappyBetterBlockAsm8B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	MOVL  24(SP)(R10*4), SI
	INCL  CX
	MOVL  CX, 24(SP)(R10*4)
	CMPL  (DX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
	DECL  CX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm8B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm8B

match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
	CMPL CX, DI
	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
	MOVB -1(DX)(SI*1), BL
	MOVB -1(DX)(CX*1), R8
	CMPB BL, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
	LEAL -1(CX), CX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm8B
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm8B

match_extend_back_end_encodeSnappyBetterBlockAsm8B:
	MOVL CX, DI
	SUBL 12(SP), DI
	LEAQ 3(AX)(DI*1), DI
	CMPQ DI, (SP)
	JL   match_dst_size_check_encodeSnappyBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm8B:
	MOVL CX, DI
	ADDL $0x04, CX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(SI*1), R10

	// matchLen
	XORL R12, R12
	CMPL R8, $0x08
	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B

matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
	MOVQ  (R9)(R12*1), R11
	XORQ  (R10)(R12*1), R11
	TESTQ R11, R11
	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
	BSFQ  R11, R11
	SARQ  $0x03, R11
	LEAL  (R12)(R11*1), R12
	JMP   match_nolit_end_encodeSnappyBetterBlockAsm8B

matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	CMPL R8, $0x08
	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B

matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B:
	TESTL R8, R8
	JZ    match_nolit_end_encodeSnappyBetterBlockAsm8B

matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm8B
	LEAL 1(R12), R12
	DECL R8
	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B

match_nolit_end_encodeSnappyBetterBlockAsm8B:
	MOVL CX, R8
	SUBL SI, R8

	// Check if repeat
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (DX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm8B
	CMPL SI, $0x00000100
	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B

two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_match_emit_encodeSnappyBetterBlockAsm8B
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B

one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm8B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (AX)
	MOVQ R10, -8(AX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
	MOVQ SI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B

memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
	LEAQ (AX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  SI, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
	ADDL R12, CX
	ADDL $0x04, R12
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL R12, $0x40
	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R12), R12
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL R12, $0x0c
	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVB $0x01, BL
	LEAL -16(BX)(R12*4), R12
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
	MOVB $0x02, BL
	LEAL -4(BX)(R12*4), R12
	MOVB R12, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
	CMPL CX, 8(SP)
	JGE  emit_remainder_encodeSnappyBetterBlockAsm8B
	CMPQ AX, (SP)
	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	INCL  DI
	MOVQ  (DX)(DI*1), R9
	MOVQ  R9, R10
	MOVQ  R9, R11
	MOVQ  R9, R12
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	SHRQ  $0x10, R12
	LEAL  1(DI), R14
	LEAL  2(DI), R15
	MOVQ  -2(DX)(CX*1), R9
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x36, R13
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x38, R11
	SHLQ  $0x20, R12
	IMULQ R8, R12
	SHRQ  $0x38, R12
	MOVL  DI, 24(SP)(R10*4)
	MOVL  R14, 24(SP)(R13*4)
	MOVL  R14, 4120(SP)(R11*4)
	MOVL  R15, 4120(SP)(R12*4)
	MOVQ  R9, R10
	MOVQ  R9, R11
	SHRQ  $0x08, R11
	MOVQ  R11, R13
	LEAL  -2(CX), R9
	LEAL  -1(CX), DI
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x38, R11
	SHLQ  $0x10, R13
	IMULQ SI, R13
	SHRQ  $0x36, R13
	MOVL  R9, 24(SP)(R10*4)
	MOVL  DI, 4120(SP)(R11*4)
	MOVL  DI, 24(SP)(R13*4)
	JMP   search_loop_encodeSnappyBetterBlockAsm8B

emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
	CMPL DX, $0x00000100
	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B

one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x08
	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8:
	MOVQ (CX), SI
	MOVQ SI, (AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func emitLiteral(dst []byte, lit []byte) int
// Requires: SSE2
TEXT ·emitLiteral(SB), NOSPLIT, $0-56
	MOVQ  lit_len+32(FP), DX
	MOVQ  dst_base+0(FP), AX
	MOVQ  lit_base+24(FP), CX
	TESTQ DX, DX
	JZ    emit_literal_end_standalone_skip
	MOVL  DX, BX
	LEAL  -1(DX), SI
	CMPL  SI, $0x3c
	JLT   one_byte_standalone
	CMPL  SI, $0x00000100
	JLT   two_bytes_standalone
	CMPL  SI, $0x00010000
	JLT   three_bytes_standalone
	CMPL  SI, $0x01000000
	JLT   four_bytes_standalone
	MOVB  $0xfc, (AX)
	MOVL  SI, 1(AX)
	ADDQ  $0x05, BX
	ADDQ  $0x05, AX
	JMP   memmove_long_standalone

four_bytes_standalone:
	MOVL SI, DI
	SHRL $0x10, DI
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB DI, 3(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  memmove_long_standalone

three_bytes_standalone:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  memmove_long_standalone

two_bytes_standalone:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JL   memmove_standalone
	JMP  memmove_long_standalone

one_byte_standalone:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, BX
	ADDQ $0x01, AX

memmove_standalone:
	// genMemMoveShort
	CMPQ DX, $0x03
	JB   emit_lit_memmove_standalone_memmove_move_1or2
	JE   emit_lit_memmove_standalone_memmove_move_3
	CMPQ DX, $0x08
	JB   emit_lit_memmove_standalone_memmove_move_4through7
	CMPQ DX, $0x10
	JBE  emit_lit_memmove_standalone_memmove_move_8through16
	CMPQ DX, $0x20
	JBE  emit_lit_memmove_standalone_memmove_move_17through32
	JMP  emit_lit_memmove_standalone_memmove_move_33through64

emit_lit_memmove_standalone_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(DX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(DX*1)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(DX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(DX*1)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(DX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(DX*1)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(DX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(DX*1)
	JMP   emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(DX*1), X2
	MOVOU -16(CX)(DX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DX*1)
	MOVOU X3, -16(AX)(DX*1)
	JMP   emit_literal_end_standalone
	JMP emit_literal_end_standalone

memmove_long_standalone:
	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(DX*1), X2
	MOVOU -16(CX)(DX*1), X3
	MOVQ  DX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_standalonelarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_standalonelarge_big_loop_back

emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  DX, R8
	JAE   emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DX*1)
	MOVOU X3, -16(AX)(DX*1)
	JMP   emit_literal_end_standalone
	JMP emit_literal_end_standalone

emit_literal_end_standalone_skip:
	XORQ BX, BX

emit_literal_end_standalone:
	MOVQ BX, ret+48(FP)
	RET

// func emitRepeat(dst []byte, offset int, length int) int
TEXT ·emitRepeat(SB), NOSPLIT, $0-48
	XORQ BX, BX
	MOVQ dst_base+0(FP), AX
	MOVQ offset+24(FP), CX
	MOVQ length+32(FP), DX

	// emitRepeat
emit_repeat_again_standalone:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JLE  repeat_two_standalone
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_standalone
	CMPL CX, $0x00000800
	JLT  repeat_two_offset_standalone

cant_repeat_two_offset_standalone:
	CMPL DX, $0x00000104
	JLT  repeat_three_standalone
	CMPL DX, $0x00010100
	JLT  repeat_four_standalone
	CMPL DX, $0x0100ffff
	JLT  repeat_five_standalone
	LEAL -16842747(DX), DX
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone

repeat_five_standalone:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_repeat_end

repeat_four_standalone:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_repeat_end

repeat_three_standalone:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_repeat_end

repeat_two_standalone:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_repeat_end

repeat_two_offset_standalone:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX

gen_emit_repeat_end:
	MOVQ BX, ret+40(FP)
	RET

// func emitCopy(dst []byte, offset int, length int) int
TEXT ·emitCopy(SB), NOSPLIT, $0-48
	XORQ BX, BX
	MOVQ dst_base+0(FP), AX
	MOVQ offset+24(FP), CX
	MOVQ length+32(FP), DX

	// emitCopy
	CMPL CX, $0x00010000
	JL   two_byte_offset_standalone

four_bytes_loop_back_standalone:
	CMPL DX, $0x40
	JLE  four_bytes_remain_standalone
	MOVB $0xff, (AX)
	MOVL CX, 1(AX)
	LEAL -64(DX), DX
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	CMPL DX, $0x04
	JL   four_bytes_remain_standalone

	// emitRepeat
emit_repeat_again_standalone_emit_copy:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JLE  repeat_two_standalone_emit_copy
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_standalone_emit_copy
	CMPL CX, $0x00000800
	JLT  repeat_two_offset_standalone_emit_copy

cant_repeat_two_offset_standalone_emit_copy:
	CMPL DX, $0x00000104
	JLT  repeat_three_standalone_emit_copy
	CMPL DX, $0x00010100
	JLT  repeat_four_standalone_emit_copy
	CMPL DX, $0x0100ffff
	JLT  repeat_five_standalone_emit_copy
	LEAL -16842747(DX), DX
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone_emit_copy

repeat_five_standalone_emit_copy:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_copy_end

repeat_four_standalone_emit_copy:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_copy_end

repeat_three_standalone_emit_copy:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_copy_end

repeat_two_standalone_emit_copy:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

repeat_two_offset_standalone_emit_copy:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end
	JMP four_bytes_loop_back_standalone

four_bytes_remain_standalone:
	TESTL DX, DX
	JZ    gen_emit_copy_end
	MOVB  $0x03, SI
	LEAL  -4(SI)(DX*4), DX
	MOVB  DL, (AX)
	MOVL  CX, 1(AX)
	ADDQ  $0x05, BX
	ADDQ  $0x05, AX
	JMP   gen_emit_copy_end

two_byte_offset_standalone:
	CMPL DX, $0x40
	JLE  two_byte_offset_short_standalone
	MOVB $0xee, (AX)
	MOVW CX, 1(AX)
	LEAL -60(DX), DX
	ADDQ $0x03, AX
	ADDQ $0x03, BX

	// emitRepeat
emit_repeat_again_standalone_emit_copy_short:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JLE  repeat_two_standalone_emit_copy_short
	CMPL SI, $0x0c
	JGE  cant_repeat_two_offset_standalone_emit_copy_short
	CMPL CX, $0x00000800
	JLT  repeat_two_offset_standalone_emit_copy_short

cant_repeat_two_offset_standalone_emit_copy_short:
	CMPL DX, $0x00000104
	JLT  repeat_three_standalone_emit_copy_short
	CMPL DX, $0x00010100
	JLT  repeat_four_standalone_emit_copy_short
	CMPL DX, $0x0100ffff
	JLT  repeat_five_standalone_emit_copy_short
	LEAL -16842747(DX), DX
	MOVW $0x001d, (AX)
	MOVW $0xfffb, 2(AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone_emit_copy_short

repeat_five_standalone_emit_copy_short:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_copy_end

repeat_four_standalone_emit_copy_short:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_copy_end

repeat_three_standalone_emit_copy_short:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_copy_end

repeat_two_standalone_emit_copy_short:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

repeat_two_offset_standalone_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end
	JMP two_byte_offset_standalone

two_byte_offset_short_standalone:
	CMPL DX, $0x0c
	JGE  emit_copy_three_standalone
	CMPL CX, $0x00000800
	JGE  emit_copy_three_standalone
	MOVB $0x01, SI
	LEAL -16(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SHRL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

emit_copy_three_standalone:
	MOVB $0x02, SI
	LEAL -4(SI)(DX*4), DX
	MOVB DL, (AX)
	MOVW CX, 1(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX

gen_emit_copy_end:
	MOVQ BX, ret+40(FP)
	RET

// func emitCopyNoRepeat(dst []byte, offset int, length int) int
TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
	XORQ BX, BX
	MOVQ dst_base+0(FP), AX
	MOVQ offset+24(FP), CX
	MOVQ length+32(FP), DX

	// emitCopy
	CMPL CX, $0x00010000
	JL   two_byte_offset_standalone_snappy

four_bytes_loop_back_standalone_snappy:
	CMPL DX, $0x40
	JLE  four_bytes_remain_standalone_snappy
	MOVB $0xff, (AX)
	MOVL CX, 1(AX)
	LEAL -64(DX), DX
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	CMPL DX, $0x04
	JL   four_bytes_remain_standalone_snappy
	JMP  four_bytes_loop_back_standalone_snappy

four_bytes_remain_standalone_snappy:
	TESTL DX, DX
	JZ    gen_emit_copy_end_snappy
	MOVB  $0x03, SI
	LEAL  -4(SI)(DX*4), DX
	MOVB  DL, (AX)
	MOVL  CX, 1(AX)
	ADDQ  $0x05, BX
	ADDQ  $0x05, AX
	JMP   gen_emit_copy_end_snappy

two_byte_offset_standalone_snappy:
	CMPL DX, $0x40
	JLE  two_byte_offset_short_standalone_snappy
	MOVB $0xee, (AX)
	MOVW CX, 1(AX)
	LEAL -60(DX), DX
	ADDQ $0x03, AX
	ADDQ $0x03, BX
	JMP  two_byte_offset_standalone_snappy

two_byte_offset_short_standalone_snappy:
	CMPL DX, $0x0c
	JGE  emit_copy_three_standalone_snappy
	CMPL CX, $0x00000800
	JGE  emit_copy_three_standalone_snappy
	MOVB $0x01, SI
	LEAL -16(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SHRL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end_snappy

emit_copy_three_standalone_snappy:
	MOVB $0x02, SI
	LEAL -4(SI)(DX*4), DX
	MOVB DL, (AX)
	MOVW CX, 1(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX

gen_emit_copy_end_snappy:
	MOVQ BX, ret+40(FP)
	RET

// func matchLen(a []byte, b []byte) int
TEXT ·matchLen(SB), NOSPLIT, $0-56
	MOVQ a_base+0(FP), AX
	MOVQ b_base+24(FP), CX
	MOVQ a_len+8(FP), DX

	// matchLen
	XORL SI, SI
	CMPL DX, $0x08
	JL   matchlen_single_standalone

matchlen_loopback_standalone:
	MOVQ  (AX)(SI*1), BX
	XORQ  (CX)(SI*1), BX
	TESTQ BX, BX
	JZ    matchlen_loop_standalone
	BSFQ  BX, BX
	SARQ  $0x03, BX
	LEAL  (SI)(BX*1), SI
	JMP   gen_match_len_end

matchlen_loop_standalone:
	LEAL -8(DX), DX
	LEAL 8(SI), SI
	CMPL DX, $0x08
	JGE  matchlen_loopback_standalone

matchlen_single_standalone:
	TESTL DX, DX
	JZ    gen_match_len_end

matchlen_single_loopback_standalone:
	MOVB (AX)(SI*1), BL
	CMPB (CX)(SI*1), BL
	JNE  gen_match_len_end
	LEAL 1(SI), SI
	DECL DX
	JNZ  matchlen_single_loopback_standalone

gen_match_len_end:
	MOVQ SI, ret+48(FP)
	RET