diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2')
-rw-r--r-- | vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go | 4 | ||||
-rw-r--r-- | vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s | 1873 |
2 files changed, 1450 insertions, 427 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go index c8cf7b69..d9312e5b 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go @@ -1,7 +1,7 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. -//go:build !appengine && !noasm && gc -// +build !appengine,!noasm,gc +//go:build !appengine && !noasm && gc && !noasm +// +build !appengine,!noasm,gc,!noasm package s2 diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s index 1ac65a0e..729dbf53 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -1,13 +1,12 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. -// +build !appengine -// +build !noasm -// +build gc +//go:build !appengine && !noasm && gc && !noasm +// +build !appengine,!noasm,gc,!noasm #include "textflag.h" // func encodeBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -243,35 +242,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm + JL matchlen_match4_repeat_extend_encodeBlockAsm matchlen_loopback_repeat_extend_encodeBlockAsm: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm matchlen_loop_repeat_extend_encodeBlockAsm: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm - -matchlen_single_repeat_extend_encodeBlockAsm: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm - -matchlen_single_loopback_repeat_extend_encodeBlockAsm: + JZ repeat_extend_forward_end_encodeBlockAsm + +matchlen_match4_repeat_extend_encodeBlockAsm: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm repeat_extend_forward_end_encodeBlockAsm: ADDL R12, CX @@ -748,35 +780,68 @@ match_nolit_loop_encodeBlockAsm: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm + JL matchlen_match4_match_nolit_encodeBlockAsm matchlen_loopback_match_nolit_encodeBlockAsm: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm matchlen_loop_match_nolit_encodeBlockAsm: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm - -matchlen_single_match_nolit_encodeBlockAsm: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm - -matchlen_single_loopback_match_nolit_encodeBlockAsm: + JZ match_nolit_end_encodeBlockAsm + +matchlen_match4_match_nolit_encodeBlockAsm: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm match_nolit_end_encodeBlockAsm: ADDL R10, CX @@ -1162,7 +1227,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm: RET // func encodeBlockAsm4MB(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm4MB(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -1390,35 +1455,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm4MB: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm4MB + JL matchlen_match4_repeat_extend_encodeBlockAsm4MB matchlen_loopback_repeat_extend_encodeBlockAsm4MB: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm4MB matchlen_loop_repeat_extend_encodeBlockAsm4MB: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB - -matchlen_single_repeat_extend_encodeBlockAsm4MB: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB: + JZ repeat_extend_forward_end_encodeBlockAsm4MB + +matchlen_match4_repeat_extend_encodeBlockAsm4MB: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm4MB + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm4MB: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm4MB + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm4MB: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm4MB MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm4MB LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB repeat_extend_forward_end_encodeBlockAsm4MB: ADDL R12, CX @@ -1854,35 +1952,68 @@ match_nolit_loop_encodeBlockAsm4MB: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm4MB + JL matchlen_match4_match_nolit_encodeBlockAsm4MB matchlen_loopback_match_nolit_encodeBlockAsm4MB: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm4MB - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm4MB matchlen_loop_match_nolit_encodeBlockAsm4MB: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB - -matchlen_single_match_nolit_encodeBlockAsm4MB: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm4MB - -matchlen_single_loopback_match_nolit_encodeBlockAsm4MB: + JZ match_nolit_end_encodeBlockAsm4MB + +matchlen_match4_match_nolit_encodeBlockAsm4MB: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm4MB + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm4MB + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm4MB: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm4MB + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm4MB + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm4MB: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm4MB MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm4MB LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB match_nolit_end_encodeBlockAsm4MB: ADDL R10, CX @@ -2238,7 +2369,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm4MB: RET // func encodeBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000080, CX @@ -2455,35 +2586,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm12B: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm12B + JL matchlen_match4_repeat_extend_encodeBlockAsm12B matchlen_loopback_repeat_extend_encodeBlockAsm12B: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm12B matchlen_loop_repeat_extend_encodeBlockAsm12B: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B - -matchlen_single_repeat_extend_encodeBlockAsm12B: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: + JZ repeat_extend_forward_end_encodeBlockAsm12B + +matchlen_match4_repeat_extend_encodeBlockAsm12B: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm12B + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm12B + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm12B: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm12B + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm12B + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm12B: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm12B MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm12B LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B repeat_extend_forward_end_encodeBlockAsm12B: ADDL R12, CX @@ -2804,35 +2968,68 @@ match_nolit_loop_encodeBlockAsm12B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm12B + JL matchlen_match4_match_nolit_encodeBlockAsm12B matchlen_loopback_match_nolit_encodeBlockAsm12B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm12B matchlen_loop_match_nolit_encodeBlockAsm12B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm12B - -matchlen_single_match_nolit_encodeBlockAsm12B: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeBlockAsm12B: + JZ match_nolit_end_encodeBlockAsm12B + +matchlen_match4_match_nolit_encodeBlockAsm12B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm12B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm12B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm12B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm12B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm12B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm12B: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm12B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm12B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B match_nolit_end_encodeBlockAsm12B: ADDL R10, CX @@ -3085,7 +3282,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm12B: RET // func encodeBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000020, CX @@ -3302,35 +3499,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm10B: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm10B + JL matchlen_match4_repeat_extend_encodeBlockAsm10B matchlen_loopback_repeat_extend_encodeBlockAsm10B: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm10B matchlen_loop_repeat_extend_encodeBlockAsm10B: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B - -matchlen_single_repeat_extend_encodeBlockAsm10B: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: + JZ repeat_extend_forward_end_encodeBlockAsm10B + +matchlen_match4_repeat_extend_encodeBlockAsm10B: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm10B + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm10B + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm10B: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm10B + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm10B + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm10B: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm10B MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm10B LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B repeat_extend_forward_end_encodeBlockAsm10B: ADDL R12, CX @@ -3651,35 +3881,68 @@ match_nolit_loop_encodeBlockAsm10B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm10B + JL matchlen_match4_match_nolit_encodeBlockAsm10B matchlen_loopback_match_nolit_encodeBlockAsm10B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm10B matchlen_loop_match_nolit_encodeBlockAsm10B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm10B - -matchlen_single_match_nolit_encodeBlockAsm10B: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeBlockAsm10B: + JZ match_nolit_end_encodeBlockAsm10B + +matchlen_match4_match_nolit_encodeBlockAsm10B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm10B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm10B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm10B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm10B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm10B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm10B: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm10B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm10B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B match_nolit_end_encodeBlockAsm10B: ADDL R10, CX @@ -3932,7 +4195,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm10B: RET // func encodeBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm8B(SB), $1048-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000008, CX @@ -4149,35 +4412,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm8B: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm8B + JL matchlen_match4_repeat_extend_encodeBlockAsm8B matchlen_loopback_repeat_extend_encodeBlockAsm8B: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm8B matchlen_loop_repeat_extend_encodeBlockAsm8B: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B - -matchlen_single_repeat_extend_encodeBlockAsm8B: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: + JZ repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_match4_repeat_extend_encodeBlockAsm8B: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm8B + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm8B + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm8B: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm8B + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm8B + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm8B: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm8B MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm8B LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B repeat_extend_forward_end_encodeBlockAsm8B: ADDL R12, CX @@ -4488,35 +4784,68 @@ match_nolit_loop_encodeBlockAsm8B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm8B + JL matchlen_match4_match_nolit_encodeBlockAsm8B matchlen_loopback_match_nolit_encodeBlockAsm8B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm8B matchlen_loop_match_nolit_encodeBlockAsm8B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm8B - -matchlen_single_match_nolit_encodeBlockAsm8B: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeBlockAsm8B: + JZ match_nolit_end_encodeBlockAsm8B + +matchlen_match4_match_nolit_encodeBlockAsm8B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm8B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm8B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm8B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm8B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm8B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm8B: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm8B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm8B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B match_nolit_end_encodeBlockAsm8B: ADDL R10, CX @@ -4763,7 +5092,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B: RET // func encodeBetterBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX @@ -4885,35 +5214,68 @@ match_dst_size_check_encodeBetterBlockAsm: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm + JL matchlen_match4_match_nolit_encodeBetterBlockAsm matchlen_loopback_match_nolit_encodeBetterBlockAsm: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm matchlen_loop_match_nolit_encodeBetterBlockAsm: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm - -matchlen_single_match_nolit_encodeBetterBlockAsm: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: + JZ match_nolit_end_encodeBetterBlockAsm + +matchlen_match4_match_nolit_encodeBetterBlockAsm: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm match_nolit_end_encodeBetterBlockAsm: MOVL CX, R8 @@ -5719,7 +6081,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm: RET // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX @@ -5841,35 +6203,68 @@ match_dst_size_check_encodeBetterBlockAsm4MB: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB + JL matchlen_match4_match_nolit_encodeBetterBlockAsm4MB matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm4MB matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB - -matchlen_single_match_nolit_encodeBetterBlockAsm4MB: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB: + JZ match_nolit_end_encodeBetterBlockAsm4MB + +matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm4MB + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm4MB: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm4MB + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm4MB: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm4MB MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm4MB LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB match_nolit_end_encodeBetterBlockAsm4MB: MOVL CX, R8 @@ -6618,7 +7013,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: RET // func encodeBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000280, CX @@ -6732,35 +7127,68 @@ match_dst_size_check_encodeBetterBlockAsm12B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm12B + JL matchlen_match4_match_nolit_encodeBetterBlockAsm12B matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm12B matchlen_loop_match_nolit_encodeBetterBlockAsm12B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B - -matchlen_single_match_nolit_encodeBetterBlockAsm12B: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: + JZ match_nolit_end_encodeBetterBlockAsm12B + +matchlen_match4_match_nolit_encodeBetterBlockAsm12B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm12B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm12B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm12B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm12B: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm12B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm12B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B match_nolit_end_encodeBetterBlockAsm12B: MOVL CX, R8 @@ -7363,7 +7791,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: RET // func encodeBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 MOVQ dst_base+0(FP), AX MOVQ $0x000000a0, CX @@ -7477,35 +7905,68 @@ match_dst_size_check_encodeBetterBlockAsm10B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm10B + JL matchlen_match4_match_nolit_encodeBetterBlockAsm10B matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm10B matchlen_loop_match_nolit_encodeBetterBlockAsm10B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B - -matchlen_single_match_nolit_encodeBetterBlockAsm10B: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: + JZ match_nolit_end_encodeBetterBlockAsm10B + +matchlen_match4_match_nolit_encodeBetterBlockAsm10B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm10B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm10B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm10B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm10B: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm10B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm10B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B match_nolit_end_encodeBetterBlockAsm10B: MOVL CX, R8 @@ -8108,7 +8569,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: RET // func encodeBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000028, CX @@ -8222,35 +8683,68 @@ match_dst_size_check_encodeBetterBlockAsm8B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm8B + JL matchlen_match4_match_nolit_encodeBetterBlockAsm8B matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm8B matchlen_loop_match_nolit_encodeBetterBlockAsm8B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B - -matchlen_single_match_nolit_encodeBetterBlockAsm8B: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: + JZ match_nolit_end_encodeBetterBlockAsm8B + +matchlen_match4_match_nolit_encodeBetterBlockAsm8B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm8B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm8B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm8B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm8B: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm8B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm8B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B match_nolit_end_encodeBetterBlockAsm8B: MOVL CX, R8 @@ -8843,7 +9337,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: RET // func encodeSnappyBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -9079,35 +9573,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_loop_repeat_extend_encodeSnappyBlockAsm: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm - -matchlen_single_repeat_extend_encodeSnappyBlockAsm: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm repeat_extend_forward_end_encodeSnappyBlockAsm: ADDL R11, CX @@ -9380,35 +9907,68 @@ match_nolit_loop_encodeSnappyBlockAsm: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm matchlen_loopback_match_nolit_encodeSnappyBlockAsm: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm matchlen_loop_match_nolit_encodeSnappyBlockAsm: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm - -matchlen_single_match_nolit_encodeSnappyBlockAsm: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm: + JZ match_nolit_end_encodeSnappyBlockAsm + +matchlen_match4_match_nolit_encodeSnappyBlockAsm: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm match_nolit_end_encodeSnappyBlockAsm: ADDL R10, CX @@ -9660,7 +10220,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm: RET // func encodeSnappyBlockAsm64K(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -9877,35 +10437,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K - -matchlen_single_repeat_extend_encodeSnappyBlockAsm64K: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm64K MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K repeat_extend_forward_end_encodeSnappyBlockAsm64K: ADDL R11, CX @@ -10135,35 +10728,68 @@ match_nolit_loop_encodeSnappyBlockAsm64K: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm64K matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm64K + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm64K matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K - -matchlen_single_match_nolit_encodeSnappyBlockAsm64K: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K: + JZ match_nolit_end_encodeSnappyBlockAsm64K + +matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm64K + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm64K: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm64K + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm64K: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm64K MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm64K LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K match_nolit_end_encodeSnappyBlockAsm64K: ADDL R10, CX @@ -10372,7 +10998,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: RET // func encodeSnappyBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000080, CX @@ -10589,35 +11215,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B - -matchlen_single_repeat_extend_encodeSnappyBlockAsm12B: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm12B MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B repeat_extend_forward_end_encodeSnappyBlockAsm12B: ADDL R11, CX @@ -10847,35 +11506,68 @@ match_nolit_loop_encodeSnappyBlockAsm12B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm12B matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B - -matchlen_single_match_nolit_encodeSnappyBlockAsm12B: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B: + JZ match_nolit_end_encodeSnappyBlockAsm12B + +matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm12B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm12B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm12B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm12B: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm12B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm12B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B match_nolit_end_encodeSnappyBlockAsm12B: ADDL R10, CX @@ -11084,7 +11776,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: RET // func encodeSnappyBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000020, CX @@ -11301,35 +11993,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B - -matchlen_single_repeat_extend_encodeSnappyBlockAsm10B: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm10B MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B repeat_extend_forward_end_encodeSnappyBlockAsm10B: ADDL R11, CX @@ -11559,35 +12284,68 @@ match_nolit_loop_encodeSnappyBlockAsm10B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm10B matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B - -matchlen_single_match_nolit_encodeSnappyBlockAsm10B: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B: + JZ match_nolit_end_encodeSnappyBlockAsm10B + +matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm10B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm10B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm10B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm10B: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm10B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm10B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B match_nolit_end_encodeSnappyBlockAsm10B: ADDL R10, CX @@ -11796,7 +12554,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: RET // func encodeSnappyBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000008, CX @@ -12013,35 +12771,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B - -matchlen_single_repeat_extend_encodeSnappyBlockAsm8B: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm8B MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B repeat_extend_forward_end_encodeSnappyBlockAsm8B: ADDL R11, CX @@ -12269,35 +13060,68 @@ match_nolit_loop_encodeSnappyBlockAsm8B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm8B matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B - -matchlen_single_match_nolit_encodeSnappyBlockAsm8B: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B: + JZ match_nolit_end_encodeSnappyBlockAsm8B + +matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm8B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm8B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm8B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm8B: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm8B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm8B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B match_nolit_end_encodeSnappyBlockAsm8B: ADDL R10, CX @@ -12504,7 +13328,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: RET // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX @@ -12626,35 +13450,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm: + JZ match_nolit_end_encodeSnappyBetterBlockAsm + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm match_nolit_end_encodeSnappyBetterBlockAsm: MOVL CX, R8 @@ -13086,7 +13943,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: RET // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX @@ -13200,35 +14057,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm64K: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm64K + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm64K matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: + JZ match_nolit_end_encodeSnappyBetterBlockAsm64K + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm64K MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm64K LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K match_nolit_end_encodeSnappyBetterBlockAsm64K: MOVL CX, R8 @@ -13589,7 +14479,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: RET // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000280, CX @@ -13703,35 +14593,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm12B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: + JZ match_nolit_end_encodeSnappyBetterBlockAsm12B + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm12B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm12B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B match_nolit_end_encodeSnappyBetterBlockAsm12B: MOVL CX, R8 @@ -14092,7 +15015,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: RET // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 MOVQ dst_base+0(FP), AX MOVQ $0x000000a0, CX @@ -14206,35 +15129,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm10B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: + JZ match_nolit_end_encodeSnappyBetterBlockAsm10B + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm10B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm10B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B match_nolit_end_encodeSnappyBetterBlockAsm10B: MOVL CX, R8 @@ -14595,7 +15551,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: RET // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000028, CX @@ -14709,35 +15665,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm8B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: + JZ match_nolit_end_encodeSnappyBetterBlockAsm8B + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm8B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm8B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B match_nolit_end_encodeSnappyBetterBlockAsm8B: MOVL CX, R8 @@ -15635,6 +16624,7 @@ gen_emit_copy_end_snappy: RET // func matchLen(a []byte, b []byte) int +// Requires: BMI TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX MOVQ b_base+24(FP), CX @@ -15643,35 +16633,68 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56 // matchLen XORL SI, SI CMPL DX, $0x08 - JL matchlen_single_standalone + JL matchlen_match4_standalone matchlen_loopback_standalone: MOVQ (AX)(SI*1), BX XORQ (CX)(SI*1), BX TESTQ BX, BX JZ matchlen_loop_standalone - BSFQ BX, BX - SARQ $0x03, BX - LEAL (SI)(BX*1), SI - JMP gen_match_len_end + +#ifdef GOAMD64_v3 + TZCNTQ BX, BX + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef GOAMD64_v4 + TZCNTQ BX, BX + +#define TZCNTQ_EMITTED 1 +#endif + +#ifdef TZCNTQ_EMITTED +#undef TZCNTQ_EMITTED +#else + BSFQ BX, BX + +#endif + SARQ $0x03, BX + LEAL (SI)(BX*1), SI + JMP gen_match_len_end matchlen_loop_standalone: LEAL -8(DX), DX LEAL 8(SI), SI CMPL DX, $0x08 JGE matchlen_loopback_standalone + JZ gen_match_len_end -matchlen_single_standalone: - TESTL DX, DX - JZ gen_match_len_end - -matchlen_single_loopback_standalone: +matchlen_match4_standalone: + CMPL DX, $0x04 + JL matchlen_match2_standalone + MOVL (AX)(SI*1), BX + CMPL (CX)(SI*1), BX + JNE matchlen_match2_standalone + SUBL $0x04, DX + LEAL 4(SI), SI + +matchlen_match2_standalone: + CMPL DX, $0x02 + JL matchlen_match1_standalone + MOVW (AX)(SI*1), BX + CMPW (CX)(SI*1), BX + JNE matchlen_match1_standalone + SUBL $0x02, DX + LEAL 2(SI), SI + +matchlen_match1_standalone: + CMPL DX, $0x01 + JL gen_match_len_end MOVB (AX)(SI*1), BL CMPB (CX)(SI*1), BL JNE gen_match_len_end LEAL 1(SI), SI - DECL DX - JNZ matchlen_single_loopback_standalone gen_match_len_end: MOVQ SI, ret+48(FP) |