summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
diff options
context:
space:
mode:
authorWim <wim@42.be>2022-04-12 00:30:21 +0200
committerGitHub <noreply@github.com>2022-04-12 00:30:21 +0200
commit281ef53e7de5d30114dbf57a4b506b2d8d2720cc (patch)
tree5fe13b85ffe312053452e0d7107ca4b174a412e0 /vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
parentf044b948e257814e8e1f70d4b66821bfd9c2ff06 (diff)
downloadmatterbridge-msglm-281ef53e7de5d30114dbf57a4b506b2d8d2720cc.tar.gz
matterbridge-msglm-281ef53e7de5d30114dbf57a4b506b2d8d2720cc.tar.bz2
matterbridge-msglm-281ef53e7de5d30114dbf57a4b506b2d8d2720cc.zip
Update dependencies (#1800)
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s')
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s1873
1 files changed, 1448 insertions, 425 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
index 1ac65a0e..729dbf53 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -1,13 +1,12 @@
// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
-// +build !appengine
-// +build !noasm
-// +build gc
+//go:build !appengine && !noasm && gc && !noasm
+// +build !appengine,!noasm,gc,!noasm
#include "textflag.h"
// func encodeBlockAsm(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBlockAsm(SB), $65560-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000200, CX
@@ -243,35 +242,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm:
// matchLen
XORL R12, R12
CMPL R9, $0x08
- JL matchlen_single_repeat_extend_encodeBlockAsm
+ JL matchlen_match4_repeat_extend_encodeBlockAsm
matchlen_loopback_repeat_extend_encodeBlockAsm:
MOVQ (R10)(R12*1), R11
XORQ (SI)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_repeat_extend_encodeBlockAsm
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP repeat_extend_forward_end_encodeBlockAsm
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm
matchlen_loop_repeat_extend_encodeBlockAsm:
LEAL -8(R9), R9
LEAL 8(R12), R12
CMPL R9, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm
-
-matchlen_single_repeat_extend_encodeBlockAsm:
- TESTL R9, R9
- JZ repeat_extend_forward_end_encodeBlockAsm
-
-matchlen_single_loopback_repeat_extend_encodeBlockAsm:
+ JZ repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match4_repeat_extend_encodeBlockAsm:
+ CMPL R9, $0x04
+ JL matchlen_match2_repeat_extend_encodeBlockAsm
+ MOVL (R10)(R12*1), R11
+ CMPL (SI)(R12*1), R11
+ JNE matchlen_match2_repeat_extend_encodeBlockAsm
+ SUBL $0x04, R9
+ LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm:
+ CMPL R9, $0x02
+ JL matchlen_match1_repeat_extend_encodeBlockAsm
+ MOVW (R10)(R12*1), R11
+ CMPW (SI)(R12*1), R11
+ JNE matchlen_match1_repeat_extend_encodeBlockAsm
+ SUBL $0x02, R9
+ LEAL 2(R12), R12
+
+matchlen_match1_repeat_extend_encodeBlockAsm:
+ CMPL R9, $0x01
+ JL repeat_extend_forward_end_encodeBlockAsm
MOVB (R10)(R12*1), R11
CMPB (SI)(R12*1), R11
JNE repeat_extend_forward_end_encodeBlockAsm
LEAL 1(R12), R12
- DECL R9
- JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm
repeat_extend_forward_end_encodeBlockAsm:
ADDL R12, CX
@@ -748,35 +780,68 @@ match_nolit_loop_encodeBlockAsm:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeBlockAsm
+ JL matchlen_match4_match_nolit_encodeBlockAsm
matchlen_loopback_match_nolit_encodeBlockAsm:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeBlockAsm
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeBlockAsm
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm
matchlen_loop_match_nolit_encodeBlockAsm:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm
-
-matchlen_single_match_nolit_encodeBlockAsm:
- TESTL DI, DI
- JZ match_nolit_end_encodeBlockAsm
-
-matchlen_single_loopback_match_nolit_encodeBlockAsm:
+ JZ match_nolit_end_encodeBlockAsm
+
+matchlen_match4_match_nolit_encodeBlockAsm:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeBlockAsm
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeBlockAsm
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeBlockAsm
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeBlockAsm
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeBlockAsm:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeBlockAsm
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeBlockAsm
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm
match_nolit_end_encodeBlockAsm:
ADDL R10, CX
@@ -1162,7 +1227,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm:
RET
// func encodeBlockAsm4MB(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBlockAsm4MB(SB), $65560-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000200, CX
@@ -1390,35 +1455,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm4MB:
// matchLen
XORL R12, R12
CMPL R9, $0x08
- JL matchlen_single_repeat_extend_encodeBlockAsm4MB
+ JL matchlen_match4_repeat_extend_encodeBlockAsm4MB
matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
MOVQ (R10)(R12*1), R11
XORQ (SI)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP repeat_extend_forward_end_encodeBlockAsm4MB
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm4MB
matchlen_loop_repeat_extend_encodeBlockAsm4MB:
LEAL -8(R9), R9
LEAL 8(R12), R12
CMPL R9, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB
-
-matchlen_single_repeat_extend_encodeBlockAsm4MB:
- TESTL R9, R9
- JZ repeat_extend_forward_end_encodeBlockAsm4MB
-
-matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB:
+ JZ repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_match4_repeat_extend_encodeBlockAsm4MB:
+ CMPL R9, $0x04
+ JL matchlen_match2_repeat_extend_encodeBlockAsm4MB
+ MOVL (R10)(R12*1), R11
+ CMPL (SI)(R12*1), R11
+ JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB
+ SUBL $0x04, R9
+ LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm4MB:
+ CMPL R9, $0x02
+ JL matchlen_match1_repeat_extend_encodeBlockAsm4MB
+ MOVW (R10)(R12*1), R11
+ CMPW (SI)(R12*1), R11
+ JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB
+ SUBL $0x02, R9
+ LEAL 2(R12), R12
+
+matchlen_match1_repeat_extend_encodeBlockAsm4MB:
+ CMPL R9, $0x01
+ JL repeat_extend_forward_end_encodeBlockAsm4MB
MOVB (R10)(R12*1), R11
CMPB (SI)(R12*1), R11
JNE repeat_extend_forward_end_encodeBlockAsm4MB
LEAL 1(R12), R12
- DECL R9
- JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB
repeat_extend_forward_end_encodeBlockAsm4MB:
ADDL R12, CX
@@ -1854,35 +1952,68 @@ match_nolit_loop_encodeBlockAsm4MB:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeBlockAsm4MB
+ JL matchlen_match4_match_nolit_encodeBlockAsm4MB
matchlen_loopback_match_nolit_encodeBlockAsm4MB:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeBlockAsm4MB
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeBlockAsm4MB
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm4MB
matchlen_loop_match_nolit_encodeBlockAsm4MB:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB
-
-matchlen_single_match_nolit_encodeBlockAsm4MB:
- TESTL DI, DI
- JZ match_nolit_end_encodeBlockAsm4MB
-
-matchlen_single_loopback_match_nolit_encodeBlockAsm4MB:
+ JZ match_nolit_end_encodeBlockAsm4MB
+
+matchlen_match4_match_nolit_encodeBlockAsm4MB:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeBlockAsm4MB
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeBlockAsm4MB
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm4MB:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeBlockAsm4MB
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeBlockAsm4MB
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeBlockAsm4MB:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeBlockAsm4MB
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeBlockAsm4MB
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB
match_nolit_end_encodeBlockAsm4MB:
ADDL R10, CX
@@ -2238,7 +2369,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm4MB:
RET
// func encodeBlockAsm12B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBlockAsm12B(SB), $16408-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000080, CX
@@ -2455,35 +2586,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm12B:
// matchLen
XORL R12, R12
CMPL R9, $0x08
- JL matchlen_single_repeat_extend_encodeBlockAsm12B
+ JL matchlen_match4_repeat_extend_encodeBlockAsm12B
matchlen_loopback_repeat_extend_encodeBlockAsm12B:
MOVQ (R10)(R12*1), R11
XORQ (SI)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_repeat_extend_encodeBlockAsm12B
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP repeat_extend_forward_end_encodeBlockAsm12B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm12B
matchlen_loop_repeat_extend_encodeBlockAsm12B:
LEAL -8(R9), R9
LEAL 8(R12), R12
CMPL R9, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B
-
-matchlen_single_repeat_extend_encodeBlockAsm12B:
- TESTL R9, R9
- JZ repeat_extend_forward_end_encodeBlockAsm12B
-
-matchlen_single_loopback_repeat_extend_encodeBlockAsm12B:
+ JZ repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_match4_repeat_extend_encodeBlockAsm12B:
+ CMPL R9, $0x04
+ JL matchlen_match2_repeat_extend_encodeBlockAsm12B
+ MOVL (R10)(R12*1), R11
+ CMPL (SI)(R12*1), R11
+ JNE matchlen_match2_repeat_extend_encodeBlockAsm12B
+ SUBL $0x04, R9
+ LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm12B:
+ CMPL R9, $0x02
+ JL matchlen_match1_repeat_extend_encodeBlockAsm12B
+ MOVW (R10)(R12*1), R11
+ CMPW (SI)(R12*1), R11
+ JNE matchlen_match1_repeat_extend_encodeBlockAsm12B
+ SUBL $0x02, R9
+ LEAL 2(R12), R12
+
+matchlen_match1_repeat_extend_encodeBlockAsm12B:
+ CMPL R9, $0x01
+ JL repeat_extend_forward_end_encodeBlockAsm12B
MOVB (R10)(R12*1), R11
CMPB (SI)(R12*1), R11
JNE repeat_extend_forward_end_encodeBlockAsm12B
LEAL 1(R12), R12
- DECL R9
- JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B
repeat_extend_forward_end_encodeBlockAsm12B:
ADDL R12, CX
@@ -2804,35 +2968,68 @@ match_nolit_loop_encodeBlockAsm12B:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeBlockAsm12B
+ JL matchlen_match4_match_nolit_encodeBlockAsm12B
matchlen_loopback_match_nolit_encodeBlockAsm12B:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeBlockAsm12B
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeBlockAsm12B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm12B
matchlen_loop_match_nolit_encodeBlockAsm12B:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm12B
-
-matchlen_single_match_nolit_encodeBlockAsm12B:
- TESTL DI, DI
- JZ match_nolit_end_encodeBlockAsm12B
-
-matchlen_single_loopback_match_nolit_encodeBlockAsm12B:
+ JZ match_nolit_end_encodeBlockAsm12B
+
+matchlen_match4_match_nolit_encodeBlockAsm12B:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeBlockAsm12B
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeBlockAsm12B
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm12B:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeBlockAsm12B
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeBlockAsm12B
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeBlockAsm12B:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeBlockAsm12B
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeBlockAsm12B
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B
match_nolit_end_encodeBlockAsm12B:
ADDL R10, CX
@@ -3085,7 +3282,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm12B:
RET
// func encodeBlockAsm10B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBlockAsm10B(SB), $4120-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000020, CX
@@ -3302,35 +3499,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm10B:
// matchLen
XORL R12, R12
CMPL R9, $0x08
- JL matchlen_single_repeat_extend_encodeBlockAsm10B
+ JL matchlen_match4_repeat_extend_encodeBlockAsm10B
matchlen_loopback_repeat_extend_encodeBlockAsm10B:
MOVQ (R10)(R12*1), R11
XORQ (SI)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_repeat_extend_encodeBlockAsm10B
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP repeat_extend_forward_end_encodeBlockAsm10B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm10B
matchlen_loop_repeat_extend_encodeBlockAsm10B:
LEAL -8(R9), R9
LEAL 8(R12), R12
CMPL R9, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B
-
-matchlen_single_repeat_extend_encodeBlockAsm10B:
- TESTL R9, R9
- JZ repeat_extend_forward_end_encodeBlockAsm10B
-
-matchlen_single_loopback_repeat_extend_encodeBlockAsm10B:
+ JZ repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_match4_repeat_extend_encodeBlockAsm10B:
+ CMPL R9, $0x04
+ JL matchlen_match2_repeat_extend_encodeBlockAsm10B
+ MOVL (R10)(R12*1), R11
+ CMPL (SI)(R12*1), R11
+ JNE matchlen_match2_repeat_extend_encodeBlockAsm10B
+ SUBL $0x04, R9
+ LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm10B:
+ CMPL R9, $0x02
+ JL matchlen_match1_repeat_extend_encodeBlockAsm10B
+ MOVW (R10)(R12*1), R11
+ CMPW (SI)(R12*1), R11
+ JNE matchlen_match1_repeat_extend_encodeBlockAsm10B
+ SUBL $0x02, R9
+ LEAL 2(R12), R12
+
+matchlen_match1_repeat_extend_encodeBlockAsm10B:
+ CMPL R9, $0x01
+ JL repeat_extend_forward_end_encodeBlockAsm10B
MOVB (R10)(R12*1), R11
CMPB (SI)(R12*1), R11
JNE repeat_extend_forward_end_encodeBlockAsm10B
LEAL 1(R12), R12
- DECL R9
- JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B
repeat_extend_forward_end_encodeBlockAsm10B:
ADDL R12, CX
@@ -3651,35 +3881,68 @@ match_nolit_loop_encodeBlockAsm10B:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeBlockAsm10B
+ JL matchlen_match4_match_nolit_encodeBlockAsm10B
matchlen_loopback_match_nolit_encodeBlockAsm10B:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeBlockAsm10B
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeBlockAsm10B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm10B
matchlen_loop_match_nolit_encodeBlockAsm10B:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm10B
-
-matchlen_single_match_nolit_encodeBlockAsm10B:
- TESTL DI, DI
- JZ match_nolit_end_encodeBlockAsm10B
-
-matchlen_single_loopback_match_nolit_encodeBlockAsm10B:
+ JZ match_nolit_end_encodeBlockAsm10B
+
+matchlen_match4_match_nolit_encodeBlockAsm10B:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeBlockAsm10B
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeBlockAsm10B
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm10B:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeBlockAsm10B
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeBlockAsm10B
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeBlockAsm10B:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeBlockAsm10B
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeBlockAsm10B
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B
match_nolit_end_encodeBlockAsm10B:
ADDL R10, CX
@@ -3932,7 +4195,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm10B:
RET
// func encodeBlockAsm8B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBlockAsm8B(SB), $1048-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000008, CX
@@ -4149,35 +4412,68 @@ emit_literal_done_repeat_emit_encodeBlockAsm8B:
// matchLen
XORL R12, R12
CMPL R9, $0x08
- JL matchlen_single_repeat_extend_encodeBlockAsm8B
+ JL matchlen_match4_repeat_extend_encodeBlockAsm8B
matchlen_loopback_repeat_extend_encodeBlockAsm8B:
MOVQ (R10)(R12*1), R11
XORQ (SI)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_repeat_extend_encodeBlockAsm8B
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP repeat_extend_forward_end_encodeBlockAsm8B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP repeat_extend_forward_end_encodeBlockAsm8B
matchlen_loop_repeat_extend_encodeBlockAsm8B:
LEAL -8(R9), R9
LEAL 8(R12), R12
CMPL R9, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B
-
-matchlen_single_repeat_extend_encodeBlockAsm8B:
- TESTL R9, R9
- JZ repeat_extend_forward_end_encodeBlockAsm8B
-
-matchlen_single_loopback_repeat_extend_encodeBlockAsm8B:
+ JZ repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_match4_repeat_extend_encodeBlockAsm8B:
+ CMPL R9, $0x04
+ JL matchlen_match2_repeat_extend_encodeBlockAsm8B
+ MOVL (R10)(R12*1), R11
+ CMPL (SI)(R12*1), R11
+ JNE matchlen_match2_repeat_extend_encodeBlockAsm8B
+ SUBL $0x04, R9
+ LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm8B:
+ CMPL R9, $0x02
+ JL matchlen_match1_repeat_extend_encodeBlockAsm8B
+ MOVW (R10)(R12*1), R11
+ CMPW (SI)(R12*1), R11
+ JNE matchlen_match1_repeat_extend_encodeBlockAsm8B
+ SUBL $0x02, R9
+ LEAL 2(R12), R12
+
+matchlen_match1_repeat_extend_encodeBlockAsm8B:
+ CMPL R9, $0x01
+ JL repeat_extend_forward_end_encodeBlockAsm8B
MOVB (R10)(R12*1), R11
CMPB (SI)(R12*1), R11
JNE repeat_extend_forward_end_encodeBlockAsm8B
LEAL 1(R12), R12
- DECL R9
- JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B
repeat_extend_forward_end_encodeBlockAsm8B:
ADDL R12, CX
@@ -4488,35 +4784,68 @@ match_nolit_loop_encodeBlockAsm8B:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeBlockAsm8B
+ JL matchlen_match4_match_nolit_encodeBlockAsm8B
matchlen_loopback_match_nolit_encodeBlockAsm8B:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeBlockAsm8B
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeBlockAsm8B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeBlockAsm8B
matchlen_loop_match_nolit_encodeBlockAsm8B:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm8B
-
-matchlen_single_match_nolit_encodeBlockAsm8B:
- TESTL DI, DI
- JZ match_nolit_end_encodeBlockAsm8B
-
-matchlen_single_loopback_match_nolit_encodeBlockAsm8B:
+ JZ match_nolit_end_encodeBlockAsm8B
+
+matchlen_match4_match_nolit_encodeBlockAsm8B:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeBlockAsm8B
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeBlockAsm8B
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm8B:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeBlockAsm8B
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeBlockAsm8B
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeBlockAsm8B:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeBlockAsm8B
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeBlockAsm8B
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B
match_nolit_end_encodeBlockAsm8B:
ADDL R10, CX
@@ -4763,7 +5092,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B:
RET
// func encodeBetterBlockAsm(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm(SB), $327704-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000a00, CX
@@ -4885,35 +5214,68 @@ match_dst_size_check_encodeBetterBlockAsm:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeBetterBlockAsm
+ JL matchlen_match4_match_nolit_encodeBetterBlockAsm
matchlen_loopback_match_nolit_encodeBetterBlockAsm:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeBetterBlockAsm
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm
matchlen_loop_match_nolit_encodeBetterBlockAsm:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm
-
-matchlen_single_match_nolit_encodeBetterBlockAsm:
- TESTL R8, R8
- JZ match_nolit_end_encodeBetterBlockAsm
-
-matchlen_single_loopback_match_nolit_encodeBetterBlockAsm:
+ JZ match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeBetterBlockAsm
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeBetterBlockAsm
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeBetterBlockAsm
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeBetterBlockAsm
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeBetterBlockAsm
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeBetterBlockAsm
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm
match_nolit_end_encodeBetterBlockAsm:
MOVL CX, R8
@@ -5719,7 +6081,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm:
RET
// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000a00, CX
@@ -5841,35 +6203,68 @@ match_dst_size_check_encodeBetterBlockAsm4MB:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB
+ JL matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeBetterBlockAsm4MB
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm4MB
matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
-
-matchlen_single_match_nolit_encodeBetterBlockAsm4MB:
- TESTL R8, R8
- JZ match_nolit_end_encodeBetterBlockAsm4MB
-
-matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB:
+ JZ match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeBetterBlockAsm4MB
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeBetterBlockAsm4MB
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB
match_nolit_end_encodeBetterBlockAsm4MB:
MOVL CX, R8
@@ -6618,7 +7013,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
RET
// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000280, CX
@@ -6732,35 +7127,68 @@ match_dst_size_check_encodeBetterBlockAsm12B:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeBetterBlockAsm12B
+ JL matchlen_match4_match_nolit_encodeBetterBlockAsm12B
matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeBetterBlockAsm12B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm12B
matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
-
-matchlen_single_match_nolit_encodeBetterBlockAsm12B:
- TESTL R8, R8
- JZ match_nolit_end_encodeBetterBlockAsm12B
-
-matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B:
+ JZ match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeBetterBlockAsm12B
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeBetterBlockAsm12B
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeBetterBlockAsm12B
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeBetterBlockAsm12B
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B
match_nolit_end_encodeBetterBlockAsm12B:
MOVL CX, R8
@@ -7363,7 +7791,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
RET
// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
MOVQ dst_base+0(FP), AX
MOVQ $0x000000a0, CX
@@ -7477,35 +7905,68 @@ match_dst_size_check_encodeBetterBlockAsm10B:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeBetterBlockAsm10B
+ JL matchlen_match4_match_nolit_encodeBetterBlockAsm10B
matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeBetterBlockAsm10B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm10B
matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
-
-matchlen_single_match_nolit_encodeBetterBlockAsm10B:
- TESTL R8, R8
- JZ match_nolit_end_encodeBetterBlockAsm10B
-
-matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B:
+ JZ match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeBetterBlockAsm10B
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeBetterBlockAsm10B
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeBetterBlockAsm10B
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeBetterBlockAsm10B
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B
match_nolit_end_encodeBetterBlockAsm10B:
MOVL CX, R8
@@ -8108,7 +8569,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
RET
// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000028, CX
@@ -8222,35 +8683,68 @@ match_dst_size_check_encodeBetterBlockAsm8B:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeBetterBlockAsm8B
+ JL matchlen_match4_match_nolit_encodeBetterBlockAsm8B
matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeBetterBlockAsm8B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeBetterBlockAsm8B
matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
-
-matchlen_single_match_nolit_encodeBetterBlockAsm8B:
- TESTL R8, R8
- JZ match_nolit_end_encodeBetterBlockAsm8B
-
-matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B:
+ JZ match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeBetterBlockAsm8B
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeBetterBlockAsm8B
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeBetterBlockAsm8B
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeBetterBlockAsm8B
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B
match_nolit_end_encodeBetterBlockAsm8B:
MOVL CX, R8
@@ -8843,7 +9337,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
RET
// func encodeSnappyBlockAsm(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm(SB), $65560-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000200, CX
@@ -9079,35 +9573,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
// matchLen
XORL R11, R11
CMPL R8, $0x08
- JL matchlen_single_repeat_extend_encodeSnappyBlockAsm
+ JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
MOVQ (R9)(R11*1), R10
XORQ (SI)(R11*1), R10
TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm
- BSFQ R10, R10
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
- JMP repeat_extend_forward_end_encodeSnappyBlockAsm
+
+#ifdef GOAMD64_v3
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R10, R10
+
+#endif
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm
matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
LEAL -8(R8), R8
LEAL 8(R11), R11
CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
-
-matchlen_single_repeat_extend_encodeSnappyBlockAsm:
- TESTL R8, R8
- JZ repeat_extend_forward_end_encodeSnappyBlockAsm
-
-matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm:
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
+ CMPL R8, $0x04
+ JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm
+ MOVL (R9)(R11*1), R10
+ CMPL (SI)(R11*1), R10
+ JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
+ CMPL R8, $0x02
+ JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm
+ MOVW (R9)(R11*1), R10
+ CMPW (SI)(R11*1), R10
+ JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
+ CMPL R8, $0x01
+ JL repeat_extend_forward_end_encodeSnappyBlockAsm
MOVB (R9)(R11*1), R10
CMPB (SI)(R11*1), R10
JNE repeat_extend_forward_end_encodeSnappyBlockAsm
LEAL 1(R11), R11
- DECL R8
- JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm
repeat_extend_forward_end_encodeSnappyBlockAsm:
ADDL R11, CX
@@ -9380,35 +9907,68 @@ match_nolit_loop_encodeSnappyBlockAsm:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBlockAsm
+ JL matchlen_match4_match_nolit_encodeSnappyBlockAsm
matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeSnappyBlockAsm
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm
matchlen_loop_match_nolit_encodeSnappyBlockAsm:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm
-
-matchlen_single_match_nolit_encodeSnappyBlockAsm:
- TESTL DI, DI
- JZ match_nolit_end_encodeSnappyBlockAsm
-
-matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm:
+ JZ match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBlockAsm
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBlockAsm
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeSnappyBlockAsm
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeSnappyBlockAsm
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm
match_nolit_end_encodeSnappyBlockAsm:
ADDL R10, CX
@@ -9660,7 +10220,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
RET
// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000200, CX
@@ -9877,35 +10437,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
// matchLen
XORL R11, R11
CMPL R8, $0x08
- JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K
+ JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
MOVQ (R9)(R11*1), R10
XORQ (SI)(R11*1), R10
TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
- BSFQ R10, R10
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
- JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+#ifdef GOAMD64_v3
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R10, R10
+
+#endif
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
LEAL -8(R8), R8
LEAL 8(R11), R11
CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K
-
-matchlen_single_repeat_extend_encodeSnappyBlockAsm64K:
- TESTL R8, R8
- JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
-
-matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K:
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
+ CMPL R8, $0x04
+ JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
+ MOVL (R9)(R11*1), R10
+ CMPL (SI)(R11*1), R10
+ JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
+ CMPL R8, $0x02
+ JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
+ MOVW (R9)(R11*1), R10
+ CMPW (SI)(R11*1), R10
+ JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
+ CMPL R8, $0x01
+ JL repeat_extend_forward_end_encodeSnappyBlockAsm64K
MOVB (R9)(R11*1), R10
CMPB (SI)(R11*1), R10
JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
LEAL 1(R11), R11
- DECL R8
- JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K
repeat_extend_forward_end_encodeSnappyBlockAsm64K:
ADDL R11, CX
@@ -10135,35 +10728,68 @@ match_nolit_loop_encodeSnappyBlockAsm64K:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K
+ JL matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeSnappyBlockAsm64K
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm64K
matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K
-
-matchlen_single_match_nolit_encodeSnappyBlockAsm64K:
- TESTL DI, DI
- JZ match_nolit_end_encodeSnappyBlockAsm64K
-
-matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K:
+ JZ match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeSnappyBlockAsm64K
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeSnappyBlockAsm64K
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K
match_nolit_end_encodeSnappyBlockAsm64K:
ADDL R10, CX
@@ -10372,7 +10998,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
RET
// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000080, CX
@@ -10589,35 +11215,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
// matchLen
XORL R11, R11
CMPL R8, $0x08
- JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B
+ JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
MOVQ (R9)(R11*1), R10
XORQ (SI)(R11*1), R10
TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
- BSFQ R10, R10
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
- JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R10, R10
+
+#endif
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
LEAL -8(R8), R8
LEAL 8(R11), R11
CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
-
-matchlen_single_repeat_extend_encodeSnappyBlockAsm12B:
- TESTL R8, R8
- JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
-
-matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B:
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
+ CMPL R8, $0x04
+ JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
+ MOVL (R9)(R11*1), R10
+ CMPL (SI)(R11*1), R10
+ JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
+ CMPL R8, $0x02
+ JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
+ MOVW (R9)(R11*1), R10
+ CMPW (SI)(R11*1), R10
+ JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
+ CMPL R8, $0x01
+ JL repeat_extend_forward_end_encodeSnappyBlockAsm12B
MOVB (R9)(R11*1), R10
CMPB (SI)(R11*1), R10
JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
LEAL 1(R11), R11
- DECL R8
- JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B
repeat_extend_forward_end_encodeSnappyBlockAsm12B:
ADDL R11, CX
@@ -10847,35 +11506,68 @@ match_nolit_loop_encodeSnappyBlockAsm12B:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B
+ JL matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeSnappyBlockAsm12B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm12B
matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
-
-matchlen_single_match_nolit_encodeSnappyBlockAsm12B:
- TESTL DI, DI
- JZ match_nolit_end_encodeSnappyBlockAsm12B
-
-matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B:
+ JZ match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeSnappyBlockAsm12B
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeSnappyBlockAsm12B
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B
match_nolit_end_encodeSnappyBlockAsm12B:
ADDL R10, CX
@@ -11084,7 +11776,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
RET
// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000020, CX
@@ -11301,35 +11993,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
// matchLen
XORL R11, R11
CMPL R8, $0x08
- JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B
+ JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
MOVQ (R9)(R11*1), R10
XORQ (SI)(R11*1), R10
TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
- BSFQ R10, R10
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
- JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R10, R10
+
+#endif
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
LEAL -8(R8), R8
LEAL 8(R11), R11
CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
-
-matchlen_single_repeat_extend_encodeSnappyBlockAsm10B:
- TESTL R8, R8
- JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
-
-matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B:
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
+ CMPL R8, $0x04
+ JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
+ MOVL (R9)(R11*1), R10
+ CMPL (SI)(R11*1), R10
+ JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
+ CMPL R8, $0x02
+ JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
+ MOVW (R9)(R11*1), R10
+ CMPW (SI)(R11*1), R10
+ JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
+ CMPL R8, $0x01
+ JL repeat_extend_forward_end_encodeSnappyBlockAsm10B
MOVB (R9)(R11*1), R10
CMPB (SI)(R11*1), R10
JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
LEAL 1(R11), R11
- DECL R8
- JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B
repeat_extend_forward_end_encodeSnappyBlockAsm10B:
ADDL R11, CX
@@ -11559,35 +12284,68 @@ match_nolit_loop_encodeSnappyBlockAsm10B:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B
+ JL matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeSnappyBlockAsm10B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm10B
matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
-
-matchlen_single_match_nolit_encodeSnappyBlockAsm10B:
- TESTL DI, DI
- JZ match_nolit_end_encodeSnappyBlockAsm10B
-
-matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B:
+ JZ match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeSnappyBlockAsm10B
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeSnappyBlockAsm10B
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B
match_nolit_end_encodeSnappyBlockAsm10B:
ADDL R10, CX
@@ -11796,7 +12554,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
RET
// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000008, CX
@@ -12013,35 +12771,68 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
// matchLen
XORL R11, R11
CMPL R8, $0x08
- JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B
+ JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
MOVQ (R9)(R11*1), R10
XORQ (SI)(R11*1), R10
TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
- BSFQ R10, R10
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
- JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R10, R10
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R10, R10
+
+#endif
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
+ JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
LEAL -8(R8), R8
LEAL 8(R11), R11
CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
-
-matchlen_single_repeat_extend_encodeSnappyBlockAsm8B:
- TESTL R8, R8
- JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
-
-matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B:
+ JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
+ CMPL R8, $0x04
+ JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
+ MOVL (R9)(R11*1), R10
+ CMPL (SI)(R11*1), R10
+ JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
+ CMPL R8, $0x02
+ JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
+ MOVW (R9)(R11*1), R10
+ CMPW (SI)(R11*1), R10
+ JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
+ CMPL R8, $0x01
+ JL repeat_extend_forward_end_encodeSnappyBlockAsm8B
MOVB (R9)(R11*1), R10
CMPB (SI)(R11*1), R10
JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
LEAL 1(R11), R11
- DECL R8
- JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B
repeat_extend_forward_end_encodeSnappyBlockAsm8B:
ADDL R11, CX
@@ -12269,35 +13060,68 @@ match_nolit_loop_encodeSnappyBlockAsm8B:
// matchLen
XORL R10, R10
CMPL DI, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B
+ JL matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
MOVQ (R8)(R10*1), R9
XORQ (SI)(R10*1), R9
TESTQ R9, R9
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
- BSFQ R9, R9
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
- JMP match_nolit_end_encodeSnappyBlockAsm8B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R9, R9
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP match_nolit_end_encodeSnappyBlockAsm8B
matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
LEAL -8(DI), DI
LEAL 8(R10), R10
CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
-
-matchlen_single_match_nolit_encodeSnappyBlockAsm8B:
- TESTL DI, DI
- JZ match_nolit_end_encodeSnappyBlockAsm8B
-
-matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B:
+ JZ match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
+ CMPL DI, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
+ MOVL (R8)(R10*1), R9
+ CMPL (SI)(R10*1), R9
+ JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
+ CMPL DI, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
+ MOVW (R8)(R10*1), R9
+ CMPW (SI)(R10*1), R9
+ JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
+ CMPL DI, $0x01
+ JL match_nolit_end_encodeSnappyBlockAsm8B
MOVB (R8)(R10*1), R9
CMPB (SI)(R10*1), R9
JNE match_nolit_end_encodeSnappyBlockAsm8B
LEAL 1(R10), R10
- DECL DI
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B
match_nolit_end_encodeSnappyBlockAsm8B:
ADDL R10, CX
@@ -12504,7 +13328,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
RET
// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000a00, CX
@@ -12626,35 +13450,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm
+ JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeSnappyBetterBlockAsm
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm
-
-matchlen_single_match_nolit_encodeSnappyBetterBlockAsm:
- TESTL R8, R8
- JZ match_nolit_end_encodeSnappyBetterBlockAsm
-
-matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm:
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeSnappyBetterBlockAsm
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeSnappyBetterBlockAsm
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm
match_nolit_end_encodeSnappyBetterBlockAsm:
MOVL CX, R8
@@ -13086,7 +13943,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
RET
// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000a00, CX
@@ -13200,35 +14057,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm64K:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K
+ JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
-
-matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K:
- TESTL R8, R8
- JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
-
-matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeSnappyBetterBlockAsm64K
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
match_nolit_end_encodeSnappyBetterBlockAsm64K:
MOVL CX, R8
@@ -13589,7 +14479,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
RET
// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000280, CX
@@ -13703,35 +14593,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm12B:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B
+ JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
-
-matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B:
- TESTL R8, R8
- JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
-
-matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeSnappyBetterBlockAsm12B
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
match_nolit_end_encodeSnappyBetterBlockAsm12B:
MOVL CX, R8
@@ -14092,7 +15015,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
RET
// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
MOVQ dst_base+0(FP), AX
MOVQ $0x000000a0, CX
@@ -14206,35 +15129,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm10B:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B
+ JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
-
-matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B:
- TESTL R8, R8
- JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
-
-matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeSnappyBetterBlockAsm10B
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
match_nolit_end_encodeSnappyBetterBlockAsm10B:
MOVL CX, R8
@@ -14595,7 +15551,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
RET
// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
-// Requires: SSE2
+// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
MOVQ dst_base+0(FP), AX
MOVQ $0x00000028, CX
@@ -14709,35 +15665,68 @@ match_dst_size_check_encodeSnappyBetterBlockAsm8B:
// matchLen
XORL R12, R12
CMPL R8, $0x08
- JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B
+ JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
MOVQ (R9)(R12*1), R11
XORQ (R10)(R12*1), R11
TESTQ R11, R11
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
- BSFQ R11, R11
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
- JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+#ifdef GOAMD64_v3
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ R11, R11
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ R11, R11
+
+#endif
+ SARQ $0x03, R11
+ LEAL (R12)(R11*1), R12
+ JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
LEAL -8(R8), R8
LEAL 8(R12), R12
CMPL R8, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
-
-matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B:
- TESTL R8, R8
- JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
-
-matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
+ JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
+ CMPL R8, $0x04
+ JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
+ MOVL (R9)(R12*1), R11
+ CMPL (R10)(R12*1), R11
+ JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
+ SUBL $0x04, R8
+ LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
+ CMPL R8, $0x02
+ JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
+ MOVW (R9)(R12*1), R11
+ CMPW (R10)(R12*1), R11
+ JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
+ SUBL $0x02, R8
+ LEAL 2(R12), R12
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
+ CMPL R8, $0x01
+ JL match_nolit_end_encodeSnappyBetterBlockAsm8B
MOVB (R9)(R12*1), R11
CMPB (R10)(R12*1), R11
JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
LEAL 1(R12), R12
- DECL R8
- JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
match_nolit_end_encodeSnappyBetterBlockAsm8B:
MOVL CX, R8
@@ -15635,6 +16624,7 @@ gen_emit_copy_end_snappy:
RET
// func matchLen(a []byte, b []byte) int
+// Requires: BMI
TEXT ·matchLen(SB), NOSPLIT, $0-56
MOVQ a_base+0(FP), AX
MOVQ b_base+24(FP), CX
@@ -15643,35 +16633,68 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56
// matchLen
XORL SI, SI
CMPL DX, $0x08
- JL matchlen_single_standalone
+ JL matchlen_match4_standalone
matchlen_loopback_standalone:
MOVQ (AX)(SI*1), BX
XORQ (CX)(SI*1), BX
TESTQ BX, BX
JZ matchlen_loop_standalone
- BSFQ BX, BX
- SARQ $0x03, BX
- LEAL (SI)(BX*1), SI
- JMP gen_match_len_end
+
+#ifdef GOAMD64_v3
+ TZCNTQ BX, BX
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef GOAMD64_v4
+ TZCNTQ BX, BX
+
+#define TZCNTQ_EMITTED 1
+#endif
+
+#ifdef TZCNTQ_EMITTED
+#undef TZCNTQ_EMITTED
+#else
+ BSFQ BX, BX
+
+#endif
+ SARQ $0x03, BX
+ LEAL (SI)(BX*1), SI
+ JMP gen_match_len_end
matchlen_loop_standalone:
LEAL -8(DX), DX
LEAL 8(SI), SI
CMPL DX, $0x08
JGE matchlen_loopback_standalone
+ JZ gen_match_len_end
-matchlen_single_standalone:
- TESTL DX, DX
- JZ gen_match_len_end
-
-matchlen_single_loopback_standalone:
+matchlen_match4_standalone:
+ CMPL DX, $0x04
+ JL matchlen_match2_standalone
+ MOVL (AX)(SI*1), BX
+ CMPL (CX)(SI*1), BX
+ JNE matchlen_match2_standalone
+ SUBL $0x04, DX
+ LEAL 4(SI), SI
+
+matchlen_match2_standalone:
+ CMPL DX, $0x02
+ JL matchlen_match1_standalone
+ MOVW (AX)(SI*1), BX
+ CMPW (CX)(SI*1), BX
+ JNE matchlen_match1_standalone
+ SUBL $0x02, DX
+ LEAL 2(SI), SI
+
+matchlen_match1_standalone:
+ CMPL DX, $0x01
+ JL gen_match_len_end
MOVB (AX)(SI*1), BL
CMPB (CX)(SI*1), BL
JNE gen_match_len_end
LEAL 1(SI), SI
- DECL DX
- JNZ matchlen_single_loopback_standalone
gen_match_len_end:
MOVQ SI, ret+48(FP)