summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/s2/decode_arm64.s
diff options
context:
space:
mode:
authordependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>2022-01-18 20:24:14 +0100
committerGitHub <noreply@github.com>2022-01-18 20:24:14 +0100
commitaad60c882e16cd2c8769a49e6d9f87a040590d62 (patch)
tree3bfe1f8953b40f9beb39c69db3a7647ea6de54d2 /vendor/github.com/klauspost/compress/s2/decode_arm64.s
parentfecca575078a21dedb0cab213dde7fd97161c0fa (diff)
downloadmatterbridge-msglm-aad60c882e16cd2c8769a49e6d9f87a040590d62.tar.gz
matterbridge-msglm-aad60c882e16cd2c8769a49e6d9f87a040590d62.tar.bz2
matterbridge-msglm-aad60c882e16cd2c8769a49e6d9f87a040590d62.zip
Bump github.com/mattermost/mattermost-server/v6 from 6.1.0 to 6.3.0 (#1686)
Bumps [github.com/mattermost/mattermost-server/v6](https://github.com/mattermost/mattermost-server) from 6.1.0 to 6.3.0. - [Release notes](https://github.com/mattermost/mattermost-server/releases) - [Changelog](https://github.com/mattermost/mattermost-server/blob/master/CHANGELOG.md) - [Commits](https://github.com/mattermost/mattermost-server/compare/v6.1.0...v6.3.0) --- updated-dependencies: - dependency-name: github.com/mattermost/mattermost-server/v6 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/decode_arm64.s')
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_arm64.s574
1 files changed, 574 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
new file mode 100644
index 00000000..4b63d508
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@@ -0,0 +1,574 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+#define R_TMP0 R2
+#define R_TMP1 R3
+#define R_LEN R4
+#define R_OFF R5
+#define R_SRC R6
+#define R_DST R7
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+
+// TEST_SRC will check if R_SRC is <= SRC_END
+#define TEST_SRC() \
+ CMP R_SEND, R_SRC \
+ BGT errCorrupt
+
+// MOVD R_SRC, R_TMP1
+// SUB R_SBASE, R_TMP1, R_TMP1
+// CMP R_SLEN, R_TMP1
+// BGT errCorrupt
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+// - R_TMP0 scratch
+// - R_TMP1 scratch
+// - R_LEN length or x
+// - R_OFF offset
+// - R_SRC &src[s]
+// - R_DST &dst[d]
+// + R_DBASE dst_base
+// + R_DLEN dst_len
+// + R_DEND dst_base + dst_len
+// + R_SBASE src_base
+// + R_SLEN src_len
+// + R_SEND src_base + src_len
+// - R_TMP2 used by doCopy
+// - R_TMP3 used by doCopy
+//
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $56-64
+ // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+ MOVD dst_base+0(FP), R_DBASE
+ MOVD dst_len+8(FP), R_DLEN
+ MOVD R_DBASE, R_DST
+ MOVD R_DBASE, R_DEND
+ ADD R_DLEN, R_DEND, R_DEND
+ MOVD src_base+24(FP), R_SBASE
+ MOVD src_len+32(FP), R_SLEN
+ MOVD R_SBASE, R_SRC
+ MOVD R_SBASE, R_SEND
+ ADD R_SLEN, R_SEND, R_SEND
+ MOVD $0, R_OFF
+
+loop:
+ // for s < len(src)
+ CMP R_SEND, R_SRC
+ BEQ end
+
+ // R_LEN = uint32(src[s])
+ //
+ // switch src[s] & 0x03
+ MOVBU (R_SRC), R_LEN
+ MOVW R_LEN, R_TMP1
+ ANDW $3, R_TMP1
+ MOVW $1, R1
+ CMPW R1, R_TMP1
+ BGE tagCopy
+
+ // ----------------------------------------
+ // The code below handles literal tags.
+
+ // case tagLiteral:
+ // x := uint32(src[s] >> 2)
+ // switch
+ MOVW $60, R1
+ LSRW $2, R_LEN, R_LEN
+ CMPW R_LEN, R1
+ BLS tagLit60Plus
+
+ // case x < 60:
+ // s++
+ ADD $1, R_SRC, R_SRC
+
+doLit:
+ // This is the end of the inner "switch", when we have a literal tag.
+ //
+ // We assume that R_LEN == x and x fits in a uint32, where x is the variable
+ // used in the pure Go decode_other.go code.
+
+ // length = int(x) + 1
+ //
+ // Unlike the pure Go code, we don't need to check if length <= 0 because
+ // R_LEN can hold 64 bits, so the increment cannot overflow.
+ ADD $1, R_LEN, R_LEN
+
+ // Prepare to check if copying length bytes will run past the end of dst or
+ // src.
+ //
+ // R_TMP0 = len(dst) - d
+ // R_TMP1 = len(src) - s
+ MOVD R_DEND, R_TMP0
+ SUB R_DST, R_TMP0, R_TMP0
+ MOVD R_SEND, R_TMP1
+ SUB R_SRC, R_TMP1, R_TMP1
+
+ // !!! Try a faster technique for short (16 or fewer bytes) copies.
+ //
+ // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+ // goto callMemmove // Fall back on calling runtime·memmove.
+ // }
+ //
+ // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+ // against 21 instead of 16, because it cannot assume that all of its input
+ // is contiguous in memory and so it needs to leave enough source bytes to
+ // read the next tag without refilling buffers, but Go's Decode assumes
+ // contiguousness (the src argument is a []byte).
+ CMP $16, R_LEN
+ BGT callMemmove
+ CMP $16, R_TMP0
+ BLT callMemmove
+ CMP $16, R_TMP1
+ BLT callMemmove
+
+ // !!! Implement the copy from src to dst as a 16-byte load and store.
+ // (Decode's documentation says that dst and src must not overlap.)
+ //
+ // This always copies 16 bytes, instead of only length bytes, but that's
+ // OK. If the input is a valid Snappy encoding then subsequent iterations
+ // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+ // non-nil error), so the overrun will be ignored.
+ //
+ // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+ // 16-byte loads and stores. This technique probably wouldn't be as
+ // effective on architectures that are fussier about alignment.
+ LDP 0(R_SRC), (R_TMP2, R_TMP3)
+ STP (R_TMP2, R_TMP3), 0(R_DST)
+
+ // d += length
+ // s += length
+ ADD R_LEN, R_DST, R_DST
+ ADD R_LEN, R_SRC, R_SRC
+ B loop
+
+callMemmove:
+ // if length > len(dst)-d || length > len(src)-s { etc }
+ CMP R_TMP0, R_LEN
+ BGT errCorrupt
+ CMP R_TMP1, R_LEN
+ BGT errCorrupt
+
+ // copy(dst[d:], src[s:s+length])
+ //
+ // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+ // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
+ // three registers to the stack, to save local variables across the CALL.
+ MOVD R_DST, 8(RSP)
+ MOVD R_SRC, 16(RSP)
+ MOVD R_LEN, 24(RSP)
+ MOVD R_DST, 32(RSP)
+ MOVD R_SRC, 40(RSP)
+ MOVD R_LEN, 48(RSP)
+ MOVD R_OFF, 56(RSP)
+ CALL runtime·memmove(SB)
+
+ // Restore local variables: unspill registers from the stack and
+ // re-calculate R_DBASE-R_SEND.
+ MOVD 32(RSP), R_DST
+ MOVD 40(RSP), R_SRC
+ MOVD 48(RSP), R_LEN
+ MOVD 56(RSP), R_OFF
+ MOVD dst_base+0(FP), R_DBASE
+ MOVD dst_len+8(FP), R_DLEN
+ MOVD R_DBASE, R_DEND
+ ADD R_DLEN, R_DEND, R_DEND
+ MOVD src_base+24(FP), R_SBASE
+ MOVD src_len+32(FP), R_SLEN
+ MOVD R_SBASE, R_SEND
+ ADD R_SLEN, R_SEND, R_SEND
+
+ // d += length
+ // s += length
+ ADD R_LEN, R_DST, R_DST
+ ADD R_LEN, R_SRC, R_SRC
+ B loop
+
+tagLit60Plus:
+ // !!! This fragment does the
+ //
+ // s += x - 58; if uint(s) > uint(len(src)) { etc }
+ //
+ // checks. In the asm version, we code it once instead of once per switch case.
+ ADD R_LEN, R_SRC, R_SRC
+ SUB $58, R_SRC, R_SRC
+ TEST_SRC()
+
+ // case x == 60:
+ MOVW $61, R1
+ CMPW R1, R_LEN
+ BEQ tagLit61
+ BGT tagLit62Plus
+
+ // x = uint32(src[s-1])
+ MOVBU -1(R_SRC), R_LEN
+ B doLit
+
+tagLit61:
+ // case x == 61:
+ // x = uint32(src[s-2]) | uint32(src[s-1])<<8
+ MOVHU -2(R_SRC), R_LEN
+ B doLit
+
+tagLit62Plus:
+ CMPW $62, R_LEN
+ BHI tagLit63
+
+ // case x == 62:
+ // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+ MOVHU -3(R_SRC), R_LEN
+ MOVBU -1(R_SRC), R_TMP1
+ ORR R_TMP1<<16, R_LEN
+ B doLit
+
+tagLit63:
+ // case x == 63:
+ // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+ MOVWU -4(R_SRC), R_LEN
+ B doLit
+
+ // The code above handles literal tags.
+ // ----------------------------------------
+ // The code below handles copy tags.
+
+tagCopy4:
+ // case tagCopy4:
+ // s += 5
+ ADD $5, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ MOVD R_SRC, R_TMP1
+ SUB R_SBASE, R_TMP1, R_TMP1
+ CMP R_SLEN, R_TMP1
+ BGT errCorrupt
+
+ // length = 1 + int(src[s-5])>>2
+ MOVD $1, R1
+ ADD R_LEN>>2, R1, R_LEN
+
+ // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+ MOVWU -4(R_SRC), R_OFF
+ B doCopy
+
+tagCopy2:
+ // case tagCopy2:
+ // s += 3
+ ADD $3, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // length = 1 + int(src[s-3])>>2
+ MOVD $1, R1
+ ADD R_LEN>>2, R1, R_LEN
+
+ // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+ MOVHU -2(R_SRC), R_OFF
+ B doCopy
+
+tagCopy:
+ // We have a copy tag. We assume that:
+ // - R_TMP1 == src[s] & 0x03
+ // - R_LEN == src[s]
+ CMP $2, R_TMP1
+ BEQ tagCopy2
+ BGT tagCopy4
+
+ // case tagCopy1:
+ // s += 2
+ ADD $2, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+ // Calculate offset in R_TMP0 in case it is a repeat.
+ MOVD R_LEN, R_TMP0
+ AND $0xe0, R_TMP0
+ MOVBU -1(R_SRC), R_TMP1
+ ORR R_TMP0<<3, R_TMP1, R_TMP0
+
+ // length = 4 + int(src[s-2])>>2&0x7
+ MOVD $7, R1
+ AND R_LEN>>2, R1, R_LEN
+ ADD $4, R_LEN, R_LEN
+
+ // check if repeat code with offset 0.
+ CMP $0, R_TMP0
+ BEQ repeatCode
+
+ // This is a regular copy, transfer our temporary value to R_OFF (offset)
+ MOVD R_TMP0, R_OFF
+ B doCopy
+
+ // This is a repeat code.
+repeatCode:
+ // If length < 9, reuse last offset, with the length already calculated.
+ CMP $9, R_LEN
+ BLT doCopyRepeat
+ BEQ repeatLen1
+ CMP $10, R_LEN
+ BEQ repeatLen2
+
+repeatLen3:
+ // s +=3
+ ADD $3, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
+ MOVBU -1(R_SRC), R_TMP0
+ MOVHU -3(R_SRC), R_LEN
+ ORR R_TMP0<<16, R_LEN, R_LEN
+ ADD $65540, R_LEN, R_LEN
+ B doCopyRepeat
+
+repeatLen2:
+ // s +=2
+ ADD $2, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
+ MOVHU -2(R_SRC), R_LEN
+ ADD $260, R_LEN, R_LEN
+ B doCopyRepeat
+
+repeatLen1:
+ // s +=1
+ ADD $1, R_SRC, R_SRC
+
+ // if uint(s) > uint(len(src)) { etc }
+ TEST_SRC()
+
+ // length = src[s-1] + 8
+ MOVBU -1(R_SRC), R_LEN
+ ADD $8, R_LEN, R_LEN
+ B doCopyRepeat
+
+doCopy:
+ // This is the end of the outer "switch", when we have a copy tag.
+ //
+ // We assume that:
+ // - R_LEN == length && R_LEN > 0
+ // - R_OFF == offset
+
+ // if d < offset { etc }
+ MOVD R_DST, R_TMP1
+ SUB R_DBASE, R_TMP1, R_TMP1
+ CMP R_OFF, R_TMP1
+ BLT errCorrupt
+
+ // Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+
+ // if offset <= 0 { etc }
+ CMP $0, R_OFF
+ BLE errCorrupt
+
+ // if length > len(dst)-d { etc }
+ MOVD R_DEND, R_TMP1
+ SUB R_DST, R_TMP1, R_TMP1
+ CMP R_TMP1, R_LEN
+ BGT errCorrupt
+
+ // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+ //
+ // Set:
+ // - R_TMP2 = len(dst)-d
+ // - R_TMP3 = &dst[d-offset]
+ MOVD R_DEND, R_TMP2
+ SUB R_DST, R_TMP2, R_TMP2
+ MOVD R_DST, R_TMP3
+ SUB R_OFF, R_TMP3, R_TMP3
+
+ // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+ //
+ // First, try using two 8-byte load/stores, similar to the doLit technique
+ // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+ // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+ // and not one 16-byte load/store, and the first store has to be before the
+ // second load, due to the overlap if offset is in the range [8, 16).
+ //
+ // if length > 16 || offset < 8 || len(dst)-d < 16 {
+ // goto slowForwardCopy
+ // }
+ // copy 16 bytes
+ // d += length
+ CMP $16, R_LEN
+ BGT slowForwardCopy
+ CMP $8, R_OFF
+ BLT slowForwardCopy
+ CMP $16, R_TMP2
+ BLT slowForwardCopy
+ MOVD 0(R_TMP3), R_TMP0
+ MOVD R_TMP0, 0(R_DST)
+ MOVD 8(R_TMP3), R_TMP1
+ MOVD R_TMP1, 8(R_DST)
+ ADD R_LEN, R_DST, R_DST
+ B loop
+
+slowForwardCopy:
+ // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+ // can still try 8-byte load stores, provided we can overrun up to 10 extra
+ // bytes. As above, the overrun will be fixed up by subsequent iterations
+ // of the outermost loop.
+ //
+ // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+ // commentary says:
+ //
+ // ----
+ //
+ // The main part of this loop is a simple copy of eight bytes at a time
+ // until we've copied (at least) the requested amount of bytes. However,
+ // if d and d-offset are less than eight bytes apart (indicating a
+ // repeating pattern of length < 8), we first need to expand the pattern in
+ // order to get the correct results. For instance, if the buffer looks like
+ // this, with the eight-byte <d-offset> and <d> patterns marked as
+ // intervals:
+ //
+ // abxxxxxxxxxxxx
+ // [------] d-offset
+ // [------] d
+ //
+ // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+ // once, after which we can move <d> two bytes without moving <d-offset>:
+ //
+ // ababxxxxxxxxxx
+ // [------] d-offset
+ // [------] d
+ //
+ // and repeat the exercise until the two no longer overlap.
+ //
+ // This allows us to do very well in the special case of one single byte
+ // repeated many times, without taking a big hit for more general cases.
+ //
+ // The worst case of extra writing past the end of the match occurs when
+ // offset == 1 and length == 1; the last copy will read from byte positions
+ // [0..7] and write to [4..11], whereas it was only supposed to write to
+ // position 1. Thus, ten excess bytes.
+ //
+ // ----
+ //
+ // That "10 byte overrun" worst case is confirmed by Go's
+ // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+ // and finishSlowForwardCopy algorithm.
+ //
+ // if length > len(dst)-d-10 {
+ // goto verySlowForwardCopy
+ // }
+ SUB $10, R_TMP2, R_TMP2
+ CMP R_TMP2, R_LEN
+ BGT verySlowForwardCopy
+
+ // We want to keep the offset, so we use R_TMP2 from here.
+ MOVD R_OFF, R_TMP2
+
+makeOffsetAtLeast8:
+ // !!! As above, expand the pattern so that offset >= 8 and we can use
+ // 8-byte load/stores.
+ //
+ // for offset < 8 {
+ // copy 8 bytes from dst[d-offset:] to dst[d:]
+ // length -= offset
+ // d += offset
+ // offset += offset
+ // // The two previous lines together means that d-offset, and therefore
+ // // R_TMP3, is unchanged.
+ // }
+ CMP $8, R_TMP2
+ BGE fixUpSlowForwardCopy
+ MOVD (R_TMP3), R_TMP1
+ MOVD R_TMP1, (R_DST)
+ SUB R_TMP2, R_LEN, R_LEN
+ ADD R_TMP2, R_DST, R_DST
+ ADD R_TMP2, R_TMP2, R_TMP2
+ B makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+ // !!! Add length (which might be negative now) to d (implied by R_DST being
+ // &dst[d]) so that d ends up at the right place when we jump back to the
+ // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
+ // length is positive, copying the remaining length bytes will write to the
+ // right place.
+ MOVD R_DST, R_TMP0
+ ADD R_LEN, R_DST, R_DST
+
+finishSlowForwardCopy:
+ // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+ // length means that we overrun, but as above, that will be fixed up by
+ // subsequent iterations of the outermost loop.
+ MOVD $0, R1
+ CMP R1, R_LEN
+ BLE loop
+ MOVD (R_TMP3), R_TMP1
+ MOVD R_TMP1, (R_TMP0)
+ ADD $8, R_TMP3, R_TMP3
+ ADD $8, R_TMP0, R_TMP0
+ SUB $8, R_LEN, R_LEN
+ B finishSlowForwardCopy
+
+verySlowForwardCopy:
+ // verySlowForwardCopy is a simple implementation of forward copy. In C
+ // parlance, this is a do/while loop instead of a while loop, since we know
+ // that length > 0. In Go syntax:
+ //
+ // for {
+ // dst[d] = dst[d - offset]
+ // d++
+ // length--
+ // if length == 0 {
+ // break
+ // }
+ // }
+ MOVB (R_TMP3), R_TMP1
+ MOVB R_TMP1, (R_DST)
+ ADD $1, R_TMP3, R_TMP3
+ ADD $1, R_DST, R_DST
+ SUB $1, R_LEN, R_LEN
+ CBNZ R_LEN, verySlowForwardCopy
+ B loop
+
+ // The code above handles copy tags.
+ // ----------------------------------------
+
+end:
+ // This is the end of the "for s < len(src)".
+ //
+ // if d != len(dst) { etc }
+ CMP R_DEND, R_DST
+ BNE errCorrupt
+
+ // return 0
+ MOVD $0, ret+48(FP)
+ RET
+
+errCorrupt:
+ // return decodeErrCodeCorrupt
+ MOVD $1, R_TMP0
+ MOVD R_TMP0, ret+48(FP)
+ RET