summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/zstd/internal/xxhash
diff options
context:
space:
mode:
authorWim <wim@42.be>2023-03-09 22:48:00 +0100
committerGitHub <noreply@github.com>2023-03-09 22:48:00 +0100
commit08779c29099e8940493df56d28d8aa131ac8342e (patch)
tree7ad8ce25cf371e582137e1706dd671a6bf4342d0 /vendor/github.com/klauspost/compress/zstd/internal/xxhash
parentd5f9cdf912d43cd2a5cb243e086fbdab9a9073b0 (diff)
downloadmatterbridge-msglm-08779c29099e8940493df56d28d8aa131ac8342e.tar.gz
matterbridge-msglm-08779c29099e8940493df56d28d8aa131ac8342e.tar.bz2
matterbridge-msglm-08779c29099e8940493df56d28d8aa131ac8342e.zip
Update dependencies (#2007)
* Update dependencies
Diffstat (limited to 'vendor/github.com/klauspost/compress/zstd/internal/xxhash')
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md49
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go47
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s336
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s140
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go2
-rw-r--r--vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go19
6 files changed, 295 insertions, 298 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
index 69aa3bb5..777290d4 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
@@ -2,12 +2,7 @@
VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.
-
-[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
-[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
-
-xxhash is a Go implementation of the 64-bit
-[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
+xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
high-quality hashing algorithm that is much faster than anything in the Go
standard library.
@@ -28,31 +23,49 @@ func (*Digest) WriteString(string) (int, error)
func (*Digest) Sum64() uint64
```
-This implementation provides a fast pure-Go implementation and an even faster
-assembly implementation for amd64.
+The package is written with optimized pure Go and also contains even faster
+assembly implementations for amd64 and arm64. If desired, the `purego` build tag
+opts into using the Go code even on those architectures.
+
+[xxHash]: http://cyan4973.github.io/xxHash/
+
+## Compatibility
+
+This package is in a module and the latest code is in version 2 of the module.
+You need a version of Go with at least "minimal module compatibility" to use
+github.com/cespare/xxhash/v2:
+
+* 1.9.7+ for Go 1.9
+* 1.10.3+ for Go 1.10
+* Go 1.11 or later
+
+I recommend using the latest release of Go.
## Benchmarks
Here are some quick benchmarks comparing the pure-Go and assembly
implementations of Sum64.
-| input size | purego | asm |
-| --- | --- | --- |
-| 5 B | 979.66 MB/s | 1291.17 MB/s |
-| 100 B | 7475.26 MB/s | 7973.40 MB/s |
-| 4 KB | 17573.46 MB/s | 17602.65 MB/s |
-| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
+| input size | purego | asm |
+| ---------- | --------- | --------- |
+| 4 B | 1.3 GB/s | 1.2 GB/s |
+| 16 B | 2.9 GB/s | 3.5 GB/s |
+| 100 B | 6.9 GB/s | 8.1 GB/s |
+| 4 KB | 11.7 GB/s | 16.7 GB/s |
+| 10 MB | 12.0 GB/s | 17.3 GB/s |
-These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
-the following commands under Go 1.11.2:
+These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
+CPU using the following commands under Go 1.19.2:
```
-$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
-$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
+benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
+benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
```
## Projects using this package
- [InfluxDB](https://github.com/influxdata/influxdb)
- [Prometheus](https://github.com/prometheus/prometheus)
+- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
- [FreeCache](https://github.com/coocood/freecache)
+- [FastCache](https://github.com/VictoriaMetrics/fastcache)
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
index 2c112a0a..fc40c820 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
@@ -18,19 +18,11 @@ const (
prime5 uint64 = 2870177450012600261
)
-// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
-// possible in the Go code is worth a small (but measurable) performance boost
-// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
-// convenience in the Go code in a few places where we need to intentionally
-// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
-// result overflows a uint64).
-var (
- prime1v = prime1
- prime2v = prime2
- prime3v = prime3
- prime4v = prime4
- prime5v = prime5
-)
+// Store the primes in an array as well.
+//
+// The consts are used when possible in Go code to avoid MOVs but we need a
+// contiguous array of the assembly code.
+var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
// Digest implements hash.Hash64.
type Digest struct {
@@ -52,10 +44,10 @@ func New() *Digest {
// Reset clears the Digest's state so that it can be reused.
func (d *Digest) Reset() {
- d.v1 = prime1v + prime2
+ d.v1 = primes[0] + prime2
d.v2 = prime2
d.v3 = 0
- d.v4 = -prime1v
+ d.v4 = -primes[0]
d.total = 0
d.n = 0
}
@@ -71,21 +63,23 @@ func (d *Digest) Write(b []byte) (n int, err error) {
n = len(b)
d.total += uint64(n)
+ memleft := d.mem[d.n&(len(d.mem)-1):]
+
if d.n+n < 32 {
// This new data doesn't even fill the current block.
- copy(d.mem[d.n:], b)
+ copy(memleft, b)
d.n += n
return
}
if d.n > 0 {
// Finish off the partial block.
- copy(d.mem[d.n:], b)
+ c := copy(memleft, b)
d.v1 = round(d.v1, u64(d.mem[0:8]))
d.v2 = round(d.v2, u64(d.mem[8:16]))
d.v3 = round(d.v3, u64(d.mem[16:24]))
d.v4 = round(d.v4, u64(d.mem[24:32]))
- b = b[32-d.n:]
+ b = b[c:]
d.n = 0
}
@@ -135,21 +129,20 @@ func (d *Digest) Sum64() uint64 {
h += d.total
- i, end := 0, d.n
- for ; i+8 <= end; i += 8 {
- k1 := round(0, u64(d.mem[i:i+8]))
+ b := d.mem[:d.n&(len(d.mem)-1)]
+ for ; len(b) >= 8; b = b[8:] {
+ k1 := round(0, u64(b[:8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
- if i+4 <= end {
- h ^= uint64(u32(d.mem[i:i+4])) * prime1
+ if len(b) >= 4 {
+ h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3
- i += 4
+ b = b[4:]
}
- for i < end {
- h ^= uint64(d.mem[i]) * prime5
+ for ; len(b) > 0; b = b[1:] {
+ h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1
- i++
}
h ^= h >> 33
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
index cea17856..ddb63aa9 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@@ -1,3 +1,4 @@
+//go:build !appengine && gc && !purego && !noasm
// +build !appengine
// +build gc
// +build !purego
@@ -5,212 +6,205 @@
#include "textflag.h"
-// Register allocation:
-// AX h
-// SI pointer to advance through b
-// DX n
-// BX loop end
-// R8 v1, k1
-// R9 v2
-// R10 v3
-// R11 v4
-// R12 tmp
-// R13 prime1v
-// R14 prime2v
-// DI prime4v
-
-// round reads from and advances the buffer pointer in SI.
-// It assumes that R13 has prime1v and R14 has prime2v.
-#define round(r) \
- MOVQ (SI), R12 \
- ADDQ $8, SI \
- IMULQ R14, R12 \
- ADDQ R12, r \
- ROLQ $31, r \
- IMULQ R13, r
-
-// mergeRound applies a merge round on the two registers acc and val.
-// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
-#define mergeRound(acc, val) \
- IMULQ R14, val \
- ROLQ $31, val \
- IMULQ R13, val \
- XORQ val, acc \
- IMULQ R13, acc \
- ADDQ DI, acc
+// Registers:
+#define h AX
+#define d AX
+#define p SI // pointer to advance through b
+#define n DX
+#define end BX // loop end
+#define v1 R8
+#define v2 R9
+#define v3 R10
+#define v4 R11
+#define x R12
+#define prime1 R13
+#define prime2 R14
+#define prime4 DI
+
+#define round(acc, x) \
+ IMULQ prime2, x \
+ ADDQ x, acc \
+ ROLQ $31, acc \
+ IMULQ prime1, acc
+
+// round0 performs the operation x = round(0, x).
+#define round0(x) \
+ IMULQ prime2, x \
+ ROLQ $31, x \
+ IMULQ prime1, x
+
+// mergeRound applies a merge round on the two registers acc and x.
+// It assumes that prime1, prime2, and prime4 have been loaded.
+#define mergeRound(acc, x) \
+ round0(x) \
+ XORQ x, acc \
+ IMULQ prime1, acc \
+ ADDQ prime4, acc
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that there is at least one block
+// to process.
+#define blockLoop() \
+loop: \
+ MOVQ +0(p), x \
+ round(v1, x) \
+ MOVQ +8(p), x \
+ round(v2, x) \
+ MOVQ +16(p), x \
+ round(v3, x) \
+ MOVQ +24(p), x \
+ round(v4, x) \
+ ADDQ $32, p \
+ CMPQ p, end \
+ JLE loop
// func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOSPLIT, $0-32
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
// Load fixed primes.
- MOVQ ·prime1v(SB), R13
- MOVQ ·prime2v(SB), R14
- MOVQ ·prime4v(SB), DI
+ MOVQ ·primes+0(SB), prime1
+ MOVQ ·primes+8(SB), prime2
+ MOVQ ·primes+24(SB), prime4
// Load slice.
- MOVQ b_base+0(FP), SI
- MOVQ b_len+8(FP), DX
- LEAQ (SI)(DX*1), BX
+ MOVQ b_base+0(FP), p
+ MOVQ b_len+8(FP), n
+ LEAQ (p)(n*1), end
// The first loop limit will be len(b)-32.
- SUBQ $32, BX
+ SUBQ $32, end
// Check whether we have at least one block.
- CMPQ DX, $32
+ CMPQ n, $32
JLT noBlocks
// Set up initial state (v1, v2, v3, v4).
- MOVQ R13, R8
- ADDQ R14, R8
- MOVQ R14, R9
- XORQ R10, R10
- XORQ R11, R11
- SUBQ R13, R11
-
- // Loop until SI > BX.
-blockLoop:
- round(R8)
- round(R9)
- round(R10)
- round(R11)
-
- CMPQ SI, BX
- JLE blockLoop
-
- MOVQ R8, AX
- ROLQ $1, AX
- MOVQ R9, R12
- ROLQ $7, R12
- ADDQ R12, AX
- MOVQ R10, R12
- ROLQ $12, R12
- ADDQ R12, AX
- MOVQ R11, R12
- ROLQ $18, R12
- ADDQ R12, AX
-
- mergeRound(AX, R8)
- mergeRound(AX, R9)
- mergeRound(AX, R10)
- mergeRound(AX, R11)
+ MOVQ prime1, v1
+ ADDQ prime2, v1
+ MOVQ prime2, v2
+ XORQ v3, v3
+ XORQ v4, v4
+ SUBQ prime1, v4
+
+ blockLoop()
+
+ MOVQ v1, h
+ ROLQ $1, h
+ MOVQ v2, x
+ ROLQ $7, x
+ ADDQ x, h
+ MOVQ v3, x
+ ROLQ $12, x
+ ADDQ x, h
+ MOVQ v4, x
+ ROLQ $18, x
+ ADDQ x, h
+
+ mergeRound(h, v1)
+ mergeRound(h, v2)
+ mergeRound(h, v3)
+ mergeRound(h, v4)
JMP afterBlocks
noBlocks:
- MOVQ ·prime5v(SB), AX
+ MOVQ ·primes+32(SB), h
afterBlocks:
- ADDQ DX, AX
-
- // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
- ADDQ $24, BX
-
- CMPQ SI, BX
- JG fourByte
-
-wordLoop:
- // Calculate k1.
- MOVQ (SI), R8
- ADDQ $8, SI
- IMULQ R14, R8
- ROLQ $31, R8
- IMULQ R13, R8
-
- XORQ R8, AX
- ROLQ $27, AX
- IMULQ R13, AX
- ADDQ DI, AX
-
- CMPQ SI, BX
- JLE wordLoop
-
-fourByte:
- ADDQ $4, BX
- CMPQ SI, BX
- JG singles
-
- MOVL (SI), R8
- ADDQ $4, SI
- IMULQ R13, R8
- XORQ R8, AX
-
- ROLQ $23, AX
- IMULQ R14, AX
- ADDQ ·prime3v(SB), AX
-
-singles:
- ADDQ $4, BX
- CMPQ SI, BX
+ ADDQ n, h
+
+ ADDQ $24, end
+ CMPQ p, end
+ JG try4
+
+loop8:
+ MOVQ (p), x
+ ADDQ $8, p
+ round0(x)
+ XORQ x, h
+ ROLQ $27, h
+ IMULQ prime1, h
+ ADDQ prime4, h
+
+ CMPQ p, end
+ JLE loop8
+
+try4:
+ ADDQ $4, end
+ CMPQ p, end
+ JG try1
+
+ MOVL (p), x
+ ADDQ $4, p
+ IMULQ prime1, x
+ XORQ x, h
+
+ ROLQ $23, h
+ IMULQ prime2, h
+ ADDQ ·primes+16(SB), h
+
+try1:
+ ADDQ $4, end
+ CMPQ p, end
JGE finalize
-singlesLoop:
- MOVBQZX (SI), R12
- ADDQ $1, SI
- IMULQ ·prime5v(SB), R12
- XORQ R12, AX
+loop1:
+ MOVBQZX (p), x
+ ADDQ $1, p
+ IMULQ ·primes+32(SB), x
+ XORQ x, h
+ ROLQ $11, h
+ IMULQ prime1, h
- ROLQ $11, AX
- IMULQ R13, AX
-
- CMPQ SI, BX
- JL singlesLoop
+ CMPQ p, end
+ JL loop1
finalize:
- MOVQ AX, R12
- SHRQ $33, R12
- XORQ R12, AX
- IMULQ R14, AX
- MOVQ AX, R12
- SHRQ $29, R12
- XORQ R12, AX
- IMULQ ·prime3v(SB), AX
- MOVQ AX, R12
- SHRQ $32, R12
- XORQ R12, AX
-
- MOVQ AX, ret+24(FP)
+ MOVQ h, x
+ SHRQ $33, x
+ XORQ x, h
+ IMULQ prime2, h
+ MOVQ h, x
+ SHRQ $29, x
+ XORQ x, h
+ IMULQ ·primes+16(SB), h
+ MOVQ h, x
+ SHRQ $32, x
+ XORQ x, h
+
+ MOVQ h, ret+24(FP)
RET
-// writeBlocks uses the same registers as above except that it uses AX to store
-// the d pointer.
-
// func writeBlocks(d *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT, $0-40
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// Load fixed primes needed for round.
- MOVQ ·prime1v(SB), R13
- MOVQ ·prime2v(SB), R14
+ MOVQ ·primes+0(SB), prime1
+ MOVQ ·primes+8(SB), prime2
// Load slice.
- MOVQ b_base+8(FP), SI
- MOVQ b_len+16(FP), DX
- LEAQ (SI)(DX*1), BX
- SUBQ $32, BX
+ MOVQ b_base+8(FP), p
+ MOVQ b_len+16(FP), n
+ LEAQ (p)(n*1), end
+ SUBQ $32, end
// Load vN from d.
- MOVQ d+0(FP), AX
- MOVQ 0(AX), R8 // v1
- MOVQ 8(AX), R9 // v2
- MOVQ 16(AX), R10 // v3
- MOVQ 24(AX), R11 // v4
+ MOVQ s+0(FP), d
+ MOVQ 0(d), v1
+ MOVQ 8(d), v2
+ MOVQ 16(d), v3
+ MOVQ 24(d), v4
// We don't need to check the loop condition here; this function is
// always called with at least one block of data to process.
-blockLoop:
- round(R8)
- round(R9)
- round(R10)
- round(R11)
-
- CMPQ SI, BX
- JLE blockLoop
+ blockLoop()
// Copy vN back to d.
- MOVQ R8, 0(AX)
- MOVQ R9, 8(AX)
- MOVQ R10, 16(AX)
- MOVQ R11, 24(AX)
-
- // The number of bytes written is SI minus the old base pointer.
- SUBQ b_base+8(FP), SI
- MOVQ SI, ret+32(FP)
+ MOVQ v1, 0(d)
+ MOVQ v2, 8(d)
+ MOVQ v3, 16(d)
+ MOVQ v4, 24(d)
+
+ // The number of bytes written is p minus the old base pointer.
+ SUBQ b_base+8(FP), p
+ MOVQ p, ret+32(FP)
RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
index 4d64a17d..17901e08 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
@@ -1,13 +1,17 @@
-// +build gc,!purego,!noasm
+//go:build !appengine && gc && !purego && !noasm
+// +build !appengine
+// +build gc
+// +build !purego
+// +build !noasm
#include "textflag.h"
-// Register allocation.
+// Registers:
#define digest R1
-#define h R2 // Return value.
-#define p R3 // Input pointer.
-#define len R4
-#define nblocks R5 // len / 32.
+#define h R2 // return value
+#define p R3 // input pointer
+#define n R4 // input length
+#define nblocks R5 // n / 32
#define prime1 R7
#define prime2 R8
#define prime3 R9
@@ -25,60 +29,52 @@
#define round(acc, x) \
MADD prime2, acc, x, acc \
ROR $64-31, acc \
- MUL prime1, acc \
+ MUL prime1, acc
-// x = round(0, x).
+// round0 performs the operation x = round(0, x).
#define round0(x) \
MUL prime2, x \
ROR $64-31, x \
- MUL prime1, x \
-
-#define mergeRound(x) \
- round0(x) \
- EOR x, h \
- MADD h, prime4, prime1, h \
-
-// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
-#define blocksLoop() \
- LSR $5, len, nblocks \
- PCALIGN $16 \
- loop: \
- LDP.P 32(p), (x1, x2) \
- round(v1, x1) \
- LDP -16(p), (x3, x4) \
- round(v2, x2) \
- SUB $1, nblocks \
- round(v3, x3) \
- round(v4, x4) \
- CBNZ nblocks, loop \
-
-// The primes are repeated here to ensure that they're stored
-// in a contiguous array, so we can load them with LDP.
-DATA primes<> +0(SB)/8, $11400714785074694791
-DATA primes<> +8(SB)/8, $14029467366897019727
-DATA primes<>+16(SB)/8, $1609587929392839161
-DATA primes<>+24(SB)/8, $9650029242287828579
-DATA primes<>+32(SB)/8, $2870177450012600261
-GLOBL primes<>(SB), NOPTR+RODATA, $40
+ MUL prime1, x
+
+#define mergeRound(acc, x) \
+ round0(x) \
+ EOR x, acc \
+ MADD acc, prime4, prime1, acc
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that n >= 32.
+#define blockLoop() \
+ LSR $5, n, nblocks \
+ PCALIGN $16 \
+ loop: \
+ LDP.P 16(p), (x1, x2) \
+ LDP.P 16(p), (x3, x4) \
+ round(v1, x1) \
+ round(v2, x2) \
+ round(v3, x3) \
+ round(v4, x4) \
+ SUB $1, nblocks \
+ CBNZ nblocks, loop
// func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
- LDP b_base+0(FP), (p, len)
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
+ LDP b_base+0(FP), (p, n)
- LDP primes<> +0(SB), (prime1, prime2)
- LDP primes<>+16(SB), (prime3, prime4)
- MOVD primes<>+32(SB), prime5
+ LDP ·primes+0(SB), (prime1, prime2)
+ LDP ·primes+16(SB), (prime3, prime4)
+ MOVD ·primes+32(SB), prime5
- CMP $32, len
- CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
- BLO afterLoop
+ CMP $32, n
+ CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
+ BLT afterLoop
ADD prime1, prime2, v1
MOVD prime2, v2
MOVD $0, v3
NEG prime1, v4
- blocksLoop()
+ blockLoop()
ROR $64-1, v1, x1
ROR $64-7, v2, x2
@@ -88,71 +84,75 @@ TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
ADD x3, x4
ADD x2, x4, h
- mergeRound(v1)
- mergeRound(v2)
- mergeRound(v3)
- mergeRound(v4)
+ mergeRound(h, v1)
+ mergeRound(h, v2)
+ mergeRound(h, v3)
+ mergeRound(h, v4)
afterLoop:
- ADD len, h
+ ADD n, h
- TBZ $4, len, try8
+ TBZ $4, n, try8
LDP.P 16(p), (x1, x2)
round0(x1)
+
+ // NOTE: here and below, sequencing the EOR after the ROR (using a
+ // rotated register) is worth a small but measurable speedup for small
+ // inputs.
ROR $64-27, h
EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h
round0(x2)
ROR $64-27, h
- EOR x2 @> 64-27, h
+ EOR x2 @> 64-27, h, h
MADD h, prime4, prime1, h
try8:
- TBZ $3, len, try4
+ TBZ $3, n, try4
MOVD.P 8(p), x1
round0(x1)
ROR $64-27, h
- EOR x1 @> 64-27, h
+ EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h
try4:
- TBZ $2, len, try2
+ TBZ $2, n, try2
MOVWU.P 4(p), x2
MUL prime1, x2
ROR $64-23, h
- EOR x2 @> 64-23, h
+ EOR x2 @> 64-23, h, h
MADD h, prime3, prime2, h
try2:
- TBZ $1, len, try1
+ TBZ $1, n, try1
MOVHU.P 2(p), x3
AND $255, x3, x1
LSR $8, x3, x2
MUL prime5, x1
ROR $64-11, h
- EOR x1 @> 64-11, h
+ EOR x1 @> 64-11, h, h
MUL prime1, h
MUL prime5, x2
ROR $64-11, h
- EOR x2 @> 64-11, h
+ EOR x2 @> 64-11, h, h
MUL prime1, h
try1:
- TBZ $0, len, end
+ TBZ $0, n, finalize
MOVBU (p), x4
MUL prime5, x4
ROR $64-11, h
- EOR x4 @> 64-11, h
+ EOR x4 @> 64-11, h, h
MUL prime1, h
-end:
+finalize:
EOR h >> 33, h
MUL prime2, h
EOR h >> 29, h
@@ -163,24 +163,22 @@ end:
RET
// func writeBlocks(d *Digest, b []byte) int
-//
-// Assumes len(b) >= 32.
-TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
- LDP primes<>(SB), (prime1, prime2)
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
+ LDP ·primes+0(SB), (prime1, prime2)
// Load state. Assume v[1-4] are stored contiguously.
MOVD d+0(FP), digest
LDP 0(digest), (v1, v2)
LDP 16(digest), (v3, v4)
- LDP b_base+8(FP), (p, len)
+ LDP b_base+8(FP), (p, n)
- blocksLoop()
+ blockLoop()
// Store updated state.
STP (v1, v2), 0(digest)
STP (v3, v4), 16(digest)
- BIC $31, len
- MOVD len, ret+32(FP)
+ BIC $31, n
+ MOVD n, ret+32(FP)
RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
index 1a1fac9c..d4221edf 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
@@ -13,4 +13,4 @@ package xxhash
func Sum64(b []byte) uint64
//go:noescape
-func writeBlocks(d *Digest, b []byte) int
+func writeBlocks(s *Digest, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
index 209cb4a9..0be16cef 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
@@ -15,10 +15,10 @@ func Sum64(b []byte) uint64 {
var h uint64
if n >= 32 {
- v1 := prime1v + prime2
+ v1 := primes[0] + prime2
v2 := prime2
v3 := uint64(0)
- v4 := -prime1v
+ v4 := -primes[0]
for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)]))
v2 = round(v2, u64(b[8:16:len(b)]))
@@ -37,19 +37,18 @@ func Sum64(b []byte) uint64 {
h += uint64(n)
- i, end := 0, len(b)
- for ; i+8 <= end; i += 8 {
- k1 := round(0, u64(b[i:i+8:len(b)]))
+ for ; len(b) >= 8; b = b[8:] {
+ k1 := round(0, u64(b[:8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
- if i+4 <= end {
- h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
+ if len(b) >= 4 {
+ h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3
- i += 4
+ b = b[4:]
}
- for ; i < end; i++ {
- h ^= uint64(b[i]) * prime5
+ for ; len(b) > 0; b = b[1:] {
+ h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1
}