summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/huff0/decompress_amd64.s')
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s585
1 files changed, 284 insertions, 301 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index dd1a5aec..c4c7ab2d 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -1,364 +1,352 @@
// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
//go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_main_loop_amd64(SB), $0-8
- XORQ DX, DX
-
// Preload values
MOVQ ctx+0(FP), AX
MOVBQZX 8(AX), DI
- MOVQ 16(AX), SI
- MOVQ 48(AX), BX
- MOVQ 24(AX), R9
- MOVQ 32(AX), R10
- MOVQ (AX), R11
+ MOVQ 16(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 24(AX), R8
+ MOVQ 32(AX), R9
+ MOVQ (AX), R10
// Main loop
main_loop:
- MOVQ SI, R8
- CMPQ R8, BX
+ XORL DX, DX
+ CMPQ BX, SI
SETGE DL
// br0.fillFast32()
- MOVQ 32(R11), R12
- MOVBQZX 40(R11), R13
- CMPQ R13, $0x20
+ MOVQ 32(R10), R11
+ MOVBQZX 40(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill0
- MOVQ 24(R11), AX
- SUBQ $0x20, R13
+ MOVQ 24(R10), AX
+ SUBQ $0x20, R12
SUBQ $0x04, AX
- MOVQ (R11), R14
+ MOVQ (R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(R14*1), R14
- MOVQ R13, CX
- SHLQ CL, R14
- MOVQ AX, 24(R11)
- ORQ R14, R12
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 24(R10)
+ ORQ R13, R11
- // exhausted = exhausted || (br0.off < 4)
- CMPQ AX, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br0.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br0.peekTopBits(peekBits)
MOVQ DI, CX
- MOVQ R12, R14
- SHRQ CL, R14
+ MOVQ R11, R13
+ SHRQ CL, R13
// v1 := table[val1&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (R8)
+ MOVW AX, (BX)
// update the bitreader structure
- MOVQ R12, 32(R11)
- MOVB R13, 40(R11)
- ADDQ R9, R8
+ MOVQ R11, 32(R10)
+ MOVB R12, 40(R10)
// br1.fillFast32()
- MOVQ 80(R11), R12
- MOVBQZX 88(R11), R13
- CMPQ R13, $0x20
+ MOVQ 80(R10), R11
+ MOVBQZX 88(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill1
- MOVQ 72(R11), AX
- SUBQ $0x20, R13
+ MOVQ 72(R10), AX
+ SUBQ $0x20, R12
SUBQ $0x04, AX
- MOVQ 48(R11), R14
+ MOVQ 48(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(R14*1), R14
- MOVQ R13, CX
- SHLQ CL, R14
- MOVQ AX, 72(R11)
- ORQ R14, R12
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 72(R10)
+ ORQ R13, R11
- // exhausted = exhausted || (br1.off < 4)
- CMPQ AX, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br1.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br1.peekTopBits(peekBits)
MOVQ DI, CX
- MOVQ R12, R14
- SHRQ CL, R14
+ MOVQ R11, R13
+ SHRQ CL, R13
// v1 := table[val1&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (R8)
+ MOVW AX, (BX)(R8*1)
// update the bitreader structure
- MOVQ R12, 80(R11)
- MOVB R13, 88(R11)
- ADDQ R9, R8
+ MOVQ R11, 80(R10)
+ MOVB R12, 88(R10)
// br2.fillFast32()
- MOVQ 128(R11), R12
- MOVBQZX 136(R11), R13
- CMPQ R13, $0x20
+ MOVQ 128(R10), R11
+ MOVBQZX 136(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill2
- MOVQ 120(R11), AX
- SUBQ $0x20, R13
+ MOVQ 120(R10), AX
+ SUBQ $0x20, R12
SUBQ $0x04, AX
- MOVQ 96(R11), R14
+ MOVQ 96(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(R14*1), R14
- MOVQ R13, CX
- SHLQ CL, R14
- MOVQ AX, 120(R11)
- ORQ R14, R12
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 120(R10)
+ ORQ R13, R11
- // exhausted = exhausted || (br2.off < 4)
- CMPQ AX, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br2.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br2.peekTopBits(peekBits)
MOVQ DI, CX
- MOVQ R12, R14
- SHRQ CL, R14
+ MOVQ R11, R13
+ SHRQ CL, R13
// v1 := table[val1&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (R8)
+ MOVW AX, (BX)(R8*2)
// update the bitreader structure
- MOVQ R12, 128(R11)
- MOVB R13, 136(R11)
- ADDQ R9, R8
+ MOVQ R11, 128(R10)
+ MOVB R12, 136(R10)
// br3.fillFast32()
- MOVQ 176(R11), R12
- MOVBQZX 184(R11), R13
- CMPQ R13, $0x20
+ MOVQ 176(R10), R11
+ MOVBQZX 184(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill3
- MOVQ 168(R11), AX
- SUBQ $0x20, R13
+ MOVQ 168(R10), AX
+ SUBQ $0x20, R12
SUBQ $0x04, AX
- MOVQ 144(R11), R14
+ MOVQ 144(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(R14*1), R14
- MOVQ R13, CX
- SHLQ CL, R14
- MOVQ AX, 168(R11)
- ORQ R14, R12
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 168(R10)
+ ORQ R13, R11
- // exhausted = exhausted || (br3.off < 4)
- CMPQ AX, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br3.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br3.peekTopBits(peekBits)
MOVQ DI, CX
- MOVQ R12, R14
- SHRQ CL, R14
+ MOVQ R11, R13
+ SHRQ CL, R13
// v1 := table[val1&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (R8)
+ LEAQ (R8)(R8*2), CX
+ MOVW AX, (BX)(CX*1)
// update the bitreader structure
- MOVQ R12, 176(R11)
- MOVB R13, 184(R11)
- ADDQ $0x02, SI
+ MOVQ R11, 176(R10)
+ MOVB R12, 184(R10)
+ ADDQ $0x02, BX
TESTB DL, DL
JZ main_loop
MOVQ ctx+0(FP), AX
- SUBQ 16(AX), SI
- SHLQ $0x02, SI
- MOVQ SI, 40(AX)
+ SUBQ 16(AX), BX
+ SHLQ $0x02, BX
+ MOVQ BX, 40(AX)
RET
// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
- XORQ DX, DX
-
// Preload values
MOVQ ctx+0(FP), CX
MOVBQZX 8(CX), DI
MOVQ 16(CX), BX
MOVQ 48(CX), SI
- MOVQ 24(CX), R9
- MOVQ 32(CX), R10
- MOVQ (CX), R11
+ MOVQ 24(CX), R8
+ MOVQ 32(CX), R9
+ MOVQ (CX), R10
// Main loop
main_loop:
- MOVQ BX, R8
- CMPQ R8, SI
+ XORL DX, DX
+ CMPQ BX, SI
SETGE DL
// br0.fillFast32()
- MOVQ 32(R11), R12
- MOVBQZX 40(R11), R13
- CMPQ R13, $0x20
+ MOVQ 32(R10), R11
+ MOVBQZX 40(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill0
- MOVQ 24(R11), R14
- SUBQ $0x20, R13
- SUBQ $0x04, R14
- MOVQ (R11), R15
+ MOVQ 24(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ (R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R14)(R15*1), R15
- MOVQ R13, CX
- SHLQ CL, R15
- MOVQ R14, 24(R11)
- ORQ R15, R12
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 24(R10)
+ ORQ R14, R11
- // exhausted = exhausted || (br0.off < 4)
- CMPQ R14, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br0.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v1 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// val2 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v2 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val3 := br0.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v3 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br0.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@@ -366,88 +354,86 @@ skip_fill0:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (R8)
+ MOVL AX, (BX)
// update the bitreader structure
- MOVQ R12, 32(R11)
- MOVB R13, 40(R11)
- ADDQ R9, R8
+ MOVQ R11, 32(R10)
+ MOVB R12, 40(R10)
// br1.fillFast32()
- MOVQ 80(R11), R12
- MOVBQZX 88(R11), R13
- CMPQ R13, $0x20
+ MOVQ 80(R10), R11
+ MOVBQZX 88(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill1
- MOVQ 72(R11), R14
- SUBQ $0x20, R13
- SUBQ $0x04, R14
- MOVQ 48(R11), R15
+ MOVQ 72(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ 48(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R14)(R15*1), R15
- MOVQ R13, CX
- SHLQ CL, R15
- MOVQ R14, 72(R11)
- ORQ R15, R12
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 72(R10)
+ ORQ R14, R11
- // exhausted = exhausted || (br1.off < 4)
- CMPQ R14, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br1.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v1 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// val2 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v2 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val3 := br1.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v3 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br1.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@@ -455,88 +441,86 @@ skip_fill1:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (R8)
+ MOVL AX, (BX)(R8*1)
// update the bitreader structure
- MOVQ R12, 80(R11)
- MOVB R13, 88(R11)
- ADDQ R9, R8
+ MOVQ R11, 80(R10)
+ MOVB R12, 88(R10)
// br2.fillFast32()
- MOVQ 128(R11), R12
- MOVBQZX 136(R11), R13
- CMPQ R13, $0x20
+ MOVQ 128(R10), R11
+ MOVBQZX 136(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill2
- MOVQ 120(R11), R14
- SUBQ $0x20, R13
- SUBQ $0x04, R14
- MOVQ 96(R11), R15
+ MOVQ 120(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ 96(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R14)(R15*1), R15
- MOVQ R13, CX
- SHLQ CL, R15
- MOVQ R14, 120(R11)
- ORQ R15, R12
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 120(R10)
+ ORQ R14, R11
- // exhausted = exhausted || (br2.off < 4)
- CMPQ R14, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br2.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v1 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// val2 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v2 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val3 := br2.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v3 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br2.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@@ -544,88 +528,86 @@ skip_fill2:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (R8)
+ MOVL AX, (BX)(R8*2)
// update the bitreader structure
- MOVQ R12, 128(R11)
- MOVB R13, 136(R11)
- ADDQ R9, R8
+ MOVQ R11, 128(R10)
+ MOVB R12, 136(R10)
// br3.fillFast32()
- MOVQ 176(R11), R12
- MOVBQZX 184(R11), R13
- CMPQ R13, $0x20
+ MOVQ 176(R10), R11
+ MOVBQZX 184(R10), R12
+ CMPQ R12, $0x20
JBE skip_fill3
- MOVQ 168(R11), R14
- SUBQ $0x20, R13
- SUBQ $0x04, R14
- MOVQ 144(R11), R15
+ MOVQ 168(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ 144(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R14)(R15*1), R15
- MOVQ R13, CX
- SHLQ CL, R15
- MOVQ R14, 168(R11)
- ORQ R15, R12
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 168(R10)
+ ORQ R14, R11
- // exhausted = exhausted || (br3.off < 4)
- CMPQ R14, $0x04
- SETLT AL
- ORB AL, DL
+ // exhausted += (br3.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v0 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val1 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v1 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// val2 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v2 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
// val3 := br3.peekTopBits(peekBits)
- MOVQ R12, R14
+ MOVQ R11, R13
MOVQ DI, CX
- SHRQ CL, R14
+ SHRQ CL, R13
// v3 := table[val0&mask]
- MOVW (R10)(R14*2), CX
+ MOVW (R9)(R13*2), CX
// br3.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R12
- ADDB CL, R13
+ SHLQ CL, R11
+ ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@@ -633,11 +615,12 @@ skip_fill3:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (R8)
+ LEAQ (R8)(R8*2), CX
+ MOVL AX, (BX)(CX*1)
// update the bitreader structure
- MOVQ R12, 176(R11)
- MOVB R13, 184(R11)
+ MOVQ R11, 176(R10)
+ MOVB R12, 184(R10)
ADDQ $0x04, BX
TESTB DL, DL
JZ main_loop
@@ -653,7 +636,7 @@ TEXT ·decompress1x_main_loop_amd64(SB), $0-8
MOVQ 16(CX), DX
MOVQ 24(CX), BX
CMPQ BX, $0x04
- JB error_max_decoded_size_exeeded
+ JB error_max_decoded_size_exceeded
LEAQ (DX)(BX*1), BX
MOVQ (CX), SI
MOVQ (SI), R8
@@ -668,7 +651,7 @@ main_loop:
// Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX
CMPQ CX, BX
- JGE error_max_decoded_size_exeeded
+ JGE error_max_decoded_size_exceeded
// Decode 4 values
CMPQ R11, $0x20
@@ -745,7 +728,7 @@ loop_condition:
RET
// Report error
-error_max_decoded_size_exeeded:
+error_max_decoded_size_exceeded:
MOVQ ctx+0(FP), AX
MOVQ $-1, CX
MOVQ CX, 40(AX)
@@ -758,7 +741,7 @@ TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
MOVQ 16(CX), DX
MOVQ 24(CX), BX
CMPQ BX, $0x04
- JB error_max_decoded_size_exeeded
+ JB error_max_decoded_size_exceeded
LEAQ (DX)(BX*1), BX
MOVQ (CX), SI
MOVQ (SI), R8
@@ -773,7 +756,7 @@ main_loop:
// Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX
CMPQ CX, BX
- JGE error_max_decoded_size_exeeded
+ JGE error_max_decoded_size_exceeded
// Decode 4 values
CMPQ R11, $0x20
@@ -840,7 +823,7 @@ loop_condition:
RET
// Report error
-error_max_decoded_size_exeeded:
+error_max_decoded_size_exceeded:
MOVQ ctx+0(FP), AX
MOVQ $-1, CX
MOVQ CX, 40(AX)