summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/huff0/decompress_amd64.s')
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s1179
1 files changed, 769 insertions, 410 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index 2edad3ea..6c65c6e2 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -1,506 +1,865 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $8-8
+ XORQ DX, DX
+
+ // Preload values
+ MOVQ ctx+0(FP), AX
+ MOVBQZX 32(AX), SI
+ MOVQ 40(AX), DI
+ MOVQ DI, BX
+ MOVQ 72(AX), CX
+ MOVQ CX, (SP)
+ MOVQ 48(AX), R8
+ MOVQ 56(AX), R9
+ MOVQ (AX), R10
+ MOVQ 8(AX), R11
+ MOVQ 16(AX), R12
+ MOVQ 24(AX), R13
+
+ // Main loop
main_loop:
-
- // const stream = 0
- // br0.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
- MOVQ bitReaderShifted_value(br0), br_value
- MOVQ bitReaderShifted_off(br0), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill0
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br0), AX
+ MOVQ BX, DI
+ CMPQ DI, (SP)
+ SETGE DL
+
+ // br0.fillFast32()
+ MOVQ 32(R10), R14
+ MOVBQZX 40(R10), R15
+ CMPQ R15, $0x20
+ JBE skip_fill0
+ MOVQ 24(R10), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R10), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R10)
+ ORQ BP, R14
// exhausted = exhausted || (br0.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill0:
-
// val0 := br0.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ MOVW (R9)(BP*2), CX
-#endif
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R9)(BP*2), CX
// br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 0(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
// update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
- MOVQ br_value, bitReaderShifted_value(br0)
- MOVQ br_offset, bitReaderShifted_off(br0)
-
- // const stream = 1
- // br1.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
- MOVQ bitReaderShifted_value(br1), br_value
- MOVQ bitReaderShifted_off(br1), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill1
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br1), AX
+ MOVQ R14, 32(R10)
+ MOVB R15, 40(R10)
+ ADDQ R8, DI
+
+ // br1.fillFast32()
+ MOVQ 32(R11), R14
+ MOVBQZX 40(R11), R15
+ CMPQ R15, $0x20
+ JBE skip_fill1
+ MOVQ 24(R11), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R11), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R11)
+ ORQ BP, R14
// exhausted = exhausted || (br1.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill1:
-
// val0 := br1.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ MOVW (R9)(BP*2), CX
-#endif
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R9)(BP*2), CX
// br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 256(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
// update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
- MOVQ br_value, bitReaderShifted_value(br1)
- MOVQ br_offset, bitReaderShifted_off(br1)
-
- // const stream = 2
- // br2.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
- MOVQ bitReaderShifted_value(br2), br_value
- MOVQ bitReaderShifted_off(br2), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill2
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br2), AX
+ MOVQ R14, 32(R11)
+ MOVB R15, 40(R11)
+ ADDQ R8, DI
+
+ // br2.fillFast32()
+ MOVQ 32(R12), R14
+ MOVBQZX 40(R12), R15
+ CMPQ R15, $0x20
+ JBE skip_fill2
+ MOVQ 24(R12), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R12), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R12)
+ ORQ BP, R14
// exhausted = exhausted || (br2.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill2:
-
// val0 := br2.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ MOVW (R9)(BP*2), CX
-#endif
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R9)(BP*2), CX
// br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 512(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
// update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
- MOVQ br_value, bitReaderShifted_value(br2)
- MOVQ br_offset, bitReaderShifted_off(br2)
-
- // const stream = 3
- // br3.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
- MOVQ bitReaderShifted_value(br3), br_value
- MOVQ bitReaderShifted_off(br3), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill3
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br3), AX
+ MOVQ R14, 32(R12)
+ MOVB R15, 40(R12)
+ ADDQ R8, DI
+
+ // br3.fillFast32()
+ MOVQ 32(R13), R14
+ MOVBQZX 40(R13), R15
+ CMPQ R15, $0x20
+ JBE skip_fill3
+ MOVQ 24(R13), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R13), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R13)
+ ORQ BP, R14
// exhausted = exhausted || (br3.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill3:
-
// val0 := br3.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
+ // v0 := table[val0&mask]
+ MOVW (R9)(BP*2), CX
-#endif
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ // v1 := table[val1&mask]
+ MOVW (R9)(BP*2), CX
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
+ // br3.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
-#endif
+ // update the bitrader reader structure
+ MOVQ R14, 32(R13)
+ MOVB R15, 40(R13)
+ ADDQ $0x02, BX
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ MOVQ 40(AX), CX
+ MOVQ BX, DX
+ SUBQ CX, DX
+ SHLQ $0x02, DX
+ MOVQ DX, 64(AX)
+ RET
- ADDQ CX, br_bits_read // bits_read += n
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
+ XORQ DX, DX
+
+ // Preload values
+ MOVQ ctx+0(FP), CX
+ MOVBQZX 32(CX), BX
+ MOVQ 40(CX), SI
+ MOVQ SI, (SP)
+ MOVQ 72(CX), DX
+ MOVQ DX, 8(SP)
+ MOVQ 48(CX), DI
+ MOVQ 56(CX), R8
+ MOVQ (CX), R9
+ MOVQ 8(CX), R10
+ MOVQ 16(CX), R11
+ MOVQ 24(CX), R12
+
+ // Main loop
+main_loop:
+ MOVQ (SP), SI
+ CMPQ SI, 8(SP)
+ SETGE DL
+
+ // br1000.fillFast32()
+ MOVQ 32(R9), R13
+ MOVBQZX 40(R9), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1000
+ MOVQ 24(R9), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R9), BP
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R9)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1000.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1000:
+ // val0 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
-#else
- // val1 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
-#endif
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br0.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br0.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br0.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R9)
+ MOVB R14, 40(R9)
+ ADDQ DI, SI
+
+ // br1001.fillFast32()
+ MOVQ 32(R10), R13
+ MOVBQZX 40(R10), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1001
+ MOVQ 24(R10), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R10), BP
- // br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R10)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1001.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1001:
+ // val0 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
-#endif
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br1.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br1.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br1.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R10)
+ MOVB R14, 40(R10)
+ ADDQ DI, SI
+
+ // br1002.fillFast32()
+ MOVQ 32(R11), R13
+ MOVBQZX 40(R11), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1002
+ MOVQ 24(R11), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R11), BP
- ADDQ CX, br_bits_read // bits_read += n
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R11)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1002.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1002:
+ // val0 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 768(buffer)(off*1)
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
- MOVQ br_value, bitReaderShifted_value(br3)
- MOVQ br_offset, bitReaderShifted_off(br3)
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
- ADDQ $2, off // off += 2
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br2.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br2.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br2.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R11)
+ MOVB R14, 40(R11)
+ ADDQ DI, SI
+
+ // br1003.fillFast32()
+ MOVQ 32(R12), R13
+ MOVBQZX 40(R12), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1003
+ MOVQ 24(R12), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R12), BP
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R12)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1003.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1003:
+ // val0 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
- CMPQ off, $bufoff
- JL main_loop
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
-end:
- MOVQ 0(SP), BP
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
- MOVB off, ret+56(FP)
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br3.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br3.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br3.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R12)
+ MOVB R14, 40(R12)
+ ADDQ $0x04, (SP)
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ MOVQ 40(AX), CX
+ MOVQ (SP), DX
+ SUBQ CX, DX
+ SHLQ $0x02, DX
+ MOVQ DX, 64(AX)
RET
-#undef off
-#undef buffer
-#undef table
+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_1_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_2_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, CX
+ MOVQ 16(AX), DX
+ SUBQ DX, CX
+ MOVQ CX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
-#undef br0
-#undef br1
-#undef br2
-#undef br3
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
+
+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+// Requires: BMI2
+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_1_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_2_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, CX
+ MOVQ 16(AX), DX
+ SUBQ DX, CX
+ MOVQ CX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET