summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/huff0/decompress_amd64.s')
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s506
1 files changed, 506 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 00000000..2edad3ea
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,506 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+
+#define bufoff 256 // see decompress.go, we're using [4][256]byte table
+
+// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off R8
+#define buffer DI
+#define table SI
+
+#define br_bits_read R9
+#define br_value R10
+#define br_offset R11
+#define peek_bits R12
+#define exhausted DX
+
+#define br0 R13
+#define br1 R14
+#define br2 R15
+#define br3 BP
+
+ MOVQ BP, 0(SP)
+
+ XORQ exhausted, exhausted // exhausted = false
+ XORQ off, off // off = 0
+
+ MOVBQZX peekBits+32(FP), peek_bits
+ MOVQ buf+40(FP), buffer
+ MOVQ tbl+48(FP), table
+
+ MOVQ pbr0+0(FP), br0
+ MOVQ pbr1+8(FP), br1
+ MOVQ pbr2+16(FP), br2
+ MOVQ pbr3+24(FP), br3
+
+main_loop:
+
+ // const stream = 0
+ // br0.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
+ MOVQ bitReaderShifted_value(br0), br_value
+ MOVQ bitReaderShifted_off(br0), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill0
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br0), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br0.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill0:
+
+ // val0 := br0.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br0.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br0.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 0(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
+ MOVQ br_value, bitReaderShifted_value(br0)
+ MOVQ br_offset, bitReaderShifted_off(br0)
+
+ // const stream = 1
+ // br1.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
+ MOVQ bitReaderShifted_value(br1), br_value
+ MOVQ bitReaderShifted_off(br1), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill1
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br1), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br1.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill1:
+
+ // val0 := br1.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br1.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br1.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 256(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
+ MOVQ br_value, bitReaderShifted_value(br1)
+ MOVQ br_offset, bitReaderShifted_off(br1)
+
+ // const stream = 2
+ // br2.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
+ MOVQ bitReaderShifted_value(br2), br_value
+ MOVQ bitReaderShifted_off(br2), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill2
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br2), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br2.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill2:
+
+ // val0 := br2.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br2.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br2.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 512(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
+ MOVQ br_value, bitReaderShifted_value(br2)
+ MOVQ br_offset, bitReaderShifted_off(br2)
+
+ // const stream = 3
+ // br3.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
+ MOVQ bitReaderShifted_value(br3), br_value
+ MOVQ bitReaderShifted_off(br3), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill3
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br3), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br3.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+
+ // }
+skip_fill3:
+
+ // val0 := br3.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br3.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br3.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, 768(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
+ MOVQ br_value, bitReaderShifted_value(br3)
+ MOVQ br_offset, bitReaderShifted_off(br3)
+
+ ADDQ $2, off // off += 2
+
+ TESTB DH, DH // any br[i].ofs < 4?
+ JNZ end
+
+ CMPQ off, $bufoff
+ JL main_loop
+
+end:
+ MOVQ 0(SP), BP
+
+ MOVB off, ret+56(FP)
+ RET
+
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3