summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in')
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in195
1 files changed, 195 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
new file mode 100644
index 00000000..330d86ae
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
@@ -0,0 +1,195 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+
+#define bufoff 256 // see decompress.go, we're using [4][256]byte table
+
+//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off R8
+#define buffer DI
+#define table SI
+
+#define br_bits_read R9
+#define br_value R10
+#define br_offset R11
+#define peek_bits R12
+#define exhausted DX
+
+#define br0 R13
+#define br1 R14
+#define br2 R15
+#define br3 BP
+
+ MOVQ BP, 0(SP)
+
+ XORQ exhausted, exhausted // exhausted = false
+ XORQ off, off // off = 0
+
+ MOVBQZX peekBits+32(FP), peek_bits
+ MOVQ buf+40(FP), buffer
+ MOVQ tbl+48(FP), table
+
+ MOVQ pbr0+0(FP), br0
+ MOVQ pbr1+8(FP), br1
+ MOVQ pbr2+16(FP), br2
+ MOVQ pbr3+24(FP), br3
+
+main_loop:
+{{ define "decode_2_values_x86" }}
+ // const stream = {{ var "id" }}
+ // br{{ var "id"}}.fillFast()
+ MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
+ MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
+ MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
+
+ // We must have at least 2 * max tablelog left
+ CMPQ br_bits_read, $64-22
+ JBE skip_fill{{ var "id" }}
+
+ SUBQ $32, br_bits_read // b.bitsRead -= 32
+ SUBQ $4, br_offset // b.off -= 4
+
+ // v := b.in[b.off-4 : b.off]
+ // v = v[:4]
+ // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+ SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+#else
+ MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+ MOVQ br_bits_read, CX
+ SHLQ CL, AX
+#endif
+
+ ORQ AX, br_value
+
+ // exhausted = exhausted || (br{{ var "id"}}.off < 4)
+ CMPQ br_offset, $4
+ SETLT DL
+ ORB DL, DH
+ // }
+skip_fill{{ var "id" }}:
+
+ // val0 := br{{ var "id"}}.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+#else
+ MOVQ br_value, AX
+ MOVQ peek_bits, CX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+#endif
+
+ // v0 := table[val0&mask]
+ MOVW 0(table)(AX*2), AX // AX - v0
+
+ // br{{ var "id"}}.advance(uint8(v0.entry))
+ MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+
+#ifdef GOAMD64_v3
+ SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+#else
+ // val1 := br{{ var "id"}}.peekTopBits(peekBits)
+ MOVQ peek_bits, CX
+ MOVQ br_value, AX
+ SHRQ CL, AX // AX = (value >> peek_bits) & mask
+#endif
+
+ // v1 := table[val1&mask]
+ MOVW 0(table)(AX*2), AX // AX - v1
+
+ // br{{ var "id"}}.advance(uint8(v1.entry))
+ MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+ MOVBQZX AL, CX
+ SHLXQ AX, br_value, br_value // value <<= n
+#else
+ MOVBQZX AL, CX
+ SHLQ CL, br_value // value <<= n
+#endif
+
+ ADDQ CX, br_bits_read // bits_read += n
+
+
+ // these two writes get coalesced
+ // buf[stream][off] = uint8(v0.entry >> 8)
+ // buf[stream][off+1] = uint8(v1.entry >> 8)
+ MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
+
+ // update the bitrader reader structure
+ MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
+ MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
+ MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
+{{ end }}
+
+ {{ set "id" "0" }}
+ {{ set "ofs" "0" }}
+ {{ set "bufofs" "0" }} {{/* id * bufoff */}}
+ {{ template "decode_2_values_x86" . }}
+
+ {{ set "id" "1" }}
+ {{ set "ofs" "8" }}
+ {{ set "bufofs" "256" }}
+ {{ template "decode_2_values_x86" . }}
+
+ {{ set "id" "2" }}
+ {{ set "ofs" "16" }}
+ {{ set "bufofs" "512" }}
+ {{ template "decode_2_values_x86" . }}
+
+ {{ set "id" "3" }}
+ {{ set "ofs" "24" }}
+ {{ set "bufofs" "768" }}
+ {{ template "decode_2_values_x86" . }}
+
+ ADDQ $2, off // off += 2
+
+ TESTB DH, DH // any br[i].ofs < 4?
+ JNZ end
+
+ CMPQ off, $bufoff
+ JL main_loop
+end:
+ MOVQ 0(SP), BP
+
+ MOVB off, ret+56(FP)
+ RET
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3