diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in')
-rw-r--r-- | vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in | 195 |
1 files changed, 195 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in new file mode 100644 index 00000000..330d86ae --- /dev/null +++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in @@ -0,0 +1,195 @@ +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" +#include "funcdata.h" +#include "go_asm.h" + +#ifdef GOAMD64_v4 +#ifndef GOAMD64_v3 +#define GOAMD64_v3 +#endif +#endif + +#define bufoff 256 // see decompress.go, we're using [4][256]byte table + +//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, +// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool) +TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8 +#define off R8 +#define buffer DI +#define table SI + +#define br_bits_read R9 +#define br_value R10 +#define br_offset R11 +#define peek_bits R12 +#define exhausted DX + +#define br0 R13 +#define br1 R14 +#define br2 R15 +#define br3 BP + + MOVQ BP, 0(SP) + + XORQ exhausted, exhausted // exhausted = false + XORQ off, off // off = 0 + + MOVBQZX peekBits+32(FP), peek_bits + MOVQ buf+40(FP), buffer + MOVQ tbl+48(FP), table + + MOVQ pbr0+0(FP), br0 + MOVQ pbr1+8(FP), br1 + MOVQ pbr2+16(FP), br2 + MOVQ pbr3+24(FP), br3 + +main_loop: +{{ define "decode_2_values_x86" }} + // const stream = {{ var "id" }} + // br{{ var "id"}}.fillFast() + MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read + MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value + MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset + + // We must have at least 2 * max tablelog left + CMPQ br_bits_read, $64-22 + JBE skip_fill{{ var "id" }} + + SUBQ $32, br_bits_read // b.bitsRead -= 32 + SUBQ $4, br_offset // b.off -= 4 + + // v := b.in[b.off-4 : b.off] + // v = v[:4] + // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + MOVQ bitReaderShifted_in(br{{ var "id" }}), AX + + // b.value |= uint64(low) << (b.bitsRead & 63) +#ifdef GOAMD64_v3 + SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) +#else + MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) + MOVQ br_bits_read, CX + SHLQ CL, AX +#endif + + ORQ AX, br_value + + // exhausted = exhausted || (br{{ var "id"}}.off < 4) + CMPQ br_offset, $4 + SETLT DL + ORB DL, DH + // } +skip_fill{{ var "id" }}: + + // val0 := br{{ var "id"}}.peekTopBits(peekBits) +#ifdef GOAMD64_v3 + SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask +#else + MOVQ br_value, AX + MOVQ peek_bits, CX + SHRQ CL, AX // AX = (value >> peek_bits) & mask +#endif + + // v0 := table[val0&mask] + MOVW 0(table)(AX*2), AX // AX - v0 + + // br{{ var "id"}}.advance(uint8(v0.entry)) + MOVB AH, BL // BL = uint8(v0.entry >> 8) + +#ifdef GOAMD64_v3 + MOVBQZX AL, CX + SHLXQ AX, br_value, br_value // value <<= n +#else + MOVBQZX AL, CX + SHLQ CL, br_value // value <<= n +#endif + + ADDQ CX, br_bits_read // bits_read += n + + +#ifdef GOAMD64_v3 + SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask +#else + // val1 := br{{ var "id"}}.peekTopBits(peekBits) + MOVQ peek_bits, CX + MOVQ br_value, AX + SHRQ CL, AX // AX = (value >> peek_bits) & mask +#endif + + // v1 := table[val1&mask] + MOVW 0(table)(AX*2), AX // AX - v1 + + // br{{ var "id"}}.advance(uint8(v1.entry)) + MOVB AH, BH // BH = uint8(v1.entry >> 8) + +#ifdef GOAMD64_v3 + MOVBQZX AL, CX + SHLXQ AX, br_value, br_value // value <<= n +#else + MOVBQZX AL, CX + SHLQ CL, br_value // value <<= n +#endif + + ADDQ CX, br_bits_read // bits_read += n + + + // these two writes get coalesced + // buf[stream][off] = uint8(v0.entry >> 8) + // buf[stream][off+1] = uint8(v1.entry >> 8) + MOVW BX, {{ var "bufofs" }}(buffer)(off*1) + + // update the bitrader reader structure + MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }}) + MOVQ br_value, bitReaderShifted_value(br{{ var "id" }}) + MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }}) +{{ end }} + + {{ set "id" "0" }} + {{ set "ofs" "0" }} + {{ set "bufofs" "0" }} {{/* id * bufoff */}} + {{ template "decode_2_values_x86" . }} + + {{ set "id" "1" }} + {{ set "ofs" "8" }} + {{ set "bufofs" "256" }} + {{ template "decode_2_values_x86" . }} + + {{ set "id" "2" }} + {{ set "ofs" "16" }} + {{ set "bufofs" "512" }} + {{ template "decode_2_values_x86" . }} + + {{ set "id" "3" }} + {{ set "ofs" "24" }} + {{ set "bufofs" "768" }} + {{ template "decode_2_values_x86" . }} + + ADDQ $2, off // off += 2 + + TESTB DH, DH // any br[i].ofs < 4? + JNZ end + + CMPQ off, $bufoff + JL main_loop +end: + MOVQ 0(SP), BP + + MOVB off, ret+56(FP) + RET +#undef off +#undef buffer +#undef table + +#undef br_bits_read +#undef br_value +#undef br_offset +#undef peek_bits +#undef exhausted + +#undef br0 +#undef br1 +#undef br2 +#undef br3 |