diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/huff0/decompress_amd64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/huff0/decompress_amd64.s | 1179 |
1 files changed, 769 insertions, 410 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s index 2edad3ea..6c65c6e2 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s +++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s @@ -1,506 +1,865 @@ -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" -#include "funcdata.h" -#include "go_asm.h" - -#ifdef GOAMD64_v4 -#ifndef GOAMD64_v3 -#define GOAMD64_v3 -#endif -#endif - -#define bufoff 256 // see decompress.go, we're using [4][256]byte table - -// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, -// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool) -TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8 -#define off R8 -#define buffer DI -#define table SI - -#define br_bits_read R9 -#define br_value R10 -#define br_offset R11 -#define peek_bits R12 -#define exhausted DX - -#define br0 R13 -#define br1 R14 -#define br2 R15 -#define br3 BP - - MOVQ BP, 0(SP) - - XORQ exhausted, exhausted // exhausted = false - XORQ off, off // off = 0 - - MOVBQZX peekBits+32(FP), peek_bits - MOVQ buf+40(FP), buffer - MOVQ tbl+48(FP), table - - MOVQ pbr0+0(FP), br0 - MOVQ pbr1+8(FP), br1 - MOVQ pbr2+16(FP), br2 - MOVQ pbr3+24(FP), br3 - +// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT. + +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +// func decompress4x_main_loop_amd64(ctx *decompress4xContext) +TEXT ·decompress4x_main_loop_amd64(SB), $8-8 + XORQ DX, DX + + // Preload values + MOVQ ctx+0(FP), AX + MOVBQZX 32(AX), SI + MOVQ 40(AX), DI + MOVQ DI, BX + MOVQ 72(AX), CX + MOVQ CX, (SP) + MOVQ 48(AX), R8 + MOVQ 56(AX), R9 + MOVQ (AX), R10 + MOVQ 8(AX), R11 + MOVQ 16(AX), R12 + MOVQ 24(AX), R13 + + // Main loop main_loop: - - // const stream = 0 - // br0.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read - MOVQ bitReaderShifted_value(br0), br_value - MOVQ bitReaderShifted_off(br0), br_offset - - // We must have at least 2 * max tablelog left - CMPQ br_bits_read, $64-22 - JBE skip_fill0 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br0), AX + MOVQ BX, DI + CMPQ DI, (SP) + SETGE DL + + // br0.fillFast32() + MOVQ 32(R10), R14 + MOVBQZX 40(R10), R15 + CMPQ R15, $0x20 + JBE skip_fill0 + MOVQ 24(R10), AX + SUBQ $0x20, R15 + SUBQ $0x04, AX + MOVQ (R10), BP // b.value |= uint64(low) << (b.bitsRead & 63) -#ifdef GOAMD64_v3 - SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) - -#else - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - MOVQ br_bits_read, CX - SHLQ CL, AX - -#endif - - ORQ AX, br_value + MOVL (AX)(BP*1), BP + MOVQ R15, CX + SHLQ CL, BP + MOVQ AX, 24(R10) + ORQ BP, R14 // exhausted = exhausted || (br0.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL - // } skip_fill0: - // val0 := br0.peekTopBits(peekBits) -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ R14, BP + MOVQ SI, CX + SHRQ CL, BP // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br0.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n + MOVW (R9)(BP*2), CX -#endif + // br0.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R14 + ADDB CL, R15 - ADDQ CX, br_bits_read // bits_read += n - -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else // val1 := br0.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ SI, CX + MOVQ R14, BP + SHRQ CL, BP // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 + MOVW (R9)(BP*2), CX // br0.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - -#endif - - ADDQ CX, br_bits_read // bits_read += n + MOVB CH, AH + SHLQ CL, R14 + ADDB CL, R15 // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 0(buffer)(off*1) + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (DI) // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br0) - MOVQ br_value, bitReaderShifted_value(br0) - MOVQ br_offset, bitReaderShifted_off(br0) - - // const stream = 1 - // br1.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read - MOVQ bitReaderShifted_value(br1), br_value - MOVQ bitReaderShifted_off(br1), br_offset - - // We must have at least 2 * max tablelog left - CMPQ br_bits_read, $64-22 - JBE skip_fill1 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br1), AX + MOVQ R14, 32(R10) + MOVB R15, 40(R10) + ADDQ R8, DI + + // br1.fillFast32() + MOVQ 32(R11), R14 + MOVBQZX 40(R11), R15 + CMPQ R15, $0x20 + JBE skip_fill1 + MOVQ 24(R11), AX + SUBQ $0x20, R15 + SUBQ $0x04, AX + MOVQ (R11), BP // b.value |= uint64(low) << (b.bitsRead & 63) -#ifdef GOAMD64_v3 - SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) - -#else - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - MOVQ br_bits_read, CX - SHLQ CL, AX - -#endif - - ORQ AX, br_value + MOVL (AX)(BP*1), BP + MOVQ R15, CX + SHLQ CL, BP + MOVQ AX, 24(R11) + ORQ BP, R14 // exhausted = exhausted || (br1.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL - // } skip_fill1: - // val0 := br1.peekTopBits(peekBits) -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ R14, BP + MOVQ SI, CX + SHRQ CL, BP // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br1.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n + MOVW (R9)(BP*2), CX -#endif + // br1.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R14 + ADDB CL, R15 - ADDQ CX, br_bits_read // bits_read += n - -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else // val1 := br1.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ SI, CX + MOVQ R14, BP + SHRQ CL, BP // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 + MOVW (R9)(BP*2), CX // br1.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - -#endif - - ADDQ CX, br_bits_read // bits_read += n + MOVB CH, AH + SHLQ CL, R14 + ADDB CL, R15 // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 256(buffer)(off*1) + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (DI) // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br1) - MOVQ br_value, bitReaderShifted_value(br1) - MOVQ br_offset, bitReaderShifted_off(br1) - - // const stream = 2 - // br2.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read - MOVQ bitReaderShifted_value(br2), br_value - MOVQ bitReaderShifted_off(br2), br_offset - - // We must have at least 2 * max tablelog left - CMPQ br_bits_read, $64-22 - JBE skip_fill2 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br2), AX + MOVQ R14, 32(R11) + MOVB R15, 40(R11) + ADDQ R8, DI + + // br2.fillFast32() + MOVQ 32(R12), R14 + MOVBQZX 40(R12), R15 + CMPQ R15, $0x20 + JBE skip_fill2 + MOVQ 24(R12), AX + SUBQ $0x20, R15 + SUBQ $0x04, AX + MOVQ (R12), BP // b.value |= uint64(low) << (b.bitsRead & 63) -#ifdef GOAMD64_v3 - SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) - -#else - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - MOVQ br_bits_read, CX - SHLQ CL, AX - -#endif - - ORQ AX, br_value + MOVL (AX)(BP*1), BP + MOVQ R15, CX + SHLQ CL, BP + MOVQ AX, 24(R12) + ORQ BP, R14 // exhausted = exhausted || (br2.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL - // } skip_fill2: - // val0 := br2.peekTopBits(peekBits) -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ R14, BP + MOVQ SI, CX + SHRQ CL, BP // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br2.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n + MOVW (R9)(BP*2), CX -#endif + // br2.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R14 + ADDB CL, R15 - ADDQ CX, br_bits_read // bits_read += n - -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else // val1 := br2.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ SI, CX + MOVQ R14, BP + SHRQ CL, BP // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 + MOVW (R9)(BP*2), CX // br2.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - -#endif - - ADDQ CX, br_bits_read // bits_read += n + MOVB CH, AH + SHLQ CL, R14 + ADDB CL, R15 // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 512(buffer)(off*1) + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (DI) // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br2) - MOVQ br_value, bitReaderShifted_value(br2) - MOVQ br_offset, bitReaderShifted_off(br2) - - // const stream = 3 - // br3.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read - MOVQ bitReaderShifted_value(br3), br_value - MOVQ bitReaderShifted_off(br3), br_offset - - // We must have at least 2 * max tablelog left - CMPQ br_bits_read, $64-22 - JBE skip_fill3 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br3), AX + MOVQ R14, 32(R12) + MOVB R15, 40(R12) + ADDQ R8, DI + + // br3.fillFast32() + MOVQ 32(R13), R14 + MOVBQZX 40(R13), R15 + CMPQ R15, $0x20 + JBE skip_fill3 + MOVQ 24(R13), AX + SUBQ $0x20, R15 + SUBQ $0x04, AX + MOVQ (R13), BP // b.value |= uint64(low) << (b.bitsRead & 63) -#ifdef GOAMD64_v3 - SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) - -#else - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - MOVQ br_bits_read, CX - SHLQ CL, AX - -#endif - - ORQ AX, br_value + MOVL (AX)(BP*1), BP + MOVQ R15, CX + SHLQ CL, BP + MOVQ AX, 24(R13) + ORQ BP, R14 // exhausted = exhausted || (br3.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL - // } skip_fill3: - // val0 := br3.peekTopBits(peekBits) -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask + MOVQ R14, BP + MOVQ SI, CX + SHRQ CL, BP -#else - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask + // v0 := table[val0&mask] + MOVW (R9)(BP*2), CX -#endif + // br3.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R14 + ADDB CL, R15 - // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 + // val1 := br3.peekTopBits(peekBits) + MOVQ SI, CX + MOVQ R14, BP + SHRQ CL, BP - // br3.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) + // v1 := table[val1&mask] + MOVW (R9)(BP*2), CX -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n + // br3.advance(uint8(v1.entry)) + MOVB CH, AH + SHLQ CL, R14 + ADDB CL, R15 -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n + // these two writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (DI) -#endif + // update the bitrader reader structure + MOVQ R14, 32(R13) + MOVB R15, 40(R13) + ADDQ $0x02, BX + TESTB DL, DL + JZ main_loop + MOVQ ctx+0(FP), AX + MOVQ 40(AX), CX + MOVQ BX, DX + SUBQ CX, DX + SHLQ $0x02, DX + MOVQ DX, 64(AX) + RET - ADDQ CX, br_bits_read // bits_read += n +// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) +TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8 + XORQ DX, DX + + // Preload values + MOVQ ctx+0(FP), CX + MOVBQZX 32(CX), BX + MOVQ 40(CX), SI + MOVQ SI, (SP) + MOVQ 72(CX), DX + MOVQ DX, 8(SP) + MOVQ 48(CX), DI + MOVQ 56(CX), R8 + MOVQ (CX), R9 + MOVQ 8(CX), R10 + MOVQ 16(CX), R11 + MOVQ 24(CX), R12 + + // Main loop +main_loop: + MOVQ (SP), SI + CMPQ SI, 8(SP) + SETGE DL + + // br1000.fillFast32() + MOVQ 32(R9), R13 + MOVBQZX 40(R9), R14 + CMPQ R14, $0x20 + JBE skip_fill1000 + MOVQ 24(R9), R15 + SUBQ $0x20, R14 + SUBQ $0x04, R15 + MOVQ (R9), BP -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R15)(BP*1), BP + MOVQ R14, CX + SHLQ CL, BP + MOVQ R15, 24(R9) + ORQ BP, R13 + + // exhausted = exhausted || (br1000.off < 4) + CMPQ R15, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1000: + // val0 := br0.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 -#else - // val1 := br3.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask + // v0 := table[val0&mask] + MOVW (R8)(R15*2), CX -#endif + // br0.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 - // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 + // val1 := br0.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v1 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br0.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // val2 := br0.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v2 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br0.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + + // val3 := br0.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v3 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br0.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (SI) + + // update the bitreader reader structure + MOVQ R13, 32(R9) + MOVB R14, 40(R9) + ADDQ DI, SI + + // br1001.fillFast32() + MOVQ 32(R10), R13 + MOVBQZX 40(R10), R14 + CMPQ R14, $0x20 + JBE skip_fill1001 + MOVQ 24(R10), R15 + SUBQ $0x20, R14 + SUBQ $0x04, R15 + MOVQ (R10), BP - // br3.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R15)(BP*1), BP + MOVQ R14, CX + SHLQ CL, BP + MOVQ R15, 24(R10) + ORQ BP, R13 + + // exhausted = exhausted || (br1001.off < 4) + CMPQ R15, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1001: + // val0 := br1.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n + // v0 := table[val0&mask] + MOVW (R8)(R15*2), CX -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n + // br1.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 -#endif + // val1 := br1.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v1 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br1.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // val2 := br1.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v2 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br1.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + + // val3 := br1.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v3 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br1.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (SI) + + // update the bitreader reader structure + MOVQ R13, 32(R10) + MOVB R14, 40(R10) + ADDQ DI, SI + + // br1002.fillFast32() + MOVQ 32(R11), R13 + MOVBQZX 40(R11), R14 + CMPQ R14, $0x20 + JBE skip_fill1002 + MOVQ 24(R11), R15 + SUBQ $0x20, R14 + SUBQ $0x04, R15 + MOVQ (R11), BP - ADDQ CX, br_bits_read // bits_read += n + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R15)(BP*1), BP + MOVQ R14, CX + SHLQ CL, BP + MOVQ R15, 24(R11) + ORQ BP, R13 + + // exhausted = exhausted || (br1002.off < 4) + CMPQ R15, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1002: + // val0 := br2.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 - // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 768(buffer)(off*1) + // v0 := table[val0&mask] + MOVW (R8)(R15*2), CX - // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br3) - MOVQ br_value, bitReaderShifted_value(br3) - MOVQ br_offset, bitReaderShifted_off(br3) + // br2.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 - ADDQ $2, off // off += 2 + // val1 := br2.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v1 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br2.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // val2 := br2.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v2 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br2.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + + // val3 := br2.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v3 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br2.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (SI) + + // update the bitreader reader structure + MOVQ R13, 32(R11) + MOVB R14, 40(R11) + ADDQ DI, SI + + // br1003.fillFast32() + MOVQ 32(R12), R13 + MOVBQZX 40(R12), R14 + CMPQ R14, $0x20 + JBE skip_fill1003 + MOVQ 24(R12), R15 + SUBQ $0x20, R14 + SUBQ $0x04, R15 + MOVQ (R12), BP - TESTB DH, DH // any br[i].ofs < 4? - JNZ end + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R15)(BP*1), BP + MOVQ R14, CX + SHLQ CL, BP + MOVQ R15, 24(R12) + ORQ BP, R13 + + // exhausted = exhausted || (br1003.off < 4) + CMPQ R15, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1003: + // val0 := br3.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 - CMPQ off, $bufoff - JL main_loop + // v0 := table[val0&mask] + MOVW (R8)(R15*2), CX -end: - MOVQ 0(SP), BP + // br3.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 - MOVB off, ret+56(FP) + // val1 := br3.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v1 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br3.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // val2 := br3.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v2 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br3.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + + // val3 := br3.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v3 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br3.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (SI) + + // update the bitreader reader structure + MOVQ R13, 32(R12) + MOVB R14, 40(R12) + ADDQ $0x04, (SP) + TESTB DL, DL + JZ main_loop + MOVQ ctx+0(FP), AX + MOVQ 40(AX), CX + MOVQ (SP), DX + SUBQ CX, DX + SHLQ $0x02, DX + MOVQ DX, 64(AX) RET -#undef off -#undef buffer -#undef table +// func decompress1x_main_loop_amd64(ctx *decompress1xContext) +TEXT ·decompress1x_main_loop_amd64(SB), $0-8 + MOVQ ctx+0(FP), CX + MOVQ 16(CX), DX + MOVQ 24(CX), BX + CMPQ BX, $0x04 + JB error_max_decoded_size_exeeded + LEAQ (DX)(BX*1), BX + MOVQ (CX), SI + MOVQ (SI), R8 + MOVQ 24(SI), R9 + MOVQ 32(SI), R10 + MOVBQZX 40(SI), R11 + MOVQ 32(CX), SI + MOVBQZX 8(CX), DI + JMP loop_condition -#undef br_bits_read -#undef br_value -#undef br_offset -#undef peek_bits -#undef exhausted +main_loop: + // Check if we have room for 4 bytes in the output buffer + LEAQ 4(DX), CX + CMPQ CX, BX + JGE error_max_decoded_size_exeeded + + // Decode 4 values + CMPQ R11, $0x20 + JL bitReader_fillFast_1_end + SUBQ $0x20, R11 + SUBQ $0x04, R9 + MOVL (R8)(R9*1), R12 + MOVQ R11, CX + SHLQ CL, R12 + ORQ R12, R10 + +bitReader_fillFast_1_end: + MOVQ DI, CX + MOVQ R10, R12 + SHRQ CL, R12 + MOVW (SI)(R12*2), CX + MOVB CH, AL + MOVBQZX CL, CX + ADDQ CX, R11 + SHLQ CL, R10 + MOVQ DI, CX + MOVQ R10, R12 + SHRQ CL, R12 + MOVW (SI)(R12*2), CX + MOVB CH, AH + MOVBQZX CL, CX + ADDQ CX, R11 + SHLQ CL, R10 + BSWAPL AX + CMPQ R11, $0x20 + JL bitReader_fillFast_2_end + SUBQ $0x20, R11 + SUBQ $0x04, R9 + MOVL (R8)(R9*1), R12 + MOVQ R11, CX + SHLQ CL, R12 + ORQ R12, R10 + +bitReader_fillFast_2_end: + MOVQ DI, CX + MOVQ R10, R12 + SHRQ CL, R12 + MOVW (SI)(R12*2), CX + MOVB CH, AH + MOVBQZX CL, CX + ADDQ CX, R11 + SHLQ CL, R10 + MOVQ DI, CX + MOVQ R10, R12 + SHRQ CL, R12 + MOVW (SI)(R12*2), CX + MOVB CH, AL + MOVBQZX CL, CX + ADDQ CX, R11 + SHLQ CL, R10 + BSWAPL AX + + // Store the decoded values + MOVL AX, (DX) + ADDQ $0x04, DX + +loop_condition: + CMPQ R9, $0x08 + JGE main_loop + + // Update ctx structure + MOVQ ctx+0(FP), AX + MOVQ DX, CX + MOVQ 16(AX), DX + SUBQ DX, CX + MOVQ CX, 40(AX) + MOVQ (AX), AX + MOVQ R9, 24(AX) + MOVQ R10, 32(AX) + MOVB R11, 40(AX) + RET -#undef br0 -#undef br1 -#undef br2 -#undef br3 + // Report error +error_max_decoded_size_exeeded: + MOVQ ctx+0(FP), AX + MOVQ $-1, CX + MOVQ CX, 40(AX) + RET + +// func decompress1x_main_loop_bmi2(ctx *decompress1xContext) +// Requires: BMI2 +TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 + MOVQ ctx+0(FP), CX + MOVQ 16(CX), DX + MOVQ 24(CX), BX + CMPQ BX, $0x04 + JB error_max_decoded_size_exeeded + LEAQ (DX)(BX*1), BX + MOVQ (CX), SI + MOVQ (SI), R8 + MOVQ 24(SI), R9 + MOVQ 32(SI), R10 + MOVBQZX 40(SI), R11 + MOVQ 32(CX), SI + MOVBQZX 8(CX), DI + JMP loop_condition + +main_loop: + // Check if we have room for 4 bytes in the output buffer + LEAQ 4(DX), CX + CMPQ CX, BX + JGE error_max_decoded_size_exeeded + + // Decode 4 values + CMPQ R11, $0x20 + JL bitReader_fillFast_1_end + SUBQ $0x20, R11 + SUBQ $0x04, R9 + MOVL (R8)(R9*1), CX + SHLXQ R11, CX, CX + ORQ CX, R10 + +bitReader_fillFast_1_end: + SHRXQ DI, R10, CX + MOVW (SI)(CX*2), CX + MOVB CH, AL + MOVBQZX CL, CX + ADDQ CX, R11 + SHLXQ CX, R10, R10 + SHRXQ DI, R10, CX + MOVW (SI)(CX*2), CX + MOVB CH, AH + MOVBQZX CL, CX + ADDQ CX, R11 + SHLXQ CX, R10, R10 + BSWAPL AX + CMPQ R11, $0x20 + JL bitReader_fillFast_2_end + SUBQ $0x20, R11 + SUBQ $0x04, R9 + MOVL (R8)(R9*1), CX + SHLXQ R11, CX, CX + ORQ CX, R10 + +bitReader_fillFast_2_end: + SHRXQ DI, R10, CX + MOVW (SI)(CX*2), CX + MOVB CH, AH + MOVBQZX CL, CX + ADDQ CX, R11 + SHLXQ CX, R10, R10 + SHRXQ DI, R10, CX + MOVW (SI)(CX*2), CX + MOVB CH, AL + MOVBQZX CL, CX + ADDQ CX, R11 + SHLXQ CX, R10, R10 + BSWAPL AX + + // Store the decoded values + MOVL AX, (DX) + ADDQ $0x04, DX + +loop_condition: + CMPQ R9, $0x08 + JGE main_loop + + // Update ctx structure + MOVQ ctx+0(FP), AX + MOVQ DX, CX + MOVQ 16(AX), DX + SUBQ DX, CX + MOVQ CX, 40(AX) + MOVQ (AX), AX + MOVQ R9, 24(AX) + MOVQ R10, 32(AX) + MOVB R11, 40(AX) + RET + + // Report error +error_max_decoded_size_exeeded: + MOVQ ctx+0(FP), AX + MOVQ $-1, CX + MOVQ CX, 40(AX) + RET |