diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/huff0/decompress_amd64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/huff0/decompress_amd64.s | 585 |
1 files changed, 284 insertions, 301 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s index dd1a5aec..c4c7ab2d 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s +++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s @@ -1,364 +1,352 @@ // Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT. //go:build amd64 && !appengine && !noasm && gc -// +build amd64,!appengine,!noasm,gc // func decompress4x_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_main_loop_amd64(SB), $0-8 - XORQ DX, DX - // Preload values MOVQ ctx+0(FP), AX MOVBQZX 8(AX), DI - MOVQ 16(AX), SI - MOVQ 48(AX), BX - MOVQ 24(AX), R9 - MOVQ 32(AX), R10 - MOVQ (AX), R11 + MOVQ 16(AX), BX + MOVQ 48(AX), SI + MOVQ 24(AX), R8 + MOVQ 32(AX), R9 + MOVQ (AX), R10 // Main loop main_loop: - MOVQ SI, R8 - CMPQ R8, BX + XORL DX, DX + CMPQ BX, SI SETGE DL // br0.fillFast32() - MOVQ 32(R11), R12 - MOVBQZX 40(R11), R13 - CMPQ R13, $0x20 + MOVQ 32(R10), R11 + MOVBQZX 40(R10), R12 + CMPQ R12, $0x20 JBE skip_fill0 - MOVQ 24(R11), AX - SUBQ $0x20, R13 + MOVQ 24(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ (R11), R14 + MOVQ (R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 24(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 24(R10) + ORQ R13, R11 - // exhausted = exhausted || (br0.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br0.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br0.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + MOVW AX, (BX) // update the bitreader structure - MOVQ R12, 32(R11) - MOVB R13, 40(R11) - ADDQ R9, R8 + MOVQ R11, 32(R10) + MOVB R12, 40(R10) // br1.fillFast32() - MOVQ 80(R11), R12 - MOVBQZX 88(R11), R13 - CMPQ R13, $0x20 + MOVQ 80(R10), R11 + MOVBQZX 88(R10), R12 + CMPQ R12, $0x20 JBE skip_fill1 - MOVQ 72(R11), AX - SUBQ $0x20, R13 + MOVQ 72(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ 48(R11), R14 + MOVQ 48(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 72(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 72(R10) + ORQ R13, R11 - // exhausted = exhausted || (br1.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br1.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br1.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + MOVW AX, (BX)(R8*1) // update the bitreader structure - MOVQ R12, 80(R11) - MOVB R13, 88(R11) - ADDQ R9, R8 + MOVQ R11, 80(R10) + MOVB R12, 88(R10) // br2.fillFast32() - MOVQ 128(R11), R12 - MOVBQZX 136(R11), R13 - CMPQ R13, $0x20 + MOVQ 128(R10), R11 + MOVBQZX 136(R10), R12 + CMPQ R12, $0x20 JBE skip_fill2 - MOVQ 120(R11), AX - SUBQ $0x20, R13 + MOVQ 120(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ 96(R11), R14 + MOVQ 96(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 120(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 120(R10) + ORQ R13, R11 - // exhausted = exhausted || (br2.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br2.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br2.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + MOVW AX, (BX)(R8*2) // update the bitreader structure - MOVQ R12, 128(R11) - MOVB R13, 136(R11) - ADDQ R9, R8 + MOVQ R11, 128(R10) + MOVB R12, 136(R10) // br3.fillFast32() - MOVQ 176(R11), R12 - MOVBQZX 184(R11), R13 - CMPQ R13, $0x20 + MOVQ 176(R10), R11 + MOVBQZX 184(R10), R12 + CMPQ R12, $0x20 JBE skip_fill3 - MOVQ 168(R11), AX - SUBQ $0x20, R13 + MOVQ 168(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ 144(R11), R14 + MOVQ 144(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 168(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 168(R10) + ORQ R13, R11 - // exhausted = exhausted || (br3.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br3.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br3.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + LEAQ (R8)(R8*2), CX + MOVW AX, (BX)(CX*1) // update the bitreader structure - MOVQ R12, 176(R11) - MOVB R13, 184(R11) - ADDQ $0x02, SI + MOVQ R11, 176(R10) + MOVB R12, 184(R10) + ADDQ $0x02, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - SUBQ 16(AX), SI - SHLQ $0x02, SI - MOVQ SI, 40(AX) + SUBQ 16(AX), BX + SHLQ $0x02, BX + MOVQ BX, 40(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 - XORQ DX, DX - // Preload values MOVQ ctx+0(FP), CX MOVBQZX 8(CX), DI MOVQ 16(CX), BX MOVQ 48(CX), SI - MOVQ 24(CX), R9 - MOVQ 32(CX), R10 - MOVQ (CX), R11 + MOVQ 24(CX), R8 + MOVQ 32(CX), R9 + MOVQ (CX), R10 // Main loop main_loop: - MOVQ BX, R8 - CMPQ R8, SI + XORL DX, DX + CMPQ BX, SI SETGE DL // br0.fillFast32() - MOVQ 32(R11), R12 - MOVBQZX 40(R11), R13 - CMPQ R13, $0x20 + MOVQ 32(R10), R11 + MOVBQZX 40(R10), R12 + CMPQ R12, $0x20 JBE skip_fill0 - MOVQ 24(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ (R11), R15 + MOVQ 24(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ (R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 24(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 24(R10) + ORQ R14, R11 - // exhausted = exhausted || (br0.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br0.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -366,88 +354,86 @@ skip_fill0: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + MOVL AX, (BX) // update the bitreader structure - MOVQ R12, 32(R11) - MOVB R13, 40(R11) - ADDQ R9, R8 + MOVQ R11, 32(R10) + MOVB R12, 40(R10) // br1.fillFast32() - MOVQ 80(R11), R12 - MOVBQZX 88(R11), R13 - CMPQ R13, $0x20 + MOVQ 80(R10), R11 + MOVBQZX 88(R10), R12 + CMPQ R12, $0x20 JBE skip_fill1 - MOVQ 72(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ 48(R11), R15 + MOVQ 72(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ 48(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 72(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 72(R10) + ORQ R14, R11 - // exhausted = exhausted || (br1.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br1.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -455,88 +441,86 @@ skip_fill1: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + MOVL AX, (BX)(R8*1) // update the bitreader structure - MOVQ R12, 80(R11) - MOVB R13, 88(R11) - ADDQ R9, R8 + MOVQ R11, 80(R10) + MOVB R12, 88(R10) // br2.fillFast32() - MOVQ 128(R11), R12 - MOVBQZX 136(R11), R13 - CMPQ R13, $0x20 + MOVQ 128(R10), R11 + MOVBQZX 136(R10), R12 + CMPQ R12, $0x20 JBE skip_fill2 - MOVQ 120(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ 96(R11), R15 + MOVQ 120(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ 96(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 120(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 120(R10) + ORQ R14, R11 - // exhausted = exhausted || (br2.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br2.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -544,88 +528,86 @@ skip_fill2: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + MOVL AX, (BX)(R8*2) // update the bitreader structure - MOVQ R12, 128(R11) - MOVB R13, 136(R11) - ADDQ R9, R8 + MOVQ R11, 128(R10) + MOVB R12, 136(R10) // br3.fillFast32() - MOVQ 176(R11), R12 - MOVBQZX 184(R11), R13 - CMPQ R13, $0x20 + MOVQ 176(R10), R11 + MOVBQZX 184(R10), R12 + CMPQ R12, $0x20 JBE skip_fill3 - MOVQ 168(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ 144(R11), R15 + MOVQ 168(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ 144(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 168(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 168(R10) + ORQ R14, R11 - // exhausted = exhausted || (br3.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br3.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -633,11 +615,12 @@ skip_fill3: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + LEAQ (R8)(R8*2), CX + MOVL AX, (BX)(CX*1) // update the bitreader structure - MOVQ R12, 176(R11) - MOVB R13, 184(R11) + MOVQ R11, 176(R10) + MOVB R12, 184(R10) ADDQ $0x04, BX TESTB DL, DL JZ main_loop @@ -653,7 +636,7 @@ TEXT ·decompress1x_main_loop_amd64(SB), $0-8 MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 - JB error_max_decoded_size_exeeded + JB error_max_decoded_size_exceeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 @@ -668,7 +651,7 @@ main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX - JGE error_max_decoded_size_exeeded + JGE error_max_decoded_size_exceeded // Decode 4 values CMPQ R11, $0x20 @@ -745,7 +728,7 @@ loop_condition: RET // Report error -error_max_decoded_size_exeeded: +error_max_decoded_size_exceeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) @@ -758,7 +741,7 @@ TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 - JB error_max_decoded_size_exeeded + JB error_max_decoded_size_exceeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 @@ -773,7 +756,7 @@ main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX - JGE error_max_decoded_size_exeeded + JGE error_max_decoded_size_exceeded // Decode 4 values CMPQ R11, $0x20 @@ -840,7 +823,7 @@ loop_condition: RET // Report error -error_max_decoded_size_exeeded: +error_max_decoded_size_exceeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) |