diff options
author | Wim <wim@42.be> | 2022-08-13 16:14:26 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-08-13 16:14:26 +0200 |
commit | 6a3fc713978a0c1c9290a4afd08b47886b49b635 (patch) | |
tree | aa62cd85cf5671646c75ee38b3fc140ef7edcea8 /vendor/github.com/klauspost/compress/zstd | |
parent | 3c4192ebf6a32e30cdd23a9644c2ceca72a006fa (diff) | |
download | matterbridge-msglm-6a3fc713978a0c1c9290a4afd08b47886b49b635.tar.gz matterbridge-msglm-6a3fc713978a0c1c9290a4afd08b47886b49b635.tar.bz2 matterbridge-msglm-6a3fc713978a0c1c9290a4afd08b47886b49b635.zip |
Update dependencies and go1.18 (#1873)
* Update dependencies and go1.18
* Exclude unnecessary linters and update build to go1.18
Diffstat (limited to 'vendor/github.com/klauspost/compress/zstd')
5 files changed, 1426 insertions, 899 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go index 23333b96..2f8860a7 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go @@ -180,7 +180,6 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error { return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog) } b.advance((bitCount + 7) >> 3) - // println(s.norm[:s.symbolLen], s.symbolLen) return s.buildDtable() } @@ -269,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) { s.dt[0] = symbol } -// buildDtable will build the decoding table. -func (s *fseDecoder) buildDtable() error { - tableSize := uint32(1 << s.actualTableLog) - highThreshold := tableSize - 1 - symbolNext := s.stateTable[:256] - - // Init, lay down lowprob symbols - { - for i, v := range s.norm[:s.symbolLen] { - if v == -1 { - s.dt[highThreshold].setAddBits(uint8(i)) - highThreshold-- - symbolNext[i] = 1 - } else { - symbolNext[i] = uint16(v) - } - } - } - // Spread symbols - { - tableMask := tableSize - 1 - step := tableStep(tableSize) - position := uint32(0) - for ss, v := range s.norm[:s.symbolLen] { - for i := 0; i < int(v); i++ { - s.dt[position].setAddBits(uint8(ss)) - position = (position + step) & tableMask - for position > highThreshold { - // lowprob area - position = (position + step) & tableMask - } - } - } - if position != 0 { - // position must reach all cells once, otherwise normalizedCounter is incorrect - return errors.New("corrupted input (position != 0)") - } - } - - // Build Decoding table - { - tableSize := uint16(1 << s.actualTableLog) - for u, v := range s.dt[:tableSize] { - symbol := v.addBits() - nextState := symbolNext[symbol] - symbolNext[symbol] = nextState + 1 - nBits := s.actualTableLog - byte(highBits(uint32(nextState))) - s.dt[u&maxTableMask].setNBits(nBits) - newState := (nextState << nBits) - tableSize - if newState > tableSize { - return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize) - } - if newState == uint16(u) && nBits == 0 { - // Seems weird that this is possible with nbits > 0. - return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u) - } - s.dt[u&maxTableMask].setNewState(newState) - } - } - return nil -} - // transform will transform the decoder table into a table usable for // decoding without having to apply the transformation while decoding. // The state will contain the base value and the number of bits to read. diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go new file mode 100644 index 00000000..e74df436 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go @@ -0,0 +1,64 @@ +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +package zstd + +import ( + "fmt" +) + +type buildDtableAsmContext struct { + // inputs + stateTable *uint16 + norm *int16 + dt *uint64 + + // outputs --- set by the procedure in the case of error; + // for interpretation please see the error handling part below + errParam1 uint64 + errParam2 uint64 +} + +// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable. +// Function returns non-zero exit code on error. +// go:noescape +func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int + +// please keep in sync with _generate/gen_fse.go +const ( + errorCorruptedNormalizedCounter = 1 + errorNewStateTooBig = 2 + errorNewStateNoBits = 3 +) + +// buildDtable will build the decoding table. +func (s *fseDecoder) buildDtable() error { + ctx := buildDtableAsmContext{ + stateTable: (*uint16)(&s.stateTable[0]), + norm: (*int16)(&s.norm[0]), + dt: (*uint64)(&s.dt[0]), + } + code := buildDtable_asm(s, &ctx) + + if code != 0 { + switch code { + case errorCorruptedNormalizedCounter: + position := ctx.errParam1 + return fmt.Errorf("corrupted input (position=%d, expected 0)", position) + + case errorNewStateTooBig: + newState := decSymbol(ctx.errParam1) + size := ctx.errParam2 + return fmt.Errorf("newState (%d) outside table size (%d)", newState, size) + + case errorNewStateNoBits: + newState := decSymbol(ctx.errParam1) + oldState := decSymbol(ctx.errParam2) + return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState) + + default: + return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code) + } + } + return nil +} diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s new file mode 100644 index 00000000..da32b442 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s @@ -0,0 +1,127 @@ +// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !noasm +// +build !appengine,!noasm,gc,!noasm + +// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int +TEXT ·buildDtable_asm(SB), $0-24 + MOVQ ctx+8(FP), CX + MOVQ s+0(FP), DI + + // Load values + MOVBQZX 4098(DI), DX + XORQ AX, AX + BTSQ DX, AX + MOVQ (CX), BX + MOVQ 16(CX), SI + LEAQ -1(AX), R8 + MOVQ 8(CX), CX + MOVWQZX 4096(DI), DI + + // End load values + // Init, lay down lowprob symbols + XORQ R9, R9 + JMP init_main_loop_condition + +init_main_loop: + MOVWQSX (CX)(R9*2), R10 + CMPW R10, $-1 + JNE do_not_update_high_threshold + MOVB R9, 1(SI)(R8*8) + DECQ R8 + MOVQ $0x0000000000000001, R10 + +do_not_update_high_threshold: + MOVW R10, (BX)(R9*2) + INCQ R9 + +init_main_loop_condition: + CMPQ R9, DI + JL init_main_loop + + // Spread symbols + // Calculate table step + MOVQ AX, R9 + SHRQ $0x01, R9 + MOVQ AX, R10 + SHRQ $0x03, R10 + LEAQ 3(R9)(R10*1), R9 + + // Fill add bits values + LEAQ -1(AX), R10 + XORQ R11, R11 + XORQ R12, R12 + JMP spread_main_loop_condition + +spread_main_loop: + XORQ R13, R13 + MOVWQSX (CX)(R12*2), R14 + JMP spread_inner_loop_condition + +spread_inner_loop: + MOVB R12, 1(SI)(R11*8) + +adjust_position: + ADDQ R9, R11 + ANDQ R10, R11 + CMPQ R11, R8 + JG adjust_position + INCQ R13 + +spread_inner_loop_condition: + CMPQ R13, R14 + JL spread_inner_loop + INCQ R12 + +spread_main_loop_condition: + CMPQ R12, DI + JL spread_main_loop + TESTQ R11, R11 + JZ spread_check_ok + MOVQ ctx+8(FP), AX + MOVQ R11, 24(AX) + MOVQ $+1, ret+16(FP) + RET + +spread_check_ok: + // Build Decoding table + XORQ DI, DI + +build_table_main_table: + MOVBQZX 1(SI)(DI*8), CX + MOVWQZX (BX)(CX*2), R8 + LEAQ 1(R8), R9 + MOVW R9, (BX)(CX*2) + MOVQ R8, R9 + BSRQ R9, R9 + MOVQ DX, CX + SUBQ R9, CX + SHLQ CL, R8 + SUBQ AX, R8 + MOVB CL, (SI)(DI*8) + MOVW R8, 2(SI)(DI*8) + CMPQ R8, AX + JLE build_table_check1_ok + MOVQ ctx+8(FP), CX + MOVQ R8, 24(CX) + MOVQ AX, 32(CX) + MOVQ $+2, ret+16(FP) + RET + +build_table_check1_ok: + TESTB CL, CL + JNZ build_table_check2_ok + CMPW R8, DI + JNE build_table_check2_ok + MOVQ ctx+8(FP), AX + MOVQ R8, 24(AX) + MOVQ DI, 32(AX) + MOVQ $+3, ret+16(FP) + RET + +build_table_check2_ok: + INCQ DI + CMPQ DI, AX + JL build_table_main_table + MOVQ $+0, ret+16(FP) + RET diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go new file mode 100644 index 00000000..332e51fe --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go @@ -0,0 +1,72 @@ +//go:build !amd64 || appengine || !gc || noasm +// +build !amd64 appengine !gc noasm + +package zstd + +import ( + "errors" + "fmt" +) + +// buildDtable will build the decoding table. +func (s *fseDecoder) buildDtable() error { + tableSize := uint32(1 << s.actualTableLog) + highThreshold := tableSize - 1 + symbolNext := s.stateTable[:256] + + // Init, lay down lowprob symbols + { + for i, v := range s.norm[:s.symbolLen] { + if v == -1 { + s.dt[highThreshold].setAddBits(uint8(i)) + highThreshold-- + symbolNext[i] = 1 + } else { + symbolNext[i] = uint16(v) + } + } + } + + // Spread symbols + { + tableMask := tableSize - 1 + step := tableStep(tableSize) + position := uint32(0) + for ss, v := range s.norm[:s.symbolLen] { + for i := 0; i < int(v); i++ { + s.dt[position].setAddBits(uint8(ss)) + position = (position + step) & tableMask + for position > highThreshold { + // lowprob area + position = (position + step) & tableMask + } + } + } + if position != 0 { + // position must reach all cells once, otherwise normalizedCounter is incorrect + return errors.New("corrupted input (position != 0)") + } + } + + // Build Decoding table + { + tableSize := uint16(1 << s.actualTableLog) + for u, v := range s.dt[:tableSize] { + symbol := v.addBits() + nextState := symbolNext[symbol] + symbolNext[symbol] = nextState + 1 + nBits := s.actualTableLog - byte(highBits(uint32(nextState))) + s.dt[u&maxTableMask].setNBits(nBits) + newState := (nextState << nBits) - tableSize + if newState > tableSize { + return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize) + } + if newState == uint16(u) && nBits == 0 { + // Seems weird that this is possible with nbits > 0. + return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u) + } + s.dt[u&maxTableMask].setNewState(newState) + } + } + return nil +} diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s index 212c6cac..71e64e06 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s @@ -134,18 +134,17 @@ sequenceDecs_decode_amd64_fill_2_end: MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, DI -sequenceDecs_decode_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -155,18 +154,17 @@ sequenceDecs_decode_amd64_llState_updateState_skip_zero: MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R8 -sequenceDecs_decode_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -176,18 +174,17 @@ sequenceDecs_decode_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R9 -sequenceDecs_decode_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -416,18 +413,17 @@ sequenceDecs_decode_56_amd64_fill_end: MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, DI -sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -437,18 +433,17 @@ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R8 -sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -458,18 +453,17 @@ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R9 -sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -1181,52 +1175,65 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - JMP copy_4_test - -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1234,53 +1241,74 @@ copy_4_test: JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, R11 - JB copy_5 + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 @@ -1382,45 +1410,67 @@ main_loop: // Copy literals TESTQ R11, R11 JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, R11 - JZ copy_1_word - MOVB (SI)(R14*1), R15 - MOVB R15, (BX)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, R11 - JZ copy_1_dword - MOVW (SI)(R14*1), R15 - MOVW R15, (BX)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, R11 - JZ copy_1_qword - MOVL (SI)(R14*1), R15 - MOVL R15, (BX)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, R11 - JZ copy_1_test - MOVQ (SI)(R14*1), R15 - MOVQ R15, (BX)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ R11, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (SI), X0 + MOVUPS X0, (BX) + ADDQ $0x10, SI + ADDQ $0x10, BX + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(SI)(R14*1), SI + LEAQ 16(BX)(R14*1), BX + MOVUPS -16(SI), X0 + MOVUPS X0, -16(BX) + JMP copy_1_end + +copy_1_small: + CMPQ R11, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ R11, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (SI), R14 + MOVB -1(SI)(R11*1), R15 + MOVB R14, (BX) + MOVB R15, -1(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end -copy_1: - MOVUPS (SI)(R14*1), X0 - MOVUPS X0, (BX)(R14*1) - ADDQ $0x10, R14 +copy_1_move_3: + MOVW (SI), R14 + MOVB 2(SI), R15 + MOVW R14, (BX) + MOVB R15, 2(BX) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end -copy_1_test: - CMPQ R14, R11 - JB copy_1 +copy_1_move_4through7: + MOVL (SI), R14 + MOVL -4(SI)(R11*1), R15 + MOVL R14, (BX) + MOVL R15, -4(BX)(R11*1) ADDQ R11, SI ADDQ R11, BX + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (SI), R14 + MOVQ -8(SI)(R11*1), R15 + MOVQ R14, (BX) + MOVQ R15, -8(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + +copy_1_end: ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -1432,52 +1482,65 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - JMP copy_4_test - -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1485,99 +1548,143 @@ copy_4_test: JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, R11 - JB copy_5 + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 ADDQ R11, BX + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, DI - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (R11)(R12*1), R14 - MOVB R14, (BX)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (R11)(R12*1), R14 - MOVW R14, (BX)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (R11)(R12*1), R14 - MOVL R14, (BX)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (R11)(R12*1), R14 - MOVQ R14, (BX)(R12*1) - ADDQ $0x08, R12 - JMP copy_2_test + ADDQ R13, DI + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small -copy_2: - MOVUPS (R11)(R12*1), X0 - MOVUPS X0, (BX)(R12*1) - ADDQ $0x10, R12 +copy_2_loop: + MOVUPS (R11), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R11 + ADDQ $0x10, BX + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(R11)(R12*1), R11 + LEAQ 16(BX)(R12*1), BX + MOVUPS -16(R11), X0 + MOVUPS X0, -16(BX) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (R11), R12 + MOVB -1(R11)(R13*1), R14 + MOVB R12, (BX) + MOVB R14, -1(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_move_3: + MOVW (R11), R12 + MOVB 2(R11), R14 + MOVW R12, (BX) + MOVB R14, 2(BX) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_4through7: + MOVL (R11), R12 + MOVL -4(R11)(R13*1), R14 + MOVL R12, (BX) + MOVL R14, -4(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (R11), R12 + MOVQ -8(R11)(R13*1), R14 + MOVQ R12, (BX) + MOVQ R14, -8(BX)(R13*1) + ADDQ R13, R11 ADDQ R13, BX - JMP handle_loop + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -1773,18 +1880,17 @@ sequenceDecs_decodeSync_amd64_fill_2_end: MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, DI -sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -1794,18 +1900,17 @@ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R8 -sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -1815,18 +1920,17 @@ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R9 -sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -1934,103 +2038,137 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small -copy_4_test: - CMPQ AX, R13 - JB copy_4 - ADDQ R13, R12 +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, AX - JB copy_5 + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 ADDQ AX, R10 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX @@ -2407,103 +2545,137 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, CX - JB copy_5 + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12 @@ -2746,18 +2918,17 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_end: MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, DI -sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -2767,18 +2938,17 @@ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R8 -sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -2788,18 +2958,17 @@ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R9 -sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -2885,45 +3054,67 @@ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: // Copy literals TESTQ AX, AX JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, AX - JZ copy_1_word - MOVB (R11)(R14*1), R15 - MOVB R15, (R10)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, AX - JZ copy_1_dword - MOVW (R11)(R14*1), R15 - MOVW R15, (R10)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, AX - JZ copy_1_qword - MOVL (R11)(R14*1), R15 - MOVL R15, (R10)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, AX - JZ copy_1_test - MOVQ (R11)(R14*1), R15 - MOVQ R15, (R10)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ AX, R14 + SUBQ $0x10, R14 + JB copy_1_small -copy_1: - MOVUPS (R11)(R14*1), X0 - MOVUPS X0, (R10)(R14*1) - ADDQ $0x10, R14 +copy_1_loop: + MOVUPS (R11), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R11 + ADDQ $0x10, R10 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R11)(R14*1), R11 + LEAQ 16(R10)(R14*1), R10 + MOVUPS -16(R11), X0 + MOVUPS X0, -16(R10) + JMP copy_1_end + +copy_1_small: + CMPQ AX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ AX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R11), R14 + MOVB -1(R11)(AX*1), R15 + MOVB R14, (R10) + MOVB R15, -1(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_3: + MOVW (R11), R14 + MOVB 2(R11), R15 + MOVW R14, (R10) + MOVB R15, 2(R10) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R11), R14 + MOVL -4(R11)(AX*1), R15 + MOVL R14, (R10) + MOVL R15, -4(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end -copy_1_test: - CMPQ R14, AX - JB copy_1 +copy_1_move_8through16: + MOVQ (R11), R14 + MOVQ -8(R11)(AX*1), R15 + MOVQ R14, (R10) + MOVQ R15, -8(R10)(AX*1) ADDQ AX, R11 ADDQ AX, R10 + +copy_1_end: ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -2936,149 +3127,206 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small -copy_4_test: - CMPQ AX, R13 - JB copy_4 - ADDQ R13, R12 +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, AX - JB copy_5 + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R12 - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (AX)(CX*1), R14 - MOVB R14, (R10)(CX*1) - ADDQ $0x01, CX - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (AX)(CX*1), R14 - MOVW R14, (R10)(CX*1) - ADDQ $0x02, CX - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (AX)(CX*1), R14 - MOVL R14, (R10)(CX*1) - ADDQ $0x04, CX - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (AX)(CX*1), R14 - MOVQ R14, (R10)(CX*1) - ADDQ $0x08, CX - JMP copy_2_test - -copy_2: - MOVUPS (AX)(CX*1), X0 - MOVUPS X0, (R10)(CX*1) - ADDQ $0x10, CX + ADDQ R13, R12 + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_2_small -copy_2_test: - CMPQ CX, R13 - JB copy_2 +copy_2_loop: + MOVUPS (AX), X0 + MOVUPS X0, (R10) + ADDQ $0x10, AX + ADDQ $0x10, R10 + SUBQ $0x10, CX + JAE copy_2_loop + LEAQ 16(AX)(CX*1), AX + LEAQ 16(R10)(CX*1), R10 + MOVUPS -16(AX), X0 + MOVUPS X0, -16(R10) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (AX), CL + MOVB -1(AX)(R13*1), R14 + MOVB CL, (R10) + MOVB R14, -1(R10)(R13*1) + ADDQ R13, AX ADDQ R13, R10 - JMP handle_loop + JMP copy_2_end + +copy_2_move_3: + MOVW (AX), CX + MOVB 2(AX), R14 + MOVW CX, (R10) + MOVB R14, 2(R10) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (AX), CX + MOVL -4(AX)(R13*1), R14 + MOVL CX, (R10) + MOVL R14, -4(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (AX), CX + MOVQ -8(AX)(R13*1), R14 + MOVQ CX, (R10) + MOVQ R14, -8(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -3415,45 +3663,67 @@ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: // Copy literals TESTQ CX, CX JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, CX - JZ copy_1_word - MOVB (R10)(R14*1), R15 - MOVB R15, (R9)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, CX - JZ copy_1_dword - MOVW (R10)(R14*1), R15 - MOVW R15, (R9)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, CX - JZ copy_1_qword - MOVL (R10)(R14*1), R15 - MOVL R15, (R9)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, CX - JZ copy_1_test - MOVQ (R10)(R14*1), R15 - MOVQ R15, (R9)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ CX, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (R10), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R10 + ADDQ $0x10, R9 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R10)(R14*1), R10 + LEAQ 16(R9)(R14*1), R9 + MOVUPS -16(R10), X0 + MOVUPS X0, -16(R9) + JMP copy_1_end + +copy_1_small: + CMPQ CX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ CX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R10), R14 + MOVB -1(R10)(CX*1), R15 + MOVB R14, (R9) + MOVB R15, -1(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end -copy_1: - MOVUPS (R10)(R14*1), X0 - MOVUPS X0, (R9)(R14*1) - ADDQ $0x10, R14 +copy_1_move_3: + MOVW (R10), R14 + MOVB 2(R10), R15 + MOVW R14, (R9) + MOVB R15, 2(R9) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R10), R14 + MOVL -4(R10)(CX*1), R15 + MOVL R14, (R9) + MOVL R15, -4(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end -copy_1_test: - CMPQ R14, CX - JB copy_1 +copy_1_move_8through16: + MOVQ (R10), R14 + MOVQ -8(R10)(CX*1), R15 + MOVQ R14, (R9) + MOVQ R15, -8(R9)(CX*1) ADDQ CX, R10 ADDQ CX, R9 + +copy_1_end: ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -3466,149 +3736,206 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, CX - JB copy_5 + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R11 - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (CX)(R12*1), R14 - MOVB R14, (R9)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (CX)(R12*1), R14 - MOVW R14, (R9)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (CX)(R12*1), R14 - MOVL R14, (R9)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (CX)(R12*1), R14 - MOVQ R14, (R9)(R12*1) - ADDQ $0x08, R12 - JMP copy_2_test - -copy_2: - MOVUPS (CX)(R12*1), X0 - MOVUPS X0, (R9)(R12*1) - ADDQ $0x10, R12 + ADDQ R13, R11 + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_loop: + MOVUPS (CX), X0 + MOVUPS X0, (R9) + ADDQ $0x10, CX + ADDQ $0x10, R9 + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(CX)(R12*1), CX + LEAQ 16(R9)(R12*1), R9 + MOVUPS -16(CX), X0 + MOVUPS X0, -16(R9) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (CX), R12 + MOVB -1(CX)(R13*1), R14 + MOVB R12, (R9) + MOVB R14, -1(R9)(R13*1) + ADDQ R13, CX ADDQ R13, R9 - JMP handle_loop + JMP copy_2_end + +copy_2_move_3: + MOVW (CX), R12 + MOVB 2(CX), R14 + MOVW R12, (R9) + MOVB R14, 2(R9) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (CX), R12 + MOVL -4(CX)(R13*1), R14 + MOVL R12, (R9) + MOVL R14, -4(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (CX), R12 + MOVQ -8(CX)(R13*1), R14 + MOVQ R12, (R9) + MOVQ R14, -8(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: |