From 20f6c05ec50739d31f4dbe9fde0d223f2c43f6e8 Mon Sep 17 00:00:00 2001 From: Wim Date: Sat, 16 Oct 2021 23:11:32 +0200 Subject: Update vendor --- vendor/github.com/minio/md5-simd/block8_amd64.s | 281 ++++++++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 vendor/github.com/minio/md5-simd/block8_amd64.s (limited to 'vendor/github.com/minio/md5-simd/block8_amd64.s') diff --git a/vendor/github.com/minio/md5-simd/block8_amd64.s b/vendor/github.com/minio/md5-simd/block8_amd64.s new file mode 100644 index 00000000..f57db17a --- /dev/null +++ b/vendor/github.com/minio/md5-simd/block8_amd64.s @@ -0,0 +1,281 @@ +//+build !noasm,!appengine,gc + +// Copyright (c) 2018 Igneous Systems +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Copyright (c) 2020 MinIO Inc. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// This is the AVX2 implementation of the MD5 block function (8-way parallel) + +// block8(state *uint64, base uintptr, bufs *int32, cache *byte, n int) +TEXT ·block8(SB), 4, $0-40 + MOVQ state+0(FP), BX + MOVQ base+8(FP), SI + MOVQ bufs+16(FP), AX + MOVQ cache+24(FP), CX + MOVQ n+32(FP), DX + MOVQ ·avx256md5consts+0(SB), DI + + // Align cache (which is stack allocated by the compiler) + // to a 256 bit boundary (ymm register alignment) + // The cache8 type is deliberately oversized to permit this. + ADDQ $31, CX + ANDB $-32, CL + +#define a Y0 +#define b Y1 +#define c Y2 +#define d Y3 + +#define sa Y4 +#define sb Y5 +#define sc Y6 +#define sd Y7 + +#define tmp Y8 +#define tmp2 Y9 + +#define mask Y10 +#define off Y11 + +#define ones Y12 + +#define rtmp1 Y13 +#define rtmp2 Y14 + +#define mem Y15 + +#define dig BX +#define cache CX +#define count DX +#define base SI +#define consts DI + +#define prepmask \ + VPXOR mask, mask, mask \ + VPCMPGTD mask, off, mask + +#define prep(index) \ + VMOVAPD mask, rtmp2 \ + VPGATHERDD rtmp2, index*4(base)(off*1), mem + +#define load(index) \ + VMOVAPD index*32(cache), mem + +#define store(index) \ + VMOVAPD mem, index*32(cache) + +#define roll(shift, a) \ + VPSLLD $shift, a, rtmp1 \ + VPSRLD $32-shift, a, a \ + VPOR rtmp1, a, a + +#define ROUND1(a, b, c, d, index, const, shift) \ + VPXOR c, tmp, tmp \ + VPADDD 32*const(consts), a, a \ + VPADDD mem, a, a \ + VPAND b, tmp, tmp \ + VPXOR d, tmp, tmp \ + prep(index) \ + VPADDD tmp, a, a \ + roll(shift,a) \ + VMOVAPD c, tmp \ + VPADDD b, a, a + +#define ROUND1load(a, b, c, d, index, const, shift) \ + VXORPD c, tmp, tmp \ + VPADDD 32*const(consts), a, a \ + VPADDD mem, a, a \ + VPAND b, tmp, tmp \ + VPXOR d, tmp, tmp \ + load(index) \ + VPADDD tmp, a, a \ + roll(shift,a) \ + VMOVAPD c, tmp \ + VPADDD b, a, a + +#define ROUND2(a, b, c, d, index, const, shift) \ + VPADDD 32*const(consts), a, a \ + VPADDD mem, a, a \ + VPAND b, tmp2, tmp2 \ + VANDNPD c, tmp, tmp \ + load(index) \ + VPOR tmp, tmp2, tmp2 \ + VMOVAPD c, tmp \ + VPADDD tmp2, a, a \ + VMOVAPD c, tmp2 \ + roll(shift,a) \ + VPADDD b, a, a + +#define ROUND3(a, b, c, d, index, const, shift) \ + VPADDD 32*const(consts), a, a \ + VPADDD mem, a, a \ + load(index) \ + VPXOR d, tmp, tmp \ + VPXOR b, tmp, tmp \ + VPADDD tmp, a, a \ + roll(shift,a) \ + VMOVAPD b, tmp \ + VPADDD b, a, a + +#define ROUND4(a, b, c, d, index, const, shift) \ + VPADDD 32*const(consts), a, a \ + VPADDD mem, a, a \ + VPOR b, tmp, tmp \ + VPXOR c, tmp, tmp \ + VPADDD tmp, a, a \ + load(index) \ + roll(shift,a) \ + VPXOR c, ones, tmp \ + VPADDD b, a, a + + // load digest into state registers + VMOVUPD (dig), a + VMOVUPD 32(dig), b + VMOVUPD 64(dig), c + VMOVUPD 96(dig), d + + // load source buffer offsets + VMOVUPD (AX), off + + prepmask + VPCMPEQD ones, ones, ones + +loop: + VMOVAPD a, sa + VMOVAPD b, sb + VMOVAPD c, sc + VMOVAPD d, sd + + prep(0) + VMOVAPD d, tmp + store(0) + + ROUND1(a,b,c,d, 1,0x00, 7) + store(1) + ROUND1(d,a,b,c, 2,0x01,12) + store(2) + ROUND1(c,d,a,b, 3,0x02,17) + store(3) + ROUND1(b,c,d,a, 4,0x03,22) + store(4) + ROUND1(a,b,c,d, 5,0x04, 7) + store(5) + ROUND1(d,a,b,c, 6,0x05,12) + store(6) + ROUND1(c,d,a,b, 7,0x06,17) + store(7) + ROUND1(b,c,d,a, 8,0x07,22) + store(8) + ROUND1(a,b,c,d, 9,0x08, 7) + store(9) + ROUND1(d,a,b,c,10,0x09,12) + store(10) + ROUND1(c,d,a,b,11,0x0a,17) + store(11) + ROUND1(b,c,d,a,12,0x0b,22) + store(12) + ROUND1(a,b,c,d,13,0x0c, 7) + store(13) + ROUND1(d,a,b,c,14,0x0d,12) + store(14) + ROUND1(c,d,a,b,15,0x0e,17) + store(15) + ROUND1load(b,c,d,a, 1,0x0f,22) + + VMOVAPD d, tmp + VMOVAPD d, tmp2 + + ROUND2(a,b,c,d, 6,0x10, 5) + ROUND2(d,a,b,c,11,0x11, 9) + ROUND2(c,d,a,b, 0,0x12,14) + ROUND2(b,c,d,a, 5,0x13,20) + ROUND2(a,b,c,d,10,0x14, 5) + ROUND2(d,a,b,c,15,0x15, 9) + ROUND2(c,d,a,b, 4,0x16,14) + ROUND2(b,c,d,a, 9,0x17,20) + ROUND2(a,b,c,d,14,0x18, 5) + ROUND2(d,a,b,c, 3,0x19, 9) + ROUND2(c,d,a,b, 8,0x1a,14) + ROUND2(b,c,d,a,13,0x1b,20) + ROUND2(a,b,c,d, 2,0x1c, 5) + ROUND2(d,a,b,c, 7,0x1d, 9) + ROUND2(c,d,a,b,12,0x1e,14) + ROUND2(b,c,d,a, 0,0x1f,20) + + load(5) + VMOVAPD c, tmp + + ROUND3(a,b,c,d, 8,0x20, 4) + ROUND3(d,a,b,c,11,0x21,11) + ROUND3(c,d,a,b,14,0x22,16) + ROUND3(b,c,d,a, 1,0x23,23) + ROUND3(a,b,c,d, 4,0x24, 4) + ROUND3(d,a,b,c, 7,0x25,11) + ROUND3(c,d,a,b,10,0x26,16) + ROUND3(b,c,d,a,13,0x27,23) + ROUND3(a,b,c,d, 0,0x28, 4) + ROUND3(d,a,b,c, 3,0x29,11) + ROUND3(c,d,a,b, 6,0x2a,16) + ROUND3(b,c,d,a, 9,0x2b,23) + ROUND3(a,b,c,d,12,0x2c, 4) + ROUND3(d,a,b,c,15,0x2d,11) + ROUND3(c,d,a,b, 2,0x2e,16) + ROUND3(b,c,d,a, 0,0x2f,23) + + load(0) + VPXOR d, ones, tmp + + ROUND4(a,b,c,d, 7,0x30, 6) + ROUND4(d,a,b,c,14,0x31,10) + ROUND4(c,d,a,b, 5,0x32,15) + ROUND4(b,c,d,a,12,0x33,21) + ROUND4(a,b,c,d, 3,0x34, 6) + ROUND4(d,a,b,c,10,0x35,10) + ROUND4(c,d,a,b, 1,0x36,15) + ROUND4(b,c,d,a, 8,0x37,21) + ROUND4(a,b,c,d,15,0x38, 6) + ROUND4(d,a,b,c, 6,0x39,10) + ROUND4(c,d,a,b,13,0x3a,15) + ROUND4(b,c,d,a, 4,0x3b,21) + ROUND4(a,b,c,d,11,0x3c, 6) + ROUND4(d,a,b,c, 2,0x3d,10) + ROUND4(c,d,a,b, 9,0x3e,15) + ROUND4(b,c,d,a, 0,0x3f,21) + + VPADDD sa, a, a + VPADDD sb, b, b + VPADDD sc, c, c + VPADDD sd, d, d + + LEAQ 64(base), base + SUBQ $64, count + JNE loop + + VMOVUPD a, (dig) + VMOVUPD b, 32(dig) + VMOVUPD c, 64(dig) + VMOVUPD d, 96(dig) + + VZEROUPPER + RET -- cgit v1.2.3