diff options
Diffstat (limited to 'vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.go')
-rw-r--r-- | vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.go | 500 |
1 files changed, 500 insertions, 0 deletions
diff --git a/vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.go b/vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.go new file mode 100644 index 00000000..b7d7c163 --- /dev/null +++ b/vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.go @@ -0,0 +1,500 @@ +//+build !noasm,!appengine,gc + +/* + * Minio Cloud Storage, (C) 2017 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package sha256 + +import ( + "encoding/binary" + "errors" + "hash" + "sort" + "sync/atomic" + "time" +) + +//go:noescape +func sha256X16Avx512(digests *[512]byte, scratch *[512]byte, table *[512]uint64, mask []uint64, inputs [16][]byte) + +// Avx512ServerUID - Do not start at 0 but next multiple of 16 so as to be able to +// differentiate with default initialiation value of 0 +const Avx512ServerUID = 16 + +var uidCounter uint64 + +// NewAvx512 - initialize sha256 Avx512 implementation. +func NewAvx512(a512srv *Avx512Server) hash.Hash { + uid := atomic.AddUint64(&uidCounter, 1) + return &Avx512Digest{uid: uid, a512srv: a512srv} +} + +// Avx512Digest - Type for computing SHA256 using Avx512 +type Avx512Digest struct { + uid uint64 + a512srv *Avx512Server + x [chunk]byte + nx int + len uint64 + final bool + result [Size]byte +} + +// Size - Return size of checksum +func (d *Avx512Digest) Size() int { return Size } + +// BlockSize - Return blocksize of checksum +func (d Avx512Digest) BlockSize() int { return BlockSize } + +// Reset - reset sha digest to its initial values +func (d *Avx512Digest) Reset() { + d.a512srv.blocksCh <- blockInput{uid: d.uid, reset: true} + d.nx = 0 + d.len = 0 + d.final = false +} + +// Write to digest +func (d *Avx512Digest) Write(p []byte) (nn int, err error) { + + if d.final { + return 0, errors.New("Avx512Digest already finalized. Reset first before writing again") + } + + nn = len(p) + d.len += uint64(nn) + if d.nx > 0 { + n := copy(d.x[d.nx:], p) + d.nx += n + if d.nx == chunk { + d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: d.x[:]} + d.nx = 0 + } + p = p[n:] + } + if len(p) >= chunk { + n := len(p) &^ (chunk - 1) + d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: p[:n]} + p = p[n:] + } + if len(p) > 0 { + d.nx = copy(d.x[:], p) + } + return +} + +// Sum - Return sha256 sum in bytes +func (d *Avx512Digest) Sum(in []byte) (result []byte) { + + if d.final { + return append(in, d.result[:]...) + } + + trail := make([]byte, 0, 128) + trail = append(trail, d.x[:d.nx]...) + + len := d.len + // Padding. Add a 1 bit and 0 bits until 56 bytes mod 64. + var tmp [64]byte + tmp[0] = 0x80 + if len%64 < 56 { + trail = append(trail, tmp[0:56-len%64]...) + } else { + trail = append(trail, tmp[0:64+56-len%64]...) + } + d.nx = 0 + + // Length in bits. + len <<= 3 + for i := uint(0); i < 8; i++ { + tmp[i] = byte(len >> (56 - 8*i)) + } + trail = append(trail, tmp[0:8]...) + + sumCh := make(chan [Size]byte) + d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: trail, final: true, sumCh: sumCh} + d.result = <-sumCh + d.final = true + return append(in, d.result[:]...) +} + +var table = [512]uint64{ + 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, + 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, + 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, + 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, + 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, + 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, + 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, + 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, + 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, + 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, + 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, + 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, + 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, + 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, + 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, + 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, + 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, + 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, + 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, + 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, + 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, + 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, + 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, + 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, + 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, + 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, + 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, + 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, + 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, + 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, + 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, + 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, + 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, + 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, + 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, + 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, + 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, + 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, + 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, + 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, + 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, + 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, + 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, + 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, + 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, + 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, + 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, + 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, + 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, + 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, + 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, + 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, + 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, + 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, + 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, + 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, + 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, + 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, + 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, + 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, + 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, + 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, + 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, + 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, + 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, + 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, + 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, + 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, + 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, + 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, + 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, + 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, + 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, + 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, + 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, + 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, + 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, + 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, + 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, + 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, + 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, + 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, + 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, + 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, + 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, + 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, + 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, + 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, + 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, + 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, + 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, + 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, + 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, + 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, + 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, + 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, + 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, + 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, + 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, + 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, + 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, + 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, + 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, + 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, + 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, + 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, + 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, + 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, + 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, + 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, + 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, + 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, + 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, + 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, + 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, + 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, + 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, + 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, + 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, + 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, + 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, + 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, + 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, + 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, + 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, + 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, + 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, + 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2} + +// Interface function to assembly ode +func blockAvx512(digests *[512]byte, input [16][]byte, mask []uint64) [16][Size]byte { + + scratch := [512]byte{} + sha256X16Avx512(digests, &scratch, &table, mask, input) + + output := [16][Size]byte{} + for i := 0; i < 16; i++ { + output[i] = getDigest(i, digests[:]) + } + + return output +} + +func getDigest(index int, state []byte) (sum [Size]byte) { + for j := 0; j < 16; j += 2 { + for i := index*4 + j*Size; i < index*4+(j+1)*Size; i += Size { + binary.BigEndian.PutUint32(sum[j*2:], binary.LittleEndian.Uint32(state[i:i+4])) + } + } + return +} + +// Message to send across input channel +type blockInput struct { + uid uint64 + msg []byte + reset bool + final bool + sumCh chan [Size]byte +} + +// Avx512Server - Type to implement 16x parallel handling of SHA256 invocations +type Avx512Server struct { + blocksCh chan blockInput // Input channel + totalIn int // Total number of inputs waiting to be processed + lanes [16]Avx512LaneInfo // Array with info per lane (out of 16) + digests map[uint64][Size]byte // Map of uids to (interim) digest results +} + +// Avx512LaneInfo - Info for each lane +type Avx512LaneInfo struct { + uid uint64 // unique identification for this SHA processing + block []byte // input block to be processed + outputCh chan [Size]byte // channel for output result +} + +// NewAvx512Server - Create new object for parallel processing handling +func NewAvx512Server() *Avx512Server { + a512srv := &Avx512Server{} + a512srv.digests = make(map[uint64][Size]byte) + a512srv.blocksCh = make(chan blockInput) + + // Start a single thread for reading from the input channel + go a512srv.Process() + return a512srv +} + +// Process - Sole handler for reading from the input channel +func (a512srv *Avx512Server) Process() { + for { + select { + case block := <-a512srv.blocksCh: + if block.reset { + a512srv.reset(block.uid) + continue + } + index := block.uid & 0xf + // fmt.Println("Adding message:", block.uid, index) + + if a512srv.lanes[index].block != nil { // If slot is already filled, process all inputs + //fmt.Println("Invoking Blocks()") + a512srv.blocks() + } + a512srv.totalIn++ + a512srv.lanes[index] = Avx512LaneInfo{uid: block.uid, block: block.msg} + if block.final { + a512srv.lanes[index].outputCh = block.sumCh + } + if a512srv.totalIn == len(a512srv.lanes) { + // fmt.Println("Invoking Blocks() while FULL: ") + a512srv.blocks() + } + + // TODO: test with larger timeout + case <-time.After(1 * time.Microsecond): + for _, lane := range a512srv.lanes { + if lane.block != nil { // check if there is any input to process + // fmt.Println("Invoking Blocks() on TIMEOUT: ") + a512srv.blocks() + break // we are done + } + } + } + } +} + +// Do a reset for this calculation +func (a512srv *Avx512Server) reset(uid uint64) { + + // Check if there is a message still waiting to be processed (and remove if so) + for i, lane := range a512srv.lanes { + if lane.uid == uid { + if lane.block != nil { + a512srv.lanes[i] = Avx512LaneInfo{} // clear message + a512srv.totalIn-- + } + } + } + + // Delete entry from hash map + delete(a512srv.digests, uid) +} + +// Invoke assembly and send results back +func (a512srv *Avx512Server) blocks() { + + inputs := [16][]byte{} + for i := range inputs { + inputs[i] = a512srv.lanes[i].block + } + + mask := expandMask(genMask(inputs)) + outputs := blockAvx512(a512srv.getDigests(), inputs, mask) + + a512srv.totalIn = 0 + for i := 0; i < len(outputs); i++ { + uid, outputCh := a512srv.lanes[i].uid, a512srv.lanes[i].outputCh + a512srv.digests[uid] = outputs[i] + a512srv.lanes[i] = Avx512LaneInfo{} + + if outputCh != nil { + // Send back result + outputCh <- outputs[i] + delete(a512srv.digests, uid) // Delete entry from hashmap + } + } +} + +func (a512srv *Avx512Server) Write(uid uint64, p []byte) (nn int, err error) { + a512srv.blocksCh <- blockInput{uid: uid, msg: p} + return len(p), nil +} + +// Sum - return sha256 sum in bytes for a given sum id. +func (a512srv *Avx512Server) Sum(uid uint64, p []byte) [32]byte { + sumCh := make(chan [32]byte) + a512srv.blocksCh <- blockInput{uid: uid, msg: p, final: true, sumCh: sumCh} + return <-sumCh +} + +func (a512srv *Avx512Server) getDigests() *[512]byte { + digests := [512]byte{} + for i, lane := range a512srv.lanes { + a, ok := a512srv.digests[lane.uid] + if ok { + binary.BigEndian.PutUint32(digests[(i+0*16)*4:], binary.LittleEndian.Uint32(a[0:4])) + binary.BigEndian.PutUint32(digests[(i+1*16)*4:], binary.LittleEndian.Uint32(a[4:8])) + binary.BigEndian.PutUint32(digests[(i+2*16)*4:], binary.LittleEndian.Uint32(a[8:12])) + binary.BigEndian.PutUint32(digests[(i+3*16)*4:], binary.LittleEndian.Uint32(a[12:16])) + binary.BigEndian.PutUint32(digests[(i+4*16)*4:], binary.LittleEndian.Uint32(a[16:20])) + binary.BigEndian.PutUint32(digests[(i+5*16)*4:], binary.LittleEndian.Uint32(a[20:24])) + binary.BigEndian.PutUint32(digests[(i+6*16)*4:], binary.LittleEndian.Uint32(a[24:28])) + binary.BigEndian.PutUint32(digests[(i+7*16)*4:], binary.LittleEndian.Uint32(a[28:32])) + } else { + binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0) + binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1) + binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2) + binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3) + binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4) + binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5) + binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6) + binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7) + } + } + return &digests +} + +// Helper struct for sorting blocks based on length +type lane struct { + len uint + pos uint +} + +type lanes []lane + +func (lns lanes) Len() int { return len(lns) } +func (lns lanes) Swap(i, j int) { lns[i], lns[j] = lns[j], lns[i] } +func (lns lanes) Less(i, j int) bool { return lns[i].len < lns[j].len } + +// Helper struct for +type maskRounds struct { + mask uint64 + rounds uint64 +} + +func genMask(input [16][]byte) [16]maskRounds { + + // Sort on blocks length small to large + var sorted [16]lane + for c, inpt := range input { + sorted[c] = lane{uint(len(inpt)), uint(c)} + } + sort.Sort(lanes(sorted[:])) + + // Create mask array including 'rounds' between masks + m, round, index := uint64(0xffff), uint64(0), 0 + var mr [16]maskRounds + for _, s := range sorted { + if s.len > 0 { + if uint64(s.len)>>6 > round { + mr[index] = maskRounds{m, (uint64(s.len) >> 6) - round} + index++ + } + round = uint64(s.len) >> 6 + } + m = m & ^(1 << uint(s.pos)) + } + + return mr +} + +// TODO: remove function +func expandMask(mr [16]maskRounds) []uint64 { + size := uint64(0) + for _, r := range mr { + size += r.rounds + } + result, index := make([]uint64, size), 0 + for _, r := range mr { + for j := uint64(0); j < r.rounds; j++ { + result[index] = r.mask + index++ + } + } + return result +} |