From 3893a035be347a7687a41d2054dd1b274d3a0504 Mon Sep 17 00:00:00 2001 From: Wim Date: Sun, 12 Dec 2021 00:05:15 +0100 Subject: Update dependencies/vendor (#1659) --- vendor/golang.org/x/crypto/acme/acme.go | 2 +- vendor/golang.org/x/crypto/acme/rfc8555.go | 26 ++ .../x/crypto/internal/poly1305/bits_compat.go | 40 ++ .../x/crypto/internal/poly1305/bits_go1.13.go | 22 + .../x/crypto/internal/poly1305/mac_noasm.go | 10 + .../x/crypto/internal/poly1305/poly1305.go | 99 ++++ .../x/crypto/internal/poly1305/sum_amd64.go | 48 ++ .../x/crypto/internal/poly1305/sum_amd64.s | 109 +++++ .../x/crypto/internal/poly1305/sum_generic.go | 310 +++++++++++++ .../x/crypto/internal/poly1305/sum_ppc64le.go | 48 ++ .../x/crypto/internal/poly1305/sum_ppc64le.s | 182 ++++++++ .../x/crypto/internal/poly1305/sum_s390x.go | 76 ++++ .../x/crypto/internal/poly1305/sum_s390x.s | 504 +++++++++++++++++++++ .../x/crypto/nacl/secretbox/secretbox.go | 2 +- vendor/golang.org/x/crypto/poly1305/bits_compat.go | 40 -- vendor/golang.org/x/crypto/poly1305/bits_go1.13.go | 22 - vendor/golang.org/x/crypto/poly1305/mac_noasm.go | 10 - vendor/golang.org/x/crypto/poly1305/poly1305.go | 99 ---- vendor/golang.org/x/crypto/poly1305/sum_amd64.go | 48 -- vendor/golang.org/x/crypto/poly1305/sum_amd64.s | 109 ----- vendor/golang.org/x/crypto/poly1305/sum_generic.go | 310 ------------- vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go | 48 -- vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s | 182 -------- vendor/golang.org/x/crypto/poly1305/sum_s390x.go | 76 ---- vendor/golang.org/x/crypto/poly1305/sum_s390x.s | 504 --------------------- vendor/golang.org/x/crypto/ssh/cipher.go | 2 +- 26 files changed, 1477 insertions(+), 1451 deletions(-) create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/poly1305.go create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.go create mode 100644 vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.s delete mode 100644 vendor/golang.org/x/crypto/poly1305/bits_compat.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/bits_go1.13.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/mac_noasm.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/poly1305.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_amd64.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_amd64.s delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_generic.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_s390x.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_s390x.s (limited to 'vendor/golang.org/x/crypto') diff --git a/vendor/golang.org/x/crypto/acme/acme.go b/vendor/golang.org/x/crypto/acme/acme.go index 174cfe8b..73b19ef3 100644 --- a/vendor/golang.org/x/crypto/acme/acme.go +++ b/vendor/golang.org/x/crypto/acme/acme.go @@ -4,7 +4,7 @@ // Package acme provides an implementation of the // Automatic Certificate Management Environment (ACME) spec. -// The intial implementation was based on ACME draft-02 and +// The initial implementation was based on ACME draft-02 and // is now being extended to comply with RFC 8555. // See https://tools.ietf.org/html/draft-ietf-acme-acme-02 // and https://tools.ietf.org/html/rfc8555 for details. diff --git a/vendor/golang.org/x/crypto/acme/rfc8555.go b/vendor/golang.org/x/crypto/acme/rfc8555.go index 073cee58..f9d3011f 100644 --- a/vendor/golang.org/x/crypto/acme/rfc8555.go +++ b/vendor/golang.org/x/crypto/acme/rfc8555.go @@ -410,3 +410,29 @@ func isAlreadyRevoked(err error) bool { e, ok := err.(*Error) return ok && e.ProblemType == "urn:ietf:params:acme:error:alreadyRevoked" } + +// ListCertAlternates retrieves any alternate certificate chain URLs for the +// given certificate chain URL. These alternate URLs can be passed to FetchCert +// in order to retrieve the alternate certificate chains. +// +// If there are no alternate issuer certificate chains, a nil slice will be +// returned. +func (c *Client) ListCertAlternates(ctx context.Context, url string) ([]string, error) { + if _, err := c.Discover(ctx); err != nil { // required by c.accountKID + return nil, err + } + + res, err := c.postAsGet(ctx, url, wantStatus(http.StatusOK)) + if err != nil { + return nil, err + } + defer res.Body.Close() + + // We don't need the body but we need to discard it so we don't end up + // preventing keep-alive + if _, err := io.Copy(ioutil.Discard, res.Body); err != nil { + return nil, fmt.Errorf("acme: cert alternates response stream: %v", err) + } + alts := linkHeader(res.Header, "alternate") + return alts, nil +} diff --git a/vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go b/vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go new file mode 100644 index 00000000..45b5c966 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go @@ -0,0 +1,40 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !go1.13 +// +build !go1.13 + +package poly1305 + +// Generic fallbacks for the math/bits intrinsics, copied from +// src/math/bits/bits.go. They were added in Go 1.12, but Add64 and Sum64 had +// variable time fallbacks until Go 1.13. + +func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { + sum = x + y + carry + carryOut = ((x & y) | ((x | y) &^ sum)) >> 63 + return +} + +func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { + diff = x - y - borrow + borrowOut = ((^x & y) | (^(x ^ y) & diff)) >> 63 + return +} + +func bitsMul64(x, y uint64) (hi, lo uint64) { + const mask32 = 1<<32 - 1 + x0 := x & mask32 + x1 := x >> 32 + y0 := y & mask32 + y1 := y >> 32 + w0 := x0 * y0 + t := x1*y0 + w0>>32 + w1 := t & mask32 + w2 := t >> 32 + w1 += x0 * y1 + hi = x1*y1 + w2 + w1>>32 + lo = x * y + return +} diff --git a/vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go b/vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go new file mode 100644 index 00000000..ed52b341 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go @@ -0,0 +1,22 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.13 +// +build go1.13 + +package poly1305 + +import "math/bits" + +func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { + return bits.Add64(x, y, carry) +} + +func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { + return bits.Sub64(x, y, borrow) +} + +func bitsMul64(x, y uint64) (hi, lo uint64) { + return bits.Mul64(x, y) +} diff --git a/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go b/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go new file mode 100644 index 00000000..f184b67d --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go @@ -0,0 +1,10 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego +// +build !amd64,!ppc64le,!s390x !gc purego + +package poly1305 + +type mac struct{ macGeneric } diff --git a/vendor/golang.org/x/crypto/internal/poly1305/poly1305.go b/vendor/golang.org/x/crypto/internal/poly1305/poly1305.go new file mode 100644 index 00000000..4aaea810 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/poly1305.go @@ -0,0 +1,99 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package poly1305 implements Poly1305 one-time message authentication code as +// specified in https://cr.yp.to/mac/poly1305-20050329.pdf. +// +// Poly1305 is a fast, one-time authentication function. It is infeasible for an +// attacker to generate an authenticator for a message without the key. However, a +// key must only be used for a single message. Authenticating two different +// messages with the same key allows an attacker to forge authenticators for other +// messages with the same key. +// +// Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was +// used with a fixed key in order to generate one-time keys from an nonce. +// However, in this package AES isn't used and the one-time key is specified +// directly. +package poly1305 + +import "crypto/subtle" + +// TagSize is the size, in bytes, of a poly1305 authenticator. +const TagSize = 16 + +// Sum generates an authenticator for msg using a one-time key and puts the +// 16-byte result into out. Authenticating two different messages with the same +// key allows an attacker to forge messages at will. +func Sum(out *[16]byte, m []byte, key *[32]byte) { + h := New(key) + h.Write(m) + h.Sum(out[:0]) +} + +// Verify returns true if mac is a valid authenticator for m with the given key. +func Verify(mac *[16]byte, m []byte, key *[32]byte) bool { + var tmp [16]byte + Sum(&tmp, m, key) + return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1 +} + +// New returns a new MAC computing an authentication +// tag of all data written to it with the given key. +// This allows writing the message progressively instead +// of passing it as a single slice. Common users should use +// the Sum function instead. +// +// The key must be unique for each message, as authenticating +// two different messages with the same key allows an attacker +// to forge messages at will. +func New(key *[32]byte) *MAC { + m := &MAC{} + initialize(key, &m.macState) + return m +} + +// MAC is an io.Writer computing an authentication tag +// of the data written to it. +// +// MAC cannot be used like common hash.Hash implementations, +// because using a poly1305 key twice breaks its security. +// Therefore writing data to a running MAC after calling +// Sum or Verify causes it to panic. +type MAC struct { + mac // platform-dependent implementation + + finalized bool +} + +// Size returns the number of bytes Sum will return. +func (h *MAC) Size() int { return TagSize } + +// Write adds more data to the running message authentication code. +// It never returns an error. +// +// It must not be called after the first call of Sum or Verify. +func (h *MAC) Write(p []byte) (n int, err error) { + if h.finalized { + panic("poly1305: write to MAC after Sum or Verify") + } + return h.mac.Write(p) +} + +// Sum computes the authenticator of all data written to the +// message authentication code. +func (h *MAC) Sum(b []byte) []byte { + var mac [TagSize]byte + h.mac.Sum(&mac) + h.finalized = true + return append(b, mac[:]...) +} + +// Verify returns whether the authenticator of all data written to +// the message authentication code matches the expected value. +func (h *MAC) Verify(expected []byte) bool { + var mac [TagSize]byte + h.mac.Sum(&mac) + h.finalized = true + return subtle.ConstantTimeCompare(expected, mac[:]) == 1 +} diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go new file mode 100644 index 00000000..6d522333 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go @@ -0,0 +1,48 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +package poly1305 + +//go:noescape +func update(state *macState, msg []byte) + +// mac is a wrapper for macGeneric that redirects calls that would have gone to +// updateGeneric to update. +// +// Its Write and Sum methods are otherwise identical to the macGeneric ones, but +// using function pointers would carry a major performance cost. +type mac struct{ macGeneric } + +func (h *mac) Write(p []byte) (int, error) { + nn := len(p) + if h.offset > 0 { + n := copy(h.buffer[h.offset:], p) + if h.offset+n < TagSize { + h.offset += n + return nn, nil + } + p = p[n:] + h.offset = 0 + update(&h.macState, h.buffer[:]) + } + if n := len(p) - (len(p) % TagSize); n > 0 { + update(&h.macState, p[:n]) + p = p[n:] + } + if len(p) > 0 { + h.offset += copy(h.buffer[h.offset:], p) + } + return nn, nil +} + +func (h *mac) Sum(out *[16]byte) { + state := h.macState + if h.offset > 0 { + update(&state, h.buffer[:h.offset]) + } + finalize(out, &state.h, &state.s) +} diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s new file mode 100644 index 00000000..1d74f0f8 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s @@ -0,0 +1,109 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +#include "textflag.h" + +#define POLY1305_ADD(msg, h0, h1, h2) \ + ADDQ 0(msg), h0; \ + ADCQ 8(msg), h1; \ + ADCQ $1, h2; \ + LEAQ 16(msg), msg + +#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ + MOVQ r0, AX; \ + MULQ h0; \ + MOVQ AX, t0; \ + MOVQ DX, t1; \ + MOVQ r0, AX; \ + MULQ h1; \ + ADDQ AX, t1; \ + ADCQ $0, DX; \ + MOVQ r0, t2; \ + IMULQ h2, t2; \ + ADDQ DX, t2; \ + \ + MOVQ r1, AX; \ + MULQ h0; \ + ADDQ AX, t1; \ + ADCQ $0, DX; \ + MOVQ DX, h0; \ + MOVQ r1, t3; \ + IMULQ h2, t3; \ + MOVQ r1, AX; \ + MULQ h1; \ + ADDQ AX, t2; \ + ADCQ DX, t3; \ + ADDQ h0, t2; \ + ADCQ $0, t3; \ + \ + MOVQ t0, h0; \ + MOVQ t1, h1; \ + MOVQ t2, h2; \ + ANDQ $3, h2; \ + MOVQ t2, t0; \ + ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ + ADDQ t0, h0; \ + ADCQ t3, h1; \ + ADCQ $0, h2; \ + SHRQ $2, t3, t2; \ + SHRQ $2, t3; \ + ADDQ t2, h0; \ + ADCQ t3, h1; \ + ADCQ $0, h2 + +// func update(state *[7]uint64, msg []byte) +TEXT ·update(SB), $0-32 + MOVQ state+0(FP), DI + MOVQ msg_base+8(FP), SI + MOVQ msg_len+16(FP), R15 + + MOVQ 0(DI), R8 // h0 + MOVQ 8(DI), R9 // h1 + MOVQ 16(DI), R10 // h2 + MOVQ 24(DI), R11 // r0 + MOVQ 32(DI), R12 // r1 + + CMPQ R15, $16 + JB bytes_between_0_and_15 + +loop: + POLY1305_ADD(SI, R8, R9, R10) + +multiply: + POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) + SUBQ $16, R15 + CMPQ R15, $16 + JAE loop + +bytes_between_0_and_15: + TESTQ R15, R15 + JZ done + MOVQ $1, BX + XORQ CX, CX + XORQ R13, R13 + ADDQ R15, SI + +flush_buffer: + SHLQ $8, BX, CX + SHLQ $8, BX + MOVB -1(SI), R13 + XORQ R13, BX + DECQ SI + DECQ R15 + JNZ flush_buffer + + ADDQ BX, R8 + ADCQ CX, R9 + ADCQ $0, R10 + MOVQ $16, R15 + JMP multiply + +done: + MOVQ R8, 0(DI) + MOVQ R9, 8(DI) + MOVQ R10, 16(DI) + RET diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go new file mode 100644 index 00000000..c942a659 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go @@ -0,0 +1,310 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file provides the generic implementation of Sum and MAC. Other files +// might provide optimized assembly implementations of some of this code. + +package poly1305 + +import "encoding/binary" + +// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag +// for a 64 bytes message is approximately +// +// s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5 +// +// for some secret r and s. It can be computed sequentially like +// +// for len(msg) > 0: +// h += read(msg, 16) +// h *= r +// h %= 2¹³⁰ - 5 +// return h + s +// +// All the complexity is about doing performant constant-time math on numbers +// larger than any available numeric type. + +func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) { + h := newMACGeneric(key) + h.Write(msg) + h.Sum(out) +} + +func newMACGeneric(key *[32]byte) macGeneric { + m := macGeneric{} + initialize(key, &m.macState) + return m +} + +// macState holds numbers in saturated 64-bit little-endian limbs. That is, +// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸. +type macState struct { + // h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but + // can grow larger during and after rounds. It must, however, remain below + // 2 * (2¹³⁰ - 5). + h [3]uint64 + // r and s are the private key components. + r [2]uint64 + s [2]uint64 +} + +type macGeneric struct { + macState + + buffer [TagSize]byte + offset int +} + +// Write splits the incoming message into TagSize chunks, and passes them to +// update. It buffers incomplete chunks. +func (h *macGeneric) Write(p []byte) (int, error) { + nn := len(p) + if h.offset > 0 { + n := copy(h.buffer[h.offset:], p) + if h.offset+n < TagSize { + h.offset += n + return nn, nil + } + p = p[n:] + h.offset = 0 + updateGeneric(&h.macState, h.buffer[:]) + } + if n := len(p) - (len(p) % TagSize); n > 0 { + updateGeneric(&h.macState, p[:n]) + p = p[n:] + } + if len(p) > 0 { + h.offset += copy(h.buffer[h.offset:], p) + } + return nn, nil +} + +// Sum flushes the last incomplete chunk from the buffer, if any, and generates +// the MAC output. It does not modify its state, in order to allow for multiple +// calls to Sum, even if no Write is allowed after Sum. +func (h *macGeneric) Sum(out *[TagSize]byte) { + state := h.macState + if h.offset > 0 { + updateGeneric(&state, h.buffer[:h.offset]) + } + finalize(out, &state.h, &state.s) +} + +// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It +// clears some bits of the secret coefficient to make it possible to implement +// multiplication more efficiently. +const ( + rMask0 = 0x0FFFFFFC0FFFFFFF + rMask1 = 0x0FFFFFFC0FFFFFFC +) + +// initialize loads the 256-bit key into the two 128-bit secret values r and s. +func initialize(key *[32]byte, m *macState) { + m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0 + m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1 + m.s[0] = binary.LittleEndian.Uint64(key[16:24]) + m.s[1] = binary.LittleEndian.Uint64(key[24:32]) +} + +// uint128 holds a 128-bit number as two 64-bit limbs, for use with the +// bits.Mul64 and bits.Add64 intrinsics. +type uint128 struct { + lo, hi uint64 +} + +func mul64(a, b uint64) uint128 { + hi, lo := bitsMul64(a, b) + return uint128{lo, hi} +} + +func add128(a, b uint128) uint128 { + lo, c := bitsAdd64(a.lo, b.lo, 0) + hi, c := bitsAdd64(a.hi, b.hi, c) + if c != 0 { + panic("poly1305: unexpected overflow") + } + return uint128{lo, hi} +} + +func shiftRightBy2(a uint128) uint128 { + a.lo = a.lo>>2 | (a.hi&3)<<62 + a.hi = a.hi >> 2 + return a +} + +// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of +// 128 bits of message, it computes +// +// h₊ = (h + m) * r mod 2¹³⁰ - 5 +// +// If the msg length is not a multiple of TagSize, it assumes the last +// incomplete chunk is the final one. +func updateGeneric(state *macState, msg []byte) { + h0, h1, h2 := state.h[0], state.h[1], state.h[2] + r0, r1 := state.r[0], state.r[1] + + for len(msg) > 0 { + var c uint64 + + // For the first step, h + m, we use a chain of bits.Add64 intrinsics. + // The resulting value of h might exceed 2¹³⁰ - 5, but will be partially + // reduced at the end of the multiplication below. + // + // The spec requires us to set a bit just above the message size, not to + // hide leading zeroes. For full chunks, that's 1 << 128, so we can just + // add 1 to the most significant (2¹²⁸) limb, h2. + if len(msg) >= TagSize { + h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0) + h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c) + h2 += c + 1 + + msg = msg[TagSize:] + } else { + var buf [TagSize]byte + copy(buf[:], msg) + buf[len(msg)] = 1 + + h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0) + h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c) + h2 += c + + msg = nil + } + + // Multiplication of big number limbs is similar to elementary school + // columnar multiplication. Instead of digits, there are 64-bit limbs. + // + // We are multiplying a 3 limbs number, h, by a 2 limbs number, r. + // + // h2 h1 h0 x + // r1 r0 = + // ---------------- + // h2r0 h1r0 h0r0 <-- individual 128-bit products + // + h2r1 h1r1 h0r1 + // ------------------------ + // m3 m2 m1 m0 <-- result in 128-bit overlapping limbs + // ------------------------ + // m3.hi m2.hi m1.hi m0.hi <-- carry propagation + // + m3.lo m2.lo m1.lo m0.lo + // ------------------------------- + // t4 t3 t2 t1 t0 <-- final result in 64-bit limbs + // + // The main difference from pen-and-paper multiplication is that we do + // carry propagation in a separate step, as if we wrote two digit sums + // at first (the 128-bit limbs), and then carried the tens all at once. + + h0r0 := mul64(h0, r0) + h1r0 := mul64(h1, r0) + h2r0 := mul64(h2, r0) + h0r1 := mul64(h0, r1) + h1r1 := mul64(h1, r1) + h2r1 := mul64(h2, r1) + + // Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their + // top 4 bits cleared by rMask{0,1}, we know that their product is not going + // to overflow 64 bits, so we can ignore the high part of the products. + // + // This also means that the product doesn't have a fifth limb (t4). + if h2r0.hi != 0 { + panic("poly1305: unexpected overflow") + } + if h2r1.hi != 0 { + panic("poly1305: unexpected overflow") + } + + m0 := h0r0 + m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again + m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1. + m3 := h2r1 + + t0 := m0.lo + t1, c := bitsAdd64(m1.lo, m0.hi, 0) + t2, c := bitsAdd64(m2.lo, m1.hi, c) + t3, _ := bitsAdd64(m3.lo, m2.hi, c) + + // Now we have the result as 4 64-bit limbs, and we need to reduce it + // modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do + // a cheap partial reduction according to the reduction identity + // + // c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5 + // + // because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is + // likely to be larger than 2¹³⁰ - 5, but still small enough to fit the + // assumptions we make about h in the rest of the code. + // + // See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23 + + // We split the final result at the 2¹³⁰ mark into h and cc, the carry. + // Note that the carry bits are effectively shifted left by 2, in other + // words, cc = c * 4 for the c in the reduction identity. + h0, h1, h2 = t0, t1, t2&maskLow2Bits + cc := uint128{t2 & maskNotLow2Bits, t3} + + // To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c. + + h0, c = bitsAdd64(h0, cc.lo, 0) + h1, c = bitsAdd64(h1, cc.hi, c) + h2 += c + + cc = shiftRightBy2(cc) + + h0, c = bitsAdd64(h0, cc.lo, 0) + h1, c = bitsAdd64(h1, cc.hi, c) + h2 += c + + // h2 is at most 3 + 1 + 1 = 5, making the whole of h at most + // + // 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1 + } + + state.h[0], state.h[1], state.h[2] = h0, h1, h2 +} + +const ( + maskLow2Bits uint64 = 0x0000000000000003 + maskNotLow2Bits uint64 = ^maskLow2Bits +) + +// select64 returns x if v == 1 and y if v == 0, in constant time. +func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y } + +// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order. +const ( + p0 = 0xFFFFFFFFFFFFFFFB + p1 = 0xFFFFFFFFFFFFFFFF + p2 = 0x0000000000000003 +) + +// finalize completes the modular reduction of h and computes +// +// out = h + s mod 2¹²⁸ +// +func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) { + h0, h1, h2 := h[0], h[1], h[2] + + // After the partial reduction in updateGeneric, h might be more than + // 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction + // in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the + // result if the subtraction underflows, and t otherwise. + + hMinusP0, b := bitsSub64(h0, p0, 0) + hMinusP1, b := bitsSub64(h1, p1, b) + _, b = bitsSub64(h2, p2, b) + + // h = h if h < p else h - p + h0 = select64(b, h0, hMinusP0) + h1 = select64(b, h1, hMinusP1) + + // Finally, we compute the last Poly1305 step + // + // tag = h + s mod 2¹²⁸ + // + // by just doing a wide addition with the 128 low bits of h and discarding + // the overflow. + h0, c := bitsAdd64(h0, s[0], 0) + h1, _ = bitsAdd64(h1, s[1], c) + + binary.LittleEndian.PutUint64(out[0:8], h0) + binary.LittleEndian.PutUint64(out[8:16], h1) +} diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go new file mode 100644 index 00000000..4a069941 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go @@ -0,0 +1,48 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +package poly1305 + +//go:noescape +func update(state *macState, msg []byte) + +// mac is a wrapper for macGeneric that redirects calls that would have gone to +// updateGeneric to update. +// +// Its Write and Sum methods are otherwise identical to the macGeneric ones, but +// using function pointers would carry a major performance cost. +type mac struct{ macGeneric } + +func (h *mac) Write(p []byte) (int, error) { + nn := len(p) + if h.offset > 0 { + n := copy(h.buffer[h.offset:], p) + if h.offset+n < TagSize { + h.offset += n + return nn, nil + } + p = p[n:] + h.offset = 0 + update(&h.macState, h.buffer[:]) + } + if n := len(p) - (len(p) % TagSize); n > 0 { + update(&h.macState, p[:n]) + p = p[n:] + } + if len(p) > 0 { + h.offset += copy(h.buffer[h.offset:], p) + } + return nn, nil +} + +func (h *mac) Sum(out *[16]byte) { + state := h.macState + if h.offset > 0 { + update(&state, h.buffer[:h.offset]) + } + finalize(out, &state.h, &state.s) +} diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s new file mode 100644 index 00000000..58422aad --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s @@ -0,0 +1,182 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +#include "textflag.h" + +// This was ported from the amd64 implementation. + +#define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \ + MOVD (msg), t0; \ + MOVD 8(msg), t1; \ + MOVD $1, t2; \ + ADDC t0, h0, h0; \ + ADDE t1, h1, h1; \ + ADDE t2, h2; \ + ADD $16, msg + +#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \ + MULLD r0, h0, t0; \ + MULLD r0, h1, t4; \ + MULHDU r0, h0, t1; \ + MULHDU r0, h1, t5; \ + ADDC t4, t1, t1; \ + MULLD r0, h2, t2; \ + ADDZE t5; \ + MULHDU r1, h0, t4; \ + MULLD r1, h0, h0; \ + ADD t5, t2, t2; \ + ADDC h0, t1, t1; \ + MULLD h2, r1, t3; \ + ADDZE t4, h0; \ + MULHDU r1, h1, t5; \ + MULLD r1, h1, t4; \ + ADDC t4, t2, t2; \ + ADDE t5, t3, t3; \ + ADDC h0, t2, t2; \ + MOVD $-4, t4; \ + MOVD t0, h0; \ + MOVD t1, h1; \ + ADDZE t3; \ + ANDCC $3, t2, h2; \ + AND t2, t4, t0; \ + ADDC t0, h0, h0; \ + ADDE t3, h1, h1; \ + SLD $62, t3, t4; \ + SRD $2, t2; \ + ADDZE h2; \ + OR t4, t2, t2; \ + SRD $2, t3; \ + ADDC t2, h0, h0; \ + ADDE t3, h1, h1; \ + ADDZE h2 + +DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF +DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC +GLOBL ·poly1305Mask<>(SB), RODATA, $16 + +// func update(state *[7]uint64, msg []byte) +TEXT ·update(SB), $0-32 + MOVD state+0(FP), R3 + MOVD msg_base+8(FP), R4 + MOVD msg_len+16(FP), R5 + + MOVD 0(R3), R8 // h0 + MOVD 8(R3), R9 // h1 + MOVD 16(R3), R10 // h2 + MOVD 24(R3), R11 // r0 + MOVD 32(R3), R12 // r1 + + CMP R5, $16 + BLT bytes_between_0_and_15 + +loop: + POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22) + +multiply: + POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21) + ADD $-16, R5 + CMP R5, $16 + BGE loop + +bytes_between_0_and_15: + CMP R5, $0 + BEQ done + MOVD $0, R16 // h0 + MOVD $0, R17 // h1 + +flush_buffer: + CMP R5, $8 + BLE just1 + + MOVD $8, R21 + SUB R21, R5, R21 + + // Greater than 8 -- load the rightmost remaining bytes in msg + // and put into R17 (h1) + MOVD (R4)(R21), R17 + MOVD $16, R22 + + // Find the offset to those bytes + SUB R5, R22, R22 + SLD $3, R22 + + // Shift to get only the bytes in msg + SRD R22, R17, R17 + + // Put 1 at high end + MOVD $1, R23 + SLD $3, R21 + SLD R21, R23, R23 + OR R23, R17, R17 + + // Remainder is 8 + MOVD $8, R5 + +just1: + CMP R5, $8 + BLT less8 + + // Exactly 8 + MOVD (R4), R16 + + CMP R17, $0 + + // Check if we've already set R17; if not + // set 1 to indicate end of msg. + BNE carry + MOVD $1, R17 + BR carry + +less8: + MOVD $0, R16 // h0 + MOVD $0, R22 // shift count + CMP R5, $4 + BLT less4 + MOVWZ (R4), R16 + ADD $4, R4 + ADD $-4, R5 + MOVD $32, R22 + +less4: + CMP R5, $2 + BLT less2 + MOVHZ (R4), R21 + SLD R22, R21, R21 + OR R16, R21, R16 + ADD $16, R22 + ADD $-2, R5 + ADD $2, R4 + +less2: + CMP R5, $0 + BEQ insert1 + MOVBZ (R4), R21 + SLD R22, R21, R21 + OR R16, R21, R16 + ADD $8, R22 + +insert1: + // Insert 1 at end of msg + MOVD $1, R21 + SLD R22, R21, R21 + OR R16, R21, R16 + +carry: + // Add new values to h0, h1, h2 + ADDC R16, R8 + ADDE R17, R9 + ADDZE R10, R10 + MOVD $16, R5 + ADD R5, R4 + BR multiply + +done: + // Save h0, h1, h2 in state + MOVD R8, 0(R3) + MOVD R9, 8(R3) + MOVD R10, 16(R3) + RET diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.go new file mode 100644 index 00000000..62cc9f84 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.go @@ -0,0 +1,76 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +package poly1305 + +import ( + "golang.org/x/sys/cpu" +) + +// updateVX is an assembly implementation of Poly1305 that uses vector +// instructions. It must only be called if the vector facility (vx) is +// available. +//go:noescape +func updateVX(state *macState, msg []byte) + +// mac is a replacement for macGeneric that uses a larger buffer and redirects +// calls that would have gone to updateGeneric to updateVX if the vector +// facility is installed. +// +// A larger buffer is required for good performance because the vector +// implementation has a higher fixed cost per call than the generic +// implementation. +type mac struct { + macState + + buffer [16 * TagSize]byte // size must be a multiple of block size (16) + offset int +} + +func (h *mac) Write(p []byte) (int, error) { + nn := len(p) + if h.offset > 0 { + n := copy(h.buffer[h.offset:], p) + if h.offset+n < len(h.buffer) { + h.offset += n + return nn, nil + } + p = p[n:] + h.offset = 0 + if cpu.S390X.HasVX { + updateVX(&h.macState, h.buffer[:]) + } else { + updateGeneric(&h.macState, h.buffer[:]) + } + } + + tail := len(p) % len(h.buffer) // number of bytes to copy into buffer + body := len(p) - tail // number of bytes to process now + if body > 0 { + if cpu.S390X.HasVX { + updateVX(&h.macState, p[:body]) + } else { + updateGeneric(&h.macState, p[:body]) + } + } + h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0 + return nn, nil +} + +func (h *mac) Sum(out *[TagSize]byte) { + state := h.macState + remainder := h.buffer[:h.offset] + + // Use the generic implementation if we have 2 or fewer blocks left + // to sum. The vector implementation has a higher startup time. + if cpu.S390X.HasVX && len(remainder) > 2*TagSize { + updateVX(&state, remainder) + } else if len(remainder) > 0 { + updateGeneric(&state, remainder) + } + finalize(out, &state.h, &state.s) +} diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.s new file mode 100644 index 00000000..aa9e0494 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.s @@ -0,0 +1,504 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build gc && !purego +// +build gc,!purego + +#include "textflag.h" + +// This implementation of Poly1305 uses the vector facility (vx) +// to process up to 2 blocks (32 bytes) per iteration using an +// algorithm based on the one described in: +// +// NEON crypto, Daniel J. Bernstein & Peter Schwabe +// https://cryptojedi.org/papers/neoncrypto-20120320.pdf +// +// This algorithm uses 5 26-bit limbs to represent a 130-bit +// value. These limbs are, for the most part, zero extended and +// placed into 64-bit vector register elements. Each vector +// register is 128-bits wide and so holds 2 of these elements. +// Using 26-bit limbs allows us plenty of headroom to accommodate +// accumulations before and after multiplication without +// overflowing either 32-bits (before multiplication) or 64-bits +// (after multiplication). +// +// In order to parallelise the operations required to calculate +// the sum we use two separate accumulators and then sum those +// in an extra final step. For compatibility with the generic +// implementation we perform this summation at the end of every +// updateVX call. +// +// To use two accumulators we must multiply the message blocks +// by r² rather than r. Only the final message block should be +// multiplied by r. +// +// Example: +// +// We want to calculate the sum (h) for a 64 byte message (m): +// +// h = m[0:16]r⁴ + m[16:32]r³ + m[32:48]r² + m[48:64]r +// +// To do this we split the calculation into the even indices +// and odd indices of the message. These form our SIMD 'lanes': +// +// h = m[ 0:16]r⁴ + m[32:48]r² + <- lane 0 +// m[16:32]r³ + m[48:64]r <- lane 1 +// +// To calculate this iteratively we refactor so that both lanes +// are written in terms of r² and r: +// +// h = (m[ 0:16]r² + m[32:48])r² + <- lane 0 +// (m[16:32]r² + m[48:64])r <- lane 1 +// ^ ^ +// | coefficients for second iteration +// coefficients for first iteration +// +// So in this case we would have two iterations. In the first +// both lanes are multiplied by r². In the second only the +// first lane is multiplied by r² and the second lane is +// instead multiplied by r. This gives use the odd and even +// powers of r that we need from the original equation. +// +// Notation: +// +// h - accumulator +// r - key +// m - message +// +// [a, b] - SIMD register holding two 64-bit values +// [a, b, c, d] - SIMD register holding four 32-bit values +// xᵢ[n] - limb n of variable x with bit width i +// +// Limbs are expressed in little endian order, so for 26-bit +// limbs x₂₆[4] will be the most significant limb and x₂₆[0] +// will be the least significant limb. + +// masking constants +#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits +#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits + +// expansion constants (see EXPAND macro) +#define EX0 V2 +#define EX1 V3 +#define EX2 V4 + +// key (r², r or 1 depending on context) +#define R_0 V5 +#define R_1 V6 +#define R_2 V7 +#define R_3 V8 +#define R_4 V9 + +// precalculated coefficients (5r², 5r or 0 depending on context) +#define R5_1 V10 +#define R5_2 V11 +#define R5_3 V12 +#define R5_4 V13 + +// message block (m) +#define M_0 V14 +#define M_1 V15 +#define M_2 V16 +#define M_3 V17 +#define M_4 V18 + +// accumulator (h) +#define H_0 V19 +#define H_1 V20 +#define H_2 V21 +#define H_3 V22 +#define H_4 V23 + +// temporary registers (for short-lived values) +#define T_0 V24 +#define T_1 V25 +#define T_2 V26 +#define T_3 V27 +#define T_4 V28 + +GLOBL ·constants<>(SB), RODATA, $0x30 +// EX0 +DATA ·constants<>+0x00(SB)/8, $0x0006050403020100 +DATA ·constants<>+0x08(SB)/8, $0x1016151413121110 +// EX1 +DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706 +DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716 +// EX2 +DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d +DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d + +// MULTIPLY multiplies each lane of f and g, partially reduced +// modulo 2¹³⁰ - 5. The result, h, consists of partial products +// in each lane that need to be reduced further to produce the +// final result. +// +// h₁₃₀ = (f₁₃₀g₁₃₀) % 2¹³⁰ + (5f₁₃₀g₁₃₀) / 2¹³⁰ +// +// Note that the multiplication by 5 of the high bits is +// achieved by precalculating the multiplication of four of the +// g coefficients by 5. These are g51-g54. +#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \ + VMLOF f0, g0, h0 \ + VMLOF f0, g3, h3 \ + VMLOF f0, g1, h1 \ + VMLOF f0, g4, h4 \ + VMLOF f0, g2, h2 \ + VMLOF f1, g54, T_0 \ + VMLOF f1, g2, T_3 \ + VMLOF f1, g0, T_1 \ + VMLOF f1, g3, T_4 \ + VMLOF f1, g1, T_2 \ + VMALOF f2, g53, h0, h0 \ + VMALOF f2, g1, h3, h3 \ + VMALOF f2, g54, h1, h1 \ + VMALOF f2, g2, h4, h4 \ + VMALOF f2, g0, h2, h2 \ + VMALOF f3, g52, T_0, T_0 \ + VMALOF f3, g0, T_3, T_3 \ + VMALOF f3, g53, T_1, T_1 \ + VMALOF f3, g1, T_4, T_4 \ + VMALOF f3, g54, T_2, T_2 \ + VMALOF f4, g51, h0, h0 \ + VMALOF f4, g54, h3, h3 \ + VMALOF f4, g52, h1, h1 \ + VMALOF f4, g0, h4, h4 \ + VMALOF f4, g53, h2, h2 \ + VAG T_0, h0, h0 \ + VAG T_3, h3, h3 \ + VAG T_1, h1, h1 \ + VAG T_4, h4, h4 \ + VAG T_2, h2, h2 + +// REDUCE performs the following carry operations in four +// stages, as specified in Bernstein & Schwabe: +// +// 1: h₂₆[0]->h₂₆[1] h₂₆[3]->h₂₆[4] +// 2: h₂₆[1]->h₂₆[2] h₂₆[4]->h₂₆[0] +// 3: h₂₆[0]->h₂₆[1] h₂₆[2]->h₂₆[3] +// 4: h₂₆[3]->h₂₆[4] +// +// The result is that all of the limbs are limited to 26-bits +// except for h₂₆[1] and h₂₆[4] which are limited to 27-bits. +// +// Note that although each limb is aligned at 26-bit intervals +// they may contain values that exceed 2²⁶ - 1, hence the need +// to carry the excess bits in each limb. +#define REDUCE(h0, h1, h2, h3, h4) \ + VESRLG $26, h0, T_0 \ + VESRLG $26, h3, T_1 \ + VN MOD26, h0, h0 \ + VN MOD26, h3, h3 \ + VAG T_0, h1, h1 \ + VAG T_1, h4, h4 \ + VESRLG $26, h1, T_2 \ + VESRLG $26, h4, T_3 \ + VN MOD26, h1, h1 \ + VN MOD26, h4, h4 \ + VESLG $2, T_3, T_4 \ + VAG T_3, T_4, T_4 \ + VAG T_2, h2, h2 \ + VAG T_4, h0, h0 \ + VESRLG $26, h2, T_0 \ + VESRLG $26, h0, T_1 \ + VN MOD26, h2, h2 \ + VN MOD26, h0, h0 \ + VAG T_0, h3, h3 \ + VAG T_1, h1, h1 \ + VESRLG $26, h3, T_2 \ + VN MOD26, h3, h3 \ + VAG T_2, h4, h4 + +// EXPAND splits the 128-bit little-endian values in0 and in1 +// into 26-bit big-endian limbs and places the results into +// the first and second lane of d₂₆[0:4] respectively. +// +// The EX0, EX1 and EX2 constants are arrays of byte indices +// for permutation. The permutation both reverses the bytes +// in the input and ensures the bytes are copied into the +// destination limb ready to be shifted into their final +// position. +#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \ + VPERM in0, in1, EX0, d0 \ + VPERM in0, in1, EX1, d2 \ + VPERM in0, in1, EX2, d4 \ + VESRLG $26, d0, d1 \ + VESRLG $30, d2, d3 \ + VESRLG $4, d2, d2 \ + VN MOD26, d0, d0 \ // [in0₂₆[0], in1₂₆[0]] + VN MOD26, d3, d3 \ // [in0₂₆[3], in1₂₆[3]] + VN MOD26, d1, d1 \ // [in0₂₆[1], in1₂₆[1]] + VN MOD24, d4, d4 \ // [in0₂₆[4], in1₂₆[4]] + VN MOD26, d2, d2 // [in0₂₆[2], in1₂₆[2]] + +// func updateVX(state *macState, msg []byte) +TEXT ·updateVX(SB), NOSPLIT, $0 + MOVD state+0(FP), R1 + LMG msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len + + // load EX0, EX1 and EX2 + MOVD $·constants<>(SB), R5 + VLM (R5), EX0, EX2 + + // generate masks + VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff] + VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff] + + // load h (accumulator) and r (key) from state + VZERO T_1 // [0, 0] + VL 0(R1), T_0 // [h₆₄[0], h₆₄[1]] + VLEG $0, 16(R1), T_1 // [h₆₄[2], 0] + VL 24(R1), T_2 // [r₆₄[0], r₆₄[1]] + VPDI $0, T_0, T_2, T_3 // [h₆₄[0], r₆₄[0]] + VPDI $5, T_0, T_2, T_4 // [h₆₄[1], r₆₄[1]] + + // unpack h and r into 26-bit limbs + // note: h₆₄[2] may have the low 3 bits set, so h₂₆[4] is a 27-bit value + VN MOD26, T_3, H_0 // [h₂₆[0], r₂₆[0]] + VZERO H_1 // [0, 0] + VZERO H_3 // [0, 0] + VGMG $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out + VESLG $24, T_1, T_1 // [h₆₄[2]<<24, 0] + VERIMG $-26&63, T_3, MOD26, H_1 // [h₂₆[1], r₂₆[1]] + VESRLG $+52&63, T_3, H_2 // [h₂₆[2], r₂₆[2]] - low 12 bits only + VERIMG $-14&63, T_4, MOD26, H_3 // [h₂₆[1], r₂₆[1]] + VESRLG $40, T_4, H_4 // [h₂₆[4], r₂₆[4]] - low 24 bits only + VERIMG $+12&63, T_4, T_0, H_2 // [h₂₆[2], r₂₆[2]] - complete + VO T_1, H_4, H_4 // [h₂₆[4], r₂₆[4]] - complete + + // replicate r across all 4 vector elements + VREPF $3, H_0, R_0 // [r₂₆[0], r₂₆[0], r₂₆[0], r₂₆[0]] + VREPF $3, H_1, R_1 // [r₂₆[1], r₂₆[1], r₂₆[1], r₂₆[1]] + VREPF $3, H_2, R_2 // [r₂₆[2], r₂₆[2], r₂₆[2], r₂₆[2]] + VREPF $3, H_3, R_3 // [r₂₆[3], r₂₆[3], r₂₆[3], r₂₆[3]] + VREPF $3, H_4, R_4 // [r₂₆[4], r₂₆[4], r₂₆[4], r₂₆[4]] + + // zero out lane 1 of h + VLEIG $1, $0, H_0 // [h₂₆[0], 0] + VLEIG $1, $0, H_1 // [h₂₆[1], 0] + VLEIG $1, $0, H_2 // [h₂₆[2], 0] + VLEIG $1, $0, H_3 // [h₂₆[3], 0] + VLEIG $1, $0, H_4 // [h₂₆[4], 0] + + // calculate 5r (ignore least significant limb) + VREPIF $5, T_0 + VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r₂₆[1], 5r₂₆[1], 5r₂₆[1]] + VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r₂₆[2], 5r₂₆[2], 5r₂₆[2]] + VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r₂₆[3], 5r₂₆[3], 5r₂₆[3]] + VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r₂₆[4], 5r₂₆[4], 5r₂₆[4]] + + // skip r² calculation if we are only calculating one block + CMPBLE R3, $16, skip + + // calculate r² + MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4) + REDUCE(M_0, M_1, M_2, M_3, M_4) + VGBM $0x0f0f, T_0 + VERIMG $0, M_0, T_0, R_0 // [r₂₆[0], r²₂₆[0], r₂₆[0], r²₂₆[0]] + VERIMG $0, M_1, T_0, R_1 // [r₂₆[1], r²₂₆[1], r₂₆[1], r²₂₆[1]] + VERIMG $0, M_2, T_0, R_2 // [r₂₆[2], r²₂₆[2], r₂₆[2], r²₂₆[2]] + VERIMG $0, M_3, T_0, R_3 // [r₂₆[3], r²₂₆[3], r₂₆[3], r²₂₆[3]] + VERIMG $0, M_4, T_0, R_4 // [r₂₆[4], r²₂₆[4], r₂₆[4], r²₂₆[4]] + + // calculate 5r² (ignore least significant limb) + VREPIF $5, T_0 + VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r²₂₆[1], 5r₂₆[1], 5r²₂₆[1]] + VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r²₂₆[2], 5r₂₆[2], 5r²₂₆[2]] + VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r²₂₆[3], 5r₂₆[3], 5r²₂₆[3]] + VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r²₂₆[4], 5r₂₆[4], 5r²₂₆[4]] + +loop: + CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients + + // load next 2 blocks from message + VLM (R2), T_0, T_1 + + // update message slice + SUB $32, R3 + MOVD $32(R2), R2 + + // unpack message blocks into 26-bit big-endian limbs + EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4) + + // add 2¹²⁸ to each message block value + VLEIB $4, $1, M_4 + VLEIB $12, $1, M_4 + +multiply: + // accumulate the incoming message + VAG H_0, M_0, M_0 + VAG H_3, M_3, M_3 + VAG H_1, M_1, M_1 + VAG H_4, M_4, M_4 + VAG H_2, M_2, M_2 + + // multiply the accumulator by the key coefficient + MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4) + + // carry and partially reduce the partial products + REDUCE(H_0, H_1, H_2, H_3, H_4) + + CMPBNE R3, $0, loop + +finish: + // sum lane 0 and lane 1 and put the result in lane 1 + VZERO T_0 + VSUMQG H_0, T_0, H_0 + VSUMQG H_3, T_0, H_3 + VSUMQG H_1, T_0, H_1 + VSUMQG H_4, T_0, H_4 + VSUMQG H_2, T_0, H_2 + + // reduce again after summation + // TODO(mundaym): there might be a more efficient way to do this + // now that we only have 1 active lane. For example, we could + // simultaneously pack the values as we reduce them. + REDUCE(H_0, H_1, H_2, H_3, H_4) + + // carry h[1] through to h[4] so that only h[4] can exceed 2²⁶ - 1 + // TODO(mundaym): in testing this final carry was unnecessary. + // Needs a proof before it can be removed though. + VESRLG $26, H_1, T_1 + VN MOD26, H_1, H_1 + VAQ T_1, H_2, H_2 + VESRLG $26, H_2, T_2 + VN MOD26, H_2, H_2 + VAQ T_2, H_3, H_3 + VESRLG $26, H_3, T_3 + VN MOD26, H_3, H_3 + VAQ T_3, H_4, H_4 + + // h is now < 2(2¹³⁰ - 5) + // Pack each lane in h₂₆[0:4] into h₁₂₈[0:1]. + VESLG $26, H_1, H_1 + VESLG $26, H_3, H_3 + VO H_0, H_1, H_0 + VO H_2, H_3, H_2 + VESLG $4, H_2, H_2 + VLEIB $7, $48, H_1 + VSLB H_1, H_2, H_2 + VO H_0, H_2, H_0 + VLEIB $7, $104, H_1 + VSLB H_1, H_4, H_3 + VO H_3, H_0, H_0 + VLEIB $7, $24, H_1 + VSRLB H_1, H_4, H_1 + + // update state + VSTEG $1, H_0, 0(R1) + VSTEG $0, H_0, 8(R1) + VSTEG $1, H_1, 16(R1) + RET + +b2: // 2 or fewer blocks remaining + CMPBLE R3, $16, b1 + + // Load the 2 remaining blocks (17-32 bytes remaining). + MOVD $-17(R3), R0 // index of final byte to load modulo 16 + VL (R2), T_0 // load full 16 byte block + VLL R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes + + // The Poly1305 algorithm requires that a 1 bit be appended to + // each message block. If the final block is less than 16 bytes + // long then it is easiest to insert the 1 before the message + // block is split into 26-bit limbs. If, on the other hand, the + // final message block is 16 bytes long then we append the 1 bit + // after expansion as normal. + MOVBZ $1, R0 + MOVD $-16(R3), R3 // index of byte in last block to insert 1 at (could be 16) + CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long + VLVGB R3, R0, T_1 // insert 1 into the byte at index R3 + + // Split both blocks into 26-bit limbs in the appropriate lanes. + EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4) + + // Append a 1 byte to the end of the second to last block. + VLEIB $4, $1, M_4 + + // Append a 1 byte to the end of the last block only if it is a + // full 16 byte block. + CMPBNE R3, $16, 2(PC) + VLEIB $12, $1, M_4 + + // Finally, set up the coefficients for the final multiplication. + // We have previously saved r and 5r in the 32-bit even indexes + // of the R_[0-4] and R5_[1-4] coefficient registers. + // + // We want lane 0 to be multiplied by r² so that can be kept the + // same. We want lane 1 to be multiplied by r so we need to move + // the saved r value into the 32-bit odd index in lane 1 by + // rotating the 64-bit lane by 32. + VGBM $0x00ff, T_0 // [0, 0xffffffffffffffff] - mask lane 1 only + VERIMG $32, R_0, T_0, R_0 // [_, r²₂₆[0], _, r₂₆[0]] + VERIMG $32, R_1, T_0, R_1 // [_, r²₂₆[1], _, r₂₆[1]] + VERIMG $32, R_2, T_0, R_2 // [_, r²₂₆[2], _, r₂₆[2]] + VERIMG $32, R_3, T_0, R_3 // [_, r²₂₆[3], _, r₂₆[3]] + VERIMG $32, R_4, T_0, R_4 // [_, r²₂₆[4], _, r₂₆[4]] + VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²₂₆[1], _, 5r₂₆[1]] + VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²₂₆[2], _, 5r₂₆[2]] + VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²₂₆[3], _, 5r₂₆[3]] + VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²₂₆[4], _, 5r₂₆[4]] + + MOVD $0, R3 + BR multiply + +skip: + CMPBEQ R3, $0, finish + +b1: // 1 block remaining + + // Load the final block (1-16 bytes). This will be placed into + // lane 0. + MOVD $-1(R3), R0 + VLL R0, (R2), T_0 // pad to 16 bytes with zeros + + // The Poly1305 algorithm requires that a 1 bit be appended to + // each message block. If the final block is less than 16 bytes + // long then it is easiest to insert the 1 before the message + // block is split into 26-bit limbs. If, on the other hand, the + // final message block is 16 bytes long then we append the 1 bit + // after expansion as normal. + MOVBZ $1, R0 + CMPBEQ R3, $16, 2(PC) + VLVGB R3, R0, T_0 + + // Set the message block in lane 1 to the value 0 so that it + // can be accumulated without affecting the final result. + VZERO T_1 + + // Split the final message block into 26-bit limbs in lane 0. + // Lane 1 will be contain 0. + EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4) + + // Append a 1 byte to the end of the last block only if it is a + // full 16 byte block. + CMPBNE R3, $16, 2(PC) + VLEIB $4, $1, M_4 + + // We have previously saved r and 5r in the 32-bit even indexes + // of the R_[0-4] and R5_[1-4] coefficient registers. + // + // We want lane 0 to be multiplied by r so we need to move the + // saved r value into the 32-bit odd index in lane 0. We want + // lane 1 to be set to the value 1. This makes multiplication + // a no-op. We do this by setting lane 1 in every register to 0 + // and then just setting the 32-bit index 3 in R_0 to 1. + VZERO T_0 + MOVD $0, R0 + MOVD $0x10111213, R12 + VLVGP R12, R0, T_1 // [_, 0x10111213, _, 0x00000000] + VPERM T_0, R_0, T_1, R_0 // [_, r₂₆[0], _, 0] + VPERM T_0, R_1, T_1, R_1 // [_, r₂₆[1], _, 0] + VPERM T_0, R_2, T_1, R_2 // [_, r₂₆[2], _, 0] + VPERM T_0, R_3, T_1, R_3 // [_, r₂₆[3], _, 0] + VPERM T_0, R_4, T_1, R_4 // [_, r₂₆[4], _, 0] + VPERM T_0, R5_1, T_1, R5_1 // [_, 5r₂₆[1], _, 0] + VPERM T_0, R5_2, T_1, R5_2 // [_, 5r₂₆[2], _, 0] + VPERM T_0, R5_3, T_1, R5_3 // [_, 5r₂₆[3], _, 0] + VPERM T_0, R5_4, T_1, R5_4 // [_, 5r₂₆[4], _, 0] + + // Set the value of lane 1 to be 1. + VLEIF $3, $1, R_0 // [_, r₂₆[0], _, 1] + + MOVD $0, R3 + BR multiply diff --git a/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go b/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go index a98d1bd4..a2973e62 100644 --- a/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go +++ b/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go @@ -35,8 +35,8 @@ This package is interoperable with NaCl: https://nacl.cr.yp.to/secretbox.html. package secretbox // import "golang.org/x/crypto/nacl/secretbox" import ( + "golang.org/x/crypto/internal/poly1305" "golang.org/x/crypto/internal/subtle" - "golang.org/x/crypto/poly1305" "golang.org/x/crypto/salsa20/salsa" ) diff --git a/vendor/golang.org/x/crypto/poly1305/bits_compat.go b/vendor/golang.org/x/crypto/poly1305/bits_compat.go deleted file mode 100644 index 45b5c966..00000000 --- a/vendor/golang.org/x/crypto/poly1305/bits_compat.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.13 -// +build !go1.13 - -package poly1305 - -// Generic fallbacks for the math/bits intrinsics, copied from -// src/math/bits/bits.go. They were added in Go 1.12, but Add64 and Sum64 had -// variable time fallbacks until Go 1.13. - -func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { - sum = x + y + carry - carryOut = ((x & y) | ((x | y) &^ sum)) >> 63 - return -} - -func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { - diff = x - y - borrow - borrowOut = ((^x & y) | (^(x ^ y) & diff)) >> 63 - return -} - -func bitsMul64(x, y uint64) (hi, lo uint64) { - const mask32 = 1<<32 - 1 - x0 := x & mask32 - x1 := x >> 32 - y0 := y & mask32 - y1 := y >> 32 - w0 := x0 * y0 - t := x1*y0 + w0>>32 - w1 := t & mask32 - w2 := t >> 32 - w1 += x0 * y1 - hi = x1*y1 + w2 + w1>>32 - lo = x * y - return -} diff --git a/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go b/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go deleted file mode 100644 index ed52b341..00000000 --- a/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.13 -// +build go1.13 - -package poly1305 - -import "math/bits" - -func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { - return bits.Add64(x, y, carry) -} - -func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { - return bits.Sub64(x, y, borrow) -} - -func bitsMul64(x, y uint64) (hi, lo uint64) { - return bits.Mul64(x, y) -} diff --git a/vendor/golang.org/x/crypto/poly1305/mac_noasm.go b/vendor/golang.org/x/crypto/poly1305/mac_noasm.go deleted file mode 100644 index f184b67d..00000000 --- a/vendor/golang.org/x/crypto/poly1305/mac_noasm.go +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego -// +build !amd64,!ppc64le,!s390x !gc purego - -package poly1305 - -type mac struct{ macGeneric } diff --git a/vendor/golang.org/x/crypto/poly1305/poly1305.go b/vendor/golang.org/x/crypto/poly1305/poly1305.go deleted file mode 100644 index 9d7a6af0..00000000 --- a/vendor/golang.org/x/crypto/poly1305/poly1305.go +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package poly1305 implements Poly1305 one-time message authentication code as -// specified in https://cr.yp.to/mac/poly1305-20050329.pdf. -// -// Poly1305 is a fast, one-time authentication function. It is infeasible for an -// attacker to generate an authenticator for a message without the key. However, a -// key must only be used for a single message. Authenticating two different -// messages with the same key allows an attacker to forge authenticators for other -// messages with the same key. -// -// Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was -// used with a fixed key in order to generate one-time keys from an nonce. -// However, in this package AES isn't used and the one-time key is specified -// directly. -package poly1305 // import "golang.org/x/crypto/poly1305" - -import "crypto/subtle" - -// TagSize is the size, in bytes, of a poly1305 authenticator. -const TagSize = 16 - -// Sum generates an authenticator for msg using a one-time key and puts the -// 16-byte result into out. Authenticating two different messages with the same -// key allows an attacker to forge messages at will. -func Sum(out *[16]byte, m []byte, key *[32]byte) { - h := New(key) - h.Write(m) - h.Sum(out[:0]) -} - -// Verify returns true if mac is a valid authenticator for m with the given key. -func Verify(mac *[16]byte, m []byte, key *[32]byte) bool { - var tmp [16]byte - Sum(&tmp, m, key) - return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1 -} - -// New returns a new MAC computing an authentication -// tag of all data written to it with the given key. -// This allows writing the message progressively instead -// of passing it as a single slice. Common users should use -// the Sum function instead. -// -// The key must be unique for each message, as authenticating -// two different messages with the same key allows an attacker -// to forge messages at will. -func New(key *[32]byte) *MAC { - m := &MAC{} - initialize(key, &m.macState) - return m -} - -// MAC is an io.Writer computing an authentication tag -// of the data written to it. -// -// MAC cannot be used like common hash.Hash implementations, -// because using a poly1305 key twice breaks its security. -// Therefore writing data to a running MAC after calling -// Sum or Verify causes it to panic. -type MAC struct { - mac // platform-dependent implementation - - finalized bool -} - -// Size returns the number of bytes Sum will return. -func (h *MAC) Size() int { return TagSize } - -// Write adds more data to the running message authentication code. -// It never returns an error. -// -// It must not be called after the first call of Sum or Verify. -func (h *MAC) Write(p []byte) (n int, err error) { - if h.finalized { - panic("poly1305: write to MAC after Sum or Verify") - } - return h.mac.Write(p) -} - -// Sum computes the authenticator of all data written to the -// message authentication code. -func (h *MAC) Sum(b []byte) []byte { - var mac [TagSize]byte - h.mac.Sum(&mac) - h.finalized = true - return append(b, mac[:]...) -} - -// Verify returns whether the authenticator of all data written to -// the message authentication code matches the expected value. -func (h *MAC) Verify(expected []byte) bool { - var mac [TagSize]byte - h.mac.Sum(&mac) - h.finalized = true - return subtle.ConstantTimeCompare(expected, mac[:]) == 1 -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go deleted file mode 100644 index 6d522333..00000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -package poly1305 - -//go:noescape -func update(state *macState, msg []byte) - -// mac is a wrapper for macGeneric that redirects calls that would have gone to -// updateGeneric to update. -// -// Its Write and Sum methods are otherwise identical to the macGeneric ones, but -// using function pointers would carry a major performance cost. -type mac struct{ macGeneric } - -func (h *mac) Write(p []byte) (int, error) { - nn := len(p) - if h.offset > 0 { - n := copy(h.buffer[h.offset:], p) - if h.offset+n < TagSize { - h.offset += n - return nn, nil - } - p = p[n:] - h.offset = 0 - update(&h.macState, h.buffer[:]) - } - if n := len(p) - (len(p) % TagSize); n > 0 { - update(&h.macState, p[:n]) - p = p[n:] - } - if len(p) > 0 { - h.offset += copy(h.buffer[h.offset:], p) - } - return nn, nil -} - -func (h *mac) Sum(out *[16]byte) { - state := h.macState - if h.offset > 0 { - update(&state, h.buffer[:h.offset]) - } - finalize(out, &state.h, &state.s) -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s deleted file mode 100644 index 1d74f0f8..00000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -#include "textflag.h" - -#define POLY1305_ADD(msg, h0, h1, h2) \ - ADDQ 0(msg), h0; \ - ADCQ 8(msg), h1; \ - ADCQ $1, h2; \ - LEAQ 16(msg), msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ - MOVQ r0, AX; \ - MULQ h0; \ - MOVQ AX, t0; \ - MOVQ DX, t1; \ - MOVQ r0, AX; \ - MULQ h1; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ r0, t2; \ - IMULQ h2, t2; \ - ADDQ DX, t2; \ - \ - MOVQ r1, AX; \ - MULQ h0; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ DX, h0; \ - MOVQ r1, t3; \ - IMULQ h2, t3; \ - MOVQ r1, AX; \ - MULQ h1; \ - ADDQ AX, t2; \ - ADCQ DX, t3; \ - ADDQ h0, t2; \ - ADCQ $0, t3; \ - \ - MOVQ t0, h0; \ - MOVQ t1, h1; \ - MOVQ t2, h2; \ - ANDQ $3, h2; \ - MOVQ t2, t0; \ - ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ - ADDQ t0, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2; \ - SHRQ $2, t3, t2; \ - SHRQ $2, t3; \ - ADDQ t2, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2 - -// func update(state *[7]uint64, msg []byte) -TEXT ·update(SB), $0-32 - MOVQ state+0(FP), DI - MOVQ msg_base+8(FP), SI - MOVQ msg_len+16(FP), R15 - - MOVQ 0(DI), R8 // h0 - MOVQ 8(DI), R9 // h1 - MOVQ 16(DI), R10 // h2 - MOVQ 24(DI), R11 // r0 - MOVQ 32(DI), R12 // r1 - - CMPQ R15, $16 - JB bytes_between_0_and_15 - -loop: - POLY1305_ADD(SI, R8, R9, R10) - -multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) - SUBQ $16, R15 - CMPQ R15, $16 - JAE loop - -bytes_between_0_and_15: - TESTQ R15, R15 - JZ done - MOVQ $1, BX - XORQ CX, CX - XORQ R13, R13 - ADDQ R15, SI - -flush_buffer: - SHLQ $8, BX, CX - SHLQ $8, BX - MOVB -1(SI), R13 - XORQ R13, BX - DECQ SI - DECQ R15 - JNZ flush_buffer - - ADDQ BX, R8 - ADCQ CX, R9 - ADCQ $0, R10 - MOVQ $16, R15 - JMP multiply - -done: - MOVQ R8, 0(DI) - MOVQ R9, 8(DI) - MOVQ R10, 16(DI) - RET diff --git a/vendor/golang.org/x/crypto/poly1305/sum_generic.go b/vendor/golang.org/x/crypto/poly1305/sum_generic.go deleted file mode 100644 index c942a659..00000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_generic.go +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file provides the generic implementation of Sum and MAC. Other files -// might provide optimized assembly implementations of some of this code. - -package poly1305 - -import "encoding/binary" - -// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag -// for a 64 bytes message is approximately -// -// s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5 -// -// for some secret r and s. It can be computed sequentially like -// -// for len(msg) > 0: -// h += read(msg, 16) -// h *= r -// h %= 2¹³⁰ - 5 -// return h + s -// -// All the complexity is about doing performant constant-time math on numbers -// larger than any available numeric type. - -func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) { - h := newMACGeneric(key) - h.Write(msg) - h.Sum(out) -} - -func newMACGeneric(key *[32]byte) macGeneric { - m := macGeneric{} - initialize(key, &m.macState) - return m -} - -// macState holds numbers in saturated 64-bit little-endian limbs. That is, -// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸. -type macState struct { - // h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but - // can grow larger during and after rounds. It must, however, remain below - // 2 * (2¹³⁰ - 5). - h [3]uint64 - // r and s are the private key components. - r [2]uint64 - s [2]uint64 -} - -type macGeneric struct { - macState - - buffer [TagSize]byte - offset int -} - -// Write splits the incoming message into TagSize chunks, and passes them to -// update. It buffers incomplete chunks. -func (h *macGeneric) Write(p []byte) (int, error) { - nn := len(p) - if h.offset > 0 { - n := copy(h.buffer[h.offset:], p) - if h.offset+n < TagSize { - h.offset += n - return nn, nil - } - p = p[n:] - h.offset = 0 - updateGeneric(&h.macState, h.buffer[:]) - } - if n := len(p) - (len(p) % TagSize); n > 0 { - updateGeneric(&h.macState, p[:n]) - p = p[n:] - } - if len(p) > 0 { - h.offset += copy(h.buffer[h.offset:], p) - } - return nn, nil -} - -// Sum flushes the last incomplete chunk from the buffer, if any, and generates -// the MAC output. It does not modify its state, in order to allow for multiple -// calls to Sum, even if no Write is allowed after Sum. -func (h *macGeneric) Sum(out *[TagSize]byte) { - state := h.macState - if h.offset > 0 { - updateGeneric(&state, h.buffer[:h.offset]) - } - finalize(out, &state.h, &state.s) -} - -// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It -// clears some bits of the secret coefficient to make it possible to implement -// multiplication more efficiently. -const ( - rMask0 = 0x0FFFFFFC0FFFFFFF - rMask1 = 0x0FFFFFFC0FFFFFFC -) - -// initialize loads the 256-bit key into the two 128-bit secret values r and s. -func initialize(key *[32]byte, m *macState) { - m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0 - m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1 - m.s[0] = binary.LittleEndian.Uint64(key[16:24]) - m.s[1] = binary.LittleEndian.Uint64(key[24:32]) -} - -// uint128 holds a 128-bit number as two 64-bit limbs, for use with the -// bits.Mul64 and bits.Add64 intrinsics. -type uint128 struct { - lo, hi uint64 -} - -func mul64(a, b uint64) uint128 { - hi, lo := bitsMul64(a, b) - return uint128{lo, hi} -} - -func add128(a, b uint128) uint128 { - lo, c := bitsAdd64(a.lo, b.lo, 0) - hi, c := bitsAdd64(a.hi, b.hi, c) - if c != 0 { - panic("poly1305: unexpected overflow") - } - return uint128{lo, hi} -} - -func shiftRightBy2(a uint128) uint128 { - a.lo = a.lo>>2 | (a.hi&3)<<62 - a.hi = a.hi >> 2 - return a -} - -// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of -// 128 bits of message, it computes -// -// h₊ = (h + m) * r mod 2¹³⁰ - 5 -// -// If the msg length is not a multiple of TagSize, it assumes the last -// incomplete chunk is the final one. -func updateGeneric(state *macState, msg []byte) { - h0, h1, h2 := state.h[0], state.h[1], state.h[2] - r0, r1 := state.r[0], state.r[1] - - for len(msg) > 0 { - var c uint64 - - // For the first step, h + m, we use a chain of bits.Add64 intrinsics. - // The resulting value of h might exceed 2¹³⁰ - 5, but will be partially - // reduced at the end of the multiplication below. - // - // The spec requires us to set a bit just above the message size, not to - // hide leading zeroes. For full chunks, that's 1 << 128, so we can just - // add 1 to the most significant (2¹²⁸) limb, h2. - if len(msg) >= TagSize { - h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0) - h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c) - h2 += c + 1 - - msg = msg[TagSize:] - } else { - var buf [TagSize]byte - copy(buf[:], msg) - buf[len(msg)] = 1 - - h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0) - h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c) - h2 += c - - msg = nil - } - - // Multiplication of big number limbs is similar to elementary school - // columnar multiplication. Instead of digits, there are 64-bit limbs. - // - // We are multiplying a 3 limbs number, h, by a 2 limbs number, r. - // - // h2 h1 h0 x - // r1 r0 = - // ---------------- - // h2r0 h1r0 h0r0 <-- individual 128-bit products - // + h2r1 h1r1 h0r1 - // ------------------------ - // m3 m2 m1 m0 <-- result in 128-bit overlapping limbs - // ------------------------ - // m3.hi m2.hi m1.hi m0.hi <-- carry propagation - // + m3.lo m2.lo m1.lo m0.lo - // ------------------------------- - // t4 t3 t2 t1 t0 <-- final result in 64-bit limbs - // - // The main difference from pen-and-paper multiplication is that we do - // carry propagation in a separate step, as if we wrote two digit sums - // at first (the 128-bit limbs), and then carried the tens all at once. - - h0r0 := mul64(h0, r0) - h1r0 := mul64(h1, r0) - h2r0 := mul64(h2, r0) - h0r1 := mul64(h0, r1) - h1r1 := mul64(h1, r1) - h2r1 := mul64(h2, r1) - - // Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their - // top 4 bits cleared by rMask{0,1}, we know that their product is not going - // to overflow 64 bits, so we can ignore the high part of the products. - // - // This also means that the product doesn't have a fifth limb (t4). - if h2r0.hi != 0 { - panic("poly1305: unexpected overflow") - } - if h2r1.hi != 0 { - panic("poly1305: unexpected overflow") - } - - m0 := h0r0 - m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again - m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1. - m3 := h2r1 - - t0 := m0.lo - t1, c := bitsAdd64(m1.lo, m0.hi, 0) - t2, c := bitsAdd64(m2.lo, m1.hi, c) - t3, _ := bitsAdd64(m3.lo, m2.hi, c) - - // Now we have the result as 4 64-bit limbs, and we need to reduce it - // modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do - // a cheap partial reduction according to the reduction identity - // - // c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5 - // - // because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is - // likely to be larger than 2¹³⁰ - 5, but still small enough to fit the - // assumptions we make about h in the rest of the code. - // - // See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23 - - // We split the final result at the 2¹³⁰ mark into h and cc, the carry. - // Note that the carry bits are effectively shifted left by 2, in other - // words, cc = c * 4 for the c in the reduction identity. - h0, h1, h2 = t0, t1, t2&maskLow2Bits - cc := uint128{t2 & maskNotLow2Bits, t3} - - // To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c. - - h0, c = bitsAdd64(h0, cc.lo, 0) - h1, c = bitsAdd64(h1, cc.hi, c) - h2 += c - - cc = shiftRightBy2(cc) - - h0, c = bitsAdd64(h0, cc.lo, 0) - h1, c = bitsAdd64(h1, cc.hi, c) - h2 += c - - // h2 is at most 3 + 1 + 1 = 5, making the whole of h at most - // - // 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1 - } - - state.h[0], state.h[1], state.h[2] = h0, h1, h2 -} - -const ( - maskLow2Bits uint64 = 0x0000000000000003 - maskNotLow2Bits uint64 = ^maskLow2Bits -) - -// select64 returns x if v == 1 and y if v == 0, in constant time. -func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y } - -// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order. -const ( - p0 = 0xFFFFFFFFFFFFFFFB - p1 = 0xFFFFFFFFFFFFFFFF - p2 = 0x0000000000000003 -) - -// finalize completes the modular reduction of h and computes -// -// out = h + s mod 2¹²⁸ -// -func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) { - h0, h1, h2 := h[0], h[1], h[2] - - // After the partial reduction in updateGeneric, h might be more than - // 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction - // in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the - // result if the subtraction underflows, and t otherwise. - - hMinusP0, b := bitsSub64(h0, p0, 0) - hMinusP1, b := bitsSub64(h1, p1, b) - _, b = bitsSub64(h2, p2, b) - - // h = h if h < p else h - p - h0 = select64(b, h0, hMinusP0) - h1 = select64(b, h1, hMinusP1) - - // Finally, we compute the last Poly1305 step - // - // tag = h + s mod 2¹²⁸ - // - // by just doing a wide addition with the 128 low bits of h and discarding - // the overflow. - h0, c := bitsAdd64(h0, s[0], 0) - h1, _ = bitsAdd64(h1, s[1], c) - - binary.LittleEndian.PutUint64(out[0:8], h0) - binary.LittleEndian.PutUint64(out[8:16], h1) -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go deleted file mode 100644 index 4a069941..00000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -package poly1305 - -//go:noescape -func update(state *macState, msg []byte) - -// mac is a wrapper for macGeneric that redirects calls that would have gone to -// updateGeneric to update. -// -// Its Write and Sum methods are otherwise identical to the macGeneric ones, but -// using function pointers would carry a major performance cost. -type mac struct{ macGeneric } - -func (h *mac) Write(p []byte) (int, error) { - nn := len(p) - if h.offset > 0 { - n := copy(h.buffer[h.offset:], p) - if h.offset+n < TagSize { - h.offset += n - return nn, nil - } - p = p[n:] - h.offset = 0 - update(&h.macState, h.buffer[:]) - } - if n := len(p) - (len(p) % TagSize); n > 0 { - update(&h.macState, p[:n]) - p = p[n:] - } - if len(p) > 0 { - h.offset += copy(h.buffer[h.offset:], p) - } - return nn, nil -} - -func (h *mac) Sum(out *[16]byte) { - state := h.macState - if h.offset > 0 { - update(&state, h.buffer[:h.offset]) - } - finalize(out, &state.h, &state.s) -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s deleted file mode 100644 index 58422aad..00000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -#include "textflag.h" - -// This was ported from the amd64 implementation. - -#define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \ - MOVD (msg), t0; \ - MOVD 8(msg), t1; \ - MOVD $1, t2; \ - ADDC t0, h0, h0; \ - ADDE t1, h1, h1; \ - ADDE t2, h2; \ - ADD $16, msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \ - MULLD r0, h0, t0; \ - MULLD r0, h1, t4; \ - MULHDU r0, h0, t1; \ - MULHDU r0, h1, t5; \ - ADDC t4, t1, t1; \ - MULLD r0, h2, t2; \ - ADDZE t5; \ - MULHDU r1, h0, t4; \ - MULLD r1, h0, h0; \ - ADD t5, t2, t2; \ - ADDC h0, t1, t1; \ - MULLD h2, r1, t3; \ - ADDZE t4, h0; \ - MULHDU r1, h1, t5; \ - MULLD r1, h1, t4; \ - ADDC t4, t2, t2; \ - ADDE t5, t3, t3; \ - ADDC h0, t2, t2; \ - MOVD $-4, t4; \ - MOVD t0, h0; \ - MOVD t1, h1; \ - ADDZE t3; \ - ANDCC $3, t2, h2; \ - AND t2, t4, t0; \ - ADDC t0, h0, h0; \ - ADDE t3, h1, h1; \ - SLD $62, t3, t4; \ - SRD $2, t2; \ - ADDZE h2; \ - OR t4, t2, t2; \ - SRD $2, t3; \ - ADDC t2, h0, h0; \ - ADDE t3, h1, h1; \ - ADDZE h2 - -DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -GLOBL ·poly1305Mask<>(SB), RODATA, $16 - -// func update(state *[7]uint64, msg []byte) -TEXT ·update(SB), $0-32 - MOVD state+0(FP), R3 - MOVD msg_base+8(FP), R4 - MOVD msg_len+16(FP), R5 - - MOVD 0(R3), R8 // h0 - MOVD 8(R3), R9 // h1 - MOVD 16(R3), R10 // h2 - MOVD 24(R3), R11 // r0 - MOVD 32(R3), R12 // r1 - - CMP R5, $16 - BLT bytes_between_0_and_15 - -loop: - POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22) - -multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21) - ADD $-16, R5 - CMP R5, $16 - BGE loop - -bytes_between_0_and_15: - CMP R5, $0 - BEQ done - MOVD $0, R16 // h0 - MOVD $0, R17 // h1 - -flush_buffer: - CMP R5, $8 - BLE just1 - - MOVD $8, R21 - SUB R21, R5, R21 - - // Greater than 8 -- load the rightmost remaining bytes in msg - // and put into R17 (h1) - MOVD (R4)(R21), R17 - MOVD $16, R22 - - // Find the offset to those bytes - SUB R5, R22, R22 - SLD $3, R22 - - // Shift to get only the bytes in msg - SRD R22, R17, R17 - - // Put 1 at high end - MOVD $1, R23 - SLD $3, R21 - SLD R21, R23, R23 - OR R23, R17, R17 - - // Remainder is 8 - MOVD $8, R5 - -just1: - CMP R5, $8 - BLT less8 - - // Exactly 8 - MOVD (R4), R16 - - CMP R17, $0 - - // Check if we've already set R17; if not - // set 1 to indicate end of msg. - BNE carry - MOVD $1, R17 - BR carry - -less8: - MOVD $0, R16 // h0 - MOVD $0, R22 // shift count - CMP R5, $4 - BLT less4 - MOVWZ (R4), R16 - ADD $4, R4 - ADD $-4, R5 - MOVD $32, R22 - -less4: - CMP R5, $2 - BLT less2 - MOVHZ (R4), R21 - SLD R22, R21, R21 - OR R16, R21, R16 - ADD $16, R22 - ADD $-2, R5 - ADD $2, R4 - -less2: - CMP R5, $0 - BEQ insert1 - MOVBZ (R4), R21 - SLD R22, R21, R21 - OR R16, R21, R16 - ADD $8, R22 - -insert1: - // Insert 1 at end of msg - MOVD $1, R21 - SLD R22, R21, R21 - OR R16, R21, R16 - -carry: - // Add new values to h0, h1, h2 - ADDC R16, R8 - ADDE R17, R9 - ADDZE R10, R10 - MOVD $16, R5 - ADD R5, R4 - BR multiply - -done: - // Save h0, h1, h2 in state - MOVD R8, 0(R3) - MOVD R9, 8(R3) - MOVD R10, 16(R3) - RET diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go deleted file mode 100644 index 62cc9f84..00000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -package poly1305 - -import ( - "golang.org/x/sys/cpu" -) - -// updateVX is an assembly implementation of Poly1305 that uses vector -// instructions. It must only be called if the vector facility (vx) is -// available. -//go:noescape -func updateVX(state *macState, msg []byte) - -// mac is a replacement for macGeneric that uses a larger buffer and redirects -// calls that would have gone to updateGeneric to updateVX if the vector -// facility is installed. -// -// A larger buffer is required for good performance because the vector -// implementation has a higher fixed cost per call than the generic -// implementation. -type mac struct { - macState - - buffer [16 * TagSize]byte // size must be a multiple of block size (16) - offset int -} - -func (h *mac) Write(p []byte) (int, error) { - nn := len(p) - if h.offset > 0 { - n := copy(h.buffer[h.offset:], p) - if h.offset+n < len(h.buffer) { - h.offset += n - return nn, nil - } - p = p[n:] - h.offset = 0 - if cpu.S390X.HasVX { - updateVX(&h.macState, h.buffer[:]) - } else { - updateGeneric(&h.macState, h.buffer[:]) - } - } - - tail := len(p) % len(h.buffer) // number of bytes to copy into buffer - body := len(p) - tail // number of bytes to process now - if body > 0 { - if cpu.S390X.HasVX { - updateVX(&h.macState, p[:body]) - } else { - updateGeneric(&h.macState, p[:body]) - } - } - h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0 - return nn, nil -} - -func (h *mac) Sum(out *[TagSize]byte) { - state := h.macState - remainder := h.buffer[:h.offset] - - // Use the generic implementation if we have 2 or fewer blocks left - // to sum. The vector implementation has a higher startup time. - if cpu.S390X.HasVX && len(remainder) > 2*TagSize { - updateVX(&state, remainder) - } else if len(remainder) > 0 { - updateGeneric(&state, remainder) - } - finalize(out, &state.h, &state.s) -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s deleted file mode 100644 index 69c64f84..00000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s +++ /dev/null @@ -1,504 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -#include "textflag.h" - -// This implementation of Poly1305 uses the vector facility (vx) -// to process up to 2 blocks (32 bytes) per iteration using an -// algorithm based on the one described in: -// -// NEON crypto, Daniel J. Bernstein & Peter Schwabe -// https://cryptojedi.org/papers/neoncrypto-20120320.pdf -// -// This algorithm uses 5 26-bit limbs to represent a 130-bit -// value. These limbs are, for the most part, zero extended and -// placed into 64-bit vector register elements. Each vector -// register is 128-bits wide and so holds 2 of these elements. -// Using 26-bit limbs allows us plenty of headroom to accomodate -// accumulations before and after multiplication without -// overflowing either 32-bits (before multiplication) or 64-bits -// (after multiplication). -// -// In order to parallelise the operations required to calculate -// the sum we use two separate accumulators and then sum those -// in an extra final step. For compatibility with the generic -// implementation we perform this summation at the end of every -// updateVX call. -// -// To use two accumulators we must multiply the message blocks -// by r² rather than r. Only the final message block should be -// multiplied by r. -// -// Example: -// -// We want to calculate the sum (h) for a 64 byte message (m): -// -// h = m[0:16]r⁴ + m[16:32]r³ + m[32:48]r² + m[48:64]r -// -// To do this we split the calculation into the even indices -// and odd indices of the message. These form our SIMD 'lanes': -// -// h = m[ 0:16]r⁴ + m[32:48]r² + <- lane 0 -// m[16:32]r³ + m[48:64]r <- lane 1 -// -// To calculate this iteratively we refactor so that both lanes -// are written in terms of r² and r: -// -// h = (m[ 0:16]r² + m[32:48])r² + <- lane 0 -// (m[16:32]r² + m[48:64])r <- lane 1 -// ^ ^ -// | coefficients for second iteration -// coefficients for first iteration -// -// So in this case we would have two iterations. In the first -// both lanes are multiplied by r². In the second only the -// first lane is multiplied by r² and the second lane is -// instead multiplied by r. This gives use the odd and even -// powers of r that we need from the original equation. -// -// Notation: -// -// h - accumulator -// r - key -// m - message -// -// [a, b] - SIMD register holding two 64-bit values -// [a, b, c, d] - SIMD register holding four 32-bit values -// xᵢ[n] - limb n of variable x with bit width i -// -// Limbs are expressed in little endian order, so for 26-bit -// limbs x₂₆[4] will be the most significant limb and x₂₆[0] -// will be the least significant limb. - -// masking constants -#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits -#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits - -// expansion constants (see EXPAND macro) -#define EX0 V2 -#define EX1 V3 -#define EX2 V4 - -// key (r², r or 1 depending on context) -#define R_0 V5 -#define R_1 V6 -#define R_2 V7 -#define R_3 V8 -#define R_4 V9 - -// precalculated coefficients (5r², 5r or 0 depending on context) -#define R5_1 V10 -#define R5_2 V11 -#define R5_3 V12 -#define R5_4 V13 - -// message block (m) -#define M_0 V14 -#define M_1 V15 -#define M_2 V16 -#define M_3 V17 -#define M_4 V18 - -// accumulator (h) -#define H_0 V19 -#define H_1 V20 -#define H_2 V21 -#define H_3 V22 -#define H_4 V23 - -// temporary registers (for short-lived values) -#define T_0 V24 -#define T_1 V25 -#define T_2 V26 -#define T_3 V27 -#define T_4 V28 - -GLOBL ·constants<>(SB), RODATA, $0x30 -// EX0 -DATA ·constants<>+0x00(SB)/8, $0x0006050403020100 -DATA ·constants<>+0x08(SB)/8, $0x1016151413121110 -// EX1 -DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706 -DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716 -// EX2 -DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d -DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d - -// MULTIPLY multiplies each lane of f and g, partially reduced -// modulo 2¹³⁰ - 5. The result, h, consists of partial products -// in each lane that need to be reduced further to produce the -// final result. -// -// h₁₃₀ = (f₁₃₀g₁₃₀) % 2¹³⁰ + (5f₁₃₀g₁₃₀) / 2¹³⁰ -// -// Note that the multiplication by 5 of the high bits is -// achieved by precalculating the multiplication of four of the -// g coefficients by 5. These are g51-g54. -#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \ - VMLOF f0, g0, h0 \ - VMLOF f0, g3, h3 \ - VMLOF f0, g1, h1 \ - VMLOF f0, g4, h4 \ - VMLOF f0, g2, h2 \ - VMLOF f1, g54, T_0 \ - VMLOF f1, g2, T_3 \ - VMLOF f1, g0, T_1 \ - VMLOF f1, g3, T_4 \ - VMLOF f1, g1, T_2 \ - VMALOF f2, g53, h0, h0 \ - VMALOF f2, g1, h3, h3 \ - VMALOF f2, g54, h1, h1 \ - VMALOF f2, g2, h4, h4 \ - VMALOF f2, g0, h2, h2 \ - VMALOF f3, g52, T_0, T_0 \ - VMALOF f3, g0, T_3, T_3 \ - VMALOF f3, g53, T_1, T_1 \ - VMALOF f3, g1, T_4, T_4 \ - VMALOF f3, g54, T_2, T_2 \ - VMALOF f4, g51, h0, h0 \ - VMALOF f4, g54, h3, h3 \ - VMALOF f4, g52, h1, h1 \ - VMALOF f4, g0, h4, h4 \ - VMALOF f4, g53, h2, h2 \ - VAG T_0, h0, h0 \ - VAG T_3, h3, h3 \ - VAG T_1, h1, h1 \ - VAG T_4, h4, h4 \ - VAG T_2, h2, h2 - -// REDUCE performs the following carry operations in four -// stages, as specified in Bernstein & Schwabe: -// -// 1: h₂₆[0]->h₂₆[1] h₂₆[3]->h₂₆[4] -// 2: h₂₆[1]->h₂₆[2] h₂₆[4]->h₂₆[0] -// 3: h₂₆[0]->h₂₆[1] h₂₆[2]->h₂₆[3] -// 4: h₂₆[3]->h₂₆[4] -// -// The result is that all of the limbs are limited to 26-bits -// except for h₂₆[1] and h₂₆[4] which are limited to 27-bits. -// -// Note that although each limb is aligned at 26-bit intervals -// they may contain values that exceed 2²⁶ - 1, hence the need -// to carry the excess bits in each limb. -#define REDUCE(h0, h1, h2, h3, h4) \ - VESRLG $26, h0, T_0 \ - VESRLG $26, h3, T_1 \ - VN MOD26, h0, h0 \ - VN MOD26, h3, h3 \ - VAG T_0, h1, h1 \ - VAG T_1, h4, h4 \ - VESRLG $26, h1, T_2 \ - VESRLG $26, h4, T_3 \ - VN MOD26, h1, h1 \ - VN MOD26, h4, h4 \ - VESLG $2, T_3, T_4 \ - VAG T_3, T_4, T_4 \ - VAG T_2, h2, h2 \ - VAG T_4, h0, h0 \ - VESRLG $26, h2, T_0 \ - VESRLG $26, h0, T_1 \ - VN MOD26, h2, h2 \ - VN MOD26, h0, h0 \ - VAG T_0, h3, h3 \ - VAG T_1, h1, h1 \ - VESRLG $26, h3, T_2 \ - VN MOD26, h3, h3 \ - VAG T_2, h4, h4 - -// EXPAND splits the 128-bit little-endian values in0 and in1 -// into 26-bit big-endian limbs and places the results into -// the first and second lane of d₂₆[0:4] respectively. -// -// The EX0, EX1 and EX2 constants are arrays of byte indices -// for permutation. The permutation both reverses the bytes -// in the input and ensures the bytes are copied into the -// destination limb ready to be shifted into their final -// position. -#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \ - VPERM in0, in1, EX0, d0 \ - VPERM in0, in1, EX1, d2 \ - VPERM in0, in1, EX2, d4 \ - VESRLG $26, d0, d1 \ - VESRLG $30, d2, d3 \ - VESRLG $4, d2, d2 \ - VN MOD26, d0, d0 \ // [in0₂₆[0], in1₂₆[0]] - VN MOD26, d3, d3 \ // [in0₂₆[3], in1₂₆[3]] - VN MOD26, d1, d1 \ // [in0₂₆[1], in1₂₆[1]] - VN MOD24, d4, d4 \ // [in0₂₆[4], in1₂₆[4]] - VN MOD26, d2, d2 // [in0₂₆[2], in1₂₆[2]] - -// func updateVX(state *macState, msg []byte) -TEXT ·updateVX(SB), NOSPLIT, $0 - MOVD state+0(FP), R1 - LMG msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len - - // load EX0, EX1 and EX2 - MOVD $·constants<>(SB), R5 - VLM (R5), EX0, EX2 - - // generate masks - VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff] - VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff] - - // load h (accumulator) and r (key) from state - VZERO T_1 // [0, 0] - VL 0(R1), T_0 // [h₆₄[0], h₆₄[1]] - VLEG $0, 16(R1), T_1 // [h₆₄[2], 0] - VL 24(R1), T_2 // [r₆₄[0], r₆₄[1]] - VPDI $0, T_0, T_2, T_3 // [h₆₄[0], r₆₄[0]] - VPDI $5, T_0, T_2, T_4 // [h₆₄[1], r₆₄[1]] - - // unpack h and r into 26-bit limbs - // note: h₆₄[2] may have the low 3 bits set, so h₂₆[4] is a 27-bit value - VN MOD26, T_3, H_0 // [h₂₆[0], r₂₆[0]] - VZERO H_1 // [0, 0] - VZERO H_3 // [0, 0] - VGMG $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out - VESLG $24, T_1, T_1 // [h₆₄[2]<<24, 0] - VERIMG $-26&63, T_3, MOD26, H_1 // [h₂₆[1], r₂₆[1]] - VESRLG $+52&63, T_3, H_2 // [h₂₆[2], r₂₆[2]] - low 12 bits only - VERIMG $-14&63, T_4, MOD26, H_3 // [h₂₆[1], r₂₆[1]] - VESRLG $40, T_4, H_4 // [h₂₆[4], r₂₆[4]] - low 24 bits only - VERIMG $+12&63, T_4, T_0, H_2 // [h₂₆[2], r₂₆[2]] - complete - VO T_1, H_4, H_4 // [h₂₆[4], r₂₆[4]] - complete - - // replicate r across all 4 vector elements - VREPF $3, H_0, R_0 // [r₂₆[0], r₂₆[0], r₂₆[0], r₂₆[0]] - VREPF $3, H_1, R_1 // [r₂₆[1], r₂₆[1], r₂₆[1], r₂₆[1]] - VREPF $3, H_2, R_2 // [r₂₆[2], r₂₆[2], r₂₆[2], r₂₆[2]] - VREPF $3, H_3, R_3 // [r₂₆[3], r₂₆[3], r₂₆[3], r₂₆[3]] - VREPF $3, H_4, R_4 // [r₂₆[4], r₂₆[4], r₂₆[4], r₂₆[4]] - - // zero out lane 1 of h - VLEIG $1, $0, H_0 // [h₂₆[0], 0] - VLEIG $1, $0, H_1 // [h₂₆[1], 0] - VLEIG $1, $0, H_2 // [h₂₆[2], 0] - VLEIG $1, $0, H_3 // [h₂₆[3], 0] - VLEIG $1, $0, H_4 // [h₂₆[4], 0] - - // calculate 5r (ignore least significant limb) - VREPIF $5, T_0 - VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r₂₆[1], 5r₂₆[1], 5r₂₆[1]] - VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r₂₆[2], 5r₂₆[2], 5r₂₆[2]] - VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r₂₆[3], 5r₂₆[3], 5r₂₆[3]] - VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r₂₆[4], 5r₂₆[4], 5r₂₆[4]] - - // skip r² calculation if we are only calculating one block - CMPBLE R3, $16, skip - - // calculate r² - MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4) - REDUCE(M_0, M_1, M_2, M_3, M_4) - VGBM $0x0f0f, T_0 - VERIMG $0, M_0, T_0, R_0 // [r₂₆[0], r²₂₆[0], r₂₆[0], r²₂₆[0]] - VERIMG $0, M_1, T_0, R_1 // [r₂₆[1], r²₂₆[1], r₂₆[1], r²₂₆[1]] - VERIMG $0, M_2, T_0, R_2 // [r₂₆[2], r²₂₆[2], r₂₆[2], r²₂₆[2]] - VERIMG $0, M_3, T_0, R_3 // [r₂₆[3], r²₂₆[3], r₂₆[3], r²₂₆[3]] - VERIMG $0, M_4, T_0, R_4 // [r₂₆[4], r²₂₆[4], r₂₆[4], r²₂₆[4]] - - // calculate 5r² (ignore least significant limb) - VREPIF $5, T_0 - VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r²₂₆[1], 5r₂₆[1], 5r²₂₆[1]] - VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r²₂₆[2], 5r₂₆[2], 5r²₂₆[2]] - VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r²₂₆[3], 5r₂₆[3], 5r²₂₆[3]] - VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r²₂₆[4], 5r₂₆[4], 5r²₂₆[4]] - -loop: - CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients - - // load next 2 blocks from message - VLM (R2), T_0, T_1 - - // update message slice - SUB $32, R3 - MOVD $32(R2), R2 - - // unpack message blocks into 26-bit big-endian limbs - EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4) - - // add 2¹²⁸ to each message block value - VLEIB $4, $1, M_4 - VLEIB $12, $1, M_4 - -multiply: - // accumulate the incoming message - VAG H_0, M_0, M_0 - VAG H_3, M_3, M_3 - VAG H_1, M_1, M_1 - VAG H_4, M_4, M_4 - VAG H_2, M_2, M_2 - - // multiply the accumulator by the key coefficient - MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4) - - // carry and partially reduce the partial products - REDUCE(H_0, H_1, H_2, H_3, H_4) - - CMPBNE R3, $0, loop - -finish: - // sum lane 0 and lane 1 and put the result in lane 1 - VZERO T_0 - VSUMQG H_0, T_0, H_0 - VSUMQG H_3, T_0, H_3 - VSUMQG H_1, T_0, H_1 - VSUMQG H_4, T_0, H_4 - VSUMQG H_2, T_0, H_2 - - // reduce again after summation - // TODO(mundaym): there might be a more efficient way to do this - // now that we only have 1 active lane. For example, we could - // simultaneously pack the values as we reduce them. - REDUCE(H_0, H_1, H_2, H_3, H_4) - - // carry h[1] through to h[4] so that only h[4] can exceed 2²⁶ - 1 - // TODO(mundaym): in testing this final carry was unnecessary. - // Needs a proof before it can be removed though. - VESRLG $26, H_1, T_1 - VN MOD26, H_1, H_1 - VAQ T_1, H_2, H_2 - VESRLG $26, H_2, T_2 - VN MOD26, H_2, H_2 - VAQ T_2, H_3, H_3 - VESRLG $26, H_3, T_3 - VN MOD26, H_3, H_3 - VAQ T_3, H_4, H_4 - - // h is now < 2(2¹³⁰ - 5) - // Pack each lane in h₂₆[0:4] into h₁₂₈[0:1]. - VESLG $26, H_1, H_1 - VESLG $26, H_3, H_3 - VO H_0, H_1, H_0 - VO H_2, H_3, H_2 - VESLG $4, H_2, H_2 - VLEIB $7, $48, H_1 - VSLB H_1, H_2, H_2 - VO H_0, H_2, H_0 - VLEIB $7, $104, H_1 - VSLB H_1, H_4, H_3 - VO H_3, H_0, H_0 - VLEIB $7, $24, H_1 - VSRLB H_1, H_4, H_1 - - // update state - VSTEG $1, H_0, 0(R1) - VSTEG $0, H_0, 8(R1) - VSTEG $1, H_1, 16(R1) - RET - -b2: // 2 or fewer blocks remaining - CMPBLE R3, $16, b1 - - // Load the 2 remaining blocks (17-32 bytes remaining). - MOVD $-17(R3), R0 // index of final byte to load modulo 16 - VL (R2), T_0 // load full 16 byte block - VLL R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes - - // The Poly1305 algorithm requires that a 1 bit be appended to - // each message block. If the final block is less than 16 bytes - // long then it is easiest to insert the 1 before the message - // block is split into 26-bit limbs. If, on the other hand, the - // final message block is 16 bytes long then we append the 1 bit - // after expansion as normal. - MOVBZ $1, R0 - MOVD $-16(R3), R3 // index of byte in last block to insert 1 at (could be 16) - CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long - VLVGB R3, R0, T_1 // insert 1 into the byte at index R3 - - // Split both blocks into 26-bit limbs in the appropriate lanes. - EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4) - - // Append a 1 byte to the end of the second to last block. - VLEIB $4, $1, M_4 - - // Append a 1 byte to the end of the last block only if it is a - // full 16 byte block. - CMPBNE R3, $16, 2(PC) - VLEIB $12, $1, M_4 - - // Finally, set up the coefficients for the final multiplication. - // We have previously saved r and 5r in the 32-bit even indexes - // of the R_[0-4] and R5_[1-4] coefficient registers. - // - // We want lane 0 to be multiplied by r² so that can be kept the - // same. We want lane 1 to be multiplied by r so we need to move - // the saved r value into the 32-bit odd index in lane 1 by - // rotating the 64-bit lane by 32. - VGBM $0x00ff, T_0 // [0, 0xffffffffffffffff] - mask lane 1 only - VERIMG $32, R_0, T_0, R_0 // [_, r²₂₆[0], _, r₂₆[0]] - VERIMG $32, R_1, T_0, R_1 // [_, r²₂₆[1], _, r₂₆[1]] - VERIMG $32, R_2, T_0, R_2 // [_, r²₂₆[2], _, r₂₆[2]] - VERIMG $32, R_3, T_0, R_3 // [_, r²₂₆[3], _, r₂₆[3]] - VERIMG $32, R_4, T_0, R_4 // [_, r²₂₆[4], _, r₂₆[4]] - VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²₂₆[1], _, 5r₂₆[1]] - VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²₂₆[2], _, 5r₂₆[2]] - VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²₂₆[3], _, 5r₂₆[3]] - VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²₂₆[4], _, 5r₂₆[4]] - - MOVD $0, R3 - BR multiply - -skip: - CMPBEQ R3, $0, finish - -b1: // 1 block remaining - - // Load the final block (1-16 bytes). This will be placed into - // lane 0. - MOVD $-1(R3), R0 - VLL R0, (R2), T_0 // pad to 16 bytes with zeros - - // The Poly1305 algorithm requires that a 1 bit be appended to - // each message block. If the final block is less than 16 bytes - // long then it is easiest to insert the 1 before the message - // block is split into 26-bit limbs. If, on the other hand, the - // final message block is 16 bytes long then we append the 1 bit - // after expansion as normal. - MOVBZ $1, R0 - CMPBEQ R3, $16, 2(PC) - VLVGB R3, R0, T_0 - - // Set the message block in lane 1 to the value 0 so that it - // can be accumulated without affecting the final result. - VZERO T_1 - - // Split the final message block into 26-bit limbs in lane 0. - // Lane 1 will be contain 0. - EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4) - - // Append a 1 byte to the end of the last block only if it is a - // full 16 byte block. - CMPBNE R3, $16, 2(PC) - VLEIB $4, $1, M_4 - - // We have previously saved r and 5r in the 32-bit even indexes - // of the R_[0-4] and R5_[1-4] coefficient registers. - // - // We want lane 0 to be multiplied by r so we need to move the - // saved r value into the 32-bit odd index in lane 0. We want - // lane 1 to be set to the value 1. This makes multiplication - // a no-op. We do this by setting lane 1 in every register to 0 - // and then just setting the 32-bit index 3 in R_0 to 1. - VZERO T_0 - MOVD $0, R0 - MOVD $0x10111213, R12 - VLVGP R12, R0, T_1 // [_, 0x10111213, _, 0x00000000] - VPERM T_0, R_0, T_1, R_0 // [_, r₂₆[0], _, 0] - VPERM T_0, R_1, T_1, R_1 // [_, r₂₆[1], _, 0] - VPERM T_0, R_2, T_1, R_2 // [_, r₂₆[2], _, 0] - VPERM T_0, R_3, T_1, R_3 // [_, r₂₆[3], _, 0] - VPERM T_0, R_4, T_1, R_4 // [_, r₂₆[4], _, 0] - VPERM T_0, R5_1, T_1, R5_1 // [_, 5r₂₆[1], _, 0] - VPERM T_0, R5_2, T_1, R5_2 // [_, 5r₂₆[2], _, 0] - VPERM T_0, R5_3, T_1, R5_3 // [_, 5r₂₆[3], _, 0] - VPERM T_0, R5_4, T_1, R5_4 // [_, 5r₂₆[4], _, 0] - - // Set the value of lane 1 to be 1. - VLEIF $3, $1, R_0 // [_, r₂₆[0], _, 1] - - MOVD $0, R3 - BR multiply diff --git a/vendor/golang.org/x/crypto/ssh/cipher.go b/vendor/golang.org/x/crypto/ssh/cipher.go index 8bd6b3da..bddbde5d 100644 --- a/vendor/golang.org/x/crypto/ssh/cipher.go +++ b/vendor/golang.org/x/crypto/ssh/cipher.go @@ -18,7 +18,7 @@ import ( "io/ioutil" "golang.org/x/crypto/chacha20" - "golang.org/x/crypto/poly1305" + "golang.org/x/crypto/internal/poly1305" ) const ( -- cgit v1.2.3