aboutsummaryrefslogtreecommitdiff
path: root/vendor/golang.org/x/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/golang.org/x/crypto')
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go40
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go22
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go10
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/poly1305.go99
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go48
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s (renamed from vendor/golang.org/x/crypto/poly1305/sum_amd64.s)46
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go310
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go48
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s182
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.go76
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.s504
-rw-r--r--vendor/golang.org/x/crypto/internal/subtle/aliasing.go3
-rw-r--r--vendor/golang.org/x/crypto/internal/subtle/aliasing_purego.go (renamed from vendor/golang.org/x/crypto/internal/subtle/aliasing_appengine.go)3
-rw-r--r--vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go2
-rw-r--r--vendor/golang.org/x/crypto/poly1305/poly1305.go33
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_amd64.go22
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_arm.go22
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_arm.s427
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_noasm.go14
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_ref.go139
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_s390x.go49
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_s390x.s400
-rw-r--r--vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s931
-rw-r--r--vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go6
-rw-r--r--vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s (renamed from vendor/golang.org/x/crypto/salsa20/salsa/salsa2020_amd64.s)246
-rw-r--r--vendor/golang.org/x/crypto/salsa20/salsa/salsa20_noasm.go15
-rw-r--r--vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go9
27 files changed, 1499 insertions, 2207 deletions
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go b/vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go
new file mode 100644
index 0000000..45b5c96
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/bits_compat.go
@@ -0,0 +1,40 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !go1.13
+// +build !go1.13
+
+package poly1305
+
+// Generic fallbacks for the math/bits intrinsics, copied from
+// src/math/bits/bits.go. They were added in Go 1.12, but Add64 and Sum64 had
+// variable time fallbacks until Go 1.13.
+
+func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) {
+ sum = x + y + carry
+ carryOut = ((x & y) | ((x | y) &^ sum)) >> 63
+ return
+}
+
+func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) {
+ diff = x - y - borrow
+ borrowOut = ((^x & y) | (^(x ^ y) & diff)) >> 63
+ return
+}
+
+func bitsMul64(x, y uint64) (hi, lo uint64) {
+ const mask32 = 1<<32 - 1
+ x0 := x & mask32
+ x1 := x >> 32
+ y0 := y & mask32
+ y1 := y >> 32
+ w0 := x0 * y0
+ t := x1*y0 + w0>>32
+ w1 := t & mask32
+ w2 := t >> 32
+ w1 += x0 * y1
+ hi = x1*y1 + w2 + w1>>32
+ lo = x * y
+ return
+}
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go b/vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go
new file mode 100644
index 0000000..ed52b34
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/bits_go1.13.go
@@ -0,0 +1,22 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build go1.13
+// +build go1.13
+
+package poly1305
+
+import "math/bits"
+
+func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) {
+ return bits.Add64(x, y, carry)
+}
+
+func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) {
+ return bits.Sub64(x, y, borrow)
+}
+
+func bitsMul64(x, y uint64) (hi, lo uint64) {
+ return bits.Mul64(x, y)
+}
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go b/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
new file mode 100644
index 0000000..f184b67
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
@@ -0,0 +1,10 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
+// +build !amd64,!ppc64le,!s390x !gc purego
+
+package poly1305
+
+type mac struct{ macGeneric }
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/poly1305.go b/vendor/golang.org/x/crypto/internal/poly1305/poly1305.go
new file mode 100644
index 0000000..4aaea81
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/poly1305.go
@@ -0,0 +1,99 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package poly1305 implements Poly1305 one-time message authentication code as
+// specified in https://cr.yp.to/mac/poly1305-20050329.pdf.
+//
+// Poly1305 is a fast, one-time authentication function. It is infeasible for an
+// attacker to generate an authenticator for a message without the key. However, a
+// key must only be used for a single message. Authenticating two different
+// messages with the same key allows an attacker to forge authenticators for other
+// messages with the same key.
+//
+// Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was
+// used with a fixed key in order to generate one-time keys from an nonce.
+// However, in this package AES isn't used and the one-time key is specified
+// directly.
+package poly1305
+
+import "crypto/subtle"
+
+// TagSize is the size, in bytes, of a poly1305 authenticator.
+const TagSize = 16
+
+// Sum generates an authenticator for msg using a one-time key and puts the
+// 16-byte result into out. Authenticating two different messages with the same
+// key allows an attacker to forge messages at will.
+func Sum(out *[16]byte, m []byte, key *[32]byte) {
+ h := New(key)
+ h.Write(m)
+ h.Sum(out[:0])
+}
+
+// Verify returns true if mac is a valid authenticator for m with the given key.
+func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
+ var tmp [16]byte
+ Sum(&tmp, m, key)
+ return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1
+}
+
+// New returns a new MAC computing an authentication
+// tag of all data written to it with the given key.
+// This allows writing the message progressively instead
+// of passing it as a single slice. Common users should use
+// the Sum function instead.
+//
+// The key must be unique for each message, as authenticating
+// two different messages with the same key allows an attacker
+// to forge messages at will.
+func New(key *[32]byte) *MAC {
+ m := &MAC{}
+ initialize(key, &m.macState)
+ return m
+}
+
+// MAC is an io.Writer computing an authentication tag
+// of the data written to it.
+//
+// MAC cannot be used like common hash.Hash implementations,
+// because using a poly1305 key twice breaks its security.
+// Therefore writing data to a running MAC after calling
+// Sum or Verify causes it to panic.
+type MAC struct {
+ mac // platform-dependent implementation
+
+ finalized bool
+}
+
+// Size returns the number of bytes Sum will return.
+func (h *MAC) Size() int { return TagSize }
+
+// Write adds more data to the running message authentication code.
+// It never returns an error.
+//
+// It must not be called after the first call of Sum or Verify.
+func (h *MAC) Write(p []byte) (n int, err error) {
+ if h.finalized {
+ panic("poly1305: write to MAC after Sum or Verify")
+ }
+ return h.mac.Write(p)
+}
+
+// Sum computes the authenticator of all data written to the
+// message authentication code.
+func (h *MAC) Sum(b []byte) []byte {
+ var mac [TagSize]byte
+ h.mac.Sum(&mac)
+ h.finalized = true
+ return append(b, mac[:]...)
+}
+
+// Verify returns whether the authenticator of all data written to
+// the message authentication code matches the expected value.
+func (h *MAC) Verify(expected []byte) bool {
+ var mac [TagSize]byte
+ h.mac.Sum(&mac)
+ h.finalized = true
+ return subtle.ConstantTimeCompare(expected, mac[:]) == 1
+}
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go
new file mode 100644
index 0000000..6d52233
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.go
@@ -0,0 +1,48 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+// +build gc,!purego
+
+package poly1305
+
+//go:noescape
+func update(state *macState, msg []byte)
+
+// mac is a wrapper for macGeneric that redirects calls that would have gone to
+// updateGeneric to update.
+//
+// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
+// using function pointers would carry a major performance cost.
+type mac struct{ macGeneric }
+
+func (h *mac) Write(p []byte) (int, error) {
+ nn := len(p)
+ if h.offset > 0 {
+ n := copy(h.buffer[h.offset:], p)
+ if h.offset+n < TagSize {
+ h.offset += n
+ return nn, nil
+ }
+ p = p[n:]
+ h.offset = 0
+ update(&h.macState, h.buffer[:])
+ }
+ if n := len(p) - (len(p) % TagSize); n > 0 {
+ update(&h.macState, p[:n])
+ p = p[n:]
+ }
+ if len(p) > 0 {
+ h.offset += copy(h.buffer[h.offset:], p)
+ }
+ return nn, nil
+}
+
+func (h *mac) Sum(out *[16]byte) {
+ state := h.macState
+ if h.offset > 0 {
+ update(&state, h.buffer[:h.offset])
+ }
+ finalize(out, &state.h, &state.s)
+}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
index 2edae63..1d74f0f 100644
--- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
@@ -2,7 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64,!gccgo,!appengine
+//go:build gc && !purego
+// +build gc,!purego
#include "textflag.h"
@@ -54,24 +55,17 @@
ADCQ t3, h1; \
ADCQ $0, h2
-DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-GLOBL ·poly1305Mask<>(SB), RODATA, $16
+// func update(state *[7]uint64, msg []byte)
+TEXT ·update(SB), $0-32
+ MOVQ state+0(FP), DI
+ MOVQ msg_base+8(FP), SI
+ MOVQ msg_len+16(FP), R15
-// func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305(SB), $0-32
- MOVQ out+0(FP), DI
- MOVQ m+8(FP), SI
- MOVQ mlen+16(FP), R15
- MOVQ key+24(FP), AX
-
- MOVQ 0(AX), R11
- MOVQ 8(AX), R12
- ANDQ ·poly1305Mask<>(SB), R11 // r0
- ANDQ ·poly1305Mask<>+8(SB), R12 // r1
- XORQ R8, R8 // h0
- XORQ R9, R9 // h1
- XORQ R10, R10 // h2
+ MOVQ 0(DI), R8 // h0
+ MOVQ 8(DI), R9 // h1
+ MOVQ 16(DI), R10 // h2
+ MOVQ 24(DI), R11 // r0
+ MOVQ 32(DI), R12 // r1
CMPQ R15, $16
JB bytes_between_0_and_15
@@ -109,17 +103,7 @@ flush_buffer:
JMP multiply
done:
- MOVQ R8, AX
- MOVQ R9, BX
- SUBQ $0xFFFFFFFFFFFFFFFB, AX
- SBBQ $0xFFFFFFFFFFFFFFFF, BX
- SBBQ $3, R10
- CMOVQCS R8, AX
- CMOVQCS R9, BX
- MOVQ key+24(FP), R8
- ADDQ 16(R8), AX
- ADCQ 24(R8), BX
-
- MOVQ AX, 0(DI)
- MOVQ BX, 8(DI)
+ MOVQ R8, 0(DI)
+ MOVQ R9, 8(DI)
+ MOVQ R10, 16(DI)
RET
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go
new file mode 100644
index 0000000..c942a65
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_generic.go
@@ -0,0 +1,310 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file provides the generic implementation of Sum and MAC. Other files
+// might provide optimized assembly implementations of some of this code.
+
+package poly1305
+
+import "encoding/binary"
+
+// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
+// for a 64 bytes message is approximately
+//
+// s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5
+//
+// for some secret r and s. It can be computed sequentially like
+//
+// for len(msg) > 0:
+// h += read(msg, 16)
+// h *= r
+// h %= 2¹³⁰ - 5
+// return h + s
+//
+// All the complexity is about doing performant constant-time math on numbers
+// larger than any available numeric type.
+
+func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
+ h := newMACGeneric(key)
+ h.Write(msg)
+ h.Sum(out)
+}
+
+func newMACGeneric(key *[32]byte) macGeneric {
+ m := macGeneric{}
+ initialize(key, &m.macState)
+ return m
+}
+
+// macState holds numbers in saturated 64-bit little-endian limbs. That is,
+// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
+type macState struct {
+ // h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
+ // can grow larger during and after rounds. It must, however, remain below
+ // 2 * (2¹³⁰ - 5).
+ h [3]uint64
+ // r and s are the private key components.
+ r [2]uint64
+ s [2]uint64
+}
+
+type macGeneric struct {
+ macState
+
+ buffer [TagSize]byte
+ offset int
+}
+
+// Write splits the incoming message into TagSize chunks, and passes them to
+// update. It buffers incomplete chunks.
+func (h *macGeneric) Write(p []byte) (int, error) {
+ nn := len(p)
+ if h.offset > 0 {
+ n := copy(h.buffer[h.offset:], p)
+ if h.offset+n < TagSize {
+ h.offset += n
+ return nn, nil
+ }
+ p = p[n:]
+ h.offset = 0
+ updateGeneric(&h.macState, h.buffer[:])
+ }
+ if n := len(p) - (len(p) % TagSize); n > 0 {
+ updateGeneric(&h.macState, p[:n])
+ p = p[n:]
+ }
+ if len(p) > 0 {
+ h.offset += copy(h.buffer[h.offset:], p)
+ }
+ return nn, nil
+}
+
+// Sum flushes the last incomplete chunk from the buffer, if any, and generates
+// the MAC output. It does not modify its state, in order to allow for multiple
+// calls to Sum, even if no Write is allowed after Sum.
+func (h *macGeneric) Sum(out *[TagSize]byte) {
+ state := h.macState
+ if h.offset > 0 {
+ updateGeneric(&state, h.buffer[:h.offset])
+ }
+ finalize(out, &state.h, &state.s)
+}
+
+// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
+// clears some bits of the secret coefficient to make it possible to implement
+// multiplication more efficiently.
+const (
+ rMask0 = 0x0FFFFFFC0FFFFFFF
+ rMask1 = 0x0FFFFFFC0FFFFFFC
+)
+
+// initialize loads the 256-bit key into the two 128-bit secret values r and s.
+func initialize(key *[32]byte, m *macState) {
+ m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
+ m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
+ m.s[0] = binary.LittleEndian.Uint64(key[16:24])
+ m.s[1] = binary.LittleEndian.Uint64(key[24:32])
+}
+
+// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
+// bits.Mul64 and bits.Add64 intrinsics.
+type uint128 struct {
+ lo, hi uint64
+}
+
+func mul64(a, b uint64) uint128 {
+ hi, lo := bitsMul64(a, b)
+ return uint128{lo, hi}
+}
+
+func add128(a, b uint128) uint128 {
+ lo, c := bitsAdd64(a.lo, b.lo, 0)
+ hi, c := bitsAdd64(a.hi, b.hi, c)
+ if c != 0 {
+ panic("poly1305: unexpected overflow")
+ }
+ return uint128{lo, hi}
+}
+
+func shiftRightBy2(a uint128) uint128 {
+ a.lo = a.lo>>2 | (a.hi&3)<<62
+ a.hi = a.hi >> 2
+ return a
+}
+
+// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
+// 128 bits of message, it computes
+//
+// h₊ = (h + m) * r mod 2¹³⁰ - 5
+//
+// If the msg length is not a multiple of TagSize, it assumes the last
+// incomplete chunk is the final one.
+func updateGeneric(state *macState, msg []byte) {
+ h0, h1, h2 := state.h[0], state.h[1], state.h[2]
+ r0, r1 := state.r[0], state.r[1]
+
+ for len(msg) > 0 {
+ var c uint64
+
+ // For the first step, h + m, we use a chain of bits.Add64 intrinsics.
+ // The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
+ // reduced at the end of the multiplication below.
+ //
+ // The spec requires us to set a bit just above the message size, not to
+ // hide leading zeroes. For full chunks, that's 1 << 128, so we can just
+ // add 1 to the most significant (2¹²⁸) limb, h2.
+ if len(msg) >= TagSize {
+ h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
+ h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
+ h2 += c + 1
+
+ msg = msg[TagSize:]
+ } else {
+ var buf [TagSize]byte
+ copy(buf[:], msg)
+ buf[len(msg)] = 1
+
+ h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
+ h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
+ h2 += c
+
+ msg = nil
+ }
+
+ // Multiplication of big number limbs is similar to elementary school
+ // columnar multiplication. Instead of digits, there are 64-bit limbs.
+ //
+ // We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
+ //
+ // h2 h1 h0 x
+ // r1 r0 =
+ // ----------------
+ // h2r0 h1r0 h0r0 <-- individual 128-bit products
+ // + h2r1 h1r1 h0r1
+ // ------------------------
+ // m3 m2 m1 m0 <-- result in 128-bit overlapping limbs
+ // ------------------------
+ // m3.hi m2.hi m1.hi m0.hi <-- carry propagation
+ // + m3.lo m2.lo m1.lo m0.lo
+ // -------------------------------
+ // t4 t3 t2 t1 t0 <-- final result in 64-bit limbs
+ //
+ // The main difference from pen-and-paper multiplication is that we do
+ // carry propagation in a separate step, as if we wrote two digit sums
+ // at first (the 128-bit limbs), and then carried the tens all at once.
+
+ h0r0 := mul64(h0, r0)
+ h1r0 := mul64(h1, r0)
+ h2r0 := mul64(h2, r0)
+ h0r1 := mul64(h0, r1)
+ h1r1 := mul64(h1, r1)
+ h2r1 := mul64(h2, r1)
+
+ // Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
+ // top 4 bits cleared by rMask{0,1}, we know that their product is not going
+ // to overflow 64 bits, so we can ignore the high part of the products.
+ //
+ // This also means that the product doesn't have a fifth limb (t4).
+ if h2r0.hi != 0 {
+ panic("poly1305: unexpected overflow")
+ }
+ if h2r1.hi != 0 {
+ panic("poly1305: unexpected overflow")
+ }
+
+ m0 := h0r0
+ m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again
+ m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.
+ m3 := h2r1
+
+ t0 := m0.lo
+ t1, c := bitsAdd64(m1.lo, m0.hi, 0)
+ t2, c := bitsAdd64(m2.lo, m1.hi, c)
+ t3, _ := bitsAdd64(m3.lo, m2.hi, c)
+
+ // Now we have the result as 4 64-bit limbs, and we need to reduce it
+ // modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
+ // a cheap partial reduction according to the reduction identity
+ //
+ // c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5
+ //
+ // because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
+ // likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
+ // assumptions we make about h in the rest of the code.
+ //
+ // See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23
+
+ // We split the final result at the 2¹³⁰ mark into h and cc, the carry.
+ // Note that the carry bits are effectively shifted left by 2, in other
+ // words, cc = c * 4 for the c in the reduction identity.
+ h0, h1, h2 = t0, t1, t2&maskLow2Bits
+ cc := uint128{t2 & maskNotLow2Bits, t3}
+
+ // To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.
+
+ h0, c = bitsAdd64(h0, cc.lo, 0)
+ h1, c = bitsAdd64(h1, cc.hi, c)
+ h2 += c
+
+ cc = shiftRightBy2(cc)
+
+ h0, c = bitsAdd64(h0, cc.lo, 0)
+ h1, c = bitsAdd64(h1, cc.hi, c)
+ h2 += c
+
+ // h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
+ //
+ // 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
+ }
+
+ state.h[0], state.h[1], state.h[2] = h0, h1, h2
+}
+
+const (
+ maskLow2Bits uint64 = 0x0000000000000003
+ maskNotLow2Bits uint64 = ^maskLow2Bits
+)
+
+// select64 returns x if v == 1 and y if v == 0, in constant time.
+func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }
+
+// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
+const (
+ p0 = 0xFFFFFFFFFFFFFFFB
+ p1 = 0xFFFFFFFFFFFFFFFF
+ p2 = 0x0000000000000003
+)
+
+// finalize completes the modular reduction of h and computes
+//
+// out = h + s mod 2¹²⁸
+//
+func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
+ h0, h1, h2 := h[0], h[1], h[2]
+
+ // After the partial reduction in updateGeneric, h might be more than
+ // 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
+ // in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
+ // result if the subtraction underflows, and t otherwise.
+
+ hMinusP0, b := bitsSub64(h0, p0, 0)
+ hMinusP1, b := bitsSub64(h1, p1, b)
+ _, b = bitsSub64(h2, p2, b)
+
+ // h = h if h < p else h - p
+ h0 = select64(b, h0, hMinusP0)
+ h1 = select64(b, h1, hMinusP1)
+
+ // Finally, we compute the last Poly1305 step
+ //
+ // tag = h + s mod 2¹²⁸
+ //
+ // by just doing a wide addition with the 128 low bits of h and discarding
+ // the overflow.
+ h0, c := bitsAdd64(h0, s[0], 0)
+ h1, _ = bitsAdd64(h1, s[1], c)
+
+ binary.LittleEndian.PutUint64(out[0:8], h0)
+ binary.LittleEndian.PutUint64(out[8:16], h1)
+}
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go
new file mode 100644
index 0000000..4a06994
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go
@@ -0,0 +1,48 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+// +build gc,!purego
+
+package poly1305
+
+//go:noescape
+func update(state *macState, msg []byte)
+
+// mac is a wrapper for macGeneric that redirects calls that would have gone to
+// updateGeneric to update.
+//
+// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
+// using function pointers would carry a major performance cost.
+type mac struct{ macGeneric }
+
+func (h *mac) Write(p []byte) (int, error) {
+ nn := len(p)
+ if h.offset > 0 {
+ n := copy(h.buffer[h.offset:], p)
+ if h.offset+n < TagSize {
+ h.offset += n
+ return nn, nil
+ }
+ p = p[n:]
+ h.offset = 0
+ update(&h.macState, h.buffer[:])
+ }
+ if n := len(p) - (len(p) % TagSize); n > 0 {
+ update(&h.macState, p[:n])
+ p = p[n:]
+ }
+ if len(p) > 0 {
+ h.offset += copy(h.buffer[h.offset:], p)
+ }
+ return nn, nil
+}
+
+func (h *mac) Sum(out *[16]byte) {
+ state := h.macState
+ if h.offset > 0 {
+ update(&state, h.buffer[:h.offset])
+ }
+ finalize(out, &state.h, &state.s)
+}
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s
new file mode 100644
index 0000000..58422aa
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s
@@ -0,0 +1,182 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+// +build gc,!purego
+
+#include "textflag.h"
+
+// This was ported from the amd64 implementation.
+
+#define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \
+ MOVD (msg), t0; \
+ MOVD 8(msg), t1; \
+ MOVD $1, t2; \
+ ADDC t0, h0, h0; \
+ ADDE t1, h1, h1; \
+ ADDE t2, h2; \
+ ADD $16, msg
+
+#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \
+ MULLD r0, h0, t0; \
+ MULLD r0, h1, t4; \
+ MULHDU r0, h0, t1; \
+ MULHDU r0, h1, t5; \
+ ADDC t4, t1, t1; \
+ MULLD r0, h2, t2; \
+ ADDZE t5; \
+ MULHDU r1, h0, t4; \
+ MULLD r1, h0, h0; \
+ ADD t5, t2, t2; \
+ ADDC h0, t1, t1; \
+ MULLD h2, r1, t3; \
+ ADDZE t4, h0; \
+ MULHDU r1, h1, t5; \
+ MULLD r1, h1, t4; \
+ ADDC t4, t2, t2; \
+ ADDE t5, t3, t3; \
+ ADDC h0, t2, t2; \
+ MOVD $-4, t4; \
+ MOVD t0, h0; \
+ MOVD t1, h1; \
+ ADDZE t3; \
+ ANDCC $3, t2, h2; \
+ AND t2, t4, t0; \
+ ADDC t0, h0, h0; \
+ ADDE t3, h1, h1; \
+ SLD $62, t3, t4; \
+ SRD $2, t2; \
+ ADDZE h2; \
+ OR t4, t2, t2; \
+ SRD $2, t3; \
+ ADDC t2, h0, h0; \
+ ADDE t3, h1, h1; \
+ ADDZE h2
+
+DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
+DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
+GLOBL ·poly1305Mask<>(SB), RODATA, $16
+
+// func update(state *[7]uint64, msg []byte)
+TEXT ·update(SB), $0-32
+ MOVD state+0(FP), R3
+ MOVD msg_base+8(FP), R4
+ MOVD msg_len+16(FP), R5
+
+ MOVD 0(R3), R8 // h0
+ MOVD 8(R3), R9 // h1
+ MOVD 16(R3), R10 // h2
+ MOVD 24(R3), R11 // r0
+ MOVD 32(R3), R12 // r1
+
+ CMP R5, $16
+ BLT bytes_between_0_and_15
+
+loop:
+ POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22)
+
+multiply:
+ POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21)
+ ADD $-16, R5
+ CMP R5, $16
+ BGE loop
+
+bytes_between_0_and_15:
+ CMP R5, $0
+ BEQ done
+ MOVD $0, R16 // h0
+ MOVD $0, R17 // h1
+
+flush_buffer:
+ CMP R5, $8
+ BLE just1
+
+ MOVD $8, R21
+ SUB R21, R5, R21
+
+ // Greater than 8 -- load the rightmost remaining bytes in msg
+ // and put into R17 (h1)
+ MOVD (R4)(R21), R17
+ MOVD $16, R22
+
+ // Find the offset to those bytes
+ SUB R5, R22, R22
+ SLD $3, R22
+
+ // Shift to get only the bytes in msg
+ SRD R22, R17, R17
+
+ // Put 1 at high end
+ MOVD $1, R23
+ SLD $3, R21
+ SLD R21, R23, R23
+ OR R23, R17, R17
+
+ // Remainder is 8
+ MOVD $8, R5
+
+just1:
+ CMP R5, $8
+ BLT less8
+
+ // Exactly 8
+ MOVD (R4), R16
+
+ CMP R17, $0
+
+ // Check if we've already set R17; if not
+ // set 1 to indicate end of msg.
+ BNE carry
+ MOVD $1, R17
+ BR carry
+
+less8:
+ MOVD $0, R16 // h0
+ MOVD $0, R22 // shift count
+ CMP R5, $4
+ BLT less4
+ MOVWZ (R4), R16
+ ADD $4, R4
+ ADD $-4, R5
+ MOVD $32, R22
+
+less4:
+ CMP R5, $2
+ BLT less2
+ MOVHZ (R4), R21
+ SLD R22, R21, R21
+ OR R16, R21, R16
+ ADD $16, R22
+ ADD $-2, R5
+ ADD $2, R4
+
+less2:
+ CMP R5, $0
+ BEQ insert1
+ MOVBZ (R4), R21
+ SLD R22, R21, R21
+ OR R16, R21, R16
+ ADD $8, R22
+
+insert1:
+ // Insert 1 at end of msg
+ MOVD $1, R21
+ SLD R22, R21, R21
+ OR R16, R21, R16
+
+carry:
+ // Add new values to h0, h1, h2
+ ADDC R16, R8
+ ADDE R17, R9
+ ADDZE R10, R10
+ MOVD $16, R5
+ ADD R5, R4
+ BR multiply
+
+done:
+ // Save h0, h1, h2 in state
+ MOVD R8, 0(R3)
+ MOVD R9, 8(R3)
+ MOVD R10, 16(R3)
+ RET
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.go
new file mode 100644
index 0000000..62cc9f8
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.go
@@ -0,0 +1,76 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+// +build gc,!purego
+
+package poly1305
+
+import (
+ "golang.org/x/sys/cpu"
+)
+
+// updateVX is an assembly implementation of Poly1305 that uses vector
+// instructions. It must only be called if the vector facility (vx) is
+// available.
+//go:noescape
+func updateVX(state *macState, msg []byte)
+
+// mac is a replacement for macGeneric that uses a larger buffer and redirects
+// calls that would have gone to updateGeneric to updateVX if the vector
+// facility is installed.
+//
+// A larger buffer is required for good performance because the vector
+// implementation has a higher fixed cost per call than the generic
+// implementation.
+type mac struct {
+ macState
+
+ buffer [16 * TagSize]byte // size must be a multiple of block size (16)
+ offset int
+}
+
+func (h *mac) Write(p []byte) (int, error) {
+ nn := len(p)
+ if h.offset > 0 {
+ n := copy(h.buffer[h.offset:], p)
+ if h.offset+n < len(h.buffer) {
+ h.offset += n
+ return nn, nil
+ }
+ p = p[n:]
+ h.offset = 0
+ if cpu.S390X.HasVX {
+ updateVX(&h.macState, h.buffer[:])
+ } else {
+ updateGeneric(&h.macState, h.buffer[:])
+ }
+ }
+
+ tail := len(p) % len(h.buffer) // number of bytes to copy into buffer
+ body := len(p) - tail // number of bytes to process now
+ if body > 0 {
+ if cpu.S390X.HasVX {
+ updateVX(&h.macState, p[:body])
+ } else {
+ updateGeneric(&h.macState, p[:body])
+ }
+ }
+ h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0
+ return nn, nil
+}
+
+func (h *mac) Sum(out *[TagSize]byte) {
+ state := h.macState
+ remainder := h.buffer[:h.offset]
+
+ // Use the generic implementation if we have 2 or fewer blocks left
+ // to sum. The vector implementation has a higher startup time.
+ if cpu.S390X.HasVX && len(remainder) > 2*TagSize {
+ updateVX(&state, remainder)
+ } else if len(remainder) > 0 {
+ updateGeneric(&state, remainder)
+ }
+ finalize(out, &state.h, &state.s)
+}
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.s
new file mode 100644
index 0000000..aa9e049
--- /dev/null
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_s390x.s
@@ -0,0 +1,504 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+// +build gc,!purego
+
+#include "textflag.h"
+
+// This implementation of Poly1305 uses the vector facility (vx)
+// to process up to 2 blocks (32 bytes) per iteration using an
+// algorithm based on the one described in:
+//
+// NEON crypto, Daniel J. Bernstein & Peter Schwabe
+// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+//
+// This algorithm uses 5 26-bit limbs to represent a 130-bit
+// value. These limbs are, for the most part, zero extended and
+// placed into 64-bit vector register elements. Each vector
+// register is 128-bits wide and so holds 2 of these elements.
+// Using 26-bit limbs allows us plenty of headroom to accommodate
+// accumulations before and after multiplication without
+// overflowing either 32-bits (before multiplication) or 64-bits
+// (after multiplication).
+//
+// In order to parallelise the operations required to calculate
+// the sum we use two separate accumulators and then sum those
+// in an extra final step. For compatibility with the generic
+// implementation we perform this summation at the end of every
+// updateVX call.
+//
+// To use two accumulators we must multiply the message blocks
+// by r² rather than r. Only the final message block should be
+// multiplied by r.
+//
+// Example:
+//
+// We want to calculate the sum (h) for a 64 byte message (m):
+//
+// h = m[0:16]r⁴ + m[16:32]r³ + m[32:48]r² + m[48:64]r
+//
+// To do this we split the calculation into the even indices
+// and odd indices of the message. These form our SIMD 'lanes':
+//
+// h = m[ 0:16]r⁴ + m[32:48]r² + <- lane 0
+// m[16:32]r³ + m[48:64]r <- lane 1
+//
+// To calculate this iteratively we refactor so that both lanes
+// are written in terms of r² and r:
+//
+// h = (m[ 0:16]r² + m[32:48])r² + <- lane 0
+// (m[16:32]r² + m[48:64])r <- lane 1
+// ^ ^
+// | coefficients for second iteration
+// coefficients for first iteration
+//
+// So in this case we would have two iterations. In the first
+// both lanes are multiplied by r². In the second only the
+// first lane is multiplied by r² and the second lane is
+// instead multiplied by r. This gives use the odd and even
+// powers of r that we need from the original equation.
+//
+// Notation:
+//
+// h - accumulator
+// r - key
+// m - message
+//
+// [a, b] - SIMD register holding two 64-bit values
+// [a, b, c, d] - SIMD register holding four 32-bit values
+// xᵢ[n] - limb n of variable x with bit width i
+//
+// Limbs are expressed in little endian order, so for 26-bit
+// limbs x₂₆[4] will be the most significant limb and x₂₆[0]
+// will be the least significant limb.
+
+// masking constants
+#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits
+#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits
+
+// expansion constants (see EXPAND macro)
+#define EX0 V2
+#define EX1 V3
+#define EX2 V4
+
+// key (r², r or 1 depending on context)
+#define R_0 V5
+#define R_1 V6
+#define R_2 V7
+#define R_3 V8
+#define R_4 V9
+
+// precalculated coefficients (5r², 5r or 0 depending on context)
+#define R5_1 V10
+#define R5_2 V11
+#define R5_3 V12
+#define R5_4 V13
+
+// message block (m)
+#define M_0 V14
+#define M_1 V15
+#define M_2 V16
+#define M_3 V17
+#define M_4 V18
+
+// accumulator (h)
+#define H_0 V19
+#define H_1 V20
+#define H_2 V21
+#define H_3 V22
+#define H_4 V23
+
+// temporary registers (for short-lived values)
+#define T_0 V24
+#define T_1 V25
+#define T_2 V26
+#define T_3 V27
+#define T_4 V28
+
+GLOBL ·constants<>(SB), RODATA, $0x30
+// EX0
+DATA ·constants<>+0x00(SB)/8, $0x0006050403020100
+DATA ·constants<>+0x08(SB)/8, $0x1016151413121110
+// EX1
+DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706
+DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716
+// EX2
+DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d
+DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d
+
+// MULTIPLY multiplies each lane of f and g, partially reduced
+// modulo 2¹³⁰ - 5. The result, h, consists of partial products
+// in each lane that need to be reduced further to produce the
+// final result.
+//
+// h₁₃₀ = (f₁₃₀g₁₃₀) % 2¹³⁰ + (5f₁₃₀g₁₃₀) / 2¹³⁰
+//
+// Note that the multiplication by 5 of the high bits is
+// achieved by precalculating the multiplication of four of the
+// g coefficients by 5. These are g51-g54.
+#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
+ VMLOF f0, g0, h0 \
+ VMLOF f0, g3, h3 \
+ VMLOF f0, g1, h1 \
+ VMLOF f0, g4, h4 \
+ VMLOF f0, g2, h2 \
+ VMLOF f1, g54, T_0 \
+ VMLOF f1, g2, T_3 \
+ VMLOF f1, g0, T_1 \
+ VMLOF f1, g3, T_4 \
+ VMLOF f1, g1, T_2 \
+ VMALOF f2, g53, h0, h0 \
+ VMALOF f2, g1, h3, h3 \
+ VMALOF f2, g54, h1, h1 \
+ VMALOF f2, g2, h4, h4 \
+ VMALOF f2, g0, h2, h2 \
+ VMALOF f3, g52, T_0, T_0 \
+ VMALOF f3, g0, T_3, T_3 \
+ VMALOF f3, g53, T_1, T_1 \
+ VMALOF f3, g1, T_4, T_4 \
+ VMALOF f3, g54, T_2, T_2 \
+ VMALOF f4, g51, h0, h0 \
+ VMALOF f4, g54, h3, h3 \
+ VMALOF f4, g52, h1, h1 \
+ VMALOF f4, g0, h4, h4 \
+ VMALOF f4, g53, h2, h2 \
+ VAG T_0, h0, h0 \
+ VAG T_3, h3, h3 \
+ VAG T_1, h1, h1 \
+ VAG T_4, h4, h4 \
+ VAG T_2, h2, h2
+
+// REDUCE performs the following carry operations in four
+// stages, as specified in Bernstein & Schwabe:
+//
+// 1: h₂₆[0]->h₂₆[1] h₂₆[3]->h₂₆[4]
+// 2: h₂₆[1]->h₂₆[2] h₂₆[4]->h₂₆[0]
+// 3: h₂₆[0]->h₂₆[1] h₂₆[2]->h₂₆[3]
+// 4: h₂₆[3]->h₂₆[4]
+//
+// The result is that all of the limbs are limited to 26-bits
+// except for h₂₆[1] and h₂₆[4] which are limited to 27-bits.
+//
+// Note that although each limb is aligned at 26-bit intervals
+// they may contain values that exceed 2²⁶ - 1, hence the need
+// to carry the excess bits in each limb.
+#define REDUCE(h0, h1, h2, h3, h4) \
+ VESRLG $26, h0, T_0 \
+ VESRLG $26, h3, T_1 \
+ VN MOD26, h0, h0 \
+ VN MOD26, h3, h3 \
+ VAG T_0, h1, h1 \
+ VAG T_1, h4, h4 \
+ VESRLG $26, h1, T_2 \
+ VESRLG $26, h4, T_3 \
+ VN MOD26, h1, h1 \
+ VN MOD26, h4, h4 \
+ VESLG $2, T_3, T_4 \
+ VAG T_3, T_4, T_4 \
+ VAG T_2, h2, h2 \
+ VAG T_4, h0, h0 \
+ VESRLG $26, h2, T_0 \
+ VESRLG $26, h0, T_1 \
+ VN MOD26, h2, h2 \
+ VN MOD26, h0, h0 \
+ VAG T_0, h3, h3 \
+ VAG T_1, h1, h1 \
+ VESRLG $26, h3, T_2 \
+ VN MOD26, h3, h3 \
+ VAG T_2, h4, h4
+
+// EXPAND splits the 128-bit little-endian values in0 and in1
+// into 26-bit big-endian limbs and places the results into
+// the first and second lane of d₂₆[0:4] respectively.
+//
+// The EX0, EX1 and EX2 constants are arrays of byte indices
+// for permutation. The permutation both reverses the bytes
+// in the input and ensures the bytes are copied into the
+// destination limb ready to be shifted into their final
+// position.
+#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
+ VPERM in0, in1, EX0, d0 \
+ VPERM in0, in1, EX1, d2 \
+ VPERM in0, in1, EX2, d4 \
+ VESRLG $26, d0, d1 \
+ VESRLG $30, d2, d3 \
+ VESRLG $4, d2, d2 \
+ VN MOD26, d0, d0 \ // [in0₂₆[0], in1₂₆[0]]
+ VN MOD26, d3, d3 \ // [in0₂₆[3], in1₂₆[3]]
+ VN MOD26, d1, d1 \ // [in0₂₆[1], in1₂₆[1]]
+ VN MOD24, d4, d4 \ // [in0₂₆[4], in1₂₆[4]]
+ VN MOD26, d2, d2 // [in0₂₆[2], in1₂₆[2]]
+
+// func updateVX(state *macState, msg []byte)
+TEXT ·updateVX(SB), NOSPLIT, $0
+ MOVD state+0(FP), R1
+ LMG msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len
+
+ // load EX0, EX1 and EX2
+ MOVD $·constants<>(SB), R5
+ VLM (R5), EX0, EX2
+
+ // generate masks
+ VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff]
+ VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff]
+
+ // load h (accumulator) and r (key) from state
+ VZERO T_1 // [0, 0]
+ VL 0(R1), T_0 // [h₆₄[0], h₆₄[1]]
+ VLEG $0, 16(R1), T_1 // [h₆₄[2], 0]
+ VL 24(R1), T_2 // [r₆₄[0], r₆₄[1]]
+ VPDI $0, T_0, T_2, T_3 // [h₆₄[0], r₆₄[0]]
+ VPDI $5, T_0, T_2, T_4 // [h₆₄[1], r₆₄[1]]
+
+ // unpack h and r into 26-bit limbs
+ // note: h₆₄[2] may have the low 3 bits set, so h₂₆[4] is a 27-bit value
+ VN MOD26, T_3, H_0 // [h₂₆[0], r₂₆[0]]
+ VZERO H_1 // [0, 0]
+ VZERO H_3 // [0, 0]
+ VGMG $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out
+ VESLG $24, T_1, T_1 // [h₆₄[2]<<24, 0]
+ VERIMG $-26&63, T_3, MOD26, H_1 // [h₂₆[1], r₂₆[1]]
+ VESRLG $+52&63, T_3, H_2 // [h₂₆[2], r₂₆[2]] - low 12 bits only
+ VERIMG $-14&63, T_4, MOD26, H_3 // [h₂₆[1], r₂₆[1]]
+ VESRLG $40, T_4, H_4 // [h₂₆[4], r₂₆[4]] - low 24 bits only
+ VERIMG $+12&63, T_4, T_0, H_2 // [h₂₆[2], r₂₆[2]] - complete
+ VO T_1, H_4, H_4 // [h₂₆[4], r₂₆[4]] - complete
+
+ // replicate r across all 4 vector elements
+ VREPF $3, H_0, R_0 // [r₂₆[0], r₂₆[0], r₂₆[0], r₂₆[0]]
+ VREPF $3, H_1, R_1 // [r₂₆[1], r₂₆[1], r₂₆[1], r₂₆[1]]
+ VREPF $3, H_2, R_2 // [r₂₆[2], r₂₆[2], r₂₆[2], r₂₆[2]]
+ VREPF $3, H_3, R_3 // [r₂₆[3], r₂₆[3], r₂₆[3], r₂₆[3]]
+ VREPF $3, H_4, R_4 // [r₂₆[4], r₂₆[4], r₂₆[4], r₂₆[4]]
+
+ // zero out lane 1 of h
+ VLEIG $1, $0, H_0 // [h₂₆[0], 0]
+ VLEIG $1, $0, H_1 // [h₂₆[1], 0]
+ VLEIG $1, $0, H_2 // [h₂₆[2], 0]
+ VLEIG $1, $0, H_3 // [h₂₆[3], 0]
+ VLEIG $1, $0, H_4 // [h₂₆[4], 0]
+
+ // calculate 5r (ignore least significant limb)
+ VREPIF $5, T_0
+ VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r₂₆[1], 5r₂₆[1], 5r₂₆[1]]
+ VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r₂₆[2], 5r₂₆[2], 5r₂₆[2]]
+ VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r₂₆[3], 5r₂₆[3], 5r₂₆[3]]
+ VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r₂₆[4], 5r₂₆[4], 5r₂₆[4]]
+
+ // skip r² calculation if we are only calculating one block
+ CMPBLE R3, $16, skip
+
+ // calculate r²
+ MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4)
+ REDUCE(M_0, M_1, M_2, M_3, M_4)
+ VGBM $0x0f0f, T_0
+ VERIMG $0, M_0, T_0, R_0 // [r₂₆[0], r²₂₆[0], r₂₆[0], r²₂₆[0]]
+ VERIMG $0, M_1, T_0, R_1 // [r₂₆[1], r²₂₆[1], r₂₆[1], r²₂₆[1]]
+ VERIMG $0, M_2, T_0, R_2 // [r₂₆[2], r²₂₆[2], r₂₆[2], r²₂₆[2]]
+ VERIMG $0, M_3, T_0, R_3 // [r₂₆[3], r²₂₆[3], r₂₆[3], r²₂₆[3]]
+ VERIMG $0, M_4, T_0, R_4 // [r₂₆[4], r²₂₆[4], r₂₆[4], r²₂₆[4]]
+
+ // calculate 5r² (ignore least significant limb)
+ VREPIF $5, T_0
+ VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r²₂₆[1], 5r₂₆[1], 5r²₂₆[1]]
+ VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r²₂₆[2], 5r₂₆[2], 5r²₂₆[2]]
+ VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r²₂₆[3], 5r₂₆[3], 5r²₂₆[3]]
+ VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r²₂₆[4], 5r₂₆[4], 5r²₂₆[4]]
+
+loop:
+ CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients
+
+ // load next 2 blocks from message
+ VLM (R2), T_0, T_1
+
+ // update message slice
+ SUB $32, R3
+ MOVD $32(R2), R2
+
+ // unpack message blocks into 26-bit big-endian limbs
+ EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
+
+ // add 2¹²⁸ to each message block value
+ VLEIB $4, $1, M_4
+ VLEIB $12, $1, M_4
+
+multiply:
+ // accumulate the incoming message
+ VAG H_0, M_0, M_0
+ VAG H_3, M_3, M_3
+ VAG H_1, M_1, M_1
+ VAG H_4, M_4, M_4
+ VAG H_2, M_2, M_2
+
+ // multiply the accumulator by the key coefficient
+ MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
+
+ // carry and partially reduce the partial products
+ REDUCE(H_0, H_1, H_2, H_3, H_4)
+
+ CMPBNE R3, $0, loop
+
+finish:
+ // sum lane 0 and lane 1 and put the result in lane 1
+ VZERO T_0
+ VSUMQG H_0, T_0, H_0
+ VSUMQG H_3, T_0, H_3
+ VSUMQG H_1, T_0, H_1
+ VSUMQG H_4, T_0, H_4
+ VSUMQG H_2, T_0, H_2
+
+ // reduce again after summation
+ // TODO(mundaym): there might be a more efficient way to do this
+ // now that we only have 1 active lane. For example, we could
+ // simultaneously pack the values as we reduce them.
+ REDUCE(H_0, H_1, H_2, H_3, H_4)
+
+ // carry h[1] through to h[4] so that only h[4] can exceed 2²⁶ - 1
+ // TODO(mundaym): in testing this final carry was unnecessary.
+ // Needs a proof before it can be removed though.
+ VESRLG $26, H_1, T_1
+ VN MOD26, H_1, H_1
+ VAQ T_1, H_2, H_2
+ VESRLG $26, H_2, T_2
+ VN MOD26, H_2, H_2
+ VAQ T_2, H_3, H_3
+ VESRLG $26, H_3, T_3
+ VN MOD26, H_3, H_3
+ VAQ T_3, H_4, H_4
+
+ // h is now < 2(2¹³⁰ - 5)
+ // Pack each lane in h₂₆[0:4] into h₁₂₈[0:1].
+ VESLG $26, H_1, H_1
+ VESLG $26, H_3, H_3
+ VO H_0, H_1, H_0
+ VO H_2, H_3, H_2
+ VESLG $4, H_2, H_2
+ VLEIB $7, $48, H_1
+ VSLB H_1, H_2, H_2
+ VO H_0, H_2, H_0
+ VLEIB $7, $104, H_1
+ VSLB H_1, H_4, H_3
+ VO H_3, H_0, H_0
+ VLEIB $7, $24, H_1
+ VSRLB H_1, H_4, H_1
+
+ // update state
+ VSTEG $1, H_0, 0(R1)
+ VSTEG $0, H_0, 8(R1)
+ VSTEG $1, H_1, 16(R1)
+ RET
+
+b2: // 2 or fewer blocks remaining
+ CMPBLE R3, $16, b1
+
+ // Load the 2 remaining blocks (17-32 bytes remaining).
+ MOVD $-17(R3), R0 // index of final byte to load modulo 16
+ VL (R2), T_0 // load full 16 byte block
+ VLL R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes
+
+ // The Poly1305 algorithm requires that a 1 bit be appended to
+ // each message block. If the final block is less than 16 bytes
+ // long then it is easiest to insert the 1 before the message
+ // block is split into 26-bit limbs. If, on the other hand, the
+ // final message block is 16 bytes long then we append the 1 bit
+ // after expansion as normal.
+ MOVBZ $1, R0
+ MOVD $-16(R3), R3 // index of byte in last block to insert 1 at (could be 16)
+ CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long
+ VLVGB R3, R0, T_1 // insert 1 into the byte at index R3
+
+ // Split both blocks into 26-bit limbs in the appropriate lanes.
+ EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
+
+ // Append a 1 byte to the end of the second to last block.
+ VLEIB $4, $1, M_4
+
+ // Append a 1 byte to the end of the last block only if it is a
+ // full 16 byte block.
+ CMPBNE R3, $16, 2(PC)
+ VLEIB $12, $1, M_4
+
+ // Finally, set up the coefficients for the final multiplication.
+ // We have previously saved r and 5r in the 32-bit even indexes
+ // of the R_[0-4] and R5_[1-4] coefficient registers.
+ //
+ // We want lane 0 to be multiplied by r² so that can be kept the
+ // same. We want lane 1 to be multiplied by r so we need to move
+ // the saved r value into the 32-bit odd index in lane 1 by
+ // rotating the 64-bit lane by 32.
+ VGBM $0x00ff, T_0 // [0, 0xffffffffffffffff] - mask lane 1 only
+ VERIMG $32, R_0, T_0, R_0 // [_, r²₂₆[0], _, r₂₆[0]]
+ VERIMG $32, R_1, T_0, R_1 // [_, r²₂₆[1], _, r₂₆[1]]
+ VERIMG $32, R_2, T_0, R_2 // [_, r²₂₆[2], _, r₂₆[2]]
+ VERIMG $32, R_3, T_0, R_3 // [_, r²₂₆[3], _, r₂₆[3]]
+ VERIMG $32, R_4, T_0, R_4 // [_, r²₂₆[4], _, r₂₆[4]]
+ VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²₂₆[1], _, 5r₂₆[1]]
+ VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²₂₆[2], _, 5r₂₆[2]]
+ VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²₂₆[3], _, 5r₂₆[3]]
+ VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²₂₆[4], _, 5r₂₆[4]]
+
+ MOVD $0, R3
+ BR multiply
+
+skip:
+ CMPBEQ R3, $0, finish
+
+b1: // 1 block remaining
+
+ // Load the final block (1-16 bytes). This will be placed into
+ // lane 0.
+ MOVD $-1(R3), R0
+ VLL R0, (R2), T_0 // pad to 16 bytes with zeros
+
+ // The Poly1305 algorithm requires that a 1 bit be appended to
+ // each message block. If the final block is less than 16 bytes
+ // long then it is easiest to insert the 1 before the message
+ // block is split into 26-bit limbs. If, on the other hand, the
+ // final message block is 16 bytes long then we append the 1 bit
+ // after expansion as normal.
+ MOVBZ $1, R0
+ CMPBEQ R3, $16, 2(PC)
+ VLVGB R3, R0, T_0
+
+ // Set the message block in lane 1 to the value 0 so that it
+ // can be accumulated without affecting the final result.
+ VZERO T_1
+
+ // Split the final message block into 26-bit limbs in lane 0.
+ // Lane 1 will be contain 0.
+ EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
+
+ // Append a 1 byte to the end of the last block only if it is a
+ // full 16 byte block.
+ CMPBNE R3, $16, 2(PC)
+ VLEIB $4, $1, M_4
+
+ // We have previously saved r and 5r in the 32-bit even indexes
+ // of the R_[0-4] and R5_[1-4] coefficient registers.
+ //
+ // We want lane 0 to be multiplied by r so we need to move the
+ // saved r value into the 32-bit odd index in lane 0. We want
+ // lane 1 to be set to the value 1. This makes multiplication
+ // a no-op. We do this by setting lane 1 in every register to 0
+ // and then just setting the 32-bit index 3 in R_0 to 1.
+ VZERO T_0
+ MOVD $0, R0
+ MOVD $0x10111213, R12
+ VLVGP R12, R0, T_1 // [_, 0x10111213, _, 0x00000000]
+ VPERM T_0, R_0, T_1, R_0 // [_, r₂₆[0], _, 0]
+ VPERM T_0, R_1, T_1, R_1 // [_, r₂₆[1], _, 0]
+ VPERM T_0, R_2, T_1, R_2 // [_, r₂₆[2], _, 0]
+ VPERM T_0, R_3, T_1, R_3 // [_, r₂₆[3], _, 0]
+ VPERM T_0, R_4, T_1, R_4 // [_, r₂₆[4], _, 0]
+ VPERM T_0, R5_1, T_1, R5_1 // [_, 5r₂₆[1], _, 0]
+ VPERM T_0, R5_2, T_1, R5_2 // [_, 5r₂₆[2], _, 0]
+ VPERM T_0, R5_3, T_1, R5_3 // [_, 5r₂₆[3], _, 0]
+ VPERM T_0, R5_4, T_1, R5_4 // [_, 5r₂₆[4], _, 0]
+
+ // Set the value of lane 1 to be 1.
+ VLEIF $3, $1, R_0 // [_, r₂₆[0], _, 1]
+
+ MOVD $0, R3
+ BR multiply
diff --git a/vendor/golang.org/x/crypto/internal/subtle/aliasing.go b/vendor/golang.org/x/crypto/internal/subtle/aliasing.go
index f38797b..4fad24f 100644
--- a/vendor/golang.org/x/crypto/internal/subtle/aliasing.go
+++ b/vendor/golang.org/x/crypto/internal/subtle/aliasing.go
@@ -2,7 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !appengine
+//go:build !purego
+// +build !purego
// Package subtle implements functions that are often useful in cryptographic
// code but require careful thought to use correctly.
diff --git a/vendor/golang.org/x/crypto/internal/subtle/aliasing_appengine.go b/vendor/golang.org/x/crypto/internal/subtle/aliasing_purego.go
index 0cc4a8a..80ccbed 100644
--- a/vendor/golang.org/x/crypto/internal/subtle/aliasing_appengine.go
+++ b/vendor/golang.org/x/crypto/internal/subtle/aliasing_purego.go
@@ -2,7 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build appengine
+//go:build purego
+// +build purego
// Package subtle implements functions that are often useful in cryptographic
// code but require careful thought to use correctly.
diff --git a/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go b/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go
index a98d1bd..a2973e6 100644
--- a/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go
+++ b/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go
@@ -35,8 +35,8 @@ This package is interoperable with NaCl: https://nacl.cr.yp.to/secretbox.html.
package secretbox // import "golang.org/x/crypto/nacl/secretbox"
import (
+ "golang.org/x/crypto/internal/poly1305"
"golang.org/x/crypto/internal/subtle"
- "golang.org/x/crypto/poly1305"
"golang.org/x/crypto/salsa20/salsa"
)
diff --git a/vendor/golang.org/x/crypto/poly1305/poly1305.go b/vendor/golang.org/x/crypto/poly1305/poly1305.go
deleted file mode 100644
index f562fa5..0000000
--- a/vendor/golang.org/x/crypto/poly1305/poly1305.go
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-/*
-Package poly1305 implements Poly1305 one-time message authentication code as
-specified in https://cr.yp.to/mac/poly1305-20050329.pdf.
-
-Poly1305 is a fast, one-time authentication function. It is infeasible for an
-attacker to generate an authenticator for a message without the key. However, a
-key must only be used for a single message. Authenticating two different
-messages with the same key allows an attacker to forge authenticators for other
-messages with the same key.
-
-Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was
-used with a fixed key in order to generate one-time keys from an nonce.
-However, in this package AES isn't used and the one-time key is specified
-directly.
-*/
-package poly1305 // import "golang.org/x/crypto/poly1305"
-
-import "crypto/subtle"
-
-// TagSize is the size, in bytes, of a poly1305 authenticator.
-const TagSize = 16
-
-// Verify returns true if mac is a valid authenticator for m with the given
-// key.
-func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
- var tmp [16]byte
- Sum(&tmp, m, key)
- return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
deleted file mode 100644
index 4dd72fe..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build amd64,!gccgo,!appengine
-
-package poly1305
-
-// This function is implemented in sum_amd64.s
-//go:noescape
-func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
-
-// Sum generates an authenticator for m using a one-time key and puts the
-// 16-byte result into out. Authenticating two different messages with the same
-// key allows an attacker to forge messages at will.
-func Sum(out *[16]byte, m []byte, key *[32]byte) {
- var mPtr *byte
- if len(m) > 0 {
- mPtr = &m[0]
- }
- poly1305(out, mPtr, uint64(len(m)), key)
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.go b/vendor/golang.org/x/crypto/poly1305/sum_arm.go
deleted file mode 100644
index 5dc321c..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_arm.go
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build arm,!gccgo,!appengine,!nacl
-
-package poly1305
-
-// This function is implemented in sum_arm.s
-//go:noescape
-func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]byte)
-
-// Sum generates an authenticator for m using a one-time key and puts the
-// 16-byte result into out. Authenticating two different messages with the same
-// key allows an attacker to forge messages at will.
-func Sum(out *[16]byte, m []byte, key *[32]byte) {
- var mPtr *byte
- if len(m) > 0 {
- mPtr = &m[0]
- }
- poly1305_auth_armv6(out, mPtr, uint32(len(m)), key)
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.s b/vendor/golang.org/x/crypto/poly1305/sum_arm.s
deleted file mode 100644
index f70b4ac..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_arm.s
+++ /dev/null
@@ -1,427 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build arm,!gccgo,!appengine,!nacl
-
-#include "textflag.h"
-
-// This code was translated into a form compatible with 5a from the public
-// domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
-
-DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
-DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
-DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
-DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
-DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
-GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20
-
-// Warning: the linker may use R11 to synthesize certain instructions. Please
-// take care and verify that no synthetic instructions use it.
-
-TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
- // Needs 16 bytes of stack and 64 bytes of space pointed to by R0. (It
- // might look like it's only 60 bytes of space but the final four bytes
- // will be written by another function.) We need to skip over four
- // bytes of stack because that's saving the value of 'g'.
- ADD $4, R13, R8
- MOVM.IB [R4-R7], (R8)
- MOVM.IA.W (R1), [R2-R5]
- MOVW $·poly1305_init_constants_armv6<>(SB), R7
- MOVW R2, R8
- MOVW R2>>26, R9
- MOVW R3>>20, g
- MOVW R4>>14, R11
- MOVW R5>>8, R12
- ORR R3<<6, R9, R9
- ORR R4<<12, g, g
- ORR R5<<18, R11, R11
- MOVM.IA (R7), [R2-R6]
- AND R8, R2, R2
- AND R9, R3, R3
- AND g, R4, R4
- AND R11, R5, R5
- AND R12, R6, R6
- MOVM.IA.W [R2-R6], (R0)
- EOR R2, R2, R2
- EOR R3, R3, R3
- EOR R4, R4, R4
- EOR R5, R5, R5
- EOR R6, R6, R6
- MOVM.IA.W [R2-R6], (R0)
- MOVM.IA.W (R1), [R2-R5]
- MOVM.IA [R2-R6], (R0)
- ADD $20, R13, R0
- MOVM.DA (R0), [R4-R7]
- RET
-
-#define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
- MOVBU (offset+0)(Rsrc), Rtmp; \
- MOVBU Rtmp, (offset+0)(Rdst); \
- MOVBU (offset+1)(Rsrc), Rtmp; \
- MOVBU Rtmp, (offset+1)(Rdst); \
- MOVBU (offset+2)(Rsrc), Rtmp; \
- MOVBU Rtmp, (offset+2)(Rdst); \
- MOVBU (offset+3)(Rsrc), Rtmp; \
- MOVBU Rtmp, (offset+3)(Rdst)
-
-TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
- // Needs 24 bytes of stack for saved registers and then 88 bytes of
- // scratch space after that. We assume that 24 bytes at (R13) have
- // already been used: four bytes for the link register saved in the
- // prelude of poly1305_auth_armv6, four bytes for saving the value of g
- // in that function and 16 bytes of scratch space used around
- // poly1305_finish_ext_armv6_skip1.
- ADD $24, R13, R12
- MOVM.IB [R4-R8, R14], (R12)
- MOVW R0, 88(R13)
- MOVW R1, 92(R13)
- MOVW R2, 96(R13)
- MOVW R1, R14
- MOVW R2, R12
- MOVW 56(R0), R8
- WORD $0xe1180008 // TST R8, R8 not working see issue 5921
- EOR R6, R6, R6
- MOVW.EQ $(1<<24), R6
- MOVW R6, 84(R13)
- ADD $116, R13, g
- MOVM.IA (R0), [R0-R9]
- MOVM.IA [R0-R4], (g)
- CMP $16, R12
- BLO poly1305_blocks_armv6_done
-
-poly1305_blocks_armv6_mainloop:
- WORD $0xe31e0003 // TST R14, #3 not working see issue 5921
- BEQ poly1305_blocks_armv6_mainloop_aligned
- ADD $100, R13, g
- MOVW_UNALIGNED(R14, g, R0, 0)
- MOVW_UNALIGNED(R14, g, R0, 4)
- MOVW_UNALIGNED(R14, g, R0, 8)
- MOVW_UNALIGNED(R14, g, R0, 12)
- MOVM.IA (g), [R0-R3]
- ADD $16, R14
- B poly1305_blocks_armv6_mainloop_loaded
-
-poly1305_blocks_armv6_mainloop_aligned:
- MOVM.IA.W (R14), [R0-R3]
-
-poly1305_blocks_armv6_mainloop_loaded:
- MOVW R0>>26, g
- MOVW R1>>20, R11
- MOVW R2>>14, R12
- MOVW R14, 92(R13)
- MOVW R3>>8, R4
- ORR R1<<6, g, g
- ORR R2<<12, R11, R11
- ORR R3<<18, R12, R12
- BIC $0xfc000000, R0, R0
- BIC $0xfc000000, g, g
- MOVW 84(R13), R3
- BIC $0xfc000000, R11, R11
- BIC $0xfc000000, R12, R12
- ADD R0, R5, R5
- ADD g, R6, R6
- ORR R3, R4, R4
- ADD R11, R7, R7
- ADD $116, R13, R14
- ADD R12, R8, R8
- ADD R4, R9, R9
- MOVM.IA (R14), [R0-R4]
- MULLU R4, R5, (R11, g)
- MULLU R3, R5, (R14, R12)
- MULALU R3, R6, (R11, g)
- MULALU R2, R6, (R14, R12)
- MULALU R2, R7, (R11, g)
- MULALU R1, R7, (R14, R12)
- ADD R4<<2, R4, R4
- ADD R3<<2, R3, R3
- MULALU R1, R8, (R11, g)
- MULALU R0, R8, (R14, R12)
- MULALU R0, R9, (R11, g)
- MULALU R4, R9, (R14, R12)
- MOVW g, 76(R13)
- MOVW R11, 80(R13)
- MOVW R12, 68(R13)
- MOVW R14, 72(R13)
- MULLU R2, R5, (R11, g)
- MULLU R1, R5, (R14, R12)
- MULALU R1, R6, (R11, g)
- MULALU R0, R6, (R14, R12)
- MULALU R0, R7, (R11, g)
- MULALU R4, R7, (R14, R12)
- ADD R2<<2, R2, R2
- ADD R1<<2, R1, R1
- MULALU R4, R8, (R11, g)
- MULALU R3, R8, (R14, R12)
- MULALU R3, R9, (R11, g)
- MULALU R2, R9, (R14, R12)
- MOVW g, 60(R13)
- MOVW R11, 64(R13)
- MOVW R12, 52(R13)
- MOVW R14, 56(R13)
- MULLU R0, R5, (R11, g)
- MULALU R4, R6, (R11, g)
- MULALU R3, R7, (R11, g)
- MULALU R2, R8, (R11, g)
- MULALU R1, R9, (R11, g)
- ADD $52, R13, R0
- MOVM.IA (R0), [R0-R7]
- MOVW g>>26, R12
- MOVW R4>>26, R14
- ORR R11<<6, R12, R12
- ORR R5<<6, R14, R14
- BIC $0xfc000000, g, g
- BIC $0xfc000000, R4, R4
- ADD.S R12, R0, R0
- ADC $0, R1, R1
- ADD.S R14, R6, R6
- ADC $0, R7, R7
- MOVW R0>>26, R12
- MOVW R6>>26, R14
- ORR R1<<6, R12, R12
- ORR R7<<6, R14, R14
- BIC $0xfc000000, R0, R0
- BIC $0xfc000000, R6, R6
- ADD R14<<2, R14, R14
- ADD.S R12, R2, R2
- ADC $0, R3, R3
- ADD R14, g, g
- MOVW R2>>26, R12
- MOVW g>>26, R14
- ORR R3<<6, R12, R12
- BIC $0xfc000000, g, R5
- BIC $0xfc000000, R2, R7
- ADD R12, R4, R4
- ADD R14, R0, R0
- MOVW R4>>26, R12
- BIC $0xfc000000, R4, R8
- ADD R12, R6, R9
- MOVW 96(R13), R12
- MOVW 92(R13), R14
- MOVW R0, R6
- CMP $32, R12
- SUB $16, R12, R12
- MOVW R12, 96(R13)
- BHS poly1305_blocks_armv6_mainloop
-
-poly1305_blocks_armv6_done:
- MOVW 88(R13), R12
- MOVW R5, 20(R12)
- MOVW R6, 24(R12)
- MOVW R7, 28(R12)
- MOVW R8, 32(R12)
- MOVW R9, 36(R12)
- ADD $48, R13, R0
- MOVM.DA (R0), [R4-R8, R14]
- RET
-
-#define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
- MOVBU.P 1(Rsrc), Rtmp; \
- MOVBU.P Rtmp, 1(Rdst); \
- MOVBU.P 1(Rsrc), Rtmp; \
- MOVBU.P Rtmp, 1(Rdst)
-
-#define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
- MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
- MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
-
-// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
-TEXT ·poly1305_auth_armv6(SB), $196-16
- // The value 196, just above, is the sum of 64 (the size of the context
- // structure) and 132 (the amount of stack needed).
- //
- // At this point, the stack pointer (R13) has been moved down. It
- // points to the saved link register and there's 196 bytes of free
- // space above it.
- //
- // The stack for this function looks like:
- //
- // +---------------------
- // |
- // | 64 bytes of context structure
- // |
- // +---------------------
- // |
- // | 112 bytes for poly1305_blocks_armv6
- // |
- // +---------------------
- // | 16 bytes of final block, constructed at
- // | poly1305_finish_ext_armv6_skip8
- // +---------------------
- // | four bytes of saved 'g'
- // +---------------------
- // | lr, saved by prelude <- R13 points here
- // +---------------------
- MOVW g, 4(R13)
-
- MOVW out+0(FP), R4
- MOVW m+4(FP), R5
- MOVW mlen+8(FP), R6
- MOVW key+12(FP), R7
-
- ADD $136, R13, R0 // 136 = 4 + 4 + 16 + 112
- MOVW R7, R1
-
- // poly1305_init_ext_armv6 will write to the stack from R13+4, but
- // that's ok because none of the other values have been written yet.
- BL poly1305_init_ext_armv6<>(SB)
- BIC.S $15, R6, R2
- BEQ poly1305_auth_armv6_noblocks
- ADD $136, R13, R0
- MOVW R5, R1
- ADD R2, R5, R5
- SUB R2, R6, R6
- BL poly1305_blocks_armv6<>(SB)
-
-poly1305_auth_armv6_noblocks:
- ADD $136, R13, R0
- MOVW R5, R1
- MOVW R6, R2
- MOVW R4, R3
-
- MOVW R0, R5
- MOVW R1, R6
- MOVW R2, R7
- MOVW R3, R8
- AND.S R2, R2, R2
- BEQ poly1305_finish_ext_armv6_noremaining
- EOR R0, R0
- ADD $8, R13, R9 // 8 = offset to 16 byte scratch space
- MOVW R0, (R9)
- MOVW R0, 4(R9)
- MOVW R0, 8(R9)
- MOVW R0, 12(R9)
- WORD $0xe3110003 // TST R1, #3 not working see issue 5921
- BEQ poly1305_finish_ext_armv6_aligned
- WORD $0xe3120008 // TST R2, #8 not working see issue 5921
- BEQ poly1305_finish_ext_armv6_skip8
- MOVWP_UNALIGNED(R1, R9, g)
- MOVWP_UNALIGNED(R1, R9, g)
-
-poly1305_finish_ext_armv6_skip8:
- WORD $0xe3120004 // TST $4, R2 not working see issue 5921
- BEQ poly1305_finish_ext_armv6_skip4
- MOVWP_UNALIGNED(R1, R9, g)
-
-poly1305_finish_ext_armv6_skip4:
- WORD $0xe3120002 // TST $2, R2 not working see issue 5921
- BEQ poly1305_finish_ext_armv6_skip2
- MOVHUP_UNALIGNED(R1, R9, g)
- B poly1305_finish_ext_armv6_skip2
-
-poly1305_finish_ext_armv6_aligned:
- WORD $0xe3120008 // TST R2, #8 not working see issue 5921
- BEQ poly1305_finish_ext_armv6_skip8_aligned
- MOVM.IA.W (R1), [g-R11]
- MOVM.IA.W [g-R11], (R9)
-
-poly1305_finish_ext_armv6_skip8_aligned:
- WORD $0xe3120004 // TST $4, R2 not working see issue 5921
- BEQ poly1305_finish_ext_armv6_skip4_aligned
- MOVW.P 4(R1), g
- MOVW.P g, 4(R9)
-
-poly1305_finish_ext_armv6_skip4_aligned:
- WORD $0xe3120002 // TST $2, R2 not working see issue 5921
- BEQ poly1305_finish_ext_armv6_skip2
- MOVHU.P 2(R1), g
- MOVH.P g, 2(R9)
-
-poly1305_finish_ext_armv6_skip2:
- WORD $0xe3120001 // TST $1, R2 not working see issue 5921
- BEQ poly1305_finish_ext_armv6_skip1
- MOVBU.P 1(R1), g
- MOVBU.P g, 1(R9)
-
-poly1305_finish_ext_armv6_skip1:
- MOVW $1, R11
- MOVBU R11, 0(R9)
- MOVW R11, 56(R5)
- MOVW R5, R0
- ADD $8, R13, R1
- MOVW $16, R2
- BL poly1305_blocks_armv6<>(SB)
-
-poly1305_finish_ext_armv6_noremaining:
- MOVW 20(R5), R0
- MOVW 24(R5), R1
- MOVW 28(R5), R2
- MOVW 32(R5), R3
- MOVW 36(R5), R4
- MOVW R4>>26, R12
- BIC $0xfc000000, R4, R4
- ADD R12<<2, R12, R12
- ADD R12, R0, R0
- MOVW R0>>26, R12
- BIC $0xfc000000, R0, R0
- ADD R12, R1, R1
- MOVW R1>>26, R12
- BIC $0xfc000000, R1, R1
- ADD R12, R2, R2
- MOVW R2>>26, R12
- BIC $0xfc000000, R2, R2
- ADD R12, R3, R3
- MOVW R3>>26, R12
- BIC $0xfc000000, R3, R3
- ADD R12, R4, R4
- ADD $5, R0, R6
- MOVW R6>>26, R12
- BIC $0xfc000000, R6, R6
- ADD R12, R1, R7
- MOVW R7>>26, R12
- BIC $0xfc000000, R7, R7
- ADD R12, R2, g
- MOVW g>>26, R12
- BIC $0xfc000000, g, g
- ADD R12, R3, R11
- MOVW $-(1<<26), R12
- ADD R11>>26, R12, R12
- BIC $0xfc000000, R11, R11
- ADD R12, R4, R9
- MOVW R9>>31, R12
- SUB $1, R12
- AND R12, R6, R6
- AND R12, R7, R7
- AND R12, g, g
- AND R12, R11, R11
- AND R12, R9, R9
- MVN R12, R12
- AND R12, R0, R0
- AND R12, R1, R1
- AND R12, R2, R2
- AND R12, R3, R3
- AND R12, R4, R4
- ORR R6, R0, R0
- ORR R7, R1, R1
- ORR g, R2, R2
- ORR R11, R3, R3
- ORR R9, R4, R4
- ORR R1<<26, R0, R0
- MOVW R1>>6, R1
- ORR R2<<20, R1, R1
- MOVW R2>>12, R2
- ORR R3<<14, R2, R2
- MOVW R3>>18, R3
- ORR R4<<8, R3, R3
- MOVW 40(R5), R6
- MOVW 44(R5), R7
- MOVW 48(R5), g
- MOVW 52(R5), R11
- ADD.S R6, R0, R0
- ADC.S R7, R1, R1
- ADC.S g, R2, R2
- ADC.S R11, R3, R3
- MOVM.IA [R0-R3], (R8)
- MOVW R5, R12
- EOR R0, R0, R0
- EOR R1, R1, R1
- EOR R2, R2, R2
- EOR R3, R3, R3
- EOR R4, R4, R4
- EOR R5, R5, R5
- EOR R6, R6, R6
- EOR R7, R7, R7
- MOVM.IA.W [R0-R7], (R12)
- MOVM.IA [R0-R7], (R12)
- MOVW 4(R13), g
- RET
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go b/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
deleted file mode 100644
index 751eec5..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,!go1.11 !arm,!amd64,!s390x gccgo appengine nacl
-
-package poly1305
-
-// Sum generates an authenticator for msg using a one-time key and puts the
-// 16-byte result into out. Authenticating two different messages with the same
-// key allows an attacker to forge messages at will.
-func Sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
- sumGeneric(out, msg, key)
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ref.go b/vendor/golang.org/x/crypto/poly1305/sum_ref.go
deleted file mode 100644
index c4d59bd..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_ref.go
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package poly1305
-
-import "encoding/binary"
-
-// sumGeneric generates an authenticator for msg using a one-time key and
-// puts the 16-byte result into out. This is the generic implementation of
-// Sum and should be called if no assembly implementation is available.
-func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
- var (
- h0, h1, h2, h3, h4 uint32 // the hash accumulators
- r0, r1, r2, r3, r4 uint64 // the r part of the key
- )
-
- r0 = uint64(binary.LittleEndian.Uint32(key[0:]) & 0x3ffffff)
- r1 = uint64((binary.LittleEndian.Uint32(key[3:]) >> 2) & 0x3ffff03)
- r2 = uint64((binary.LittleEndian.Uint32(key[6:]) >> 4) & 0x3ffc0ff)
- r3 = uint64((binary.LittleEndian.Uint32(key[9:]) >> 6) & 0x3f03fff)
- r4 = uint64((binary.LittleEndian.Uint32(key[12:]) >> 8) & 0x00fffff)
-
- R1, R2, R3, R4 := r1*5, r2*5, r3*5, r4*5
-
- for len(msg) >= TagSize {
- // h += msg
- h0 += binary.LittleEndian.Uint32(msg[0:]) & 0x3ffffff
- h1 += (binary.LittleEndian.Uint32(msg[3:]) >> 2) & 0x3ffffff
- h2 += (binary.LittleEndian.Uint32(msg[6:]) >> 4) & 0x3ffffff
- h3 += (binary.LittleEndian.Uint32(msg[9:]) >> 6) & 0x3ffffff
- h4 += (binary.LittleEndian.Uint32(msg[12:]) >> 8) | (1 << 24)
-
- // h *= r
- d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1)
- d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2)
- d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3)
- d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4)
- d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0)
-
- // h %= p
- h0 = uint32(d0) & 0x3ffffff
- h1 = uint32(d1) & 0x3ffffff
- h2 = uint32(d2) & 0x3ffffff
- h3 = uint32(d3) & 0x3ffffff
- h4 = uint32(d4) & 0x3ffffff
-
- h0 += uint32(d4>>26) * 5
- h1 += h0 >> 26
- h0 = h0 & 0x3ffffff
-
- msg = msg[TagSize:]
- }
-
- if len(msg) > 0 {
- var block [TagSize]byte
- off := copy(block[:], msg)
- block[off] = 0x01
-
- // h += msg
- h0 += binary.LittleEndian.Uint32(block[0:]) & 0x3ffffff
- h1 += (binary.LittleEndian.Uint32(block[3:]) >> 2) & 0x3ffffff
- h2 += (binary.LittleEndian.Uint32(block[6:]) >> 4) & 0x3ffffff
- h3 += (binary.LittleEndian.Uint32(block[9:]) >> 6) & 0x3ffffff
- h4 += (binary.LittleEndian.Uint32(block[12:]) >> 8)
-
- // h *= r
- d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1)
- d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2)
- d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3)
- d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4)
- d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0)
-
- // h %= p
- h0 = uint32(d0) & 0x3ffffff
- h1 = uint32(d1) & 0x3ffffff
- h2 = uint32(d2) & 0x3ffffff
- h3 = uint32(d3) & 0x3ffffff
- h4 = uint32(d4) & 0x3ffffff
-
- h0 += uint32(d4>>26) * 5
- h1 += h0 >> 26
- h0 = h0 & 0x3ffffff
- }
-
- // h %= p reduction
- h2 += h1 >> 26
- h1 &= 0x3ffffff
- h3 += h2 >> 26
- h2 &= 0x3ffffff
- h4 += h3 >> 26
- h3 &= 0x3ffffff
- h0 += 5 * (h4 >> 26)
- h4 &= 0x3ffffff
- h1 += h0 >> 26
- h0 &= 0x3ffffff
-
- // h - p
- t0 := h0 + 5
- t1 := h1 + (t0 >> 26)
- t2 := h2 + (t1 >> 26)
- t3 := h3 + (t2 >> 26)
- t4 := h4 + (t3 >> 26) - (1 << 26)
- t0 &= 0x3ffffff
- t1 &= 0x3ffffff
- t2 &= 0x3ffffff
- t3 &= 0x3ffffff
-
- // select h if h < p else h - p
- t_mask := (t4 >> 31) - 1
- h_mask := ^t_mask
- h0 = (h0 & h_mask) | (t0 & t_mask)
- h1 = (h1 & h_mask) | (t1 & t_mask)
- h2 = (h2 & h_mask) | (t2 & t_mask)
- h3 = (h3 & h_mask) | (t3 & t_mask)
- h4 = (h4 & h_mask) | (t4 & t_mask)
-
- // h %= 2^128
- h0 |= h1 << 26
- h1 = ((h1 >> 6) | (h2 << 20))
- h2 = ((h2 >> 12) | (h3 << 14))
- h3 = ((h3 >> 18) | (h4 << 8))
-
- // s: the s part of the key
- // tag = (h + s) % (2^128)
- t := uint64(h0) + uint64(binary.LittleEndian.Uint32(key[16:]))
- h0 = uint32(t)
- t = uint64(h1) + uint64(binary.LittleEndian.Uint32(key[20:])) + (t >> 32)
- h1 = uint32(t)
- t = uint64(h2) + uint64(binary.LittleEndian.Uint32(key[24:])) + (t >> 32)
- h2 = uint32(t)
- t = uint64(h3) + uint64(binary.LittleEndian.Uint32(key[28:])) + (t >> 32)
- h3 = uint32(t)
-
- binary.LittleEndian.PutUint32(out[0:], h0)
- binary.LittleEndian.PutUint32(out[4:], h1)
- binary.LittleEndian.PutUint32(out[8:], h2)
- binary.LittleEndian.PutUint32(out[12:], h3)
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
deleted file mode 100644
index 7a266ce..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,go1.11,!gccgo,!appengine
-
-package poly1305
-
-// hasVectorFacility reports whether the machine supports
-// the vector facility (vx).
-func hasVectorFacility() bool
-
-// hasVMSLFacility reports whether the machine supports
-// Vector Multiply Sum Logical (VMSL).
-func hasVMSLFacility() bool
-
-var hasVX = hasVectorFacility()
-var hasVMSL = hasVMSLFacility()
-
-// poly1305vx is an assembly implementation of Poly1305 that uses vector
-// instructions. It must only be called if the vector facility (vx) is
-// available.
-//go:noescape
-func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
-
-// poly1305vmsl is an assembly implementation of Poly1305 that uses vector
-// instructions, including VMSL. It must only be called if the vector facility (vx) is
-// available and if VMSL is supported.
-//go:noescape
-func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
-
-// Sum generates an authenticator for m using a one-time key and puts the
-// 16-byte result into out. Authenticating two different messages with the same
-// key allows an attacker to forge messages at will.
-func Sum(out *[16]byte, m []byte, key *[32]byte) {
- if hasVX {
- var mPtr *byte
- if len(m) > 0 {
- mPtr = &m[0]
- }
- if hasVMSL && len(m) > 256 {
- poly1305vmsl(out, mPtr, uint64(len(m)), key)
- } else {
- poly1305vx(out, mPtr, uint64(len(m)), key)
- }
- } else {
- sumGeneric(out, m, key)
- }
-}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
deleted file mode 100644
index 356c07a..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
+++ /dev/null
@@ -1,400 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,go1.11,!gccgo,!appengine
-
-#include "textflag.h"
-
-// Implementation of Poly1305 using the vector facility (vx).
-
-// constants
-#define MOD26 V0
-#define EX0 V1
-#define EX1 V2
-#define EX2 V3
-
-// temporaries
-#define T_0 V4
-#define T_1 V5
-#define T_2 V6
-#define T_3 V7
-#define T_4 V8
-
-// key (r)
-#define R_0 V9
-#define R_1 V10
-#define R_2 V11
-#define R_3 V12
-#define R_4 V13
-#define R5_1 V14
-#define R5_2 V15
-#define R5_3 V16
-#define R5_4 V17
-#define RSAVE_0 R5
-#define RSAVE_1 R6
-#define RSAVE_2 R7
-#define RSAVE_3 R8
-#define RSAVE_4 R9
-#define R5SAVE_1 V28
-#define R5SAVE_2 V29
-#define R5SAVE_3 V30
-#define R5SAVE_4 V31
-
-// message block
-#define F_0 V18
-#define F_1 V19
-#define F_2 V20
-#define F_3 V21
-#define F_4 V22
-
-// accumulator
-#define H_0 V23
-#define H_1 V24
-#define H_2 V25
-#define H_3 V26
-#define H_4 V27
-
-GLOBL ·keyMask<>(SB), RODATA, $16
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
-
-GLOBL ·bswapMask<>(SB), RODATA, $16
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
-
-GLOBL ·constants<>(SB), RODATA, $64
-// MOD26
-DATA ·constants<>+0(SB)/8, $0x3ffffff
-DATA ·constants<>+8(SB)/8, $0x3ffffff
-// EX0
-DATA ·constants<>+16(SB)/8, $0x0006050403020100
-DATA ·constants<>+24(SB)/8, $0x1016151413121110
-// EX1
-DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
-DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
-// EX2
-DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
-DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
-
-// h = (f*g) % (2**130-5) [partial reduction]
-#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
- VMLOF f0, g0, h0 \
- VMLOF f0, g1, h1 \
- VMLOF f0, g2, h2 \
- VMLOF f0, g3, h3 \
- VMLOF f0, g4, h4 \
- VMLOF f1, g54, T_0 \
- VMLOF f1, g0, T_1 \
- VMLOF f1, g1, T_2 \
- VMLOF f1, g2, T_3 \
- VMLOF f1, g3, T_4 \
- VMALOF f2, g53, h0, h0 \
- VMALOF f2, g54, h1, h1 \
- VMALOF f2, g0, h2, h2 \
- VMALOF f2, g1, h3, h3 \
- VMALOF f2, g2, h4, h4 \
- VMALOF f3, g52, T_0, T_0 \
- VMALOF f3, g53, T_1, T_1 \
- VMALOF f3, g54, T_2, T_2 \
- VMALOF f3, g0, T_3, T_3 \
- VMALOF f3, g1, T_4, T_4 \
- VMALOF f4, g51, h0, h0 \
- VMALOF f4, g52, h1, h1 \
- VMALOF f4, g53, h2, h2 \
- VMALOF f4, g54, h3, h3 \
- VMALOF f4, g0, h4, h4 \
- VAG T_0, h0, h0 \
- VAG T_1, h1, h1 \
- VAG T_2, h2, h2 \
- VAG T_3, h3, h3 \
- VAG T_4, h4, h4
-
-// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
-#define REDUCE(h0, h1, h2, h3, h4) \
- VESRLG $26, h0, T_0 \
- VESRLG $26, h3, T_1 \
- VN MOD26, h0, h0 \
- VN MOD26, h3, h3 \
- VAG T_0, h1, h1 \
- VAG T_1, h4, h4 \
- VESRLG $26, h1, T_2 \
- VESRLG $26, h4, T_3 \
- VN MOD26, h1, h1 \
- VN MOD26, h4, h4 \
- VESLG $2, T_3, T_4 \
- VAG T_3, T_4, T_4 \
- VAG T_2, h2, h2 \
- VAG T_4, h0, h0 \
- VESRLG $26, h2, T_0 \
- VESRLG $26, h0, T_1 \
- VN MOD26, h2, h2 \
- VN MOD26, h0, h0 \
- VAG T_0, h3, h3 \
- VAG T_1, h1, h1 \
- VESRLG $26, h3, T_2 \
- VN MOD26, h3, h3 \
- VAG T_2, h4, h4
-
-// expand in0 into d[0] and in1 into d[1]
-#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
- VGBM $0x0707, d1 \ // d1=tmp
- VPERM in0, in1, EX2, d4 \
- VPERM in0, in1, EX0, d0 \
- VPERM in0, in1, EX1, d2 \
- VN d1, d4, d4 \
- VESRLG $26, d0, d1 \
- VESRLG $30, d2, d3 \
- VESRLG $4, d2, d2 \
- VN MOD26, d0, d0 \
- VN MOD26, d1, d1 \
- VN MOD26, d2, d2 \
- VN MOD26, d3, d3
-
-// pack h4:h0 into h1:h0 (no carry)
-#define PACK(h0, h1, h2, h3, h4) \
- VESLG $26, h1, h1 \
- VESLG $26, h3, h3 \
- VO h0, h1, h0 \
- VO h2, h3, h2 \
- VESLG $4, h2, h2 \
- VLEIB $7, $48, h1 \
- VSLB h1, h2, h2 \
- VO h0, h2, h0 \
- VLEIB $7, $104, h1 \
- VSLB h1, h4, h3 \
- VO h3, h0, h0 \
- VLEIB $7, $24, h1 \
- VSRLB h1, h4, h1
-
-// if h > 2**130-5 then h -= 2**130-5
-#define MOD(h0, h1, t0, t1, t2) \
- VZERO t0 \
- VLEIG $1, $5, t0 \
- VACCQ h0, t0, t1 \
- VAQ h0, t0, t0 \
- VONE t2 \
- VLEIG $1, $-4, t2 \
- VAQ t2, t1, t1 \
- VACCQ h1, t1, t1 \
- VONE t2 \
- VAQ t2, t1, t1 \
- VN h0, t1, t2 \
- VNC t0, t1, t1 \
- VO t1, t2, h0
-
-// func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305vx(SB), $0-32
- // This code processes up to 2 blocks (32 bytes) per iteration
- // using the algorithm described in:
- // NEON crypto, Daniel J. Bernstein & Peter Schwabe
- // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
- LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
-
- // load MOD26, EX0, EX1 and EX2
- MOVD $·constants<>(SB), R5
- VLM (R5), MOD26, EX2
-
- // setup r
- VL (R4), T_0
- MOVD $·keyMask<>(SB), R6
- VL (R6), T_1
- VN T_0, T_1, T_0
- EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
-
- // setup r*5
- VLEIG $0, $5, T_0
- VLEIG $1, $5, T_0
-
- // store r (for final block)
- VMLOF T_0, R_1, R5SAVE_1
- VMLOF T_0, R_2, R5SAVE_2
- VMLOF T_0, R_3, R5SAVE_3
- VMLOF T_0, R_4, R5SAVE_4
- VLGVG $0, R_0, RSAVE_0
- VLGVG $0, R_1, RSAVE_1
- VLGVG $0, R_2, RSAVE_2
- VLGVG $0, R_3, RSAVE_3
- VLGVG $0, R_4, RSAVE_4
-
- // skip r**2 calculation
- CMPBLE R3, $16, skip
-
- // calculate r**2
- MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
- REDUCE(H_0, H_1, H_2, H_3, H_4)
- VLEIG $0, $5, T_0
- VLEIG $1, $5, T_0
- VMLOF T_0, H_1, R5_1
- VMLOF T_0, H_2, R5_2
- VMLOF T_0, H_3, R5_3
- VMLOF T_0, H_4, R5_4
- VLR H_0, R_0
- VLR H_1, R_1
- VLR H_2, R_2
- VLR H_3, R_3
- VLR H_4, R_4
-
- // initialize h
- VZERO H_0
- VZERO H_1
- VZERO H_2
- VZERO H_3
- VZERO H_4
-
-loop:
- CMPBLE R3, $32, b2
- VLM (R2), T_0, T_1
- SUB $32, R3
- MOVD $32(R2), R2
- EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
- VLEIB $4, $1, F_4
- VLEIB $12, $1, F_4
-
-multiply:
- VAG H_0, F_0, F_0
- VAG H_1, F_1, F_1
- VAG H_2, F_2, F_2
- VAG H_3, F_3, F_3
- VAG H_4, F_4, F_4
- MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
- REDUCE(H_0, H_1, H_2, H_3, H_4)
- CMPBNE R3, $0, loop
-
-finish:
- // sum vectors
- VZERO T_0
- VSUMQG H_0, T_0, H_0
- VSUMQG H_1, T_0, H_1
- VSUMQG H_2, T_0, H_2
- VSUMQG H_3, T_0, H_3
- VSUMQG H_4, T_0, H_4
-
- // h may be >= 2*(2**130-5) so we need to reduce it again
- REDUCE(H_0, H_1, H_2, H_3, H_4)
-
- // carry h1->h4
- VESRLG $26, H_1, T_1
- VN MOD26, H_1, H_1
- VAQ T_1, H_2, H_2
- VESRLG $26, H_2, T_2
- VN MOD26, H_2, H_2
- VAQ T_2, H_3, H_3
- VESRLG $26, H_3, T_3
- VN MOD26, H_3, H_3
- VAQ T_3, H_4, H_4
-
- // h is now < 2*(2**130-5)
- // pack h into h1 (hi) and h0 (lo)
- PACK(H_0, H_1, H_2, H_3, H_4)
-
- // if h > 2**130-5 then h -= 2**130-5
- MOD(H_0, H_1, T_0, T_1, T_2)
-
- // h += s
- MOVD $·bswapMask<>(SB), R5
- VL (R5), T_1
- VL 16(R4), T_0
- VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
- VAQ T_0, H_0, H_0
- VPERM H_0, H_0, T_1, H_0 // reverse bytes (to little)
- VST H_0, (R1)
-
- RET
-
-b2:
- CMPBLE R3, $16, b1
-
- // 2 blocks remaining
- SUB $17, R3
- VL (R2), T_0
- VLL R3, 16(R2), T_1
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, T_1
- EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
- CMPBNE R3, $16, 2(PC)
- VLEIB $12, $1, F_4
- VLEIB $4, $1, F_4
-
- // setup [r²,r]
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, RSAVE_3, R_3
- VLVGG $1, RSAVE_4, R_4
- VPDI $0, R5_1, R5SAVE_1, R5_1
- VPDI $0, R5_2, R5SAVE_2, R5_2
- VPDI $0, R5_3, R5SAVE_3, R5_3
- VPDI $0, R5_4, R5SAVE_4, R5_4
-
- MOVD $0, R3
- BR multiply
-
-skip:
- VZERO H_0
- VZERO H_1
- VZERO H_2
- VZERO H_3
- VZERO H_4
-
- CMPBEQ R3, $0, finish
-
-b1:
- // 1 block remaining
- SUB $1, R3
- VLL R3, (R2), T_0
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, T_0
- VZERO T_1
- EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
- CMPBNE R3, $16, 2(PC)
- VLEIB $4, $1, F_4
- VLEIG $1, $1, R_0
- VZERO R_1
- VZERO R_2
- VZERO R_3
- VZERO R_4
- VZERO R5_1
- VZERO R5_2
- VZERO R5_3
- VZERO R5_4
-
- // setup [r, 1]
- VLVGG $0, RSAVE_0, R_0
- VLVGG $0, RSAVE_1, R_1
- VLVGG $0, RSAVE_2, R_2
- VLVGG $0, RSAVE_3, R_3
- VLVGG $0, RSAVE_4, R_4
- VPDI $0, R5SAVE_1, R5_1, R5_1
- VPDI $0, R5SAVE_2, R5_2, R5_2
- VPDI $0, R5SAVE_3, R5_3, R5_3
- VPDI $0, R5SAVE_4, R5_4, R5_4
-
- MOVD $0, R3
- BR multiply
-
-TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
- MOVD $x-24(SP), R1
- XC $24, 0(R1), 0(R1) // clear the storage
- MOVD $2, R0 // R0 is the number of double words stored -1
- WORD $0xB2B01000 // STFLE 0(R1)
- XOR R0, R0 // reset the value of R0
- MOVBZ z-8(SP), R1
- AND $0x40, R1
- BEQ novector
-
-vectorinstalled:
- // check if the vector instruction has been enabled
- VLEIB $0, $0xF, V16
- VLGVB $0, V16, R1
- CMPBNE R1, $0xF, novector
- MOVB $1, ret+0(FP) // have vx
- RET
-
-novector:
- MOVB $0, ret+0(FP) // no vx
- RET
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s b/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
deleted file mode 100644
index e548020..0000000
--- a/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
+++ /dev/null
@@ -1,931 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,go1.11,!gccgo,!appengine
-
-#include "textflag.h"
-
-// Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
-
-// constants
-#define EX0 V1
-#define EX1 V2
-#define EX2 V3
-
-// temporaries
-#define T_0 V4
-#define T_1 V5
-#define T_2 V6
-#define T_3 V7
-#define T_4 V8
-#define T_5 V9
-#define T_6 V10
-#define T_7 V11
-#define T_8 V12
-#define T_9 V13
-#define T_10 V14
-
-// r**2 & r**4
-#define R_0 V15
-#define R_1 V16
-#define R_2 V17
-#define R5_1 V18
-#define R5_2 V19
-// key (r)
-#define RSAVE_0 R7
-#define RSAVE_1 R8
-#define RSAVE_2 R9
-#define R5SAVE_1 R10
-#define R5SAVE_2 R11
-
-// message block
-#define M0 V20
-#define M1 V21
-#define M2 V22
-#define M3 V23
-#define M4 V24
-#define M5 V25
-
-// accumulator
-#define H0_0 V26
-#define H1_0 V27
-#define H2_0 V28
-#define H0_1 V29
-#define H1_1 V30
-#define H2_1 V31
-
-GLOBL ·keyMask<>(SB), RODATA, $16
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
-
-GLOBL ·bswapMask<>(SB), RODATA, $16
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
-
-GLOBL ·constants<>(SB), RODATA, $48
-// EX0
-DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+8(SB)/8, $0x0000050403020100
-// EX1
-DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+24(SB)/8, $0x00000a0908070605
-// EX2
-DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
-
-GLOBL ·c<>(SB), RODATA, $48
-// EX0
-DATA ·c<>+0(SB)/8, $0x0000050403020100
-DATA ·c<>+8(SB)/8, $0x0000151413121110
-// EX1
-DATA ·c<>+16(SB)/8, $0x00000a0908070605
-DATA ·c<>+24(SB)/8, $0x00001a1918171615
-// EX2
-DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
-DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
-
-GLOBL ·reduce<>(SB), RODATA, $32
-// 44 bit
-DATA ·reduce<>+0(SB)/8, $0x0
-DATA ·reduce<>+8(SB)/8, $0xfffffffffff
-// 42 bit
-DATA ·reduce<>+16(SB)/8, $0x0
-DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
-
-// h = (f*g) % (2**130-5) [partial reduction]
-// uses T_0...T_9 temporary registers
-// input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
-// output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
-#define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
- \ // Eliminate the dependency for the last 2 VMSLs
- VMSLG m02_0, r_2, m4_2, m4_2 \
- VMSLG m13_0, r_2, m5_2, m5_2 \ // 8 VMSLs pipelined
- VMSLG m02_0, r_0, m4_0, m4_0 \
- VMSLG m02_1, r5_2, V0, T_0 \
- VMSLG m02_0, r_1, m4_1, m4_1 \
- VMSLG m02_1, r_0, V0, T_1 \
- VMSLG m02_1, r_1, V0, T_2 \
- VMSLG m02_2, r5_1, V0, T_3 \
- VMSLG m02_2, r5_2, V0, T_4 \
- VMSLG m13_0, r_0, m5_0, m5_0 \
- VMSLG m13_1, r5_2, V0, T_5 \
- VMSLG m13_0, r_1, m5_1, m5_1 \
- VMSLG m13_1, r_0, V0, T_6 \
- VMSLG m13_1, r_1, V0, T_7 \
- VMSLG m13_2, r5_1, V0, T_8 \
- VMSLG m13_2, r5_2, V0, T_9 \
- VMSLG m02_2, r_0, m4_2, m4_2 \
- VMSLG m13_2, r_0, m5_2, m5_2 \
- VAQ m4_0, T_0, m02_0 \
- VAQ m4_1, T_1, m02_1 \
- VAQ m5_0, T_5, m13_0 \
- VAQ m5_1, T_6, m13_1 \
- VAQ m02_0, T_3, m02_0 \
- VAQ m02_1, T_4, m02_1 \
- VAQ m13_0, T_8, m13_0 \
- VAQ m13_1, T_9, m13_1 \
- VAQ m4_2, T_2, m02_2 \
- VAQ m5_2, T_7, m13_2 \
-
-// SQUARE uses three limbs of r and r_2*5 to output square of r
-// uses T_1, T_5 and T_7 temporary registers
-// input: r_0, r_1, r_2, r5_2
-// temp: TEMP0, TEMP1, TEMP2
-// output: p0, p1, p2
-#define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
- VMSLG r_0, r_0, p0, p0 \
- VMSLG r_1, r5_2, V0, TEMP0 \
- VMSLG r_2, r5_2, p1, p1 \
- VMSLG r_0, r_1, V0, TEMP1 \
- VMSLG r_1, r_1, p2, p2 \
- VMSLG r_0, r_2, V0, TEMP2 \
- VAQ TEMP0, p0, p0 \
- VAQ TEMP1, p1, p1 \
- VAQ TEMP2, p2, p2 \
- VAQ TEMP0, p0, p0 \
- VAQ TEMP1, p1, p1 \
- VAQ TEMP2, p2, p2 \
-
-// carry h0->h1->h2->h0 || h3->h4->h5->h3
-// uses T_2, T_4, T_5, T_7, T_8, T_9
-// t6, t7, t8, t9, t10, t11
-// input: h0, h1, h2, h3, h4, h5
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
-// output: h0, h1, h2, h3, h4, h5
-#define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
- VLM (R12), t6, t7 \ // 44 and 42 bit clear mask
- VLEIB $7, $0x28, t10 \ // 5 byte shift mask
- VREPIB $4, t8 \ // 4 bit shift mask
- VREPIB $2, t11 \ // 2 bit shift mask
- VSRLB t10, h0, t0 \ // h0 byte shift
- VSRLB t10, h1, t1 \ // h1 byte shift
- VSRLB t10, h2, t2 \ // h2 byte shift
- VSRLB t10, h3, t3 \ // h3 byte shift
- VSRLB t10, h4, t4 \ // h4 byte shift
- VSRLB t10, h5, t5 \ // h5 byte shift
- VSRL t8, t0, t0 \ // h0 bit shift
- VSRL t8, t1, t1 \ // h2 bit shift
- VSRL t11, t2, t2 \ // h2 bit shift
- VSRL t8, t3, t3 \ // h3 bit shift
- VSRL t8, t4, t4 \ // h4 bit shift
- VESLG $2, t2, t9 \ // h2 carry x5
- VSRL t11, t5, t5 \ // h5 bit shift
- VN t6, h0, h0 \ // h0 clear carry
- VAQ t2, t9, t2 \ // h2 carry x5
- VESLG $2, t5, t9 \ // h5 carry x5
- VN t6, h1, h1 \ // h1 clear carry
- VN t7, h2, h2 \ // h2 clear carry
- VAQ t5, t9, t5 \ // h5 carry x5
- VN t6, h3, h3 \ // h3 clear carry
- VN t6, h4, h4 \ // h4 clear carry
- VN t7, h5, h5 \ // h5 clear carry
- VAQ t0, h1, h1 \ // h0->h1
- VAQ t3, h4, h4 \ // h3->h4
- VAQ t1, h2, h2 \ // h1->h2
- VAQ t4, h5, h5 \ // h4->h5
- VAQ t2, h0, h0 \ // h2->h0
- VAQ t5, h3, h3 \ // h5->h3
- VREPG $1, t6, t6 \ // 44 and 42 bit masks across both halves
- VREPG $1, t7, t7 \
- VSLDB $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
- VSLDB $8, h1, h1, h1 \
- VSLDB $8, h2, h2, h2 \
- VO h0, h3, h3 \
- VO h1, h4, h4 \
- VO h2, h5, h5 \
- VESRLG $44, h3, t0 \ // 44 bit shift right
- VESRLG $44, h4, t1 \
- VESRLG $42, h5, t2 \
- VN t6, h3, h3 \ // clear carry bits
- VN t6, h4, h4 \
- VN t7, h5, h5 \
- VESLG $2, t2, t9 \ // multiply carry by 5
- VAQ t9, t2, t2 \
- VAQ t0, h4, h4 \
- VAQ t1, h5, h5 \
- VAQ t2, h3, h3 \
-
-// carry h0->h1->h2->h0
-// input: h0, h1, h2
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
-// output: h0, h1, h2
-#define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
- VLEIB $7, $0x28, t3 \ // 5 byte shift mask
- VREPIB $4, t4 \ // 4 bit shift mask
- VREPIB $2, t7 \ // 2 bit shift mask
- VGBM $0x003F, t5 \ // mask to clear carry bits
- VSRLB t3, h0, t0 \
- VSRLB t3, h1, t1 \
- VSRLB t3, h2, t2 \
- VESRLG $4, t5, t5 \ // 44 bit clear mask
- VSRL t4, t0, t0 \
- VSRL t4, t1, t1 \
- VSRL t7, t2, t2 \
- VESRLG $2, t5, t6 \ // 42 bit clear mask
- VESLG $2, t2, t8 \
- VAQ t8, t2, t2 \
- VN t5, h0, h0 \
- VN t5, h1, h1 \
- VN t6, h2, h2 \
- VAQ t0, h1, h1 \
- VAQ t1, h2, h2 \
- VAQ t2, h0, h0 \
- VSRLB t3, h0, t0 \
- VSRLB t3, h1, t1 \
- VSRLB t3, h2, t2 \
- VSRL t4, t0, t0 \
- VSRL t4, t1, t1 \
- VSRL t7, t2, t2 \
- VN t5, h0, h0 \
- VN t5, h1, h1 \
- VESLG $2, t2, t8 \
- VN t6, h2, h2 \
- VAQ t0, h1, h1 \
- VAQ t8, t2, t2 \
- VAQ t1, h2, h2 \
- VAQ t2, h0, h0 \
-
-// expands two message blocks into the lower halfs of the d registers
-// moves the contents of the d registers into upper halfs
-// input: in1, in2, d0, d1, d2, d3, d4, d5
-// temp: TEMP0, TEMP1, TEMP2, TEMP3
-// output: d0, d1, d2, d3, d4, d5
-#define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
- VGBM $0xff3f, TEMP0 \
- VGBM $0xff1f, TEMP1 \
- VESLG $4, d1, TEMP2 \
- VESLG $4, d4, TEMP3 \
- VESRLG $4, TEMP0, TEMP0 \
- VPERM in1, d0, EX0, d0 \
- VPERM in2, d3, EX0, d3 \
- VPERM in1, d2, EX2, d2 \
- VPERM in2, d5, EX2, d5 \
- VPERM in1, TEMP2, EX1, d1 \
- VPERM in2, TEMP3, EX1, d4 \
- VN TEMP0, d0, d0 \
- VN TEMP0, d3, d3 \
- VESRLG $4, d1, d1 \
- VESRLG $4, d4, d4 \
- VN TEMP1, d2, d2 \
- VN TEMP1, d5, d5 \
- VN TEMP0, d1, d1 \
- VN TEMP0, d4, d4 \
-
-// expands one message block into the lower halfs of the d registers
-// moves the contents of the d registers into upper halfs
-// input: in, d0, d1, d2
-// temp: TEMP0, TEMP1, TEMP2
-// output: d0, d1, d2
-#define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
- VGBM $0xff3f, TEMP0 \
- VESLG $4, d1, TEMP2 \
- VGBM $0xff1f, TEMP1 \
- VPERM in, d0, EX0, d0 \
- VESRLG $4, TEMP0, TEMP0 \
- VPERM in, d2, EX2, d2 \
- VPERM in, TEMP2, EX1, d1 \
- VN TEMP0, d0, d0 \
- VN TEMP1, d2, d2 \
- VESRLG $4, d1, d1 \
- VN TEMP0, d1, d1 \
-
-// pack h2:h0 into h1:h0 (no carry)
-// input: h0, h1, h2
-// output: h0, h1, h2
-#define PACK(h0, h1, h2) \
- VMRLG h1, h2, h2 \ // copy h1 to upper half h2
- VESLG $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
- VO h0, h1, h0 \ // combine h0 with 20 bits from limb 1
- VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
- VLEIG $1, $0, h1 \ // clear h2 stuff from lower half of h1
- VO h0, h1, h0 \ // h0 now has 88 bits (limb 0 and 1)
- VLEIG $0, $0, h2 \ // clear upper half of h2
- VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
- VLEIB $7, $88, h1 \ // for byte shift (11 bytes)
- VSLB h1, h2, h2 \ // shift h2 11 bytes to the left
- VO h0, h2, h0 \ // combine h0 with 20 bits from limb 1
- VLEIG $0, $0, h1 \ // clear upper half of h1
-
-// if h > 2**130-5 then h -= 2**130-5
-// input: h0, h1
-// temp: t0, t1, t2
-// output: h0
-#define MOD(h0, h1, t0, t1, t2) \
- VZERO t0 \
- VLEIG $1, $5, t0 \
- VACCQ h0, t0, t1 \
- VAQ h0, t0, t0 \
- VONE t2 \
- VLEIG $1, $-4, t2 \
- VAQ t2, t1, t1 \
- VACCQ h1, t1, t1 \
- VONE t2 \
- VAQ t2, t1, t1 \
- VN h0, t1, t2 \
- VNC t0, t1, t1 \
- VO t1, t2, h0 \
-
-// func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305vmsl(SB), $0-32
- // This code processes 6 + up to 4 blocks (32 bytes) per iteration
- // using the algorithm described in:
- // NEON crypto, Daniel J. Bernstein & Peter Schwabe
- // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
- // And as moddified for VMSL as described in
- // Accelerating Poly1305 Cryptographic Message Authentication on the z14
- // O'Farrell et al, CASCON 2017, p48-55
- // https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
-
- LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
- VZERO V0 // c
-
- // load EX0, EX1 and EX2
- MOVD $·constants<>(SB), R5
- VLM (R5), EX0, EX2 // c
-
- // setup r
- VL (R4), T_0
- MOVD $·keyMask<>(SB), R6
- VL (R6), T_1
- VN T_0, T_1, T_0
- VZERO T_2 // limbs for r
- VZERO T_3
- VZERO T_4
- EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
-
- // T_2, T_3, T_4: [0, r]
-
- // setup r*20
- VLEIG $0, $0, T_0
- VLEIG $1, $20, T_0 // T_0: [0, 20]
- VZERO T_5
- VZERO T_6
- VMSLG T_0, T_3, T_5, T_5
- VMSLG T_0, T_4, T_6, T_6
-
- // store r for final block in GR
- VLGVG $1, T_2, RSAVE_0 // c
- VLGVG $1, T_3, RSAVE_1 // c
- VLGVG $1, T_4, RSAVE_2 // c
- VLGVG $1, T_5, R5SAVE_1 // c
- VLGVG $1, T_6, R5SAVE_2 // c
-
- // initialize h
- VZERO H0_0
- VZERO H1_0
- VZERO H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- // initialize pointer for reduce constants
- MOVD $·reduce<>(SB), R12
-
- // calculate r**2 and 20*(r**2)
- VZERO R_0
- VZERO R_1
- VZERO R_2
- SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
- REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
- VZERO R5_1
- VZERO R5_2
- VMSLG T_0, R_1, R5_1, R5_1
- VMSLG T_0, R_2, R5_2, R5_2
-
- // skip r**4 calculation if 3 blocks or less
- CMPBLE R3, $48, b4
-
- // calculate r**4 and 20*(r**4)
- VZERO T_8
- VZERO T_9
- VZERO T_10
- SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
- REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
- VZERO T_2
- VZERO T_3
- VMSLG T_0, T_9, T_2, T_2
- VMSLG T_0, T_10, T_3, T_3
-
- // put r**2 to the right and r**4 to the left of R_0, R_1, R_2
- VSLDB $8, T_8, T_8, T_8
- VSLDB $8, T_9, T_9, T_9
- VSLDB $8, T_10, T_10, T_10
- VSLDB $8, T_2, T_2, T_2
- VSLDB $8, T_3, T_3, T_3
-
- VO T_8, R_0, R_0
- VO T_9, R_1, R_1
- VO T_10, R_2, R_2
- VO T_2, R5_1, R5_1
- VO T_3, R5_2, R5_2
-
- CMPBLE R3, $80, load // less than or equal to 5 blocks in message
-
- // 6(or 5+1) blocks
- SUB $81, R3
- VLM (R2), M0, M4
- VLL R3, 80(R2), M5
- ADD $1, R3
- MOVBZ $1, R0
- CMPBGE R3, $16, 2(PC)
- VLVGB R3, R0, M5
- MOVD $96(R2), R2
- EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
- EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
- VLEIB $2, $1, H2_0
- VLEIB $2, $1, H2_1
- VLEIB $10, $1, H2_0
- VLEIB $10, $1, H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO T_4
- VZERO T_10
- EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
- VLR T_4, M4
- VLEIB $10, $1, M2
- CMPBLT R3, $16, 2(PC)
- VLEIB $10, $1, T_10
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
-
- SUB $16, R3
- CMPBLE R3, $0, square
-
-load:
- // load EX0, EX1 and EX2
- MOVD $·c<>(SB), R5
- VLM (R5), EX0, EX2
-
-loop:
- CMPBLE R3, $64, add // b4 // last 4 or less blocks left
-
- // next 4 full blocks
- VLM (R2), M2, M5
- SUB $64, R3
- MOVD $64(R2), R2
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
-
- // expacc in-lined to create [m2, m3] limbs
- VGBM $0x3f3f, T_0 // 44 bit clear mask
- VGBM $0x1f1f, T_1 // 40 bit clear mask
- VPERM M2, M3, EX0, T_3
- VESRLG $4, T_0, T_0 // 44 bit clear mask ready
- VPERM M2, M3, EX1, T_4
- VPERM M2, M3, EX2, T_5
- VN T_0, T_3, T_3
- VESRLG $4, T_4, T_4
- VN T_1, T_5, T_5
- VN T_0, T_4, T_4
- VMRHG H0_1, T_3, H0_0
- VMRHG H1_1, T_4, H1_0
- VMRHG H2_1, T_5, H2_0
- VMRLG H0_1, T_3, H0_1
- VMRLG H1_1, T_4, H1_1
- VMRLG H2_1, T_5, H2_1
- VLEIB $10, $1, H2_0
- VLEIB $10, $1, H2_1
- VPERM M4, M5, EX0, T_3
- VPERM M4, M5, EX1, T_4
- VPERM M4, M5, EX2, T_5
- VN T_0, T_3, T_3
- VESRLG $4, T_4, T_4
- VN T_1, T_5, T_5
- VN T_0, T_4, T_4
- VMRHG V0, T_3, M0
- VMRHG V0, T_4, M1
- VMRHG V0, T_5, M2
- VMRLG V0, T_3, M3
- VMRLG V0, T_4, M4
- VMRLG V0, T_5, M5
- VLEIB $10, $1, M2
- VLEIB $10, $1, M5
-
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- CMPBNE R3, $0, loop
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
-
- // load EX0, EX1, EX2
- MOVD $·constants<>(SB), R5
- VLM (R5), EX0, EX2
-
- // sum vectors
- VAQ H0_0, H0_1, H0_0
- VAQ H1_0, H1_1, H1_0
- VAQ H2_0, H2_1, H2_0
-
- // h may be >= 2*(2**130-5) so we need to reduce it again
- // M0...M4 are used as temps here
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
-next: // carry h1->h2
- VLEIB $7, $0x28, T_1
- VREPIB $4, T_2
- VGBM $0x003F, T_3
- VESRLG $4, T_3
-
- // byte shift
- VSRLB T_1, H1_0, T_4
-
- // bit shift
- VSRL T_2, T_4, T_4
-
- // clear h1 carry bits
- VN T_3, H1_0, H1_0
-
- // add carry
- VAQ T_4, H2_0, H2_0
-
- // h is now < 2*(2**130-5)
- // pack h into h1 (hi) and h0 (lo)
- PACK(H0_0, H1_0, H2_0)
-
- // if h > 2**130-5 then h -= 2**130-5
- MOD(H0_0, H1_0, T_0, T_1, T_2)
-
- // h += s
- MOVD $·bswapMask<>(SB), R5
- VL (R5), T_1
- VL 16(R4), T_0
- VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
- VAQ T_0, H0_0, H0_0
- VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
- VST H0_0, (R1)
- RET
-
-add:
- // load EX0, EX1, EX2
- MOVD $·constants<>(SB), R5
- VLM (R5), EX0, EX2
-
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
- CMPBLE R3, $64, b4
-
-b4:
- CMPBLE R3, $48, b3 // 3 blocks or less
-
- // 4(3+1) blocks remaining
- SUB $49, R3
- VLM (R2), M0, M2
- VLL R3, 48(R2), M3
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, M3
- MOVD $64(R2), R2
- EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
- VLEIB $10, $1, H2_0
- VLEIB $10, $1, H2_1
- VZERO M0
- VZERO M1
- VZERO M4
- VZERO M5
- VZERO T_4
- VZERO T_10
- EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
- VLR T_4, M2
- VLEIB $10, $1, M4
- CMPBNE R3, $16, 2(PC)
- VLEIB $10, $1, T_10
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
- SUB $16, R3
- CMPBLE R3, $0, square // this condition must always hold true!
-
-b3:
- CMPBLE R3, $32, b2
-
- // 3 blocks remaining
-
- // setup [r²,r]
- VSLDB $8, R_0, R_0, R_0
- VSLDB $8, R_1, R_1, R_1
- VSLDB $8, R_2, R_2, R_2
- VSLDB $8, R5_1, R5_1, R5_1
- VSLDB $8, R5_2, R5_2, R5_2
-
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, R5SAVE_1, R5_1
- VLVGG $1, R5SAVE_2, R5_2
-
- // setup [h0, h1]
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
- VO H0_1, H0_0, H0_0
- VO H1_1, H1_0, H1_0
- VO H2_1, H2_0, H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
-
- // H*[r**2, r]
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
-
- SUB $33, R3
- VLM (R2), M0, M1
- VLL R3, 32(R2), M2
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, M2
-
- // H += m0
- VZERO T_1
- VZERO T_2
- VZERO T_3
- EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
- VLEIB $10, $1, T_3
- VAG H0_0, T_1, H0_0
- VAG H1_0, T_2, H1_0
- VAG H2_0, T_3, H2_0
-
- VZERO M0
- VZERO M3
- VZERO M4
- VZERO M5
- VZERO T_10
-
- // (H+m0)*r
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
-
- // H += m1
- VZERO V0
- VZERO T_1
- VZERO T_2
- VZERO T_3
- EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
- VLEIB $10, $1, T_3
- VAQ H0_0, T_1, H0_0
- VAQ H1_0, T_2, H1_0
- VAQ H2_0, T_3, H2_0
- REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
-
- // [H, m2] * [r**2, r]
- EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
- CMPBNE R3, $16, 2(PC)
- VLEIB $10, $1, H2_0
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
- SUB $16, R3
- CMPBLE R3, $0, next // this condition must always hold true!
-
-b2:
- CMPBLE R3, $16, b1
-
- // 2 blocks remaining
-
- // setup [r²,r]
- VSLDB $8, R_0, R_0, R_0
- VSLDB $8, R_1, R_1, R_1
- VSLDB $8, R_2, R_2, R_2
- VSLDB $8, R5_1, R5_1, R5_1
- VSLDB $8, R5_2, R5_2, R5_2
-
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, R5SAVE_1, R5_1
- VLVGG $1, R5SAVE_2, R5_2
-
- // setup [h0, h1]
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
- VO H0_1, H0_0, H0_0
- VO H1_1, H1_0, H1_0
- VO H2_1, H2_0, H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
-
- // H*[r**2, r]
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
-
- // move h to the left and 0s at the right
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
-
- // get message blocks and append 1 to start
- SUB $17, R3
- VL (R2), M0
- VLL R3, 16(R2), M1
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, M1
- VZERO T_6
- VZERO T_7
- VZERO T_8
- EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
- EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
- VLEIB $2, $1, T_8
- CMPBNE R3, $16, 2(PC)
- VLEIB $10, $1, T_8
-
- // add [m0, m1] to h
- VAG H0_0, T_6, H0_0
- VAG H1_0, T_7, H1_0
- VAG H2_0, T_8, H2_0
-
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
- VZERO T_10
- VZERO M0
-
- // at this point R_0 .. R5_2 look like [r**2, r]
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
- SUB $16, R3, R3
- CMPBLE R3, $0, next
-
-b1:
- CMPBLE R3, $0, next
-
- // 1 block remaining
-
- // setup [r²,r]
- VSLDB $8, R_0, R_0, R_0
- VSLDB $8, R_1, R_1, R_1
- VSLDB $8, R_2, R_2, R_2
- VSLDB $8, R5_1, R5_1, R5_1
- VSLDB $8, R5_2, R5_2, R5_2
-
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, R5SAVE_1, R5_1
- VLVGG $1, R5SAVE_2, R5_2
-
- // setup [h0, h1]
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
- VO H0_1, H0_0, H0_0
- VO H1_1, H1_0, H1_0
- VO H2_1, H2_0, H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
-
- // H*[r**2, r]
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
- // set up [0, m0] limbs
- SUB $1, R3
- VLL R3, (R2), M0
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, M0
- VZERO T_1
- VZERO T_2
- VZERO T_3
- EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
- CMPBNE R3, $16, 2(PC)
- VLEIB $10, $1, T_3
-
- // h+m0
- VAQ H0_0, T_1, H0_0
- VAQ H1_0, T_2, H1_0
- VAQ H2_0, T_3, H2_0
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
- BR next
-
-square:
- // setup [r²,r]
- VSLDB $8, R_0, R_0, R_0
- VSLDB $8, R_1, R_1, R_1
- VSLDB $8, R_2, R_2, R_2
- VSLDB $8, R5_1, R5_1, R5_1
- VSLDB $8, R5_2, R5_2, R5_2
-
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, R5SAVE_1, R5_1
- VLVGG $1, R5SAVE_2, R5_2
-
- // setup [h0, h1]
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
- VO H0_1, H0_0, H0_0
- VO H1_1, H1_0, H1_0
- VO H2_1, H2_0, H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
-
- // (h0*r**2) + (h1*r)
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
- BR next
-
-TEXT ·hasVMSLFacility(SB), NOSPLIT, $24-1
- MOVD $x-24(SP), R1
- XC $24, 0(R1), 0(R1) // clear the storage
- MOVD $2, R0 // R0 is the number of double words stored -1
- WORD $0xB2B01000 // STFLE 0(R1)
- XOR R0, R0 // reset the value of R0
- MOVBZ z-8(SP), R1
- AND $0x01, R1
- BEQ novmsl
-
-vectorinstalled:
- // check if the vector instruction has been enabled
- VLEIB $0, $0xF, V16
- VLGVB $0, V16, R1
- CMPBNE R1, $0xF, novmsl
- MOVB $1, ret+0(FP) // have vx
- RET
-
-novmsl:
- MOVB $0, ret+0(FP) // no vx
- RET
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go
index f9269c3..c400dfc 100644
--- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go
+++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.go
@@ -2,14 +2,14 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64,!appengine,!gccgo
+//go:build amd64 && !purego && gc
+// +build amd64,!purego,gc
package salsa
-// This function is implemented in salsa2020_amd64.s.
-
//go:noescape
+// salsa2020XORKeyStream is implemented in salsa20_amd64.s.
func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
// XORKeyStream crypts bytes from in to out using the given key and counters.
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa2020_amd64.s b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s
index 22afbdc..c089277 100644
--- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa2020_amd64.s
+++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s
@@ -2,13 +2,14 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64,!appengine,!gccgo
+//go:build amd64 && !purego && gc
+// +build amd64,!purego,gc
// This code was translated into a form compatible with 6a from the public
// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
-// This needs up to 64 bytes at 360(SP); hence the non-obvious frame size.
+// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVQ out+0(FP),DI
MOVQ in+8(FP),SI
@@ -17,10 +18,8 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVQ key+32(FP),R8
MOVQ SP,R12
- MOVQ SP,R9
- ADDQ $31, R9
- ANDQ $~31, R9
- MOVQ R9, SP
+ ADDQ $31, R12
+ ANDQ $~31, R12
MOVQ DX,R9
MOVQ CX,DX
@@ -32,122 +31,116 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVL 0(R10),R8
MOVL 0(DX),AX
MOVL 16(R10),R11
- MOVL CX,0(SP)
- MOVL R8, 4 (SP)
- MOVL AX, 8 (SP)
- MOVL R11, 12 (SP)
+ MOVL CX,0(R12)
+ MOVL R8, 4 (R12)
+ MOVL AX, 8 (R12)
+ MOVL R11, 12 (R12)
MOVL 8(DX),CX
MOVL 24(R10),R8
MOVL 4(R10),AX
MOVL 4(DX),R11
- MOVL CX,16(SP)
- MOVL R8, 20 (SP)
- MOVL AX, 24 (SP)
- MOVL R11, 28 (SP)
+ MOVL CX,16(R12)
+ MOVL R8, 20 (R12)
+ MOVL AX, 24 (R12)
+ MOVL R11, 28 (R12)
MOVL 12(DX),CX
MOVL 12(R10),DX
MOVL 28(R10),R8
MOVL 8(R10),AX
- MOVL DX,32(SP)
- MOVL CX, 36 (SP)
- MOVL R8, 40 (SP)
- MOVL AX, 44 (SP)
+ MOVL DX,32(R12)
+ MOVL CX, 36 (R12)
+ MOVL R8, 40 (R12)
+ MOVL AX, 44 (R12)
MOVQ $1634760805,DX
MOVQ $857760878,CX
MOVQ $2036477234,R8
MOVQ $1797285236,AX
- MOVL DX,48(SP)
- MOVL CX, 52 (SP)
- MOVL R8, 56 (SP)
- MOVL AX, 60 (SP)
+ MOVL DX,48(R12)
+ MOVL CX, 52 (R12)
+ MOVL R8, 56 (R12)
+ MOVL AX, 60 (R12)
CMPQ R9,$256
JB BYTESBETWEEN1AND255
- MOVOA 48(SP),X0
+ MOVOA 48(R12),X0
PSHUFL $0X55,X0,X1
PSHUFL $0XAA,X0,X2
PSHUFL $0XFF,X0,X3
PSHUFL $0X00,X0,X0
- MOVOA X1,64(SP)
- MOVOA X2,80(SP)
- MOVOA X3,96(SP)
- MOVOA X0,112(SP)
- MOVOA 0(SP),X0
+ MOVOA X1,64(R12)
+ MOVOA X2,80(R12)
+ MOVOA X3,96(R12)
+ MOVOA X0,112(R12)
+ MOVOA 0(R12),X0
PSHUFL $0XAA,X0,X1
PSHUFL $0XFF,X0,X2
PSHUFL $0X00,X0,X3
PSHUFL $0X55,X0,X0
- MOVOA X1,128(SP)
- MOVOA X2,144(SP)
- MOVOA X3,160(SP)
- MOVOA X0,176(SP)
- MOVOA 16(SP),X0
+ MOVOA X1,128(R12)
+ MOVOA X2,144(R12)
+ MOVOA X3,160(R12)
+ MOVOA X0,176(R12)
+ MOVOA 16(R12),X0
PSHUFL $0XFF,X0,X1
PSHUFL $0X55,X0,X2
PSHUFL $0XAA,X0,X0
- MOVOA X1,192(SP)
- MOVOA X2,208(SP)
- MOVOA X0,224(SP)
- MOVOA 32(SP),X0
+ MOVOA X1,192(R12)
+ MOVOA X2,208(R12)
+ MOVOA X0,224(R12)
+ MOVOA 32(R12),X0
PSHUFL $0X00,X0,X1
PSHUFL $0XAA,X0,X2
PSHUFL $0XFF,X0,X0
- MOVOA X1,240(SP)
- MOVOA X2,256(SP)
- MOVOA X0,272(SP)
+ MOVOA X1,240(R12)
+ MOVOA X2,256(R12)
+ MOVOA X0,272(R12)
BYTESATLEAST256:
- MOVL 16(SP),DX
- MOVL 36 (SP),CX
- MOVL DX,288(SP)
- MOVL CX,304(SP)
- ADDQ $1,DX
+ MOVL 16(R12),DX
+ MOVL 36 (R12),CX
+ MOVL DX,288(R12)
+ MOVL CX,304(R12)
SHLQ $32,CX
ADDQ CX,DX
+ ADDQ $1,DX
MOVQ DX,CX
SHRQ $32,CX
- MOVL DX, 292 (SP)
- MOVL CX, 308 (SP)
+ MOVL DX, 292 (R12)
+ MOVL CX, 308 (R12)
ADDQ $1,DX
- SHLQ $32,CX
- ADDQ CX,DX
MOVQ DX,CX
SHRQ $32,CX
- MOVL DX, 296 (SP)
- MOVL CX, 312 (SP)
+ MOVL DX, 296 (R12)
+ MOVL CX, 312 (R12)
ADDQ $1,DX
- SHLQ $32,CX
- ADDQ CX,DX
MOVQ DX,CX
SHRQ $32,CX
- MOVL DX, 300 (SP)
- MOVL CX, 316 (SP)
+ MOVL DX, 300 (R12)
+ MOVL CX, 316 (R12)
ADDQ $1,DX
- SHLQ $32,CX
- ADDQ CX,DX
MOVQ DX,CX
SHRQ $32,CX
- MOVL DX,16(SP)
- MOVL CX, 36 (SP)
- MOVQ R9,352(SP)
+ MOVL DX,16(R12)
+ MOVL CX, 36 (R12)
+ MOVQ R9,352(R12)
MOVQ $20,DX
- MOVOA 64(SP),X0
- MOVOA 80(SP),X1
- MOVOA 96(SP),X2
- MOVOA 256(SP),X3
- MOVOA 272(SP),X4
- MOVOA 128(SP),X5
- MOVOA 144(SP),X6
- MOVOA 176(SP),X7
- MOVOA 192(SP),X8
- MOVOA 208(SP),X9
- MOVOA 224(SP),X10
- MOVOA 304(SP),X11
- MOVOA 112(SP),X12
- MOVOA 160(SP),X13
- MOVOA 240(SP),X14
- MOVOA 288(SP),X15
+ MOVOA 64(R12),X0
+ MOVOA 80(R12),X1
+ MOVOA 96(R12),X2
+ MOVOA 256(R12),X3
+ MOVOA 272(R12),X4
+ MOVOA 128(R12),X5
+ MOVOA 144(R12),X6
+ MOVOA 176(R12),X7
+ MOVOA 192(R12),X8
+ MOVOA 208(R12),X9
+ MOVOA 224(R12),X10
+ MOVOA 304(R12),X11
+ MOVOA 112(R12),X12
+ MOVOA 160(R12),X13
+ MOVOA 240(R12),X14
+ MOVOA 288(R12),X15
MAINLOOP1:
- MOVOA X1,320(SP)
- MOVOA X2,336(SP)
+ MOVOA X1,320(R12)
+ MOVOA X2,336(R12)
MOVOA X13,X1
PADDL X12,X1
MOVOA X1,X2
@@ -197,8 +190,8 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
PXOR X1,X12
PSRLL $14,X2
PXOR X2,X12
- MOVOA 320(SP),X1
- MOVOA X12,320(SP)
+ MOVOA 320(R12),X1
+ MOVOA X12,320(R12)
MOVOA X9,X2
PADDL X7,X2
MOVOA X2,X12
@@ -213,8 +206,8 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
PXOR X2,X3
PSRLL $25,X12
PXOR X12,X3
- MOVOA 336(SP),X2
- MOVOA X0,336(SP)
+ MOVOA 336(R12),X2
+ MOVOA X0,336(R12)
MOVOA X6,X0
PADDL X2,X0
MOVOA X0,X12
@@ -257,8 +250,8 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
PXOR X0,X1
PSRLL $14,X12
PXOR X12,X1
- MOVOA 320(SP),X0
- MOVOA X1,320(SP)
+ MOVOA 320(R12),X0
+ MOVOA X1,320(R12)
MOVOA X4,X1
PADDL X0,X1
MOVOA X1,X12
@@ -273,8 +266,8 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
PXOR X1,X2
PSRLL $14,X12
PXOR X12,X2
- MOVOA 336(SP),X12
- MOVOA X2,336(SP)
+ MOVOA 336(R12),X12
+ MOVOA X2,336(R12)
MOVOA X14,X1
PADDL X12,X1
MOVOA X1,X2
@@ -317,8 +310,8 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
PXOR X1,X0
PSRLL $14,X2
PXOR X2,X0
- MOVOA 320(SP),X1
- MOVOA X0,320(SP)
+ MOVOA 320(R12),X1
+ MOVOA X0,320(R12)
MOVOA X8,X0
PADDL X14,X0
MOVOA X0,X2
@@ -333,8 +326,8 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
PXOR X0,X6
PSRLL $25,X2
PXOR X2,X6
- MOVOA 336(SP),X2
- MOVOA X12,336(SP)
+ MOVOA 336(R12),X2
+ MOVOA X12,336(R12)
MOVOA X3,X0
PADDL X2,X0
MOVOA X0,X12
@@ -384,14 +377,14 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
PXOR X0,X2
PSRLL $14,X12
PXOR X12,X2
- MOVOA 320(SP),X12
- MOVOA 336(SP),X0
+ MOVOA 320(R12),X12
+ MOVOA 336(R12),X0
SUBQ $2,DX
JA MAINLOOP1
- PADDL 112(SP),X12
- PADDL 176(SP),X7
- PADDL 224(SP),X10
- PADDL 272(SP),X4
+ PADDL 112(R12),X12
+ PADDL 176(R12),X7
+ PADDL 224(R12),X10
+ PADDL 272(R12),X4
MOVD X12,DX
MOVD X7,CX
MOVD X10,R8
@@ -452,10 +445,10 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVL CX,196(DI)
MOVL R8,200(DI)
MOVL R9,204(DI)
- PADDL 240(SP),X14
- PADDL 64(SP),X0
- PADDL 128(SP),X5
- PADDL 192(SP),X8
+ PADDL 240(R12),X14
+ PADDL 64(R12),X0
+ PADDL 128(R12),X5
+ PADDL 192(R12),X8
MOVD X14,DX
MOVD X0,CX
MOVD X5,R8
@@ -516,10 +509,10 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVL CX,212(DI)
MOVL R8,216(DI)
MOVL R9,220(DI)
- PADDL 288(SP),X15
- PADDL 304(SP),X11
- PADDL 80(SP),X1
- PADDL 144(SP),X6
+ PADDL 288(R12),X15
+ PADDL 304(R12),X11
+ PADDL 80(R12),X1
+ PADDL 144(R12),X6
MOVD X15,DX
MOVD X11,CX
MOVD X1,R8
@@ -580,10 +573,10 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVL CX,228(DI)
MOVL R8,232(DI)
MOVL R9,236(DI)
- PADDL 160(SP),X13
- PADDL 208(SP),X9
- PADDL 256(SP),X3
- PADDL 96(SP),X2
+ PADDL 160(R12),X13
+ PADDL 208(R12),X9
+ PADDL 256(R12),X3
+ PADDL 96(R12),X2
MOVD X13,DX
MOVD X9,CX
MOVD X3,R8
@@ -644,7 +637,7 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVL CX,244(DI)
MOVL R8,248(DI)
MOVL R9,252(DI)
- MOVQ 352(SP),R9
+ MOVQ 352(R12),R9
SUBQ $256,R9
ADDQ $256,SI
ADDQ $256,DI
@@ -656,17 +649,17 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
CMPQ R9,$64
JAE NOCOPY
MOVQ DI,DX
- LEAQ 360(SP),DI
+ LEAQ 360(R12),DI
MOVQ R9,CX
REP; MOVSB
- LEAQ 360(SP),DI
- LEAQ 360(SP),SI
+ LEAQ 360(R12),DI
+ LEAQ 360(R12),SI
NOCOPY:
- MOVQ R9,352(SP)
- MOVOA 48(SP),X0
- MOVOA 0(SP),X1
- MOVOA 16(SP),X2
- MOVOA 32(SP),X3
+ MOVQ R9,352(R12)
+ MOVOA 48(R12),X0
+ MOVOA 0(R12),X1
+ MOVOA 16(R12),X2
+ MOVOA 32(R12),X3
MOVOA X1,X4
MOVQ $20,CX
MAINLOOP2:
@@ -797,10 +790,10 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
PSHUFL $0X39,X3,X3
PXOR X6,X0
JA MAINLOOP2
- PADDL 48(SP),X0
- PADDL 0(SP),X1
- PADDL 16(SP),X2
- PADDL 32(SP),X3
+ PADDL 48(R12),X0
+ PADDL 0(R12),X1
+ PADDL 16(R12),X2
+ PADDL 32(R12),X3
MOVD X0,CX
MOVD X1,R8
MOVD X2,R9
@@ -861,16 +854,16 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVL R8,44(DI)
MOVL R9,28(DI)
MOVL AX,12(DI)
- MOVQ 352(SP),R9
- MOVL 16(SP),CX
- MOVL 36 (SP),R8
+ MOVQ 352(R12),R9
+ MOVL 16(R12),CX
+ MOVL 36 (R12),R8
ADDQ $1,CX
SHLQ $32,R8
ADDQ R8,CX
MOVQ CX,R8
SHRQ $32,R8
- MOVL CX,16(SP)
- MOVL R8, 36 (SP)
+ MOVL CX,16(R12)
+ MOVL R8, 36 (R12)
CMPQ R9,$64
JA BYTESATLEAST65
JAE BYTESATLEAST64
@@ -880,7 +873,6 @@ TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
REP; MOVSB
BYTESATLEAST64:
DONE:
- MOVQ R12,SP
RET
BYTESATLEAST65:
SUBQ $64,R9
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_noasm.go b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_noasm.go
new file mode 100644
index 0000000..4392cc1
--- /dev/null
+++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_noasm.go
@@ -0,0 +1,15 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || purego || !gc
+// +build !amd64 purego !gc
+
+package salsa
+
+// XORKeyStream crypts bytes from in to out using the given key and counters.
+// In and out must overlap entirely or not at all. Counter
+// contains the raw salsa20 counter bytes (both nonce and block counter).
+func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
+ genericXORKeyStream(out, in, counter, key)
+}
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go
index 22126d1..68169c6 100644
--- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go
+++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go
@@ -2,8 +2,6 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64 appengine gccgo
-
package salsa
const rounds = 20
@@ -202,10 +200,9 @@ func core(out *[64]byte, in *[16]byte, k *[32]byte, c *[16]byte) {
out[63] = byte(x15 >> 24)
}
-// XORKeyStream crypts bytes from in to out using the given key and counters.
-// In and out must overlap entirely or not at all. Counter
-// contains the raw salsa20 counter bytes (both nonce and block counter).
-func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
+// genericXORKeyStream is the generic implementation of XORKeyStream to be used
+// when no assembly implementation is available.
+func genericXORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
var block [64]byte
var counterCopy [16]byte
copy(counterCopy[:], counter[:])