aboutsummaryrefslogtreecommitdiff
path: root/src/vendor
diff options
context:
space:
mode:
authorDmitri Shuralyov <dmitshur@golang.org>2020-05-01 18:58:41 -0400
committerDmitri Shuralyov <dmitshur@golang.org>2020-05-04 22:52:07 +0000
commitb5f7ff4aa9c1fef6437f350595caae4ee4b5708d (patch)
treeaab04441ac1615260b34983999bd5ba923d4be4b /src/vendor
parent4c003f6b780b471afbf032438eb6c7519458855b (diff)
downloadgo-b5f7ff4aa9c1fef6437f350595caae4ee4b5708d.tar.gz
go-b5f7ff4aa9c1fef6437f350595caae4ee4b5708d.zip
all: update vendored dependencies for Go 1.15 release
The Go 1.15 code freeze has just started. This is the time to update all golang.org/x/... module versions that contribute packages to the std and cmd modules in the standard library to latest master versions. Those versions have already gone through code review, and now they will undergo additional testing during the freeze period. If there are new issues in these dependencies discovered, we have the freeze period to deal with that. By the end of the freeze period, we will have confidence that the Go 1.15 release and the dependency versions it has selected are robust. If one of the Go 1.15.x minor releases requires changing code in one of the vendored packages, we'll be able to do so on top of the versions that are selected here, and not be forced to use versions that came from different time periods, or try to jump across multiple untested versions in a minor release. The dependency versions that are selected in this commit are: github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3 github.com/ianlancetaylor/demangle v0.0.0-20200414190113-039b1ae3a340 golang.org/x/arch v0.0.0-20200312215426-ff8b605520f4 golang.org/x/crypto v0.0.0-20200429183012-4b2356b1ed79 golang.org/x/mod v0.2.1-0.20200429172858-859b3ef565e2 golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5 golang.org/x/sys v0.0.0-20200501145240-bc7a7d42d5c3 golang.org/x/text v0.3.3-0.20200430171850-afb9336c4530 golang.org/x/tools v0.0.0-20200504152539-33427f1b0364 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 github.com/ianlancetaylor/demangle is considered in scope and updated. github.com/google/pprof is out of scope and was not updated. For #36905. Change-Id: Icb6996eb0df11f16edd9a42e04434012c0336354 Reviewed-on: https://go-review.googlesource.com/c/go/+/231657 Reviewed-by: Bryan C. Mills <bcmills@google.com> Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/vendor')
-rw-r--r--src/vendor/golang.org/x/crypto/chacha20/chacha_generic.go119
-rw-r--r--src/vendor/golang.org/x/crypto/chacha20/xor.go17
-rw-r--r--src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go56
-rw-r--r--src/vendor/golang.org/x/crypto/cryptobyte/asn1.go2
-rw-r--r--src/vendor/golang.org/x/crypto/poly1305/mac_noasm.go4
-rw-r--r--src/vendor/golang.org/x/crypto/poly1305/poly1305.go26
-rw-r--r--src/vendor/golang.org/x/crypto/poly1305/sum_amd64.go11
-rw-r--r--src/vendor/golang.org/x/crypto/poly1305/sum_generic.go21
-rw-r--r--src/vendor/golang.org/x/crypto/poly1305/sum_noasm.go13
-rw-r--r--src/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go11
-rw-r--r--src/vendor/golang.org/x/crypto/poly1305/sum_s390x.go72
-rw-r--r--src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s667
-rw-r--r--src/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s909
-rw-r--r--src/vendor/golang.org/x/text/unicode/bidi/core.go8
-rw-r--r--src/vendor/modules.txt7
15 files changed, 594 insertions, 1349 deletions
diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_generic.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
index 7c498e90d9..a2ecf5c325 100644
--- a/src/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
+++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
@@ -42,10 +42,14 @@ type Cipher struct {
// The last len bytes of buf are leftover key stream bytes from the previous
// XORKeyStream invocation. The size of buf depends on how many blocks are
- // computed at a time.
+ // computed at a time by xorKeyStreamBlocks.
buf [bufSize]byte
len int
+ // overflow is set when the counter overflowed, no more blocks can be
+ // generated, and the next XORKeyStream call should panic.
+ overflow bool
+
// The counter-independent results of the first round are cached after they
// are computed the first time.
precompDone bool
@@ -89,6 +93,7 @@ func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) {
return nil, errors.New("chacha20: wrong nonce size")
}
+ key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint
c.key = [8]uint32{
binary.LittleEndian.Uint32(key[0:4]),
binary.LittleEndian.Uint32(key[4:8]),
@@ -139,15 +144,18 @@ func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
// SetCounter sets the Cipher counter. The next invocation of XORKeyStream will
// behave as if (64 * counter) bytes had been encrypted so far.
//
-// To prevent accidental counter reuse, SetCounter panics if counter is
-// less than the current value.
+// To prevent accidental counter reuse, SetCounter panics if counter is less
+// than the current value.
+//
+// Note that the execution time of XORKeyStream is not independent of the
+// counter value.
func (s *Cipher) SetCounter(counter uint32) {
// Internally, s may buffer multiple blocks, which complicates this
// implementation slightly. When checking whether the counter has rolled
// back, we must use both s.counter and s.len to determine how many blocks
// we have already output.
outputCounter := s.counter - uint32(s.len)/blockSize
- if counter < outputCounter {
+ if s.overflow || counter < outputCounter {
panic("chacha20: SetCounter attempted to rollback counter")
}
@@ -196,34 +204,52 @@ func (s *Cipher) XORKeyStream(dst, src []byte) {
dst[i] = src[i] ^ b
}
s.len -= len(keyStream)
- src = src[len(keyStream):]
- dst = dst[len(keyStream):]
+ dst, src = dst[len(keyStream):], src[len(keyStream):]
+ }
+ if len(src) == 0 {
+ return
}
- const blocksPerBuf = bufSize / blockSize
- numBufs := (uint64(len(src)) + bufSize - 1) / bufSize
- if uint64(s.counter)+numBufs*blocksPerBuf >= 1<<32 {
+ // If we'd need to let the counter overflow and keep generating output,
+ // panic immediately. If instead we'd only reach the last block, remember
+ // not to generate any more output after the buffer is drained.
+ numBlocks := (uint64(len(src)) + blockSize - 1) / blockSize
+ if s.overflow || uint64(s.counter)+numBlocks > 1<<32 {
panic("chacha20: counter overflow")
+ } else if uint64(s.counter)+numBlocks == 1<<32 {
+ s.overflow = true
}
// xorKeyStreamBlocks implementations expect input lengths that are a
// multiple of bufSize. Platform-specific ones process multiple blocks at a
// time, so have bufSizes that are a multiple of blockSize.
- rem := len(src) % bufSize
- full := len(src) - rem
-
+ full := len(src) - len(src)%bufSize
if full > 0 {
s.xorKeyStreamBlocks(dst[:full], src[:full])
}
+ dst, src = dst[full:], src[full:]
+
+ // If using a multi-block xorKeyStreamBlocks would overflow, use the generic
+ // one that does one block at a time.
+ const blocksPerBuf = bufSize / blockSize
+ if uint64(s.counter)+blocksPerBuf > 1<<32 {
+ s.buf = [bufSize]byte{}
+ numBlocks := (len(src) + blockSize - 1) / blockSize
+ buf := s.buf[bufSize-numBlocks*blockSize:]
+ copy(buf, src)
+ s.xorKeyStreamBlocksGeneric(buf, buf)
+ s.len = len(buf) - copy(dst, buf)
+ return
+ }
// If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
// keep the leftover keystream for the next XORKeyStream invocation.
- if rem > 0 {
+ if len(src) > 0 {
s.buf = [bufSize]byte{}
- copy(s.buf[:], src[full:])
+ copy(s.buf[:], src)
s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
- s.len = bufSize - copy(dst[full:], s.buf[:])
+ s.len = bufSize - copy(dst, s.buf[:])
}
}
@@ -260,7 +286,9 @@ func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
s.precompDone = true
}
- for i := 0; i < len(src); i += blockSize {
+ // A condition of len(src) > 0 would be sufficient, but this also
+ // acts as a bounds check elimination hint.
+ for len(src) >= 64 && len(dst) >= 64 {
// The remainder of the first column round.
fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
@@ -285,49 +313,28 @@ func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
}
- // Finally, add back the initial state to generate the key stream.
- x0 += c0
- x1 += c1
- x2 += c2
- x3 += c3
- x4 += c4
- x5 += c5
- x6 += c6
- x7 += c7
- x8 += c8
- x9 += c9
- x10 += c10
- x11 += c11
- x12 += s.counter
- x13 += c13
- x14 += c14
- x15 += c15
+ // Add back the initial state to generate the key stream, then
+ // XOR the key stream with the source and write out the result.
+ addXor(dst[0:4], src[0:4], x0, c0)
+ addXor(dst[4:8], src[4:8], x1, c1)
+ addXor(dst[8:12], src[8:12], x2, c2)
+ addXor(dst[12:16], src[12:16], x3, c3)
+ addXor(dst[16:20], src[16:20], x4, c4)
+ addXor(dst[20:24], src[20:24], x5, c5)
+ addXor(dst[24:28], src[24:28], x6, c6)
+ addXor(dst[28:32], src[28:32], x7, c7)
+ addXor(dst[32:36], src[32:36], x8, c8)
+ addXor(dst[36:40], src[36:40], x9, c9)
+ addXor(dst[40:44], src[40:44], x10, c10)
+ addXor(dst[44:48], src[44:48], x11, c11)
+ addXor(dst[48:52], src[48:52], x12, s.counter)
+ addXor(dst[52:56], src[52:56], x13, c13)
+ addXor(dst[56:60], src[56:60], x14, c14)
+ addXor(dst[60:64], src[60:64], x15, c15)
s.counter += 1
- if s.counter == 0 {
- panic("chacha20: internal error: counter overflow")
- }
- in, out := src[i:], dst[i:]
- in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint
-
- // XOR the key stream with the source and write out the result.
- xor(out[0:], in[0:], x0)
- xor(out[4:], in[4:], x1)
- xor(out[8:], in[8:], x2)
- xor(out[12:], in[12:], x3)
- xor(out[16:], in[16:], x4)
- xor(out[20:], in[20:], x5)
- xor(out[24:], in[24:], x6)
- xor(out[28:], in[28:], x7)
- xor(out[32:], in[32:], x8)
- xor(out[36:], in[36:], x9)
- xor(out[40:], in[40:], x10)
- xor(out[44:], in[44:], x11)
- xor(out[48:], in[48:], x12)
- xor(out[52:], in[52:], x13)
- xor(out[56:], in[56:], x14)
- xor(out[60:], in[60:], x15)
+ src, dst = src[blockSize:], dst[blockSize:]
}
}
diff --git a/src/vendor/golang.org/x/crypto/chacha20/xor.go b/src/vendor/golang.org/x/crypto/chacha20/xor.go
index 0110c9865a..c2d04851e0 100644
--- a/src/vendor/golang.org/x/crypto/chacha20/xor.go
+++ b/src/vendor/golang.org/x/crypto/chacha20/xor.go
@@ -13,10 +13,10 @@ const unaligned = runtime.GOARCH == "386" ||
runtime.GOARCH == "ppc64le" ||
runtime.GOARCH == "s390x"
-// xor reads a little endian uint32 from src, XORs it with u and
+// addXor reads a little endian uint32 from src, XORs it with (a + b) and
// places the result in little endian byte order in dst.
-func xor(dst, src []byte, u uint32) {
- _, _ = src[3], dst[3] // eliminate bounds checks
+func addXor(dst, src []byte, a, b uint32) {
+ _, _ = src[3], dst[3] // bounds check elimination hint
if unaligned {
// The compiler should optimize this code into
// 32-bit unaligned little endian loads and stores.
@@ -27,15 +27,16 @@ func xor(dst, src []byte, u uint32) {
v |= uint32(src[1]) << 8
v |= uint32(src[2]) << 16
v |= uint32(src[3]) << 24
- v ^= u
+ v ^= a + b
dst[0] = byte(v)
dst[1] = byte(v >> 8)
dst[2] = byte(v >> 16)
dst[3] = byte(v >> 24)
} else {
- dst[0] = src[0] ^ byte(u)
- dst[1] = src[1] ^ byte(u>>8)
- dst[2] = src[2] ^ byte(u>>16)
- dst[3] = src[3] ^ byte(u>>24)
+ a += b
+ dst[0] = src[0] ^ byte(a)
+ dst[1] = src[1] ^ byte(a>>8)
+ dst[2] = src[2] ^ byte(a>>16)
+ dst[3] = src[3] ^ byte(a>>24)
}
}
diff --git a/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go b/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
index 91b38568ce..fe191d395d 100644
--- a/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
+++ b/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
@@ -12,56 +12,64 @@ import (
"golang.org/x/crypto/poly1305"
)
-func roundTo16(n int) int {
- return 16 * ((n + 15) / 16)
+func writeWithPadding(p *poly1305.MAC, b []byte) {
+ p.Write(b)
+ if rem := len(b) % 16; rem != 0 {
+ var buf [16]byte
+ padLen := 16 - rem
+ p.Write(buf[:padLen])
+ }
+}
+
+func writeUint64(p *poly1305.MAC, n int) {
+ var buf [8]byte
+ binary.LittleEndian.PutUint64(buf[:], uint64(n))
+ p.Write(buf[:])
}
func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte {
ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize)
+ ciphertext, tag := out[:len(plaintext)], out[len(plaintext):]
if subtle.InexactOverlap(out, plaintext) {
panic("chacha20poly1305: invalid buffer overlap")
}
- var polyKey, discardBuf [32]byte
+ var polyKey [32]byte
s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
s.XORKeyStream(polyKey[:], polyKey[:])
- s.XORKeyStream(discardBuf[:], discardBuf[:]) // skip the next 32 bytes
- s.XORKeyStream(out, plaintext)
-
- polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(plaintext))+8+8)
- copy(polyInput, additionalData)
- copy(polyInput[roundTo16(len(additionalData)):], out[:len(plaintext)])
- binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
- binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(plaintext)))
+ s.SetCounter(1) // set the counter to 1, skipping 32 bytes
+ s.XORKeyStream(ciphertext, plaintext)
- var tag [poly1305.TagSize]byte
- poly1305.Sum(&tag, polyInput, &polyKey)
- copy(out[len(plaintext):], tag[:])
+ p := poly1305.New(&polyKey)
+ writeWithPadding(p, additionalData)
+ writeWithPadding(p, ciphertext)
+ writeUint64(p, len(additionalData))
+ writeUint64(p, len(plaintext))
+ p.Sum(tag[:0])
return ret
}
func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
- var tag [poly1305.TagSize]byte
- copy(tag[:], ciphertext[len(ciphertext)-16:])
+ tag := ciphertext[len(ciphertext)-16:]
ciphertext = ciphertext[:len(ciphertext)-16]
- var polyKey, discardBuf [32]byte
+ var polyKey [32]byte
s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
s.XORKeyStream(polyKey[:], polyKey[:])
- s.XORKeyStream(discardBuf[:], discardBuf[:]) // skip the next 32 bytes
+ s.SetCounter(1) // set the counter to 1, skipping 32 bytes
- polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(ciphertext))+8+8)
- copy(polyInput, additionalData)
- copy(polyInput[roundTo16(len(additionalData)):], ciphertext)
- binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
- binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(ciphertext)))
+ p := poly1305.New(&polyKey)
+ writeWithPadding(p, additionalData)
+ writeWithPadding(p, ciphertext)
+ writeUint64(p, len(additionalData))
+ writeUint64(p, len(ciphertext))
ret, out := sliceForAppend(dst, len(ciphertext))
if subtle.InexactOverlap(out, ciphertext) {
panic("chacha20poly1305: invalid buffer overlap")
}
- if !poly1305.Verify(&tag, polyInput, &polyKey) {
+ if !p.Verify(tag) {
for i := range out {
out[i] = 0
}
diff --git a/src/vendor/golang.org/x/crypto/cryptobyte/asn1.go b/src/vendor/golang.org/x/crypto/cryptobyte/asn1.go
index f930f7e526..b26376aeca 100644
--- a/src/vendor/golang.org/x/crypto/cryptobyte/asn1.go
+++ b/src/vendor/golang.org/x/crypto/cryptobyte/asn1.go
@@ -81,7 +81,7 @@ func (b *Builder) AddASN1BigInt(n *big.Int) {
for i := range bytes {
bytes[i] ^= 0xff
}
- if bytes[0]&0x80 == 0 {
+ if len(bytes) == 0 || bytes[0]&0x80 == 0 {
c.add(0xff)
}
c.add(bytes...)
diff --git a/src/vendor/golang.org/x/crypto/poly1305/mac_noasm.go b/src/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
index b0c2cd0561..d118f30ed5 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
@@ -2,10 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64,!ppc64le gccgo purego
+// +build !amd64,!ppc64le,!s390x gccgo purego
package poly1305
type mac struct{ macGeneric }
-
-func newMAC(key *[32]byte) mac { return mac{newMACGeneric(key)} }
diff --git a/src/vendor/golang.org/x/crypto/poly1305/poly1305.go b/src/vendor/golang.org/x/crypto/poly1305/poly1305.go
index 066159b797..9d7a6af09f 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/poly1305.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/poly1305.go
@@ -26,7 +26,9 @@ const TagSize = 16
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[16]byte, m []byte, key *[32]byte) {
- sum(out, m, key)
+ h := New(key)
+ h.Write(m)
+ h.Sum(out[:0])
}
// Verify returns true if mac is a valid authenticator for m with the given key.
@@ -46,10 +48,9 @@ func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
// two different messages with the same key allows an attacker
// to forge messages at will.
func New(key *[32]byte) *MAC {
- return &MAC{
- mac: newMAC(key),
- finalized: false,
- }
+ m := &MAC{}
+ initialize(key, &m.macState)
+ return m
}
// MAC is an io.Writer computing an authentication tag
@@ -58,7 +59,7 @@ func New(key *[32]byte) *MAC {
// MAC cannot be used like common hash.Hash implementations,
// because using a poly1305 key twice breaks its security.
// Therefore writing data to a running MAC after calling
-// Sum causes it to panic.
+// Sum or Verify causes it to panic.
type MAC struct {
mac // platform-dependent implementation
@@ -71,10 +72,10 @@ func (h *MAC) Size() int { return TagSize }
// Write adds more data to the running message authentication code.
// It never returns an error.
//
-// It must not be called after the first call of Sum.
+// It must not be called after the first call of Sum or Verify.
func (h *MAC) Write(p []byte) (n int, err error) {
if h.finalized {
- panic("poly1305: write to MAC after Sum")
+ panic("poly1305: write to MAC after Sum or Verify")
}
return h.mac.Write(p)
}
@@ -87,3 +88,12 @@ func (h *MAC) Sum(b []byte) []byte {
h.finalized = true
return append(b, mac[:]...)
}
+
+// Verify returns whether the authenticator of all data written to
+// the message authentication code matches the expected value.
+func (h *MAC) Verify(expected []byte) bool {
+ var mac [TagSize]byte
+ h.mac.Sum(&mac)
+ h.finalized = true
+ return subtle.ConstantTimeCompare(expected, mac[:]) == 1
+}
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/src/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
index 35b9e38c90..99e5a1d50e 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
@@ -9,17 +9,6 @@ package poly1305
//go:noescape
func update(state *macState, msg []byte)
-func sum(out *[16]byte, m []byte, key *[32]byte) {
- h := newMAC(key)
- h.Write(m)
- h.Sum(out)
-}
-
-func newMAC(key *[32]byte) (h mac) {
- initialize(key, &h.r, &h.s)
- return
-}
-
// mac is a wrapper for macGeneric that redirects calls that would have gone to
// updateGeneric to update.
//
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_generic.go b/src/vendor/golang.org/x/crypto/poly1305/sum_generic.go
index 1187eab78f..c942a65904 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_generic.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_generic.go
@@ -31,16 +31,18 @@ func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
h.Sum(out)
}
-func newMACGeneric(key *[32]byte) (h macGeneric) {
- initialize(key, &h.r, &h.s)
- return
+func newMACGeneric(key *[32]byte) macGeneric {
+ m := macGeneric{}
+ initialize(key, &m.macState)
+ return m
}
// macState holds numbers in saturated 64-bit little-endian limbs. That is,
// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
type macState struct {
// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
- // can grow larger during and after rounds.
+ // can grow larger during and after rounds. It must, however, remain below
+ // 2 * (2¹³⁰ - 5).
h [3]uint64
// r and s are the private key components.
r [2]uint64
@@ -97,11 +99,12 @@ const (
rMask1 = 0x0FFFFFFC0FFFFFFC
)
-func initialize(key *[32]byte, r, s *[2]uint64) {
- r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
- r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
- s[0] = binary.LittleEndian.Uint64(key[16:24])
- s[1] = binary.LittleEndian.Uint64(key[24:32])
+// initialize loads the 256-bit key into the two 128-bit secret values r and s.
+func initialize(key *[32]byte, m *macState) {
+ m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
+ m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
+ m.s[0] = binary.LittleEndian.Uint64(key[16:24])
+ m.s[1] = binary.LittleEndian.Uint64(key[24:32])
}
// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_noasm.go b/src/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
deleted file mode 100644
index 2e3ae34c7d..0000000000
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,!go1.11 !amd64,!s390x,!ppc64le gccgo purego
-
-package poly1305
-
-func sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
- h := newMAC(key)
- h.Write(msg)
- h.Sum(out)
-}
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go b/src/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
index 92597bb8c2..2e7a120b19 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
@@ -9,17 +9,6 @@ package poly1305
//go:noescape
func update(state *macState, msg []byte)
-func sum(out *[16]byte, m []byte, key *[32]byte) {
- h := newMAC(key)
- h.Write(m)
- h.Sum(out)
-}
-
-func newMAC(key *[32]byte) (h mac) {
- initialize(key, &h.r, &h.s)
- return
-}
-
// mac is a wrapper for macGeneric that redirects calls that would have gone to
// updateGeneric to update.
//
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.go b/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
index 5f91ff84a9..958fedc079 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build go1.11,!gccgo,!purego
+// +build !gccgo,!purego
package poly1305
@@ -10,30 +10,66 @@ import (
"golang.org/x/sys/cpu"
)
-// poly1305vx is an assembly implementation of Poly1305 that uses vector
+// updateVX is an assembly implementation of Poly1305 that uses vector
// instructions. It must only be called if the vector facility (vx) is
// available.
//go:noescape
-func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+func updateVX(state *macState, msg []byte)
-// poly1305vmsl is an assembly implementation of Poly1305 that uses vector
-// instructions, including VMSL. It must only be called if the vector facility (vx) is
-// available and if VMSL is supported.
-//go:noescape
-func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+// mac is a replacement for macGeneric that uses a larger buffer and redirects
+// calls that would have gone to updateGeneric to updateVX if the vector
+// facility is installed.
+//
+// A larger buffer is required for good performance because the vector
+// implementation has a higher fixed cost per call than the generic
+// implementation.
+type mac struct {
+ macState
+
+ buffer [16 * TagSize]byte // size must be a multiple of block size (16)
+ offset int
+}
-func sum(out *[16]byte, m []byte, key *[32]byte) {
- if cpu.S390X.HasVX {
- var mPtr *byte
- if len(m) > 0 {
- mPtr = &m[0]
+func (h *mac) Write(p []byte) (int, error) {
+ nn := len(p)
+ if h.offset > 0 {
+ n := copy(h.buffer[h.offset:], p)
+ if h.offset+n < len(h.buffer) {
+ h.offset += n
+ return nn, nil
}
- if cpu.S390X.HasVXE && len(m) > 256 {
- poly1305vmsl(out, mPtr, uint64(len(m)), key)
+ p = p[n:]
+ h.offset = 0
+ if cpu.S390X.HasVX {
+ updateVX(&h.macState, h.buffer[:])
} else {
- poly1305vx(out, mPtr, uint64(len(m)), key)
+ updateGeneric(&h.macState, h.buffer[:])
}
- } else {
- sumGeneric(out, m, key)
}
+
+ tail := len(p) % len(h.buffer) // number of bytes to copy into buffer
+ body := len(p) - tail // number of bytes to process now
+ if body > 0 {
+ if cpu.S390X.HasVX {
+ updateVX(&h.macState, p[:body])
+ } else {
+ updateGeneric(&h.macState, p[:body])
+ }
+ }
+ h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0
+ return nn, nil
+}
+
+func (h *mac) Sum(out *[TagSize]byte) {
+ state := h.macState
+ remainder := h.buffer[:h.offset]
+
+ // Use the generic implementation if we have 2 or fewer blocks left
+ // to sum. The vector implementation has a higher startup time.
+ if cpu.S390X.HasVX && len(remainder) > 2*TagSize {
+ updateVX(&state, remainder)
+ } else if len(remainder) > 0 {
+ updateGeneric(&state, remainder)
+ }
+ finalize(out, &state.h, &state.s)
}
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s b/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
index 806d1694b0..0fa9ee6e0b 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
@@ -2,115 +2,187 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build go1.11,!gccgo,!purego
+// +build !gccgo,!purego
#include "textflag.h"
-// Implementation of Poly1305 using the vector facility (vx).
-
-// constants
-#define MOD26 V0
-#define EX0 V1
-#define EX1 V2
-#define EX2 V3
-
-// temporaries
-#define T_0 V4
-#define T_1 V5
-#define T_2 V6
-#define T_3 V7
-#define T_4 V8
-
-// key (r)
-#define R_0 V9
-#define R_1 V10
-#define R_2 V11
-#define R_3 V12
-#define R_4 V13
-#define R5_1 V14
-#define R5_2 V15
-#define R5_3 V16
-#define R5_4 V17
-#define RSAVE_0 R5
-#define RSAVE_1 R6
-#define RSAVE_2 R7
-#define RSAVE_3 R8
-#define RSAVE_4 R9
-#define R5SAVE_1 V28
-#define R5SAVE_2 V29
-#define R5SAVE_3 V30
-#define R5SAVE_4 V31
-
-// message block
-#define F_0 V18
-#define F_1 V19
-#define F_2 V20
-#define F_3 V21
-#define F_4 V22
-
-// accumulator
-#define H_0 V23
-#define H_1 V24
-#define H_2 V25
-#define H_3 V26
-#define H_4 V27
-
-GLOBL ·keyMask<>(SB), RODATA, $16
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
-
-GLOBL ·bswapMask<>(SB), RODATA, $16
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
-
-GLOBL ·constants<>(SB), RODATA, $64
-// MOD26
-DATA ·constants<>+0(SB)/8, $0x3ffffff
-DATA ·constants<>+8(SB)/8, $0x3ffffff
+// This implementation of Poly1305 uses the vector facility (vx)
+// to process up to 2 blocks (32 bytes) per iteration using an
+// algorithm based on the one described in:
+//
+// NEON crypto, Daniel J. Bernstein & Peter Schwabe
+// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+//
+// This algorithm uses 5 26-bit limbs to represent a 130-bit
+// value. These limbs are, for the most part, zero extended and
+// placed into 64-bit vector register elements. Each vector
+// register is 128-bits wide and so holds 2 of these elements.
+// Using 26-bit limbs allows us plenty of headroom to accomodate
+// accumulations before and after multiplication without
+// overflowing either 32-bits (before multiplication) or 64-bits
+// (after multiplication).
+//
+// In order to parallelise the operations required to calculate
+// the sum we use two separate accumulators and then sum those
+// in an extra final step. For compatibility with the generic
+// implementation we perform this summation at the end of every
+// updateVX call.
+//
+// To use two accumulators we must multiply the message blocks
+// by r² rather than r. Only the final message block should be
+// multiplied by r.
+//
+// Example:
+//
+// We want to calculate the sum (h) for a 64 byte message (m):
+//
+// h = m[0:16]r⁴ + m[16:32]r³ + m[32:48]r² + m[48:64]r
+//
+// To do this we split the calculation into the even indices
+// and odd indices of the message. These form our SIMD 'lanes':
+//
+// h = m[ 0:16]r⁴ + m[32:48]r² + <- lane 0
+// m[16:32]r³ + m[48:64]r <- lane 1
+//
+// To calculate this iteratively we refactor so that both lanes
+// are written in terms of r² and r:
+//
+// h = (m[ 0:16]r² + m[32:48])r² + <- lane 0
+// (m[16:32]r² + m[48:64])r <- lane 1
+// ^ ^
+// | coefficients for second iteration
+// coefficients for first iteration
+//
+// So in this case we would have two iterations. In the first
+// both lanes are multiplied by r². In the second only the
+// first lane is multiplied by r² and the second lane is
+// instead multiplied by r. This gives use the odd and even
+// powers of r that we need from the original equation.
+//
+// Notation:
+//
+// h - accumulator
+// r - key
+// m - message
+//
+// [a, b] - SIMD register holding two 64-bit values
+// [a, b, c, d] - SIMD register holding four 32-bit values
+// xᵢ[n] - limb n of variable x with bit width i
+//
+// Limbs are expressed in little endian order, so for 26-bit
+// limbs x₂₆[4] will be the most significant limb and x₂₆[0]
+// will be the least significant limb.
+
+// masking constants
+#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits
+#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits
+
+// expansion constants (see EXPAND macro)
+#define EX0 V2
+#define EX1 V3
+#define EX2 V4
+
+// key (r², r or 1 depending on context)
+#define R_0 V5
+#define R_1 V6
+#define R_2 V7
+#define R_3 V8
+#define R_4 V9
+
+// precalculated coefficients (5r², 5r or 0 depending on context)
+#define R5_1 V10
+#define R5_2 V11
+#define R5_3 V12
+#define R5_4 V13
+
+// message block (m)
+#define M_0 V14
+#define M_1 V15
+#define M_2 V16
+#define M_3 V17
+#define M_4 V18
+
+// accumulator (h)
+#define H_0 V19
+#define H_1 V20
+#define H_2 V21
+#define H_3 V22
+#define H_4 V23
+
+// temporary registers (for short-lived values)
+#define T_0 V24
+#define T_1 V25
+#define T_2 V26
+#define T_3 V27
+#define T_4 V28
+
+GLOBL ·constants<>(SB), RODATA, $0x30
// EX0
-DATA ·constants<>+16(SB)/8, $0x0006050403020100
-DATA ·constants<>+24(SB)/8, $0x1016151413121110
+DATA ·constants<>+0x00(SB)/8, $0x0006050403020100
+DATA ·constants<>+0x08(SB)/8, $0x1016151413121110
// EX1
-DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
-DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
+DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706
+DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716
// EX2
-DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
-DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
-
-// h = (f*g) % (2**130-5) [partial reduction]
+DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d
+DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d
+
+// MULTIPLY multiplies each lane of f and g, partially reduced
+// modulo 2¹³⁰ - 5. The result, h, consists of partial products
+// in each lane that need to be reduced further to produce the
+// final result.
+//
+// h₁₃₀ = (f₁₃₀g₁₃₀) % 2¹³⁰ + (5f₁₃₀g₁₃₀) / 2¹³⁰
+//
+// Note that the multiplication by 5 of the high bits is
+// achieved by precalculating the multiplication of four of the
+// g coefficients by 5. These are g51-g54.
#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
VMLOF f0, g0, h0 \
- VMLOF f0, g1, h1 \
- VMLOF f0, g2, h2 \
VMLOF f0, g3, h3 \
+ VMLOF f0, g1, h1 \
VMLOF f0, g4, h4 \
+ VMLOF f0, g2, h2 \
VMLOF f1, g54, T_0 \
- VMLOF f1, g0, T_1 \
- VMLOF f1, g1, T_2 \
VMLOF f1, g2, T_3 \
+ VMLOF f1, g0, T_1 \
VMLOF f1, g3, T_4 \
+ VMLOF f1, g1, T_2 \
VMALOF f2, g53, h0, h0 \
- VMALOF f2, g54, h1, h1 \
- VMALOF f2, g0, h2, h2 \
VMALOF f2, g1, h3, h3 \
+ VMALOF f2, g54, h1, h1 \
VMALOF f2, g2, h4, h4 \
+ VMALOF f2, g0, h2, h2 \
VMALOF f3, g52, T_0, T_0 \
- VMALOF f3, g53, T_1, T_1 \
- VMALOF f3, g54, T_2, T_2 \
VMALOF f3, g0, T_3, T_3 \
+ VMALOF f3, g53, T_1, T_1 \
VMALOF f3, g1, T_4, T_4 \
+ VMALOF f3, g54, T_2, T_2 \
VMALOF f4, g51, h0, h0 \
- VMALOF f4, g52, h1, h1 \
- VMALOF f4, g53, h2, h2 \
VMALOF f4, g54, h3, h3 \
+ VMALOF f4, g52, h1, h1 \
VMALOF f4, g0, h4, h4 \
+ VMALOF f4, g53, h2, h2 \
VAG T_0, h0, h0 \
- VAG T_1, h1, h1 \
- VAG T_2, h2, h2 \
VAG T_3, h3, h3 \
- VAG T_4, h4, h4
-
-// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
+ VAG T_1, h1, h1 \
+ VAG T_4, h4, h4 \
+ VAG T_2, h2, h2
+
+// REDUCE performs the following carry operations in four
+// stages, as specified in Bernstein & Schwabe:
+//
+// 1: h₂₆[0]->h₂₆[1] h₂₆[3]->h₂₆[4]
+// 2: h₂₆[1]->h₂₆[2] h₂₆[4]->h₂₆[0]
+// 3: h₂₆[0]->h₂₆[1] h₂₆[2]->h₂₆[3]
+// 4: h₂₆[3]->h₂₆[4]
+//
+// The result is that all of the limbs are limited to 26-bits
+// except for h₂₆[1] and h₂₆[4] which are limited to 27-bits.
+//
+// Note that although each limb is aligned at 26-bit intervals
+// they may contain values that exceed 2²⁶ - 1, hence the need
+// to carry the excess bits in each limb.
#define REDUCE(h0, h1, h2, h3, h4) \
VESRLG $26, h0, T_0 \
VESRLG $26, h3, T_1 \
@@ -136,144 +208,155 @@ DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
VN MOD26, h3, h3 \
VAG T_2, h4, h4
-// expand in0 into d[0] and in1 into d[1]
+// EXPAND splits the 128-bit little-endian values in0 and in1
+// into 26-bit big-endian limbs and places the results into
+// the first and second lane of d₂₆[0:4] respectively.
+//
+// The EX0, EX1 and EX2 constants are arrays of byte indices
+// for permutation. The permutation both reverses the bytes
+// in the input and ensures the bytes are copied into the
+// destination limb ready to be shifted into their final
+// position.
#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
- VGBM $0x0707, d1 \ // d1=tmp
- VPERM in0, in1, EX2, d4 \
VPERM in0, in1, EX0, d0 \
VPERM in0, in1, EX1, d2 \
- VN d1, d4, d4 \
+ VPERM in0, in1, EX2, d4 \
VESRLG $26, d0, d1 \
VESRLG $30, d2, d3 \
VESRLG $4, d2, d2 \
- VN MOD26, d0, d0 \
- VN MOD26, d1, d1 \
- VN MOD26, d2, d2 \
- VN MOD26, d3, d3
-
-// pack h4:h0 into h1:h0 (no carry)
-#define PACK(h0, h1, h2, h3, h4) \
- VESLG $26, h1, h1 \
- VESLG $26, h3, h3 \
- VO h0, h1, h0 \
- VO h2, h3, h2 \
- VESLG $4, h2, h2 \
- VLEIB $7, $48, h1 \
- VSLB h1, h2, h2 \
- VO h0, h2, h0 \
- VLEIB $7, $104, h1 \
- VSLB h1, h4, h3 \
- VO h3, h0, h0 \
- VLEIB $7, $24, h1 \
- VSRLB h1, h4, h1
-
-// if h > 2**130-5 then h -= 2**130-5
-#define MOD(h0, h1, t0, t1, t2) \
- VZERO t0 \
- VLEIG $1, $5, t0 \
- VACCQ h0, t0, t1 \
- VAQ h0, t0, t0 \
- VONE t2 \
- VLEIG $1, $-4, t2 \
- VAQ t2, t1, t1 \
- VACCQ h1, t1, t1 \
- VONE t2 \
- VAQ t2, t1, t1 \
- VN h0, t1, t2 \
- VNC t0, t1, t1 \
- VO t1, t2, h0
-
-// func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305vx(SB), $0-32
- // This code processes up to 2 blocks (32 bytes) per iteration
- // using the algorithm described in:
- // NEON crypto, Daniel J. Bernstein & Peter Schwabe
- // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
- LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
-
- // load MOD26, EX0, EX1 and EX2
+ VN MOD26, d0, d0 \ // [in0₂₆[0], in1₂₆[0]]
+ VN MOD26, d3, d3 \ // [in0₂₆[3], in1₂₆[3]]
+ VN MOD26, d1, d1 \ // [in0₂₆[1], in1₂₆[1]]
+ VN MOD24, d4, d4 \ // [in0₂₆[4], in1₂₆[4]]
+ VN MOD26, d2, d2 // [in0₂₆[2], in1₂₆[2]]
+
+// func updateVX(state *macState, msg []byte)
+TEXT ·updateVX(SB), NOSPLIT, $0
+ MOVD state+0(FP), R1
+ LMG msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len
+
+ // load EX0, EX1 and EX2
MOVD $·constants<>(SB), R5
- VLM (R5), MOD26, EX2
-
- // setup r
- VL (R4), T_0
- MOVD $·keyMask<>(SB), R6
- VL (R6), T_1
- VN T_0, T_1, T_0
- EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
-
- // setup r*5
- VLEIG $0, $5, T_0
- VLEIG $1, $5, T_0
-
- // store r (for final block)
- VMLOF T_0, R_1, R5SAVE_1
- VMLOF T_0, R_2, R5SAVE_2
- VMLOF T_0, R_3, R5SAVE_3
- VMLOF T_0, R_4, R5SAVE_4
- VLGVG $0, R_0, RSAVE_0
- VLGVG $0, R_1, RSAVE_1
- VLGVG $0, R_2, RSAVE_2
- VLGVG $0, R_3, RSAVE_3
- VLGVG $0, R_4, RSAVE_4
-
- // skip r**2 calculation
+ VLM (R5), EX0, EX2
+
+ // generate masks
+ VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff]
+ VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff]
+
+ // load h (accumulator) and r (key) from state
+ VZERO T_1 // [0, 0]
+ VL 0(R1), T_0 // [h₆₄[0], h₆₄[1]]
+ VLEG $0, 16(R1), T_1 // [h₆₄[2], 0]
+ VL 24(R1), T_2 // [r₆₄[0], r₆₄[1]]
+ VPDI $0, T_0, T_2, T_3 // [h₆₄[0], r₆₄[0]]
+ VPDI $5, T_0, T_2, T_4 // [h₆₄[1], r₆₄[1]]
+
+ // unpack h and r into 26-bit limbs
+ // note: h₆₄[2] may have the low 3 bits set, so h₂₆[4] is a 27-bit value
+ VN MOD26, T_3, H_0 // [h₂₆[0], r₂₆[0]]
+ VZERO H_1 // [0, 0]
+ VZERO H_3 // [0, 0]
+ VGMG $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out
+ VESLG $24, T_1, T_1 // [h₆₄[2]<<24, 0]
+ VERIMG $-26&63, T_3, MOD26, H_1 // [h₂₆[1], r₂₆[1]]
+ VESRLG $+52&63, T_3, H_2 // [h₂₆[2], r₂₆[2]] - low 12 bits only
+ VERIMG $-14&63, T_4, MOD26, H_3 // [h₂₆[1], r₂₆[1]]
+ VESRLG $40, T_4, H_4 // [h₂₆[4], r₂₆[4]] - low 24 bits only
+ VERIMG $+12&63, T_4, T_0, H_2 // [h₂₆[2], r₂₆[2]] - complete
+ VO T_1, H_4, H_4 // [h₂₆[4], r₂₆[4]] - complete
+
+ // replicate r across all 4 vector elements
+ VREPF $3, H_0, R_0 // [r₂₆[0], r₂₆[0], r₂₆[0], r₂₆[0]]
+ VREPF $3, H_1, R_1 // [r₂₆[1], r₂₆[1], r₂₆[1], r₂₆[1]]
+ VREPF $3, H_2, R_2 // [r₂₆[2], r₂₆[2], r₂₆[2], r₂₆[2]]
+ VREPF $3, H_3, R_3 // [r₂₆[3], r₂₆[3], r₂₆[3], r₂₆[3]]
+ VREPF $3, H_4, R_4 // [r₂₆[4], r₂₆[4], r₂₆[4], r₂₆[4]]
+
+ // zero out lane 1 of h
+ VLEIG $1, $0, H_0 // [h₂₆[0], 0]
+ VLEIG $1, $0, H_1 // [h₂₆[1], 0]
+ VLEIG $1, $0, H_2 // [h₂₆[2], 0]
+ VLEIG $1, $0, H_3 // [h₂₆[3], 0]
+ VLEIG $1, $0, H_4 // [h₂₆[4], 0]
+
+ // calculate 5r (ignore least significant limb)
+ VREPIF $5, T_0
+ VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r₂₆[1], 5r₂₆[1], 5r₂₆[1]]
+ VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r₂₆[2], 5r₂₆[2], 5r₂₆[2]]
+ VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r₂₆[3], 5r₂₆[3], 5r₂₆[3]]
+ VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r₂₆[4], 5r₂₆[4], 5r₂₆[4]]
+
+ // skip r² calculation if we are only calculating one block
CMPBLE R3, $16, skip
- // calculate r**2
- MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
- REDUCE(H_0, H_1, H_2, H_3, H_4)
- VLEIG $0, $5, T_0
- VLEIG $1, $5, T_0
- VMLOF T_0, H_1, R5_1
- VMLOF T_0, H_2, R5_2
- VMLOF T_0, H_3, R5_3
- VMLOF T_0, H_4, R5_4
- VLR H_0, R_0
- VLR H_1, R_1
- VLR H_2, R_2
- VLR H_3, R_3
- VLR H_4, R_4
-
- // initialize h
- VZERO H_0
- VZERO H_1
- VZERO H_2
- VZERO H_3
- VZERO H_4
+ // calculate r²
+ MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4)
+ REDUCE(M_0, M_1, M_2, M_3, M_4)
+ VGBM $0x0f0f, T_0
+ VERIMG $0, M_0, T_0, R_0 // [r₂₆[0], r²₂₆[0], r₂₆[0], r²₂₆[0]]
+ VERIMG $0, M_1, T_0, R_1 // [r₂₆[1], r²₂₆[1], r₂₆[1], r²₂₆[1]]
+ VERIMG $0, M_2, T_0, R_2 // [r₂₆[2], r²₂₆[2], r₂₆[2], r²₂₆[2]]
+ VERIMG $0, M_3, T_0, R_3 // [r₂₆[3], r²₂₆[3], r₂₆[3], r²₂₆[3]]
+ VERIMG $0, M_4, T_0, R_4 // [r₂₆[4], r²₂₆[4], r₂₆[4], r²₂₆[4]]
+
+ // calculate 5r² (ignore least significant limb)
+ VREPIF $5, T_0
+ VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r²₂₆[1], 5r₂₆[1], 5r²₂₆[1]]
+ VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r²₂₆[2], 5r₂₆[2], 5r²₂₆[2]]
+ VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r²₂₆[3], 5r₂₆[3], 5r²₂₆[3]]
+ VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r²₂₆[4], 5r₂₆[4], 5r²₂₆[4]]
loop:
- CMPBLE R3, $32, b2
- VLM (R2), T_0, T_1
- SUB $32, R3
- MOVD $32(R2), R2
- EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
- VLEIB $4, $1, F_4
- VLEIB $12, $1, F_4
+ CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients
+
+ // load next 2 blocks from message
+ VLM (R2), T_0, T_1
+
+ // update message slice
+ SUB $32, R3
+ MOVD $32(R2), R2
+
+ // unpack message blocks into 26-bit big-endian limbs
+ EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
+
+ // add 2¹²⁸ to each message block value
+ VLEIB $4, $1, M_4
+ VLEIB $12, $1, M_4
multiply:
- VAG H_0, F_0, F_0
- VAG H_1, F_1, F_1
- VAG H_2, F_2, F_2
- VAG H_3, F_3, F_3
- VAG H_4, F_4, F_4
- MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
+ // accumulate the incoming message
+ VAG H_0, M_0, M_0
+ VAG H_3, M_3, M_3
+ VAG H_1, M_1, M_1
+ VAG H_4, M_4, M_4
+ VAG H_2, M_2, M_2
+
+ // multiply the accumulator by the key coefficient
+ MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
+
+ // carry and partially reduce the partial products
REDUCE(H_0, H_1, H_2, H_3, H_4)
+
CMPBNE R3, $0, loop
finish:
- // sum vectors
+ // sum lane 0 and lane 1 and put the result in lane 1
VZERO T_0
VSUMQG H_0, T_0, H_0
- VSUMQG H_1, T_0, H_1
- VSUMQG H_2, T_0, H_2
VSUMQG H_3, T_0, H_3
+ VSUMQG H_1, T_0, H_1
VSUMQG H_4, T_0, H_4
+ VSUMQG H_2, T_0, H_2
- // h may be >= 2*(2**130-5) so we need to reduce it again
+ // reduce again after summation
+ // TODO(mundaym): there might be a more efficient way to do this
+ // now that we only have 1 active lane. For example, we could
+ // simultaneously pack the values as we reduce them.
REDUCE(H_0, H_1, H_2, H_3, H_4)
- // carry h1->h4
+ // carry h[1] through to h[4] so that only h[4] can exceed 2²⁶ - 1
+ // TODO(mundaym): in testing this final carry was unnecessary.
+ // Needs a proof before it can be removed though.
VESRLG $26, H_1, T_1
VN MOD26, H_1, H_1
VAQ T_1, H_2, H_2
@@ -284,95 +367,137 @@ finish:
VN MOD26, H_3, H_3
VAQ T_3, H_4, H_4
- // h is now < 2*(2**130-5)
- // pack h into h1 (hi) and h0 (lo)
- PACK(H_0, H_1, H_2, H_3, H_4)
-
- // if h > 2**130-5 then h -= 2**130-5
- MOD(H_0, H_1, T_0, T_1, T_2)
-
- // h += s
- MOVD $·bswapMask<>(SB), R5
- VL (R5), T_1
- VL 16(R4), T_0
- VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
- VAQ T_0, H_0, H_0
- VPERM H_0, H_0, T_1, H_0 // reverse bytes (to little)
- VST H_0, (R1)
-
+ // h is now < 2(2¹³⁰ - 5)
+ // Pack each lane in h₂₆[0:4] into h₁₂₈[0:1].
+ VESLG $26, H_1, H_1
+ VESLG $26, H_3, H_3
+ VO H_0, H_1, H_0
+ VO H_2, H_3, H_2
+ VESLG $4, H_2, H_2
+ VLEIB $7, $48, H_1
+ VSLB H_1, H_2, H_2
+ VO H_0, H_2, H_0
+ VLEIB $7, $104, H_1
+ VSLB H_1, H_4, H_3
+ VO H_3, H_0, H_0
+ VLEIB $7, $24, H_1
+ VSRLB H_1, H_4, H_1
+
+ // update state
+ VSTEG $1, H_0, 0(R1)
+ VSTEG $0, H_0, 8(R1)
+ VSTEG $1, H_1, 16(R1)
RET
-b2:
+b2: // 2 or fewer blocks remaining
CMPBLE R3, $16, b1
- // 2 blocks remaining
- SUB $17, R3
- VL (R2), T_0
- VLL R3, 16(R2), T_1
- ADD $1, R3
+ // Load the 2 remaining blocks (17-32 bytes remaining).
+ MOVD $-17(R3), R0 // index of final byte to load modulo 16
+ VL (R2), T_0 // load full 16 byte block
+ VLL R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes
+
+ // The Poly1305 algorithm requires that a 1 bit be appended to
+ // each message block. If the final block is less than 16 bytes
+ // long then it is easiest to insert the 1 before the message
+ // block is split into 26-bit limbs. If, on the other hand, the
+ // final message block is 16 bytes long then we append the 1 bit
+ // after expansion as normal.
MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, T_1
- EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
+ MOVD $-16(R3), R3 // index of byte in last block to insert 1 at (could be 16)
+ CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long
+ VLVGB R3, R0, T_1 // insert 1 into the byte at index R3
+
+ // Split both blocks into 26-bit limbs in the appropriate lanes.
+ EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
+
+ // Append a 1 byte to the end of the second to last block.
+ VLEIB $4, $1, M_4
+
+ // Append a 1 byte to the end of the last block only if it is a
+ // full 16 byte block.
CMPBNE R3, $16, 2(PC)
- VLEIB $12, $1, F_4
- VLEIB $4, $1, F_4
-
- // setup [r²,r]
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, RSAVE_3, R_3
- VLVGG $1, RSAVE_4, R_4
- VPDI $0, R5_1, R5SAVE_1, R5_1
- VPDI $0, R5_2, R5SAVE_2, R5_2
- VPDI $0, R5_3, R5SAVE_3, R5_3
- VPDI $0, R5_4, R5SAVE_4, R5_4
+ VLEIB $12, $1, M_4
+
+ // Finally, set up the coefficients for the final multiplication.
+ // We have previously saved r and 5r in the 32-bit even indexes
+ // of the R_[0-4] and R5_[1-4] coefficient registers.
+ //
+ // We want lane 0 to be multiplied by r² so that can be kept the
+ // same. We want lane 1 to be multiplied by r so we need to move
+ // the saved r value into the 32-bit odd index in lane 1 by
+ // rotating the 64-bit lane by 32.
+ VGBM $0x00ff, T_0 // [0, 0xffffffffffffffff] - mask lane 1 only
+ VERIMG $32, R_0, T_0, R_0 // [_, r²₂₆[0], _, r₂₆[0]]
+ VERIMG $32, R_1, T_0, R_1 // [_, r²₂₆[1], _, r₂₆[1]]
+ VERIMG $32, R_2, T_0, R_2 // [_, r²₂₆[2], _, r₂₆[2]]
+ VERIMG $32, R_3, T_0, R_3 // [_, r²₂₆[3], _, r₂₆[3]]
+ VERIMG $32, R_4, T_0, R_4 // [_, r²₂₆[4], _, r₂₆[4]]
+ VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²₂₆[1], _, 5r₂₆[1]]
+ VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²₂₆[2], _, 5r₂₆[2]]
+ VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²₂₆[3], _, 5r₂₆[3]]
+ VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²₂₆[4], _, 5r₂₆[4]]
MOVD $0, R3
BR multiply
skip:
- VZERO H_0
- VZERO H_1
- VZERO H_2
- VZERO H_3
- VZERO H_4
-
CMPBEQ R3, $0, finish
-b1:
- // 1 block remaining
- SUB $1, R3
- VLL R3, (R2), T_0
- ADD $1, R3
+b1: // 1 block remaining
+
+ // Load the final block (1-16 bytes). This will be placed into
+ // lane 0.
+ MOVD $-1(R3), R0
+ VLL R0, (R2), T_0 // pad to 16 bytes with zeros
+
+ // The Poly1305 algorithm requires that a 1 bit be appended to
+ // each message block. If the final block is less than 16 bytes
+ // long then it is easiest to insert the 1 before the message
+ // block is split into 26-bit limbs. If, on the other hand, the
+ // final message block is 16 bytes long then we append the 1 bit
+ // after expansion as normal.
MOVBZ $1, R0
CMPBEQ R3, $16, 2(PC)
VLVGB R3, R0, T_0
- VZERO T_1
- EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
+
+ // Set the message block in lane 1 to the value 0 so that it
+ // can be accumulated without affecting the final result.
+ VZERO T_1
+
+ // Split the final message block into 26-bit limbs in lane 0.
+ // Lane 1 will be contain 0.
+ EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
+
+ // Append a 1 byte to the end of the last block only if it is a
+ // full 16 byte block.
CMPBNE R3, $16, 2(PC)
- VLEIB $4, $1, F_4
- VLEIG $1, $1, R_0
- VZERO R_1
- VZERO R_2
- VZERO R_3
- VZERO R_4
- VZERO R5_1
- VZERO R5_2
- VZERO R5_3
- VZERO R5_4
-
- // setup [r, 1]
- VLVGG $0, RSAVE_0, R_0
- VLVGG $0, RSAVE_1, R_1
- VLVGG $0, RSAVE_2, R_2
- VLVGG $0, RSAVE_3, R_3
- VLVGG $0, RSAVE_4, R_4
- VPDI $0, R5SAVE_1, R5_1, R5_1
- VPDI $0, R5SAVE_2, R5_2, R5_2
- VPDI $0, R5SAVE_3, R5_3, R5_3
- VPDI $0, R5SAVE_4, R5_4, R5_4
+ VLEIB $4, $1, M_4
+
+ // We have previously saved r and 5r in the 32-bit even indexes
+ // of the R_[0-4] and R5_[1-4] coefficient registers.
+ //
+ // We want lane 0 to be multiplied by r so we need to move the
+ // saved r value into the 32-bit odd index in lane 0. We want
+ // lane 1 to be set to the value 1. This makes multiplication
+ // a no-op. We do this by setting lane 1 in every register to 0
+ // and then just setting the 32-bit index 3 in R_0 to 1.
+ VZERO T_0
+ MOVD $0, R0
+ MOVD $0x10111213, R12
+ VLVGP R12, R0, T_1 // [_, 0x10111213, _, 0x00000000]
+ VPERM T_0, R_0, T_1, R_0 // [_, r₂₆[0], _, 0]
+ VPERM T_0, R_1, T_1, R_1 // [_, r₂₆[1], _, 0]
+ VPERM T_0, R_2, T_1, R_2 // [_, r₂₆[2], _, 0]
+ VPERM T_0, R_3, T_1, R_3 // [_, r₂₆[3], _, 0]
+ VPERM T_0, R_4, T_1, R_4 // [_, r₂₆[4], _, 0]
+ VPERM T_0, R5_1, T_1, R5_1 // [_, 5r₂₆[1], _, 0]
+ VPERM T_0, R5_2, T_1, R5_2 // [_, 5r₂₆[2], _, 0]
+ VPERM T_0, R5_3, T_1, R5_3 // [_, 5r₂₆[3], _, 0]
+ VPERM T_0, R5_4, T_1, R5_4 // [_, 5r₂₆[4], _, 0]
+
+ // Set the value of lane 1 to be 1.
+ VLEIF $3, $1, R_0 // [_, r₂₆[0], _, 1]
MOVD $0, R3
BR multiply
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s b/src/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
deleted file mode 100644
index b439af9369..0000000000
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
+++ /dev/null
@@ -1,909 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.11,!gccgo,!purego
-
-#include "textflag.h"
-
-// Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
-
-// constants
-#define EX0 V1
-#define EX1 V2
-#define EX2 V3
-
-// temporaries
-#define T_0 V4
-#define T_1 V5
-#define T_2 V6
-#define T_3 V7
-#define T_4 V8
-#define T_5 V9
-#define T_6 V10
-#define T_7 V11
-#define T_8 V12
-#define T_9 V13
-#define T_10 V14
-
-// r**2 & r**4
-#define R_0 V15
-#define R_1 V16
-#define R_2 V17
-#define R5_1 V18
-#define R5_2 V19
-// key (r)
-#define RSAVE_0 R7
-#define RSAVE_1 R8
-#define RSAVE_2 R9
-#define R5SAVE_1 R10
-#define R5SAVE_2 R11
-
-// message block
-#define M0 V20
-#define M1 V21
-#define M2 V22
-#define M3 V23
-#define M4 V24
-#define M5 V25
-
-// accumulator
-#define H0_0 V26
-#define H1_0 V27
-#define H2_0 V28
-#define H0_1 V29
-#define H1_1 V30
-#define H2_1 V31
-
-GLOBL ·keyMask<>(SB), RODATA, $16
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
-
-GLOBL ·bswapMask<>(SB), RODATA, $16
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
-
-GLOBL ·constants<>(SB), RODATA, $48
-// EX0
-DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+8(SB)/8, $0x0000050403020100
-// EX1
-DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+24(SB)/8, $0x00000a0908070605
-// EX2
-DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
-
-GLOBL ·c<>(SB), RODATA, $48
-// EX0
-DATA ·c<>+0(SB)/8, $0x0000050403020100
-DATA ·c<>+8(SB)/8, $0x0000151413121110
-// EX1
-DATA ·c<>+16(SB)/8, $0x00000a0908070605
-DATA ·c<>+24(SB)/8, $0x00001a1918171615
-// EX2
-DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
-DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
-
-GLOBL ·reduce<>(SB), RODATA, $32
-// 44 bit
-DATA ·reduce<>+0(SB)/8, $0x0
-DATA ·reduce<>+8(SB)/8, $0xfffffffffff
-// 42 bit
-DATA ·reduce<>+16(SB)/8, $0x0
-DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
-
-// h = (f*g) % (2**130-5) [partial reduction]
-// uses T_0...T_9 temporary registers
-// input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
-// output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
-#define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
- \ // Eliminate the dependency for the last 2 VMSLs
- VMSLG m02_0, r_2, m4_2, m4_2 \
- VMSLG m13_0, r_2, m5_2, m5_2 \ // 8 VMSLs pipelined
- VMSLG m02_0, r_0, m4_0, m4_0 \
- VMSLG m02_1, r5_2, V0, T_0 \
- VMSLG m02_0, r_1, m4_1, m4_1 \
- VMSLG m02_1, r_0, V0, T_1 \
- VMSLG m02_1, r_1, V0, T_2 \
- VMSLG m02_2, r5_1, V0, T_3 \
- VMSLG m02_2, r5_2, V0, T_4 \
- VMSLG m13_0, r_0, m5_0, m5_0 \
- VMSLG m13_1, r5_2, V0, T_5 \
- VMSLG m13_0, r_1, m5_1, m5_1 \
- VMSLG m13_1, r_0, V0, T_6 \
- VMSLG m13_1, r_1, V0, T_7 \
- VMSLG m13_2, r5_1, V0, T_8 \
- VMSLG m13_2, r5_2, V0, T_9 \
- VMSLG m02_2, r_0, m4_2, m4_2 \
- VMSLG m13_2, r_0, m5_2, m5_2 \
- VAQ m4_0, T_0, m02_0 \
- VAQ m4_1, T_1, m02_1 \
- VAQ m5_0, T_5, m13_0 \
- VAQ m5_1, T_6, m13_1 \
- VAQ m02_0, T_3, m02_0 \
- VAQ m02_1, T_4, m02_1 \
- VAQ m13_0, T_8, m13_0 \
- VAQ m13_1, T_9, m13_1 \
- VAQ m4_2, T_2, m02_2 \
- VAQ m5_2, T_7, m13_2 \
-
-// SQUARE uses three limbs of r and r_2*5 to output square of r
-// uses T_1, T_5 and T_7 temporary registers
-// input: r_0, r_1, r_2, r5_2
-// temp: TEMP0, TEMP1, TEMP2
-// output: p0, p1, p2
-#define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
- VMSLG r_0, r_0, p0, p0 \
- VMSLG r_1, r5_2, V0, TEMP0 \
- VMSLG r_2, r5_2, p1, p1 \
- VMSLG r_0, r_1, V0, TEMP1 \
- VMSLG r_1, r_1, p2, p2 \
- VMSLG r_0, r_2, V0, TEMP2 \
- VAQ TEMP0, p0, p0 \
- VAQ TEMP1, p1, p1 \
- VAQ TEMP2, p2, p2 \
- VAQ TEMP0, p0, p0 \
- VAQ TEMP1, p1, p1 \
- VAQ TEMP2, p2, p2 \
-
-// carry h0->h1->h2->h0 || h3->h4->h5->h3
-// uses T_2, T_4, T_5, T_7, T_8, T_9
-// t6, t7, t8, t9, t10, t11
-// input: h0, h1, h2, h3, h4, h5
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
-// output: h0, h1, h2, h3, h4, h5
-#define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
- VLM (R12), t6, t7 \ // 44 and 42 bit clear mask
- VLEIB $7, $0x28, t10 \ // 5 byte shift mask
- VREPIB $4, t8 \ // 4 bit shift mask
- VREPIB $2, t11 \ // 2 bit shift mask
- VSRLB t10, h0, t0 \ // h0 byte shift
- VSRLB t10, h1, t1 \ // h1 byte shift
- VSRLB t10, h2, t2 \ // h2 byte shift
- VSRLB t10, h3, t3 \ // h3 byte shift
- VSRLB t10, h4, t4 \ // h4 byte shift
- VSRLB t10, h5, t5 \ // h5 byte shift
- VSRL t8, t0, t0 \ // h0 bit shift
- VSRL t8, t1, t1 \ // h2 bit shift
- VSRL t11, t2, t2 \ // h2 bit shift
- VSRL t8, t3, t3 \ // h3 bit shift
- VSRL t8, t4, t4 \ // h4 bit shift
- VESLG $2, t2, t9 \ // h2 carry x5
- VSRL t11, t5, t5 \ // h5 bit shift
- VN t6, h0, h0 \ // h0 clear carry
- VAQ t2, t9, t2 \ // h2 carry x5
- VESLG $2, t5, t9 \ // h5 carry x5
- VN t6, h1, h1 \ // h1 clear carry
- VN t7, h2, h2 \ // h2 clear carry
- VAQ t5, t9, t5 \ // h5 carry x5
- VN t6, h3, h3 \ // h3 clear carry
- VN t6, h4, h4 \ // h4 clear carry
- VN t7, h5, h5 \ // h5 clear carry
- VAQ t0, h1, h1 \ // h0->h1
- VAQ t3, h4, h4 \ // h3->h4
- VAQ t1, h2, h2 \ // h1->h2
- VAQ t4, h5, h5 \ // h4->h5
- VAQ t2, h0, h0 \ // h2->h0
- VAQ t5, h3, h3 \ // h5->h3
- VREPG $1, t6, t6 \ // 44 and 42 bit masks across both halves
- VREPG $1, t7, t7 \
- VSLDB $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
- VSLDB $8, h1, h1, h1 \
- VSLDB $8, h2, h2, h2 \
- VO h0, h3, h3 \
- VO h1, h4, h4 \
- VO h2, h5, h5 \
- VESRLG $44, h3, t0 \ // 44 bit shift right
- VESRLG $44, h4, t1 \
- VESRLG $42, h5, t2 \
- VN t6, h3, h3 \ // clear carry bits
- VN t6, h4, h4 \
- VN t7, h5, h5 \
- VESLG $2, t2, t9 \ // multiply carry by 5
- VAQ t9, t2, t2 \
- VAQ t0, h4, h4 \
- VAQ t1, h5, h5 \
- VAQ t2, h3, h3 \
-
-// carry h0->h1->h2->h0
-// input: h0, h1, h2
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
-// output: h0, h1, h2
-#define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
- VLEIB $7, $0x28, t3 \ // 5 byte shift mask
- VREPIB $4, t4 \ // 4 bit shift mask
- VREPIB $2, t7 \ // 2 bit shift mask
- VGBM $0x003F, t5 \ // mask to clear carry bits
- VSRLB t3, h0, t0 \
- VSRLB t3, h1, t1 \
- VSRLB t3, h2, t2 \
- VESRLG $4, t5, t5 \ // 44 bit clear mask
- VSRL t4, t0, t0 \
- VSRL t4, t1, t1 \
- VSRL t7, t2, t2 \
- VESRLG $2, t5, t6 \ // 42 bit clear mask
- VESLG $2, t2, t8 \
- VAQ t8, t2, t2 \
- VN t5, h0, h0 \
- VN t5, h1, h1 \
- VN t6, h2, h2 \
- VAQ t0, h1, h1 \
- VAQ t1, h2, h2 \
- VAQ t2, h0, h0 \
- VSRLB t3, h0, t0 \
- VSRLB t3, h1, t1 \
- VSRLB t3, h2, t2 \
- VSRL t4, t0, t0 \
- VSRL t4, t1, t1 \
- VSRL t7, t2, t2 \
- VN t5, h0, h0 \
- VN t5, h1, h1 \
- VESLG $2, t2, t8 \
- VN t6, h2, h2 \
- VAQ t0, h1, h1 \
- VAQ t8, t2, t2 \
- VAQ t1, h2, h2 \
- VAQ t2, h0, h0 \
-
-// expands two message blocks into the lower halfs of the d registers
-// moves the contents of the d registers into upper halfs
-// input: in1, in2, d0, d1, d2, d3, d4, d5
-// temp: TEMP0, TEMP1, TEMP2, TEMP3
-// output: d0, d1, d2, d3, d4, d5
-#define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
- VGBM $0xff3f, TEMP0 \
- VGBM $0xff1f, TEMP1 \
- VESLG $4, d1, TEMP2 \
- VESLG $4, d4, TEMP3 \
- VESRLG $4, TEMP0, TEMP0 \
- VPERM in1, d0, EX0, d0 \
- VPERM in2, d3, EX0, d3 \
- VPERM in1, d2, EX2, d2 \
- VPERM in2, d5, EX2, d5 \
- VPERM in1, TEMP2, EX1, d1 \
- VPERM in2, TEMP3, EX1, d4 \
- VN TEMP0, d0, d0 \
- VN TEMP0, d3, d3 \
- VESRLG $4, d1, d1 \
- VESRLG $4, d4, d4 \
- VN TEMP1, d2, d2 \
- VN TEMP1, d5, d5 \
- VN TEMP0, d1, d1 \
- VN TEMP0, d4, d4 \
-
-// expands one message block into the lower halfs of the d registers
-// moves the contents of the d registers into upper halfs
-// input: in, d0, d1, d2
-// temp: TEMP0, TEMP1, TEMP2
-// output: d0, d1, d2
-#define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
- VGBM $0xff3f, TEMP0 \
- VESLG $4, d1, TEMP2 \
- VGBM $0xff1f, TEMP1 \
- VPERM in, d0, EX0, d0 \
- VESRLG $4, TEMP0, TEMP0 \
- VPERM in, d2, EX2, d2 \
- VPERM in, TEMP2, EX1, d1 \
- VN TEMP0, d0, d0 \
- VN TEMP1, d2, d2 \
- VESRLG $4, d1, d1 \
- VN TEMP0, d1, d1 \
-
-// pack h2:h0 into h1:h0 (no carry)
-// input: h0, h1, h2
-// output: h0, h1, h2
-#define PACK(h0, h1, h2) \
- VMRLG h1, h2, h2 \ // copy h1 to upper half h2
- VESLG $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
- VO h0, h1, h0 \ // combine h0 with 20 bits from limb 1
- VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
- VLEIG $1, $0, h1 \ // clear h2 stuff from lower half of h1
- VO h0, h1, h0 \ // h0 now has 88 bits (limb 0 and 1)
- VLEIG $0, $0, h2 \ // clear upper half of h2
- VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
- VLEIB $7, $88, h1 \ // for byte shift (11 bytes)
- VSLB h1, h2, h2 \ // shift h2 11 bytes to the left
- VO h0, h2, h0 \ // combine h0 with 20 bits from limb 1
- VLEIG $0, $0, h1 \ // clear upper half of h1
-
-// if h > 2**130-5 then h -= 2**130-5
-// input: h0, h1
-// temp: t0, t1, t2
-// output: h0
-#define MOD(h0, h1, t0, t1, t2) \
- VZERO t0 \
- VLEIG $1, $5, t0 \
- VACCQ h0, t0, t1 \
- VAQ h0, t0, t0 \
- VONE t2 \
- VLEIG $1, $-4, t2 \
- VAQ t2, t1, t1 \
- VACCQ h1, t1, t1 \
- VONE t2 \
- VAQ t2, t1, t1 \
- VN h0, t1, t2 \
- VNC t0, t1, t1 \
- VO t1, t2, h0 \
-
-// func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305vmsl(SB), $0-32
- // This code processes 6 + up to 4 blocks (32 bytes) per iteration
- // using the algorithm described in:
- // NEON crypto, Daniel J. Bernstein & Peter Schwabe
- // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
- // And as moddified for VMSL as described in
- // Accelerating Poly1305 Cryptographic Message Authentication on the z14
- // O'Farrell et al, CASCON 2017, p48-55
- // https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
-
- LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
- VZERO V0 // c
-
- // load EX0, EX1 and EX2
- MOVD $·constants<>(SB), R5
- VLM (R5), EX0, EX2 // c
-
- // setup r
- VL (R4), T_0
- MOVD $·keyMask<>(SB), R6
- VL (R6), T_1
- VN T_0, T_1, T_0
- VZERO T_2 // limbs for r
- VZERO T_3
- VZERO T_4
- EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
-
- // T_2, T_3, T_4: [0, r]
-
- // setup r*20
- VLEIG $0, $0, T_0
- VLEIG $1, $20, T_0 // T_0: [0, 20]
- VZERO T_5
- VZERO T_6
- VMSLG T_0, T_3, T_5, T_5
- VMSLG T_0, T_4, T_6, T_6
-
- // store r for final block in GR
- VLGVG $1, T_2, RSAVE_0 // c
- VLGVG $1, T_3, RSAVE_1 // c
- VLGVG $1, T_4, RSAVE_2 // c
- VLGVG $1, T_5, R5SAVE_1 // c
- VLGVG $1, T_6, R5SAVE_2 // c
-
- // initialize h
- VZERO H0_0
- VZERO H1_0
- VZERO H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- // initialize pointer for reduce constants
- MOVD $·reduce<>(SB), R12
-
- // calculate r**2 and 20*(r**2)
- VZERO R_0
- VZERO R_1
- VZERO R_2
- SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
- REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
- VZERO R5_1
- VZERO R5_2
- VMSLG T_0, R_1, R5_1, R5_1
- VMSLG T_0, R_2, R5_2, R5_2
-
- // skip r**4 calculation if 3 blocks or less
- CMPBLE R3, $48, b4
-
- // calculate r**4 and 20*(r**4)
- VZERO T_8
- VZERO T_9
- VZERO T_10
- SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
- REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
- VZERO T_2
- VZERO T_3
- VMSLG T_0, T_9, T_2, T_2
- VMSLG T_0, T_10, T_3, T_3
-
- // put r**2 to the right and r**4 to the left of R_0, R_1, R_2
- VSLDB $8, T_8, T_8, T_8
- VSLDB $8, T_9, T_9, T_9
- VSLDB $8, T_10, T_10, T_10
- VSLDB $8, T_2, T_2, T_2
- VSLDB $8, T_3, T_3, T_3
-
- VO T_8, R_0, R_0
- VO T_9, R_1, R_1
- VO T_10, R_2, R_2
- VO T_2, R5_1, R5_1
- VO T_3, R5_2, R5_2
-
- CMPBLE R3, $80, load // less than or equal to 5 blocks in message
-
- // 6(or 5+1) blocks
- SUB $81, R3
- VLM (R2), M0, M4
- VLL R3, 80(R2), M5
- ADD $1, R3
- MOVBZ $1, R0
- CMPBGE R3, $16, 2(PC)
- VLVGB R3, R0, M5
- MOVD $96(R2), R2
- EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
- EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
- VLEIB $2, $1, H2_0
- VLEIB $2, $1, H2_1
- VLEIB $10, $1, H2_0
- VLEIB $10, $1, H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO T_4
- VZERO T_10
- EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
- VLR T_4, M4
- VLEIB $10, $1, M2
- CMPBLT R3, $16, 2(PC)
- VLEIB $10, $1, T_10
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
-
- SUB $16, R3
- CMPBLE R3, $0, square
-
-load:
- // load EX0, EX1 and EX2
- MOVD $·c<>(SB), R5
- VLM (R5), EX0, EX2
-
-loop:
- CMPBLE R3, $64, add // b4 // last 4 or less blocks left
-
- // next 4 full blocks
- VLM (R2), M2, M5
- SUB $64, R3
- MOVD $64(R2), R2
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
-
- // expacc in-lined to create [m2, m3] limbs
- VGBM $0x3f3f, T_0 // 44 bit clear mask
- VGBM $0x1f1f, T_1 // 40 bit clear mask
- VPERM M2, M3, EX0, T_3
- VESRLG $4, T_0, T_0 // 44 bit clear mask ready
- VPERM M2, M3, EX1, T_4
- VPERM M2, M3, EX2, T_5
- VN T_0, T_3, T_3
- VESRLG $4, T_4, T_4
- VN T_1, T_5, T_5
- VN T_0, T_4, T_4
- VMRHG H0_1, T_3, H0_0
- VMRHG H1_1, T_4, H1_0
- VMRHG H2_1, T_5, H2_0
- VMRLG H0_1, T_3, H0_1
- VMRLG H1_1, T_4, H1_1
- VMRLG H2_1, T_5, H2_1
- VLEIB $10, $1, H2_0
- VLEIB $10, $1, H2_1
- VPERM M4, M5, EX0, T_3
- VPERM M4, M5, EX1, T_4
- VPERM M4, M5, EX2, T_5
- VN T_0, T_3, T_3
- VESRLG $4, T_4, T_4
- VN T_1, T_5, T_5
- VN T_0, T_4, T_4
- VMRHG V0, T_3, M0
- VMRHG V0, T_4, M1
- VMRHG V0, T_5, M2
- VMRLG V0, T_3, M3
- VMRLG V0, T_4, M4
- VMRLG V0, T_5, M5
- VLEIB $10, $1, M2
- VLEIB $10, $1, M5
-
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- CMPBNE R3, $0, loop
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
-
- // load EX0, EX1, EX2
- MOVD $·constants<>(SB), R5
- VLM (R5), EX0, EX2
-
- // sum vectors
- VAQ H0_0, H0_1, H0_0
- VAQ H1_0, H1_1, H1_0
- VAQ H2_0, H2_1, H2_0
-
- // h may be >= 2*(2**130-5) so we need to reduce it again
- // M0...M4 are used as temps here
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
-next: // carry h1->h2
- VLEIB $7, $0x28, T_1
- VREPIB $4, T_2
- VGBM $0x003F, T_3
- VESRLG $4, T_3
-
- // byte shift
- VSRLB T_1, H1_0, T_4
-
- // bit shift
- VSRL T_2, T_4, T_4
-
- // clear h1 carry bits
- VN T_3, H1_0, H1_0
-
- // add carry
- VAQ T_4, H2_0, H2_0
-
- // h is now < 2*(2**130-5)
- // pack h into h1 (hi) and h0 (lo)
- PACK(H0_0, H1_0, H2_0)
-
- // if h > 2**130-5 then h -= 2**130-5
- MOD(H0_0, H1_0, T_0, T_1, T_2)
-
- // h += s
- MOVD $·bswapMask<>(SB), R5
- VL (R5), T_1
- VL 16(R4), T_0
- VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
- VAQ T_0, H0_0, H0_0
- VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
- VST H0_0, (R1)
- RET
-
-add:
- // load EX0, EX1, EX2
- MOVD $·constants<>(SB), R5
- VLM (R5), EX0, EX2
-
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
- CMPBLE R3, $64, b4
-
-b4:
- CMPBLE R3, $48, b3 // 3 blocks or less
-
- // 4(3+1) blocks remaining
- SUB $49, R3
- VLM (R2), M0, M2
- VLL R3, 48(R2), M3
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, M3
- MOVD $64(R2), R2
- EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
- VLEIB $10, $1, H2_0
- VLEIB $10, $1, H2_1
- VZERO M0
- VZERO M1
- VZERO M4
- VZERO M5
- VZERO T_4
- VZERO T_10
- EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
- VLR T_4, M2
- VLEIB $10, $1, M4
- CMPBNE R3, $16, 2(PC)
- VLEIB $10, $1, T_10
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
- SUB $16, R3
- CMPBLE R3, $0, square // this condition must always hold true!
-
-b3:
- CMPBLE R3, $32, b2
-
- // 3 blocks remaining
-
- // setup [r²,r]
- VSLDB $8, R_0, R_0, R_0
- VSLDB $8, R_1, R_1, R_1
- VSLDB $8, R_2, R_2, R_2
- VSLDB $8, R5_1, R5_1, R5_1
- VSLDB $8, R5_2, R5_2, R5_2
-
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, R5SAVE_1, R5_1
- VLVGG $1, R5SAVE_2, R5_2
-
- // setup [h0, h1]
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
- VO H0_1, H0_0, H0_0
- VO H1_1, H1_0, H1_0
- VO H2_1, H2_0, H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
-
- // H*[r**2, r]
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
-
- SUB $33, R3
- VLM (R2), M0, M1
- VLL R3, 32(R2), M2
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, M2
-
- // H += m0
- VZERO T_1
- VZERO T_2
- VZERO T_3
- EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
- VLEIB $10, $1, T_3
- VAG H0_0, T_1, H0_0
- VAG H1_0, T_2, H1_0
- VAG H2_0, T_3, H2_0
-
- VZERO M0
- VZERO M3
- VZERO M4
- VZERO M5
- VZERO T_10
-
- // (H+m0)*r
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
-
- // H += m1
- VZERO V0
- VZERO T_1
- VZERO T_2
- VZERO T_3
- EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
- VLEIB $10, $1, T_3
- VAQ H0_0, T_1, H0_0
- VAQ H1_0, T_2, H1_0
- VAQ H2_0, T_3, H2_0
- REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
-
- // [H, m2] * [r**2, r]
- EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
- CMPBNE R3, $16, 2(PC)
- VLEIB $10, $1, H2_0
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
- SUB $16, R3
- CMPBLE R3, $0, next // this condition must always hold true!
-
-b2:
- CMPBLE R3, $16, b1
-
- // 2 blocks remaining
-
- // setup [r²,r]
- VSLDB $8, R_0, R_0, R_0
- VSLDB $8, R_1, R_1, R_1
- VSLDB $8, R_2, R_2, R_2
- VSLDB $8, R5_1, R5_1, R5_1
- VSLDB $8, R5_2, R5_2, R5_2
-
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, R5SAVE_1, R5_1
- VLVGG $1, R5SAVE_2, R5_2
-
- // setup [h0, h1]
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
- VO H0_1, H0_0, H0_0
- VO H1_1, H1_0, H1_0
- VO H2_1, H2_0, H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
-
- // H*[r**2, r]
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
- VMRHG V0, H0_1, H0_0
- VMRHG V0, H1_1, H1_0
- VMRHG V0, H2_1, H2_0
- VMRLG V0, H0_1, H0_1
- VMRLG V0, H1_1, H1_1
- VMRLG V0, H2_1, H2_1
-
- // move h to the left and 0s at the right
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
-
- // get message blocks and append 1 to start
- SUB $17, R3
- VL (R2), M0
- VLL R3, 16(R2), M1
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, M1
- VZERO T_6
- VZERO T_7
- VZERO T_8
- EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
- EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
- VLEIB $2, $1, T_8
- CMPBNE R3, $16, 2(PC)
- VLEIB $10, $1, T_8
-
- // add [m0, m1] to h
- VAG H0_0, T_6, H0_0
- VAG H1_0, T_7, H1_0
- VAG H2_0, T_8, H2_0
-
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
- VZERO T_10
- VZERO M0
-
- // at this point R_0 .. R5_2 look like [r**2, r]
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
- SUB $16, R3, R3
- CMPBLE R3, $0, next
-
-b1:
- CMPBLE R3, $0, next
-
- // 1 block remaining
-
- // setup [r²,r]
- VSLDB $8, R_0, R_0, R_0
- VSLDB $8, R_1, R_1, R_1
- VSLDB $8, R_2, R_2, R_2
- VSLDB $8, R5_1, R5_1, R5_1
- VSLDB $8, R5_2, R5_2, R5_2
-
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, R5SAVE_1, R5_1
- VLVGG $1, R5SAVE_2, R5_2
-
- // setup [h0, h1]
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
- VO H0_1, H0_0, H0_0
- VO H1_1, H1_0, H1_0
- VO H2_1, H2_0, H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
-
- // H*[r**2, r]
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
- // set up [0, m0] limbs
- SUB $1, R3
- VLL R3, (R2), M0
- ADD $1, R3
- MOVBZ $1, R0
- CMPBEQ R3, $16, 2(PC)
- VLVGB R3, R0, M0
- VZERO T_1
- VZERO T_2
- VZERO T_3
- EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
- CMPBNE R3, $16, 2(PC)
- VLEIB $10, $1, T_3
-
- // h+m0
- VAQ H0_0, T_1, H0_0
- VAQ H1_0, T_2, H1_0
- VAQ H2_0, T_3, H2_0
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
- BR next
-
-square:
- // setup [r²,r]
- VSLDB $8, R_0, R_0, R_0
- VSLDB $8, R_1, R_1, R_1
- VSLDB $8, R_2, R_2, R_2
- VSLDB $8, R5_1, R5_1, R5_1
- VSLDB $8, R5_2, R5_2, R5_2
-
- VLVGG $1, RSAVE_0, R_0
- VLVGG $1, RSAVE_1, R_1
- VLVGG $1, RSAVE_2, R_2
- VLVGG $1, R5SAVE_1, R5_1
- VLVGG $1, R5SAVE_2, R5_2
-
- // setup [h0, h1]
- VSLDB $8, H0_0, H0_0, H0_0
- VSLDB $8, H1_0, H1_0, H1_0
- VSLDB $8, H2_0, H2_0, H2_0
- VO H0_1, H0_0, H0_0
- VO H1_1, H1_0, H1_0
- VO H2_1, H2_0, H2_0
- VZERO H0_1
- VZERO H1_1
- VZERO H2_1
-
- VZERO M0
- VZERO M1
- VZERO M2
- VZERO M3
- VZERO M4
- VZERO M5
-
- // (h0*r**2) + (h1*r)
- MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
- REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
- BR next
diff --git a/src/vendor/golang.org/x/text/unicode/bidi/core.go b/src/vendor/golang.org/x/text/unicode/bidi/core.go
index 48d144008a..50deb6600a 100644
--- a/src/vendor/golang.org/x/text/unicode/bidi/core.go
+++ b/src/vendor/golang.org/x/text/unicode/bidi/core.go
@@ -480,15 +480,15 @@ func (s *isolatingRunSequence) resolveWeakTypes() {
// Rule W1.
// Changes all NSMs.
- preceedingCharacterType := s.sos
+ precedingCharacterType := s.sos
for i, t := range s.types {
if t == NSM {
- s.types[i] = preceedingCharacterType
+ s.types[i] = precedingCharacterType
} else {
if t.in(LRI, RLI, FSI, PDI) {
- preceedingCharacterType = ON
+ precedingCharacterType = ON
}
- preceedingCharacterType = t
+ precedingCharacterType = t
}
}
diff --git a/src/vendor/modules.txt b/src/vendor/modules.txt
index 37fda889ec..7c42df8348 100644
--- a/src/vendor/modules.txt
+++ b/src/vendor/modules.txt
@@ -1,4 +1,4 @@
-# golang.org/x/crypto v0.0.0-20200414155820-4f8f47aa7992
+# golang.org/x/crypto v0.0.0-20200429183012-4b2356b1ed79
## explicit
golang.org/x/crypto/chacha20
golang.org/x/crypto/chacha20poly1305
@@ -18,9 +18,10 @@ golang.org/x/net/idna
golang.org/x/net/lif
golang.org/x/net/nettest
golang.org/x/net/route
-# golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd
+# golang.org/x/sys v0.0.0-20200501145240-bc7a7d42d5c3
+## explicit
golang.org/x/sys/cpu
-# golang.org/x/text v0.3.3-0.20191031172631-4b67af870c6f
+# golang.org/x/text v0.3.3-0.20200430171850-afb9336c4530
## explicit
golang.org/x/text/secure/bidirule
golang.org/x/text/transform