all: update vendored dependencies for Go 1.15 release

The Go 1.15 code freeze has just started. This is the time to update all golang.org/x/... module versions that contribute packages to the std and cmd modules in the standard library to latest master versions. Those versions have already gone through code review, and now they will undergo additional testing during the freeze period. If there are new issues in these dependencies discovered, we have the freeze period to deal with that. By the end of the freeze period, we will have confidence that the Go 1.15 release and the dependency versions it has selected are robust. If one of the Go 1.15.x minor releases requires changing code in one of the vendored packages, we'll be able to do so on top of the versions that are selected here, and not be forced to use versions that came from different time periods, or try to jump across multiple untested versions in a minor release. The dependency versions that are selected in this commit are: github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3 github.com/ianlancetaylor/demangle v0.0.0-20200414190113-039b1ae3a340 golang.org/x/arch v0.0.0-20200312215426-ff8b605520f4 golang.org/x/crypto v0.0.0-20200429183012-4b2356b1ed79 golang.org/x/mod v0.2.1-0.20200429172858-859b3ef565e2 golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5 golang.org/x/sys v0.0.0-20200501145240-bc7a7d42d5c3 golang.org/x/text v0.3.3-0.20200430171850-afb9336c4530 golang.org/x/tools v0.0.0-20200504152539-33427f1b0364 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 github.com/ianlancetaylor/demangle is considered in scope and updated. github.com/google/pprof is out of scope and was not updated. For #36905. Change-Id: Icb6996eb0df11f16edd9a42e04434012c0336354 Reviewed-on: https://go-review.googlesource.com/c/go/+/231657 Reviewed-by: Bryan C. Mills <bcmills@google.com> Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
author: Dmitri Shuralyov <dmitshur@golang.org> 2020-05-01 18:58:41 -0400
committer: Dmitri Shuralyov <dmitshur@golang.org> 2020-05-04 22:52:07 +0000
commit: b5f7ff4aa9c1fef6437f350595caae4ee4b5708d (patch)
tree: aab04441ac1615260b34983999bd5ba923d4be4b /src/vendor
parent: 4c003f6b780b471afbf032438eb6c7519458855b (diff)
download: go-b5f7ff4aa9c1fef6437f350595caae4ee4b5708d.tar.gz
go-b5f7ff4aa9c1fef6437f350595caae4ee4b5708d.zip
15 files changed, 594 insertions, 1349 deletions
diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_generic.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
index 7c498e90d9..a2ecf5c325 100644
--- a/src/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
+++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
@@ -42,10 +42,14 @@ type Cipher struct {
 
 	// The last len bytes of buf are leftover key stream bytes from the previous
 	// XORKeyStream invocation. The size of buf depends on how many blocks are
-	// computed at a time.
+	// computed at a time by xorKeyStreamBlocks.
 	buf [bufSize]byte
 	len int
 
+	// overflow is set when the counter overflowed, no more blocks can be
+	// generated, and the next XORKeyStream call should panic.
+	overflow bool
+
 	// The counter-independent results of the first round are cached after they
 	// are computed the first time.
 	precompDone      bool
@@ -89,6 +93,7 @@ func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) {
 		return nil, errors.New("chacha20: wrong nonce size")
 	}
 
+	key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint
 	c.key = [8]uint32{
 		binary.LittleEndian.Uint32(key[0:4]),
 		binary.LittleEndian.Uint32(key[4:8]),
@@ -139,15 +144,18 @@ func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
 // SetCounter sets the Cipher counter. The next invocation of XORKeyStream will
 // behave as if (64 * counter) bytes had been encrypted so far.
 //
-// To prevent accidental counter reuse, SetCounter panics if counter is
-// less than the current value.
+// To prevent accidental counter reuse, SetCounter panics if counter is less
+// than the current value.
+//
+// Note that the execution time of XORKeyStream is not independent of the
+// counter value.
 func (s *Cipher) SetCounter(counter uint32) {
 	// Internally, s may buffer multiple blocks, which complicates this
 	// implementation slightly. When checking whether the counter has rolled
 	// back, we must use both s.counter and s.len to determine how many blocks
 	// we have already output.
 	outputCounter := s.counter - uint32(s.len)/blockSize
-	if counter < outputCounter {
+	if s.overflow || counter < outputCounter {
 		panic("chacha20: SetCounter attempted to rollback counter")
 	}
 
@@ -196,34 +204,52 @@ func (s *Cipher) XORKeyStream(dst, src []byte) {
 			dst[i] = src[i] ^ b
 		}
 		s.len -= len(keyStream)
-		src = src[len(keyStream):]
-		dst = dst[len(keyStream):]
+		dst, src = dst[len(keyStream):], src[len(keyStream):]
+	}
+	if len(src) == 0 {
+		return
 	}
 
-	const blocksPerBuf = bufSize / blockSize
-	numBufs := (uint64(len(src)) + bufSize - 1) / bufSize
-	if uint64(s.counter)+numBufs*blocksPerBuf >= 1<<32 {
+	// If we'd need to let the counter overflow and keep generating output,
+	// panic immediately. If instead we'd only reach the last block, remember
+	// not to generate any more output after the buffer is drained.
+	numBlocks := (uint64(len(src)) + blockSize - 1) / blockSize
+	if s.overflow || uint64(s.counter)+numBlocks > 1<<32 {
 		panic("chacha20: counter overflow")
+	} else if uint64(s.counter)+numBlocks == 1<<32 {
+		s.overflow = true
 	}
 
 	// xorKeyStreamBlocks implementations expect input lengths that are a
 	// multiple of bufSize. Platform-specific ones process multiple blocks at a
 	// time, so have bufSizes that are a multiple of blockSize.
 
-	rem := len(src) % bufSize
-	full := len(src) - rem
-
+	full := len(src) - len(src)%bufSize
 	if full > 0 {
 		s.xorKeyStreamBlocks(dst[:full], src[:full])
 	}
+	dst, src = dst[full:], src[full:]
+
+	// If using a multi-block xorKeyStreamBlocks would overflow, use the generic
+	// one that does one block at a time.
+	const blocksPerBuf = bufSize / blockSize
+	if uint64(s.counter)+blocksPerBuf > 1<<32 {
+		s.buf = [bufSize]byte{}
+		numBlocks := (len(src) + blockSize - 1) / blockSize
+		buf := s.buf[bufSize-numBlocks*blockSize:]
+		copy(buf, src)
+		s.xorKeyStreamBlocksGeneric(buf, buf)
+		s.len = len(buf) - copy(dst, buf)
+		return
+	}
 
 	// If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
 	// keep the leftover keystream for the next XORKeyStream invocation.
-	if rem > 0 {
+	if len(src) > 0 {
 		s.buf = [bufSize]byte{}
-		copy(s.buf[:], src[full:])
+		copy(s.buf[:], src)
 		s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
-		s.len = bufSize - copy(dst[full:], s.buf[:])
+		s.len = bufSize - copy(dst, s.buf[:])
 	}
 }
 
@@ -260,7 +286,9 @@ func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
 		s.precompDone = true
 	}
 
-	for i := 0; i < len(src); i += blockSize {
+	// A condition of len(src) > 0 would be sufficient, but this also
+	// acts as a bounds check elimination hint.
+	for len(src) >= 64 && len(dst) >= 64 {
 		// The remainder of the first column round.
 		fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
 
@@ -285,49 +313,28 @@ func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
 			x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
 		}
 
-		// Finally, add back the initial state to generate the key stream.
-		x0 += c0
-		x1 += c1
-		x2 += c2
-		x3 += c3
-		x4 += c4
-		x5 += c5
-		x6 += c6
-		x7 += c7
-		x8 += c8
-		x9 += c9
-		x10 += c10
-		x11 += c11
-		x12 += s.counter
-		x13 += c13
-		x14 += c14
-		x15 += c15
+		// Add back the initial state to generate the key stream, then
+		// XOR the key stream with the source and write out the result.
+		addXor(dst[0:4], src[0:4], x0, c0)
+		addXor(dst[4:8], src[4:8], x1, c1)
+		addXor(dst[8:12], src[8:12], x2, c2)
+		addXor(dst[12:16], src[12:16], x3, c3)
+		addXor(dst[16:20], src[16:20], x4, c4)
+		addXor(dst[20:24], src[20:24], x5, c5)
+		addXor(dst[24:28], src[24:28], x6, c6)
+		addXor(dst[28:32], src[28:32], x7, c7)
+		addXor(dst[32:36], src[32:36], x8, c8)
+		addXor(dst[36:40], src[36:40], x9, c9)
+		addXor(dst[40:44], src[40:44], x10, c10)
+		addXor(dst[44:48], src[44:48], x11, c11)
+		addXor(dst[48:52], src[48:52], x12, s.counter)
+		addXor(dst[52:56], src[52:56], x13, c13)
+		addXor(dst[56:60], src[56:60], x14, c14)
+		addXor(dst[60:64], src[60:64], x15, c15)
 
 		s.counter += 1
-		if s.counter == 0 {
-			panic("chacha20: internal error: counter overflow")
-		}
 
-		in, out := src[i:], dst[i:]
-		in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint
-
-		// XOR the key stream with the source and write out the result.
-		xor(out[0:], in[0:], x0)
-		xor(out[4:], in[4:], x1)
-		xor(out[8:], in[8:], x2)
-		xor(out[12:], in[12:], x3)
-		xor(out[16:], in[16:], x4)
-		xor(out[20:], in[20:], x5)
-		xor(out[24:], in[24:], x6)
-		xor(out[28:], in[28:], x7)
-		xor(out[32:], in[32:], x8)
-		xor(out[36:], in[36:], x9)
-		xor(out[40:], in[40:], x10)
-		xor(out[44:], in[44:], x11)
-		xor(out[48:], in[48:], x12)
-		xor(out[52:], in[52:], x13)
-		xor(out[56:], in[56:], x14)
-		xor(out[60:], in[60:], x15)
+		src, dst = src[blockSize:], dst[blockSize:]
 	}
 }
 
diff --git a/src/vendor/golang.org/x/crypto/chacha20/xor.go b/src/vendor/golang.org/x/crypto/chacha20/xor.go
index 0110c9865a..c2d04851e0 100644
--- a/src/vendor/golang.org/x/crypto/chacha20/xor.go
+++ b/src/vendor/golang.org/x/crypto/chacha20/xor.go
@@ -13,10 +13,10 @@ const unaligned = runtime.GOARCH == "386" ||
 	runtime.GOARCH == "ppc64le" ||
 	runtime.GOARCH == "s390x"
 
-// xor reads a little endian uint32 from src, XORs it with u and
+// addXor reads a little endian uint32 from src, XORs it with (a + b) and
 // places the result in little endian byte order in dst.
-func xor(dst, src []byte, u uint32) {
-	_, _ = src[3], dst[3] // eliminate bounds checks
+func addXor(dst, src []byte, a, b uint32) {
+	_, _ = src[3], dst[3] // bounds check elimination hint
 	if unaligned {
 		// The compiler should optimize this code into
 		// 32-bit unaligned little endian loads and stores.
@@ -27,15 +27,16 @@ func xor(dst, src []byte, u uint32) {
 		v |= uint32(src[1]) << 8
 		v |= uint32(src[2]) << 16
 		v |= uint32(src[3]) << 24
-		v ^= u
+		v ^= a + b
 		dst[0] = byte(v)
 		dst[1] = byte(v >> 8)
 		dst[2] = byte(v >> 16)
 		dst[3] = byte(v >> 24)
 	} else {
-		dst[0] = src[0] ^ byte(u)
-		dst[1] = src[1] ^ byte(u>>8)
-		dst[2] = src[2] ^ byte(u>>16)
-		dst[3] = src[3] ^ byte(u>>24)
+		a += b
+		dst[0] = src[0] ^ byte(a)
+		dst[1] = src[1] ^ byte(a>>8)
+		dst[2] = src[2] ^ byte(a>>16)
+		dst[3] = src[3] ^ byte(a>>24)
 	}
 }
diff --git a/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go b/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
index 91b38568ce..fe191d395d 100644
--- a/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
+++ b/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
@@ -12,56 +12,64 @@ import (
 	"golang.org/x/crypto/poly1305"
 )
 
-func roundTo16(n int) int {
-	return 16 * ((n + 15) / 16)
+func writeWithPadding(p *poly1305.MAC, b []byte) {
+	p.Write(b)
+	if rem := len(b) % 16; rem != 0 {
+		var buf [16]byte
+		padLen := 16 - rem
+		p.Write(buf[:padLen])
+	}
+}
+
+func writeUint64(p *poly1305.MAC, n int) {
+	var buf [8]byte
+	binary.LittleEndian.PutUint64(buf[:], uint64(n))
+	p.Write(buf[:])
 }
 
 func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte {
 	ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize)
+	ciphertext, tag := out[:len(plaintext)], out[len(plaintext):]
 	if subtle.InexactOverlap(out, plaintext) {
 		panic("chacha20poly1305: invalid buffer overlap")
 	}
 
-	var polyKey, discardBuf [32]byte
+	var polyKey [32]byte
 	s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
 	s.XORKeyStream(polyKey[:], polyKey[:])
-	s.XORKeyStream(discardBuf[:], discardBuf[:]) // skip the next 32 bytes
-	s.XORKeyStream(out, plaintext)
-
-	polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(plaintext))+8+8)
-	copy(polyInput, additionalData)
-	copy(polyInput[roundTo16(len(additionalData)):], out[:len(plaintext)])
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(plaintext)))
+	s.SetCounter(1) // set the counter to 1, skipping 32 bytes
+	s.XORKeyStream(ciphertext, plaintext)
 
-	var tag [poly1305.TagSize]byte
-	poly1305.Sum(&tag, polyInput, &polyKey)
-	copy(out[len(plaintext):], tag[:])
+	p := poly1305.New(&polyKey)
+	writeWithPadding(p, additionalData)
+	writeWithPadding(p, ciphertext)
+	writeUint64(p, len(additionalData))
+	writeUint64(p, len(plaintext))
+	p.Sum(tag[:0])
 
 	return ret
 }
 
 func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
-	var tag [poly1305.TagSize]byte
-	copy(tag[:], ciphertext[len(ciphertext)-16:])
+	tag := ciphertext[len(ciphertext)-16:]
 	ciphertext = ciphertext[:len(ciphertext)-16]
 
-	var polyKey, discardBuf [32]byte
+	var polyKey [32]byte
 	s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
 	s.XORKeyStream(polyKey[:], polyKey[:])
-	s.XORKeyStream(discardBuf[:], discardBuf[:]) // skip the next 32 bytes
+	s.SetCounter(1) // set the counter to 1, skipping 32 bytes
 
-	polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(ciphertext))+8+8)
-	copy(polyInput, additionalData)
-	copy(polyInput[roundTo16(len(additionalData)):], ciphertext)
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(ciphertext)))
+	p := poly1305.New(&polyKey)
+	writeWithPadding(p, additionalData)
+	writeWithPadding(p, ciphertext)
+	writeUint64(p, len(additionalData))
+	writeUint64(p, len(ciphertext))
 
 	ret, out := sliceForAppend(dst, len(ciphertext))
 	if subtle.InexactOverlap(out, ciphertext) {
 		panic("chacha20poly1305: invalid buffer overlap")
 	}
-	if !poly1305.Verify(&tag, polyInput, &polyKey) {
+	if !p.Verify(tag) {
 		for i := range out {
 			out[i] = 0
 		}
diff --git a/src/vendor/golang.org/x/crypto/cryptobyte/asn1.go b/src/vendor/golang.org/x/crypto/cryptobyte/asn1.go
index f930f7e526..b26376aeca 100644
--- a/src/vendor/golang.org/x/crypto/cryptobyte/asn1.go
+++ b/src/vendor/golang.org/x/crypto/cryptobyte/asn1.go
@@ -81,7 +81,7 @@ func (b *Builder) AddASN1BigInt(n *big.Int) {
 			for i := range bytes {
 				bytes[i] ^= 0xff
 			}
-			if bytes[0]&0x80 == 0 {
+			if len(bytes) == 0 || bytes[0]&0x80 == 0 {
 				c.add(0xff)
 			}
 			c.add(bytes...)
diff --git a/src/vendor/golang.org/x/crypto/poly1305/mac_noasm.go b/src/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
index b0c2cd0561..d118f30ed5 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
@@ -2,10 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!ppc64le gccgo purego
+// +build !amd64,!ppc64le,!s390x gccgo purego
 
 package poly1305
 
 type mac struct{ macGeneric }
-
-func newMAC(key *[32]byte) mac { return mac{newMACGeneric(key)} }
diff --git a/src/vendor/golang.org/x/crypto/poly1305/poly1305.go b/src/vendor/golang.org/x/crypto/poly1305/poly1305.go
index 066159b797..9d7a6af09f 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/poly1305.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/poly1305.go
@@ -26,7 +26,9 @@ const TagSize = 16
 // 16-byte result into out. Authenticating two different messages with the same
 // key allows an attacker to forge messages at will.
 func Sum(out *[16]byte, m []byte, key *[32]byte) {
-	sum(out, m, key)
+	h := New(key)
+	h.Write(m)
+	h.Sum(out[:0])
 }
 
 // Verify returns true if mac is a valid authenticator for m with the given key.
@@ -46,10 +48,9 @@ func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
 // two different messages with the same key allows an attacker
 // to forge messages at will.
 func New(key *[32]byte) *MAC {
-	return &MAC{
-		mac:       newMAC(key),
-		finalized: false,
-	}
+	m := &MAC{}
+	initialize(key, &m.macState)
+	return m
 }
 
 // MAC is an io.Writer computing an authentication tag
@@ -58,7 +59,7 @@ func New(key *[32]byte) *MAC {
 // MAC cannot be used like common hash.Hash implementations,
 // because using a poly1305 key twice breaks its security.
 // Therefore writing data to a running MAC after calling
-// Sum causes it to panic.
+// Sum or Verify causes it to panic.
 type MAC struct {
 	mac // platform-dependent implementation
 
@@ -71,10 +72,10 @@ func (h *MAC) Size() int { return TagSize }
 // Write adds more data to the running message authentication code.
 // It never returns an error.
 //
-// It must not be called after the first call of Sum.
+// It must not be called after the first call of Sum or Verify.
 func (h *MAC) Write(p []byte) (n int, err error) {
 	if h.finalized {
-		panic("poly1305: write to MAC after Sum")
+		panic("poly1305: write to MAC after Sum or Verify")
 	}
 	return h.mac.Write(p)
 }
@@ -87,3 +88,12 @@ func (h *MAC) Sum(b []byte) []byte {
 	h.finalized = true
 	return append(b, mac[:]...)
 }
+
+// Verify returns whether the authenticator of all data written to
+// the message authentication code matches the expected value.
+func (h *MAC) Verify(expected []byte) bool {
+	var mac [TagSize]byte
+	h.mac.Sum(&mac)
+	h.finalized = true
+	return subtle.ConstantTimeCompare(expected, mac[:]) == 1
+}
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/src/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
index 35b9e38c90..99e5a1d50e 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
@@ -9,17 +9,6 @@ package poly1305
 //go:noescape
 func update(state *macState, msg []byte)
 
-func sum(out *[16]byte, m []byte, key *[32]byte) {
-	h := newMAC(key)
-	h.Write(m)
-	h.Sum(out)
-}
-
-func newMAC(key *[32]byte) (h mac) {
-	initialize(key, &h.r, &h.s)
-	return
-}
-
 // mac is a wrapper for macGeneric that redirects calls that would have gone to
 // updateGeneric to update.
 //
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_generic.go b/src/vendor/golang.org/x/crypto/poly1305/sum_generic.go
index 1187eab78f..c942a65904 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_generic.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_generic.go
@@ -31,16 +31,18 @@ func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
 	h.Sum(out)
 }
 
-func newMACGeneric(key *[32]byte) (h macGeneric) {
-	initialize(key, &h.r, &h.s)
-	return
+func newMACGeneric(key *[32]byte) macGeneric {
+	m := macGeneric{}
+	initialize(key, &m.macState)
+	return m
 }
 
 // macState holds numbers in saturated 64-bit little-endian limbs. That is,
 // the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
 type macState struct {
 	// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
-	// can grow larger during and after rounds.
+	// can grow larger during and after rounds. It must, however, remain below
+	// 2 * (2¹³⁰ - 5).
 	h [3]uint64
 	// r and s are the private key components.
 	r [2]uint64
@@ -97,11 +99,12 @@ const (
 	rMask1 = 0x0FFFFFFC0FFFFFFC
 )
 
-func initialize(key *[32]byte, r, s *[2]uint64) {
-	r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
-	r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
-	s[0] = binary.LittleEndian.Uint64(key[16:24])
-	s[1] = binary.LittleEndian.Uint64(key[24:32])
+// initialize loads the 256-bit key into the two 128-bit secret values r and s.
+func initialize(key *[32]byte, m *macState) {
+	m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
+	m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
+	m.s[0] = binary.LittleEndian.Uint64(key[16:24])
+	m.s[1] = binary.LittleEndian.Uint64(key[24:32])
 }
 
 // uint128 holds a 128-bit number as two 64-bit limbs, for use with the
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_noasm.go b/src/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
deleted file mode 100644
index 2e3ae34c7d..0000000000
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x,!go1.11 !amd64,!s390x,!ppc64le gccgo purego
-
-package poly1305
-
-func sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
-	h := newMAC(key)
-	h.Write(msg)
-	h.Sum(out)
-}
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go b/src/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
index 92597bb8c2..2e7a120b19 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
@@ -9,17 +9,6 @@ package poly1305
 //go:noescape
 func update(state *macState, msg []byte)
 
-func sum(out *[16]byte, m []byte, key *[32]byte) {
-	h := newMAC(key)
-	h.Write(m)
-	h.Sum(out)
-}
-
-func newMAC(key *[32]byte) (h mac) {
-	initialize(key, &h.r, &h.s)
-	return
-}
-
 // mac is a wrapper for macGeneric that redirects calls that would have gone to
 // updateGeneric to update.
 //
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.go b/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
index 5f91ff84a9..958fedc079 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build go1.11,!gccgo,!purego
+// +build !gccgo,!purego
 
 package poly1305
 
@@ -10,30 +10,66 @@ import (
 	"golang.org/x/sys/cpu"
 )
 
-// poly1305vx is an assembly implementation of Poly1305 that uses vector
+// updateVX is an assembly implementation of Poly1305 that uses vector
 // instructions. It must only be called if the vector facility (vx) is
 // available.
 //go:noescape
-func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+func updateVX(state *macState, msg []byte)
 
-// poly1305vmsl is an assembly implementation of Poly1305 that uses vector
-// instructions, including VMSL. It must only be called if the vector facility (vx) is
-// available and if VMSL is supported.
-//go:noescape
-func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+// mac is a replacement for macGeneric that uses a larger buffer and redirects
+// calls that would have gone to updateGeneric to updateVX if the vector
+// facility is installed.
+//
+// A larger buffer is required for good performance because the vector
+// implementation has a higher fixed cost per call than the generic
+// implementation.
+type mac struct {
+	macState
+
+	buffer [16 * TagSize]byte // size must be a multiple of block size (16)
+	offset int
+}
 
-func sum(out *[16]byte, m []byte, key *[32]byte) {
-	if cpu.S390X.HasVX {
-		var mPtr *byte
-		if len(m) > 0 {
-			mPtr = &m[0]
+func (h *mac) Write(p []byte) (int, error) {
+	nn := len(p)
+	if h.offset > 0 {
+		n := copy(h.buffer[h.offset:], p)
+		if h.offset+n < len(h.buffer) {
+			h.offset += n
+			return nn, nil
 		}
-		if cpu.S390X.HasVXE && len(m) > 256 {
-			poly1305vmsl(out, mPtr, uint64(len(m)), key)
+		p = p[n:]
+		h.offset = 0
+		if cpu.S390X.HasVX {
+			updateVX(&h.macState, h.buffer[:])
 		} else {
-			poly1305vx(out, mPtr, uint64(len(m)), key)
+			updateGeneric(&h.macState, h.buffer[:])
 		}
-	} else {
-		sumGeneric(out, m, key)
 	}
+
+	tail := len(p) % len(h.buffer) // number of bytes to copy into buffer
+	body := len(p) - tail          // number of bytes to process now
+	if body > 0 {
+		if cpu.S390X.HasVX {
+			updateVX(&h.macState, p[:body])
+		} else {
+			updateGeneric(&h.macState, p[:body])
+		}
+	}
+	h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0
+	return nn, nil
+}
+
+func (h *mac) Sum(out *[TagSize]byte) {
+	state := h.macState
+	remainder := h.buffer[:h.offset]
+
+	// Use the generic implementation if we have 2 or fewer blocks left
+	// to sum. The vector implementation has a higher startup time.
+	if cpu.S390X.HasVX && len(remainder) > 2*TagSize {
+		updateVX(&state, remainder)
+	} else if len(remainder) > 0 {
+		updateGeneric(&state, remainder)
+	}
+	finalize(out, &state.h, &state.s)
 }
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s b/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
index 806d1694b0..0fa9ee6e0b 100644
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
+++ b/src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
@@ -2,115 +2,187 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build go1.11,!gccgo,!purego
+// +build !gccgo,!purego
 
 #include "textflag.h"
 
-// Implementation of Poly1305 using the vector facility (vx).
-
-// constants
-#define MOD26 V0
-#define EX0   V1
-#define EX1   V2
-#define EX2   V3
-
-// temporaries
-#define T_0 V4
-#define T_1 V5
-#define T_2 V6
-#define T_3 V7
-#define T_4 V8
-
-// key (r)
-#define R_0  V9
-#define R_1  V10
-#define R_2  V11
-#define R_3  V12
-#define R_4  V13
-#define R5_1 V14
-#define R5_2 V15
-#define R5_3 V16
-#define R5_4 V17
-#define RSAVE_0 R5
-#define RSAVE_1 R6
-#define RSAVE_2 R7
-#define RSAVE_3 R8
-#define RSAVE_4 R9
-#define R5SAVE_1 V28
-#define R5SAVE_2 V29
-#define R5SAVE_3 V30
-#define R5SAVE_4 V31
-
-// message block
-#define F_0 V18
-#define F_1 V19
-#define F_2 V20
-#define F_3 V21
-#define F_4 V22
-
-// accumulator
-#define H_0 V23
-#define H_1 V24
-#define H_2 V25
-#define H_3 V26
-#define H_4 V27
-
-GLOBL ·keyMask<>(SB), RODATA, $16
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
-
-GLOBL ·bswapMask<>(SB), RODATA, $16
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
-
-GLOBL ·constants<>(SB), RODATA, $64
-// MOD26
-DATA ·constants<>+0(SB)/8, $0x3ffffff
-DATA ·constants<>+8(SB)/8, $0x3ffffff
+// This implementation of Poly1305 uses the vector facility (vx)
+// to process up to 2 blocks (32 bytes) per iteration using an
+// algorithm based on the one described in:
+//
+// NEON crypto, Daniel J. Bernstein & Peter Schwabe
+// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+//
+// This algorithm uses 5 26-bit limbs to represent a 130-bit
+// value. These limbs are, for the most part, zero extended and
+// placed into 64-bit vector register elements. Each vector
+// register is 128-bits wide and so holds 2 of these elements.
+// Using 26-bit limbs allows us plenty of headroom to accomodate
+// accumulations before and after multiplication without
+// overflowing either 32-bits (before multiplication) or 64-bits
+// (after multiplication).
+//
+// In order to parallelise the operations required to calculate
+// the sum we use two separate accumulators and then sum those
+// in an extra final step. For compatibility with the generic
+// implementation we perform this summation at the end of every
+// updateVX call.
+//
+// To use two accumulators we must multiply the message blocks
+// by r² rather than r. Only the final message block should be
+// multiplied by r.
+//
+// Example:
+//
+// We want to calculate the sum (h) for a 64 byte message (m):
+//
+//   h = m[0:16]r⁴ + m[16:32]r³ + m[32:48]r² + m[48:64]r
+//
+// To do this we split the calculation into the even indices
+// and odd indices of the message. These form our SIMD 'lanes':
+//
+//   h = m[ 0:16]r⁴ + m[32:48]r² +   <- lane 0
+//       m[16:32]r³ + m[48:64]r      <- lane 1
+//
+// To calculate this iteratively we refactor so that both lanes
+// are written in terms of r² and r:
+//
+//   h = (m[ 0:16]r² + m[32:48])r² + <- lane 0
+//       (m[16:32]r² + m[48:64])r    <- lane 1
+//                ^             ^
+//                |             coefficients for second iteration
+//                coefficients for first iteration
+//
+// So in this case we would have two iterations. In the first
+// both lanes are multiplied by r². In the second only the
+// first lane is multiplied by r² and the second lane is
+// instead multiplied by r. This gives use the odd and even
+// powers of r that we need from the original equation.
+//
+// Notation:
+//
+//   h - accumulator
+//   r - key
+//   m - message
+//
+//   [a, b]       - SIMD register holding two 64-bit values
+//   [a, b, c, d] - SIMD register holding four 32-bit values
+//   xᵢ[n]        - limb n of variable x with bit width i
+//
+// Limbs are expressed in little endian order, so for 26-bit
+// limbs x₂₆[4] will be the most significant limb and x₂₆[0]
+// will be the least significant limb.
+
+// masking constants
+#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits
+#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits
+
+// expansion constants (see EXPAND macro)
+#define EX0 V2
+#define EX1 V3
+#define EX2 V4
+
+// key (r², r or 1 depending on context)
+#define R_0 V5
+#define R_1 V6
+#define R_2 V7
+#define R_3 V8
+#define R_4 V9
+
+// precalculated coefficients (5r², 5r or 0 depending on context)
+#define R5_1 V10
+#define R5_2 V11
+#define R5_3 V12
+#define R5_4 V13
+
+// message block (m)
+#define M_0 V14
+#define M_1 V15
+#define M_2 V16
+#define M_3 V17
+#define M_4 V18
+
+// accumulator (h)
+#define H_0 V19
+#define H_1 V20
+#define H_2 V21
+#define H_3 V22
+#define H_4 V23
+
+// temporary registers (for short-lived values)
+#define T_0 V24
+#define T_1 V25
+#define T_2 V26
+#define T_3 V27
+#define T_4 V28
+
+GLOBL ·constants<>(SB), RODATA, $0x30
 // EX0
-DATA ·constants<>+16(SB)/8, $0x0006050403020100
-DATA ·constants<>+24(SB)/8, $0x1016151413121110
+DATA ·constants<>+0x00(SB)/8, $0x0006050403020100
+DATA ·constants<>+0x08(SB)/8, $0x1016151413121110
 // EX1
-DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
-DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
+DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706
+DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716
 // EX2
-DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
-DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
-
-// h = (f*g) % (2**130-5) [partial reduction]
+DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d
+DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d
+
+// MULTIPLY multiplies each lane of f and g, partially reduced
+// modulo 2¹³⁰ - 5. The result, h, consists of partial products
+// in each lane that need to be reduced further to produce the
+// final result.
+//
+//   h₁₃₀ = (f₁₃₀g₁₃₀) % 2¹³⁰ + (5f₁₃₀g₁₃₀) / 2¹³⁰
+//
+// Note that the multiplication by 5 of the high bits is
+// achieved by precalculating the multiplication of four of the
+// g coefficients by 5. These are g51-g54.
 #define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
 	VMLOF  f0, g0, h0        \
-	VMLOF  f0, g1, h1        \
-	VMLOF  f0, g2, h2        \
 	VMLOF  f0, g3, h3        \
+	VMLOF  f0, g1, h1        \
 	VMLOF  f0, g4, h4        \
+	VMLOF  f0, g2, h2        \
 	VMLOF  f1, g54, T_0      \
-	VMLOF  f1, g0, T_1       \
-	VMLOF  f1, g1, T_2       \
 	VMLOF  f1, g2, T_3       \
+	VMLOF  f1, g0, T_1       \
 	VMLOF  f1, g3, T_4       \
+	VMLOF  f1, g1, T_2       \
 	VMALOF f2, g53, h0, h0   \
-	VMALOF f2, g54, h1, h1   \
-	VMALOF f2, g0, h2, h2    \
 	VMALOF f2, g1, h3, h3    \
+	VMALOF f2, g54, h1, h1   \
 	VMALOF f2, g2, h4, h4    \
+	VMALOF f2, g0, h2, h2    \
 	VMALOF f3, g52, T_0, T_0 \
-	VMALOF f3, g53, T_1, T_1 \
-	VMALOF f3, g54, T_2, T_2 \
 	VMALOF f3, g0, T_3, T_3  \
+	VMALOF f3, g53, T_1, T_1 \
 	VMALOF f3, g1, T_4, T_4  \
+	VMALOF f3, g54, T_2, T_2 \
 	VMALOF f4, g51, h0, h0   \
-	VMALOF f4, g52, h1, h1   \
-	VMALOF f4, g53, h2, h2   \
 	VMALOF f4, g54, h3, h3   \
+	VMALOF f4, g52, h1, h1   \
 	VMALOF f4, g0, h4, h4    \
+	VMALOF f4, g53, h2, h2   \
 	VAG    T_0, h0, h0       \
-	VAG    T_1, h1, h1       \
-	VAG    T_2, h2, h2       \
 	VAG    T_3, h3, h3       \
-	VAG    T_4, h4, h4
-
-// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
+	VAG    T_1, h1, h1       \
+	VAG    T_4, h4, h4       \
+	VAG    T_2, h2, h2
+
+// REDUCE performs the following carry operations in four
+// stages, as specified in Bernstein & Schwabe:
+//
+//   1: h₂₆[0]->h₂₆[1] h₂₆[3]->h₂₆[4]
+//   2: h₂₆[1]->h₂₆[2] h₂₆[4]->h₂₆[0]
+//   3: h₂₆[0]->h₂₆[1] h₂₆[2]->h₂₆[3]
+//   4: h₂₆[3]->h₂₆[4]
+//
+// The result is that all of the limbs are limited to 26-bits
+// except for h₂₆[1] and h₂₆[4] which are limited to 27-bits.
+//
+// Note that although each limb is aligned at 26-bit intervals
+// they may contain values that exceed 2²⁶ - 1, hence the need
+// to carry the excess bits in each limb.
 #define REDUCE(h0, h1, h2, h3, h4) \
 	VESRLG $26, h0, T_0  \
 	VESRLG $26, h3, T_1  \
@@ -136,144 +208,155 @@ DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
 	VN     MOD26, h3, h3 \
 	VAG    T_2, h4, h4
 
-// expand in0 into d[0] and in1 into d[1]
+// EXPAND splits the 128-bit little-endian values in0 and in1
+// into 26-bit big-endian limbs and places the results into
+// the first and second lane of d₂₆[0:4] respectively.
+//
+// The EX0, EX1 and EX2 constants are arrays of byte indices
+// for permutation. The permutation both reverses the bytes
+// in the input and ensures the bytes are copied into the
+// destination limb ready to be shifted into their final
+// position.
 #define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
-	VGBM   $0x0707, d1       \ // d1=tmp
-	VPERM  in0, in1, EX2, d4 \
 	VPERM  in0, in1, EX0, d0 \
 	VPERM  in0, in1, EX1, d2 \
-	VN     d1, d4, d4        \
+	VPERM  in0, in1, EX2, d4 \
 	VESRLG $26, d0, d1       \
 	VESRLG $30, d2, d3       \
 	VESRLG $4, d2, d2        \
-	VN     MOD26, d0, d0     \
-	VN     MOD26, d1, d1     \
-	VN     MOD26, d2, d2     \
-	VN     MOD26, d3, d3
-
-// pack h4:h0 into h1:h0 (no carry)
-#define PACK(h0, h1, h2, h3, h4) \
-	VESLG $26, h1, h1  \
-	VESLG $26, h3, h3  \
-	VO    h0, h1, h0   \
-	VO    h2, h3, h2   \
-	VESLG $4, h2, h2   \
-	VLEIB $7, $48, h1  \
-	VSLB  h1, h2, h2   \
-	VO    h0, h2, h0   \
-	VLEIB $7, $104, h1 \
-	VSLB  h1, h4, h3   \
-	VO    h3, h0, h0   \
-	VLEIB $7, $24, h1  \
-	VSRLB h1, h4, h1
-
-// if h > 2**130-5 then h -= 2**130-5
-#define MOD(h0, h1, t0, t1, t2) \
-	VZERO t0          \
-	VLEIG $1, $5, t0  \
-	VACCQ h0, t0, t1  \
-	VAQ   h0, t0, t0  \
-	VONE  t2          \
-	VLEIG $1, $-4, t2 \
-	VAQ   t2, t1, t1  \
-	VACCQ h1, t1, t1  \
-	VONE  t2          \
-	VAQ   t2, t1, t1  \
-	VN    h0, t1, t2  \
-	VNC   t0, t1, t1  \
-	VO    t1, t2, h0
-
-// func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305vx(SB), $0-32
-	// This code processes up to 2 blocks (32 bytes) per iteration
-	// using the algorithm described in:
-	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
-	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
-	LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
-
-	// load MOD26, EX0, EX1 and EX2
+	VN     MOD26, d0, d0     \ // [in0₂₆[0], in1₂₆[0]]
+	VN     MOD26, d3, d3     \ // [in0₂₆[3], in1₂₆[3]]
+	VN     MOD26, d1, d1     \ // [in0₂₆[1], in1₂₆[1]]
+	VN     MOD24, d4, d4     \ // [in0₂₆[4], in1₂₆[4]]
+	VN     MOD26, d2, d2     // [in0₂₆[2], in1₂₆[2]]
+
+// func updateVX(state *macState, msg []byte)
+TEXT ·updateVX(SB), NOSPLIT, $0
+	MOVD state+0(FP), R1
+	LMG  msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len
+
+	// load EX0, EX1 and EX2
 	MOVD $·constants<>(SB), R5
-	VLM  (R5), MOD26, EX2
-
-	// setup r
-	VL   (R4), T_0
-	MOVD $·keyMask<>(SB), R6
-	VL   (R6), T_1
-	VN   T_0, T_1, T_0
-	EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
-
-	// setup r*5
-	VLEIG $0, $5, T_0
-	VLEIG $1, $5, T_0
-
-	// store r (for final block)
-	VMLOF T_0, R_1, R5SAVE_1
-	VMLOF T_0, R_2, R5SAVE_2
-	VMLOF T_0, R_3, R5SAVE_3
-	VMLOF T_0, R_4, R5SAVE_4
-	VLGVG $0, R_0, RSAVE_0
-	VLGVG $0, R_1, RSAVE_1
-	VLGVG $0, R_2, RSAVE_2
-	VLGVG $0, R_3, RSAVE_3
-	VLGVG $0, R_4, RSAVE_4
-
-	// skip r**2 calculation
+	VLM  (R5), EX0, EX2
+
+	// generate masks
+	VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff]
+	VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff]
+
+	// load h (accumulator) and r (key) from state
+	VZERO T_1               // [0, 0]
+	VL    0(R1), T_0        // [h₆₄[0], h₆₄[1]]
+	VLEG  $0, 16(R1), T_1   // [h₆₄[2], 0]
+	VL    24(R1), T_2       // [r₆₄[0], r₆₄[1]]
+	VPDI  $0, T_0, T_2, T_3 // [h₆₄[0], r₆₄[0]]
+	VPDI  $5, T_0, T_2, T_4 // [h₆₄[1], r₆₄[1]]
+
+	// unpack h and r into 26-bit limbs
+	// note: h₆₄[2] may have the low 3 bits set, so h₂₆[4] is a 27-bit value
+	VN     MOD26, T_3, H_0            // [h₂₆[0], r₂₆[0]]
+	VZERO  H_1                        // [0, 0]
+	VZERO  H_3                        // [0, 0]
+	VGMG   $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out
+	VESLG  $24, T_1, T_1              // [h₆₄[2]<<24, 0]
+	VERIMG $-26&63, T_3, MOD26, H_1   // [h₂₆[1], r₂₆[1]]
+	VESRLG $+52&63, T_3, H_2          // [h₂₆[2], r₂₆[2]] - low 12 bits only
+	VERIMG $-14&63, T_4, MOD26, H_3   // [h₂₆[1], r₂₆[1]]
+	VESRLG $40, T_4, H_4              // [h₂₆[4], r₂₆[4]] - low 24 bits only
+	VERIMG $+12&63, T_4, T_0, H_2     // [h₂₆[2], r₂₆[2]] - complete
+	VO     T_1, H_4, H_4              // [h₂₆[4], r₂₆[4]] - complete
+
+	// replicate r across all 4 vector elements
+	VREPF $3, H_0, R_0 // [r₂₆[0], r₂₆[0], r₂₆[0], r₂₆[0]]
+	VREPF $3, H_1, R_1 // [r₂₆[1], r₂₆[1], r₂₆[1], r₂₆[1]]
+	VREPF $3, H_2, R_2 // [r₂₆[2], r₂₆[2], r₂₆[2], r₂₆[2]]
+	VREPF $3, H_3, R_3 // [r₂₆[3], r₂₆[3], r₂₆[3], r₂₆[3]]
+	VREPF $3, H_4, R_4 // [r₂₆[4], r₂₆[4], r₂₆[4], r₂₆[4]]
+
+	// zero out lane 1 of h
+	VLEIG $1, $0, H_0 // [h₂₆[0], 0]
+	VLEIG $1, $0, H_1 // [h₂₆[1], 0]
+	VLEIG $1, $0, H_2 // [h₂₆[2], 0]
+	VLEIG $1, $0, H_3 // [h₂₆[3], 0]
+	VLEIG $1, $0, H_4 // [h₂₆[4], 0]
+
+	// calculate 5r (ignore least significant limb)
+	VREPIF $5, T_0
+	VMLF   T_0, R_1, R5_1 // [5r₂₆[1], 5r₂₆[1], 5r₂₆[1], 5r₂₆[1]]
+	VMLF   T_0, R_2, R5_2 // [5r₂₆[2], 5r₂₆[2], 5r₂₆[2], 5r₂₆[2]]
+	VMLF   T_0, R_3, R5_3 // [5r₂₆[3], 5r₂₆[3], 5r₂₆[3], 5r₂₆[3]]
+	VMLF   T_0, R_4, R5_4 // [5r₂₆[4], 5r₂₆[4], 5r₂₆[4], 5r₂₆[4]]
+
+	// skip r² calculation if we are only calculating one block
 	CMPBLE R3, $16, skip
 
-	// calculate r**2
-	MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
-	REDUCE(H_0, H_1, H_2, H_3, H_4)
-	VLEIG $0, $5, T_0
-	VLEIG $1, $5, T_0
-	VMLOF T_0, H_1, R5_1
-	VMLOF T_0, H_2, R5_2
-	VMLOF T_0, H_3, R5_3
-	VMLOF T_0, H_4, R5_4
-	VLR   H_0, R_0
-	VLR   H_1, R_1
-	VLR   H_2, R_2
-	VLR   H_3, R_3
-	VLR   H_4, R_4
-
-	// initialize h
-	VZERO H_0
-	VZERO H_1
-	VZERO H_2
-	VZERO H_3
-	VZERO H_4
+	// calculate r²
+	MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4)
+	REDUCE(M_0, M_1, M_2, M_3, M_4)
+	VGBM   $0x0f0f, T_0
+	VERIMG $0, M_0, T_0, R_0 // [r₂₆[0], r²₂₆[0], r₂₆[0], r²₂₆[0]]
+	VERIMG $0, M_1, T_0, R_1 // [r₂₆[1], r²₂₆[1], r₂₆[1], r²₂₆[1]]
+	VERIMG $0, M_2, T_0, R_2 // [r₂₆[2], r²₂₆[2], r₂₆[2], r²₂₆[2]]
+	VERIMG $0, M_3, T_0, R_3 // [r₂₆[3], r²₂₆[3], r₂₆[3], r²₂₆[3]]
+	VERIMG $0, M_4, T_0, R_4 // [r₂₆[4], r²₂₆[4], r₂₆[4], r²₂₆[4]]
+
+	// calculate 5r² (ignore least significant limb)
+	VREPIF $5, T_0
+	VMLF   T_0, R_1, R5_1 // [5r₂₆[1], 5r²₂₆[1], 5r₂₆[1], 5r²₂₆[1]]
+	VMLF   T_0, R_2, R5_2 // [5r₂₆[2], 5r²₂₆[2], 5r₂₆[2], 5r²₂₆[2]]
+	VMLF   T_0, R_3, R5_3 // [5r₂₆[3], 5r²₂₆[3], 5r₂₆[3], 5r²₂₆[3]]
+	VMLF   T_0, R_4, R5_4 // [5r₂₆[4], 5r²₂₆[4], 5r₂₆[4], 5r²₂₆[4]]
 
 loop:
-	CMPBLE R3, $32, b2
-	VLM    (R2), T_0, T_1
-	SUB    $32, R3
-	MOVD   $32(R2), R2
-	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
-	VLEIB  $4, $1, F_4
-	VLEIB  $12, $1, F_4
+	CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients
+
+	// load next 2 blocks from message
+	VLM (R2), T_0, T_1
+
+	// update message slice
+	SUB  $32, R3
+	MOVD $32(R2), R2
+
+	// unpack message blocks into 26-bit big-endian limbs
+	EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
+
+	// add 2¹²⁸ to each message block value
+	VLEIB $4, $1, M_4
+	VLEIB $12, $1, M_4
 
 multiply:
-	VAG    H_0, F_0, F_0
-	VAG    H_1, F_1, F_1
-	VAG    H_2, F_2, F_2
-	VAG    H_3, F_3, F_3
-	VAG    H_4, F_4, F_4
-	MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
+	// accumulate the incoming message
+	VAG H_0, M_0, M_0
+	VAG H_3, M_3, M_3
+	VAG H_1, M_1, M_1
+	VAG H_4, M_4, M_4
+	VAG H_2, M_2, M_2
+
+	// multiply the accumulator by the key coefficient
+	MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
+
+	// carry and partially reduce the partial products
 	REDUCE(H_0, H_1, H_2, H_3, H_4)
+
 	CMPBNE R3, $0, loop
 
 finish:
-	// sum vectors
+	// sum lane 0 and lane 1 and put the result in lane 1
 	VZERO  T_0
 	VSUMQG H_0, T_0, H_0
-	VSUMQG H_1, T_0, H_1
-	VSUMQG H_2, T_0, H_2
 	VSUMQG H_3, T_0, H_3
+	VSUMQG H_1, T_0, H_1
 	VSUMQG H_4, T_0, H_4
+	VSUMQG H_2, T_0, H_2
 
-	// h may be >= 2*(2**130-5) so we need to reduce it again
+	// reduce again after summation
+	// TODO(mundaym): there might be a more efficient way to do this
+	// now that we only have 1 active lane. For example, we could
+	// simultaneously pack the values as we reduce them.
 	REDUCE(H_0, H_1, H_2, H_3, H_4)
 
-	// carry h1->h4
+	// carry h[1] through to h[4] so that only h[4] can exceed 2²⁶ - 1
+	// TODO(mundaym): in testing this final carry was unnecessary.
+	// Needs a proof before it can be removed though.
 	VESRLG $26, H_1, T_1
 	VN     MOD26, H_1, H_1
 	VAQ    T_1, H_2, H_2
@@ -284,95 +367,137 @@ finish:
 	VN     MOD26, H_3, H_3
 	VAQ    T_3, H_4, H_4
 
-	// h is now < 2*(2**130-5)
-	// pack h into h1 (hi) and h0 (lo)
-	PACK(H_0, H_1, H_2, H_3, H_4)
-
-	// if h > 2**130-5 then h -= 2**130-5
-	MOD(H_0, H_1, T_0, T_1, T_2)
-
-	// h += s
-	MOVD  $·bswapMask<>(SB), R5
-	VL    (R5), T_1
-	VL    16(R4), T_0
-	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
-	VAQ   T_0, H_0, H_0
-	VPERM H_0, H_0, T_1, H_0    // reverse bytes (to little)
-	VST   H_0, (R1)
-
+	// h is now < 2(2¹³⁰ - 5)
+	// Pack each lane in h₂₆[0:4] into h₁₂₈[0:1].
+	VESLG $26, H_1, H_1
+	VESLG $26, H_3, H_3
+	VO    H_0, H_1, H_0
+	VO    H_2, H_3, H_2
+	VESLG $4, H_2, H_2
+	VLEIB $7, $48, H_1
+	VSLB  H_1, H_2, H_2
+	VO    H_0, H_2, H_0
+	VLEIB $7, $104, H_1
+	VSLB  H_1, H_4, H_3
+	VO    H_3, H_0, H_0
+	VLEIB $7, $24, H_1
+	VSRLB H_1, H_4, H_1
+
+	// update state
+	VSTEG $1, H_0, 0(R1)
+	VSTEG $0, H_0, 8(R1)
+	VSTEG $1, H_1, 16(R1)
 	RET
 
-b2:
+b2:  // 2 or fewer blocks remaining
 	CMPBLE R3, $16, b1
 
-	// 2 blocks remaining
-	SUB    $17, R3
-	VL     (R2), T_0
-	VLL    R3, 16(R2), T_1
-	ADD    $1, R3
+	// Load the 2 remaining blocks (17-32 bytes remaining).
+	MOVD $-17(R3), R0    // index of final byte to load modulo 16
+	VL   (R2), T_0       // load full 16 byte block
+	VLL  R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes
+
+	// The Poly1305 algorithm requires that a 1 bit be appended to
+	// each message block. If the final block is less than 16 bytes
+	// long then it is easiest to insert the 1 before the message
+	// block is split into 26-bit limbs. If, on the other hand, the
+	// final message block is 16 bytes long then we append the 1 bit
+	// after expansion as normal.
 	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, T_1
-	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
+	MOVD   $-16(R3), R3   // index of byte in last block to insert 1 at (could be 16)
+	CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long
+	VLVGB  R3, R0, T_1    // insert 1 into the byte at index R3
+
+	// Split both blocks into 26-bit limbs in the appropriate lanes.
+	EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
+
+	// Append a 1 byte to the end of the second to last block.
+	VLEIB $4, $1, M_4
+
+	// Append a 1 byte to the end of the last block only if it is a
+	// full 16 byte block.
 	CMPBNE R3, $16, 2(PC)
-	VLEIB  $12, $1, F_4
-	VLEIB  $4, $1, F_4
-
-	// setup [r²,r]
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, RSAVE_3, R_3
-	VLVGG $1, RSAVE_4, R_4
-	VPDI  $0, R5_1, R5SAVE_1, R5_1
-	VPDI  $0, R5_2, R5SAVE_2, R5_2
-	VPDI  $0, R5_3, R5SAVE_3, R5_3
-	VPDI  $0, R5_4, R5SAVE_4, R5_4
+	VLEIB  $12, $1, M_4
+
+	// Finally, set up the coefficients for the final multiplication.
+	// We have previously saved r and 5r in the 32-bit even indexes
+	// of the R_[0-4] and R5_[1-4] coefficient registers.
+	//
+	// We want lane 0 to be multiplied by r² so that can be kept the
+	// same. We want lane 1 to be multiplied by r so we need to move
+	// the saved r value into the 32-bit odd index in lane 1 by
+	// rotating the 64-bit lane by 32.
+	VGBM   $0x00ff, T_0         // [0, 0xffffffffffffffff] - mask lane 1 only
+	VERIMG $32, R_0, T_0, R_0   // [_,  r²₂₆[0], _,  r₂₆[0]]
+	VERIMG $32, R_1, T_0, R_1   // [_,  r²₂₆[1], _,  r₂₆[1]]
+	VERIMG $32, R_2, T_0, R_2   // [_,  r²₂₆[2], _,  r₂₆[2]]
+	VERIMG $32, R_3, T_0, R_3   // [_,  r²₂₆[3], _,  r₂₆[3]]
+	VERIMG $32, R_4, T_0, R_4   // [_,  r²₂₆[4], _,  r₂₆[4]]
+	VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²₂₆[1], _, 5r₂₆[1]]
+	VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²₂₆[2], _, 5r₂₆[2]]
+	VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²₂₆[3], _, 5r₂₆[3]]
+	VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²₂₆[4], _, 5r₂₆[4]]
 
 	MOVD $0, R3
 	BR   multiply
 
 skip:
-	VZERO H_0
-	VZERO H_1
-	VZERO H_2
-	VZERO H_3
-	VZERO H_4
-
 	CMPBEQ R3, $0, finish
 
-b1:
-	// 1 block remaining
-	SUB    $1, R3
-	VLL    R3, (R2), T_0
-	ADD    $1, R3
+b1:  // 1 block remaining
+
+	// Load the final block (1-16 bytes). This will be placed into
+	// lane 0.
+	MOVD $-1(R3), R0
+	VLL  R0, (R2), T_0 // pad to 16 bytes with zeros
+
+	// The Poly1305 algorithm requires that a 1 bit be appended to
+	// each message block. If the final block is less than 16 bytes
+	// long then it is easiest to insert the 1 before the message
+	// block is split into 26-bit limbs. If, on the other hand, the
+	// final message block is 16 bytes long then we append the 1 bit
+	// after expansion as normal.
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, T_0
-	VZERO  T_1
-	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
+
+	// Set the message block in lane 1 to the value 0 so that it
+	// can be accumulated without affecting the final result.
+	VZERO T_1
+
+	// Split the final message block into 26-bit limbs in lane 0.
+	// Lane 1 will be contain 0.
+	EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
+
+	// Append a 1 byte to the end of the last block only if it is a
+	// full 16 byte block.
 	CMPBNE R3, $16, 2(PC)
-	VLEIB  $4, $1, F_4
-	VLEIG  $1, $1, R_0
-	VZERO  R_1
-	VZERO  R_2
-	VZERO  R_3
-	VZERO  R_4
-	VZERO  R5_1
-	VZERO  R5_2
-	VZERO  R5_3
-	VZERO  R5_4
-
-	// setup [r, 1]
-	VLVGG $0, RSAVE_0, R_0
-	VLVGG $0, RSAVE_1, R_1
-	VLVGG $0, RSAVE_2, R_2
-	VLVGG $0, RSAVE_3, R_3
-	VLVGG $0, RSAVE_4, R_4
-	VPDI  $0, R5SAVE_1, R5_1, R5_1
-	VPDI  $0, R5SAVE_2, R5_2, R5_2
-	VPDI  $0, R5SAVE_3, R5_3, R5_3
-	VPDI  $0, R5SAVE_4, R5_4, R5_4
+	VLEIB  $4, $1, M_4
+
+	// We have previously saved r and 5r in the 32-bit even indexes
+	// of the R_[0-4] and R5_[1-4] coefficient registers.
+	//
+	// We want lane 0 to be multiplied by r so we need to move the
+	// saved r value into the 32-bit odd index in lane 0. We want
+	// lane 1 to be set to the value 1. This makes multiplication
+	// a no-op. We do this by setting lane 1 in every register to 0
+	// and then just setting the 32-bit index 3 in R_0 to 1.
+	VZERO T_0
+	MOVD  $0, R0
+	MOVD  $0x10111213, R12
+	VLVGP R12, R0, T_1         // [_, 0x10111213, _, 0x00000000]
+	VPERM T_0, R_0, T_1, R_0   // [_,  r₂₆[0], _, 0]
+	VPERM T_0, R_1, T_1, R_1   // [_,  r₂₆[1], _, 0]
+	VPERM T_0, R_2, T_1, R_2   // [_,  r₂₆[2], _, 0]
+	VPERM T_0, R_3, T_1, R_3   // [_,  r₂₆[3], _, 0]
+	VPERM T_0, R_4, T_1, R_4   // [_,  r₂₆[4], _, 0]
+	VPERM T_0, R5_1, T_1, R5_1 // [_, 5r₂₆[1], _, 0]
+	VPERM T_0, R5_2, T_1, R5_2 // [_, 5r₂₆[2], _, 0]
+	VPERM T_0, R5_3, T_1, R5_3 // [_, 5r₂₆[3], _, 0]
+	VPERM T_0, R5_4, T_1, R5_4 // [_, 5r₂₆[4], _, 0]
+
+	// Set the value of lane 1 to be 1.
+	VLEIF $3, $1, R_0 // [_,  r₂₆[0], _, 1]
 
 	MOVD $0, R3
 	BR   multiply
diff --git a/src/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s b/src/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
deleted file mode 100644
index b439af9369..0000000000
--- a/src/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
+++ /dev/null
@@ -1,909 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.11,!gccgo,!purego
-
-#include "textflag.h"
-
-// Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
-
-// constants
-#define EX0   V1
-#define EX1   V2
-#define EX2   V3
-
-// temporaries
-#define T_0 V4
-#define T_1 V5
-#define T_2 V6
-#define T_3 V7
-#define T_4 V8
-#define T_5 V9
-#define T_6 V10
-#define T_7 V11
-#define T_8 V12
-#define T_9 V13
-#define T_10 V14
-
-// r**2 & r**4
-#define R_0  V15
-#define R_1  V16
-#define R_2  V17
-#define R5_1 V18
-#define R5_2 V19
-// key (r)
-#define RSAVE_0 R7
-#define RSAVE_1 R8
-#define RSAVE_2 R9
-#define R5SAVE_1 R10
-#define R5SAVE_2 R11
-
-// message block
-#define M0 V20
-#define M1 V21
-#define M2 V22
-#define M3 V23
-#define M4 V24
-#define M5 V25
-
-// accumulator
-#define H0_0 V26
-#define H1_0 V27
-#define H2_0 V28
-#define H0_1 V29
-#define H1_1 V30
-#define H2_1 V31
-
-GLOBL ·keyMask<>(SB), RODATA, $16
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
-
-GLOBL ·bswapMask<>(SB), RODATA, $16
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
-
-GLOBL ·constants<>(SB), RODATA, $48
-// EX0
-DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+8(SB)/8, $0x0000050403020100
-// EX1
-DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+24(SB)/8, $0x00000a0908070605
-// EX2
-DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
-DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
-
-GLOBL ·c<>(SB), RODATA, $48
-// EX0
-DATA ·c<>+0(SB)/8, $0x0000050403020100
-DATA ·c<>+8(SB)/8, $0x0000151413121110
-// EX1
-DATA ·c<>+16(SB)/8, $0x00000a0908070605
-DATA ·c<>+24(SB)/8, $0x00001a1918171615
-// EX2
-DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
-DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
-
-GLOBL ·reduce<>(SB), RODATA, $32
-// 44 bit
-DATA ·reduce<>+0(SB)/8, $0x0
-DATA ·reduce<>+8(SB)/8, $0xfffffffffff
-// 42 bit
-DATA ·reduce<>+16(SB)/8, $0x0
-DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
-
-// h = (f*g) % (2**130-5) [partial reduction]
-// uses T_0...T_9 temporary registers
-// input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
-// output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
-#define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
-	\ // Eliminate the dependency for the last 2 VMSLs
-	VMSLG m02_0, r_2, m4_2, m4_2                       \
-	VMSLG m13_0, r_2, m5_2, m5_2                       \ // 8 VMSLs pipelined
-	VMSLG m02_0, r_0, m4_0, m4_0                       \
-	VMSLG m02_1, r5_2, V0, T_0                         \
-	VMSLG m02_0, r_1, m4_1, m4_1                       \
-	VMSLG m02_1, r_0, V0, T_1                          \
-	VMSLG m02_1, r_1, V0, T_2                          \
-	VMSLG m02_2, r5_1, V0, T_3                         \
-	VMSLG m02_2, r5_2, V0, T_4                         \
-	VMSLG m13_0, r_0, m5_0, m5_0                       \
-	VMSLG m13_1, r5_2, V0, T_5                         \
-	VMSLG m13_0, r_1, m5_1, m5_1                       \
-	VMSLG m13_1, r_0, V0, T_6                          \
-	VMSLG m13_1, r_1, V0, T_7                          \
-	VMSLG m13_2, r5_1, V0, T_8                         \
-	VMSLG m13_2, r5_2, V0, T_9                         \
-	VMSLG m02_2, r_0, m4_2, m4_2                       \
-	VMSLG m13_2, r_0, m5_2, m5_2                       \
-	VAQ   m4_0, T_0, m02_0                             \
-	VAQ   m4_1, T_1, m02_1                             \
-	VAQ   m5_0, T_5, m13_0                             \
-	VAQ   m5_1, T_6, m13_1                             \
-	VAQ   m02_0, T_3, m02_0                            \
-	VAQ   m02_1, T_4, m02_1                            \
-	VAQ   m13_0, T_8, m13_0                            \
-	VAQ   m13_1, T_9, m13_1                            \
-	VAQ   m4_2, T_2, m02_2                             \
-	VAQ   m5_2, T_7, m13_2                             \
-
-// SQUARE uses three limbs of r and r_2*5 to output square of r
-// uses T_1, T_5 and T_7 temporary registers
-// input: r_0, r_1, r_2, r5_2
-// temp: TEMP0, TEMP1, TEMP2
-// output: p0, p1, p2
-#define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
-	VMSLG r_0, r_0, p0, p0     \
-	VMSLG r_1, r5_2, V0, TEMP0 \
-	VMSLG r_2, r5_2, p1, p1    \
-	VMSLG r_0, r_1, V0, TEMP1  \
-	VMSLG r_1, r_1, p2, p2     \
-	VMSLG r_0, r_2, V0, TEMP2  \
-	VAQ   TEMP0, p0, p0        \
-	VAQ   TEMP1, p1, p1        \
-	VAQ   TEMP2, p2, p2        \
-	VAQ   TEMP0, p0, p0        \
-	VAQ   TEMP1, p1, p1        \
-	VAQ   TEMP2, p2, p2        \
-
-// carry h0->h1->h2->h0 || h3->h4->h5->h3
-// uses T_2, T_4, T_5, T_7, T_8, T_9
-//       t6,  t7,  t8,  t9, t10, t11
-// input: h0, h1, h2, h3, h4, h5
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
-// output: h0, h1, h2, h3, h4, h5
-#define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
-	VLM    (R12), t6, t7  \ // 44 and 42 bit clear mask
-	VLEIB  $7, $0x28, t10 \ // 5 byte shift mask
-	VREPIB $4, t8         \ // 4 bit shift mask
-	VREPIB $2, t11        \ // 2 bit shift mask
-	VSRLB  t10, h0, t0    \ // h0 byte shift
-	VSRLB  t10, h1, t1    \ // h1 byte shift
-	VSRLB  t10, h2, t2    \ // h2 byte shift
-	VSRLB  t10, h3, t3    \ // h3 byte shift
-	VSRLB  t10, h4, t4    \ // h4 byte shift
-	VSRLB  t10, h5, t5    \ // h5 byte shift
-	VSRL   t8, t0, t0     \ // h0 bit shift
-	VSRL   t8, t1, t1     \ // h2 bit shift
-	VSRL   t11, t2, t2    \ // h2 bit shift
-	VSRL   t8, t3, t3     \ // h3 bit shift
-	VSRL   t8, t4, t4     \ // h4 bit shift
-	VESLG  $2, t2, t9     \ // h2 carry x5
-	VSRL   t11, t5, t5    \ // h5 bit shift
-	VN     t6, h0, h0     \ // h0 clear carry
-	VAQ    t2, t9, t2     \ // h2 carry x5
-	VESLG  $2, t5, t9     \ // h5 carry x5
-	VN     t6, h1, h1     \ // h1 clear carry
-	VN     t7, h2, h2     \ // h2 clear carry
-	VAQ    t5, t9, t5     \ // h5 carry x5
-	VN     t6, h3, h3     \ // h3 clear carry
-	VN     t6, h4, h4     \ // h4 clear carry
-	VN     t7, h5, h5     \ // h5 clear carry
-	VAQ    t0, h1, h1     \ // h0->h1
-	VAQ    t3, h4, h4     \ // h3->h4
-	VAQ    t1, h2, h2     \ // h1->h2
-	VAQ    t4, h5, h5     \ // h4->h5
-	VAQ    t2, h0, h0     \ // h2->h0
-	VAQ    t5, h3, h3     \ // h5->h3
-	VREPG  $1, t6, t6     \ // 44 and 42 bit masks across both halves
-	VREPG  $1, t7, t7     \
-	VSLDB  $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
-	VSLDB  $8, h1, h1, h1 \
-	VSLDB  $8, h2, h2, h2 \
-	VO     h0, h3, h3     \
-	VO     h1, h4, h4     \
-	VO     h2, h5, h5     \
-	VESRLG $44, h3, t0    \ // 44 bit shift right
-	VESRLG $44, h4, t1    \
-	VESRLG $42, h5, t2    \
-	VN     t6, h3, h3     \ // clear carry bits
-	VN     t6, h4, h4     \
-	VN     t7, h5, h5     \
-	VESLG  $2, t2, t9     \ // multiply carry by 5
-	VAQ    t9, t2, t2     \
-	VAQ    t0, h4, h4     \
-	VAQ    t1, h5, h5     \
-	VAQ    t2, h3, h3     \
-
-// carry h0->h1->h2->h0
-// input: h0, h1, h2
-// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
-// output: h0, h1, h2
-#define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
-	VLEIB  $7, $0x28, t3 \ // 5 byte shift mask
-	VREPIB $4, t4        \ // 4 bit shift mask
-	VREPIB $2, t7        \ // 2 bit shift mask
-	VGBM   $0x003F, t5   \ // mask to clear carry bits
-	VSRLB  t3, h0, t0    \
-	VSRLB  t3, h1, t1    \
-	VSRLB  t3, h2, t2    \
-	VESRLG $4, t5, t5    \ // 44 bit clear mask
-	VSRL   t4, t0, t0    \
-	VSRL   t4, t1, t1    \
-	VSRL   t7, t2, t2    \
-	VESRLG $2, t5, t6    \ // 42 bit clear mask
-	VESLG  $2, t2, t8    \
-	VAQ    t8, t2, t2    \
-	VN     t5, h0, h0    \
-	VN     t5, h1, h1    \
-	VN     t6, h2, h2    \
-	VAQ    t0, h1, h1    \
-	VAQ    t1, h2, h2    \
-	VAQ    t2, h0, h0    \
-	VSRLB  t3, h0, t0    \
-	VSRLB  t3, h1, t1    \
-	VSRLB  t3, h2, t2    \
-	VSRL   t4, t0, t0    \
-	VSRL   t4, t1, t1    \
-	VSRL   t7, t2, t2    \
-	VN     t5, h0, h0    \
-	VN     t5, h1, h1    \
-	VESLG  $2, t2, t8    \
-	VN     t6, h2, h2    \
-	VAQ    t0, h1, h1    \
-	VAQ    t8, t2, t2    \
-	VAQ    t1, h2, h2    \
-	VAQ    t2, h0, h0    \
-
-// expands two message blocks into the lower halfs of the d registers
-// moves the contents of the d registers into upper halfs
-// input: in1, in2, d0, d1, d2, d3, d4, d5
-// temp: TEMP0, TEMP1, TEMP2, TEMP3
-// output: d0, d1, d2, d3, d4, d5
-#define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
-	VGBM   $0xff3f, TEMP0      \
-	VGBM   $0xff1f, TEMP1      \
-	VESLG  $4, d1, TEMP2       \
-	VESLG  $4, d4, TEMP3       \
-	VESRLG $4, TEMP0, TEMP0    \
-	VPERM  in1, d0, EX0, d0    \
-	VPERM  in2, d3, EX0, d3    \
-	VPERM  in1, d2, EX2, d2    \
-	VPERM  in2, d5, EX2, d5    \
-	VPERM  in1, TEMP2, EX1, d1 \
-	VPERM  in2, TEMP3, EX1, d4 \
-	VN     TEMP0, d0, d0       \
-	VN     TEMP0, d3, d3       \
-	VESRLG $4, d1, d1          \
-	VESRLG $4, d4, d4          \
-	VN     TEMP1, d2, d2       \
-	VN     TEMP1, d5, d5       \
-	VN     TEMP0, d1, d1       \
-	VN     TEMP0, d4, d4       \
-
-// expands one message block into the lower halfs of the d registers
-// moves the contents of the d registers into upper halfs
-// input: in, d0, d1, d2
-// temp: TEMP0, TEMP1, TEMP2
-// output: d0, d1, d2
-#define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
-	VGBM   $0xff3f, TEMP0     \
-	VESLG  $4, d1, TEMP2      \
-	VGBM   $0xff1f, TEMP1     \
-	VPERM  in, d0, EX0, d0    \
-	VESRLG $4, TEMP0, TEMP0   \
-	VPERM  in, d2, EX2, d2    \
-	VPERM  in, TEMP2, EX1, d1 \
-	VN     TEMP0, d0, d0      \
-	VN     TEMP1, d2, d2      \
-	VESRLG $4, d1, d1         \
-	VN     TEMP0, d1, d1      \
-
-// pack h2:h0 into h1:h0 (no carry)
-// input: h0, h1, h2
-// output: h0, h1, h2
-#define PACK(h0, h1, h2) \
-	VMRLG  h1, h2, h2  \ // copy h1 to upper half h2
-	VESLG  $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
-	VO     h0, h1, h0  \ // combine h0 with 20 bits from limb 1
-	VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
-	VLEIG  $1, $0, h1  \ // clear h2 stuff from lower half of h1
-	VO     h0, h1, h0  \ // h0 now has 88 bits (limb 0 and 1)
-	VLEIG  $0, $0, h2  \ // clear upper half of h2
-	VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
-	VLEIB  $7, $88, h1 \ // for byte shift (11 bytes)
-	VSLB   h1, h2, h2  \ // shift h2 11 bytes to the left
-	VO     h0, h2, h0  \ // combine h0 with 20 bits from limb 1
-	VLEIG  $0, $0, h1  \ // clear upper half of h1
-
-// if h > 2**130-5 then h -= 2**130-5
-// input: h0, h1
-// temp: t0, t1, t2
-// output: h0
-#define MOD(h0, h1, t0, t1, t2) \
-	VZERO t0          \
-	VLEIG $1, $5, t0  \
-	VACCQ h0, t0, t1  \
-	VAQ   h0, t0, t0  \
-	VONE  t2          \
-	VLEIG $1, $-4, t2 \
-	VAQ   t2, t1, t1  \
-	VACCQ h1, t1, t1  \
-	VONE  t2          \
-	VAQ   t2, t1, t1  \
-	VN    h0, t1, t2  \
-	VNC   t0, t1, t1  \
-	VO    t1, t2, h0  \
-
-// func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305vmsl(SB), $0-32
-	// This code processes 6 + up to 4 blocks (32 bytes) per iteration
-	// using the algorithm described in:
-	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
-	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
-	// And as moddified for VMSL as described in
-	// Accelerating Poly1305 Cryptographic Message Authentication on the z14
-	// O'Farrell et al, CASCON 2017, p48-55
-	// https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
-
-	LMG   out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
-	VZERO V0                // c
-
-	// load EX0, EX1 and EX2
-	MOVD $·constants<>(SB), R5
-	VLM  (R5), EX0, EX2        // c
-
-	// setup r
-	VL    (R4), T_0
-	MOVD  $·keyMask<>(SB), R6
-	VL    (R6), T_1
-	VN    T_0, T_1, T_0
-	VZERO T_2                 // limbs for r
-	VZERO T_3
-	VZERO T_4
-	EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
-
-	// T_2, T_3, T_4: [0, r]
-
-	// setup r*20
-	VLEIG $0, $0, T_0
-	VLEIG $1, $20, T_0       // T_0: [0, 20]
-	VZERO T_5
-	VZERO T_6
-	VMSLG T_0, T_3, T_5, T_5
-	VMSLG T_0, T_4, T_6, T_6
-
-	// store r for final block in GR
-	VLGVG $1, T_2, RSAVE_0  // c
-	VLGVG $1, T_3, RSAVE_1  // c
-	VLGVG $1, T_4, RSAVE_2  // c
-	VLGVG $1, T_5, R5SAVE_1 // c
-	VLGVG $1, T_6, R5SAVE_2 // c
-
-	// initialize h
-	VZERO H0_0
-	VZERO H1_0
-	VZERO H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	// initialize pointer for reduce constants
-	MOVD $·reduce<>(SB), R12
-
-	// calculate r**2 and 20*(r**2)
-	VZERO R_0
-	VZERO R_1
-	VZERO R_2
-	SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
-	REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
-	VZERO R5_1
-	VZERO R5_2
-	VMSLG T_0, R_1, R5_1, R5_1
-	VMSLG T_0, R_2, R5_2, R5_2
-
-	// skip r**4 calculation if 3 blocks or less
-	CMPBLE R3, $48, b4
-
-	// calculate r**4 and 20*(r**4)
-	VZERO T_8
-	VZERO T_9
-	VZERO T_10
-	SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
-	REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
-	VZERO T_2
-	VZERO T_3
-	VMSLG T_0, T_9, T_2, T_2
-	VMSLG T_0, T_10, T_3, T_3
-
-	// put r**2 to the right and r**4 to the left of R_0, R_1, R_2
-	VSLDB $8, T_8, T_8, T_8
-	VSLDB $8, T_9, T_9, T_9
-	VSLDB $8, T_10, T_10, T_10
-	VSLDB $8, T_2, T_2, T_2
-	VSLDB $8, T_3, T_3, T_3
-
-	VO T_8, R_0, R_0
-	VO T_9, R_1, R_1
-	VO T_10, R_2, R_2
-	VO T_2, R5_1, R5_1
-	VO T_3, R5_2, R5_2
-
-	CMPBLE R3, $80, load // less than or equal to 5 blocks in message
-
-	// 6(or 5+1) blocks
-	SUB    $81, R3
-	VLM    (R2), M0, M4
-	VLL    R3, 80(R2), M5
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBGE R3, $16, 2(PC)
-	VLVGB  R3, R0, M5
-	MOVD   $96(R2), R2
-	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
-	EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
-	VLEIB  $2, $1, H2_0
-	VLEIB  $2, $1, H2_1
-	VLEIB  $10, $1, H2_0
-	VLEIB  $10, $1, H2_1
-
-	VZERO  M0
-	VZERO  M1
-	VZERO  M2
-	VZERO  M3
-	VZERO  T_4
-	VZERO  T_10
-	EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
-	VLR    T_4, M4
-	VLEIB  $10, $1, M2
-	CMPBLT R3, $16, 2(PC)
-	VLEIB  $10, $1, T_10
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG  V0, H0_1, H0_0
-	VMRHG  V0, H1_1, H1_0
-	VMRHG  V0, H2_1, H2_0
-	VMRLG  V0, H0_1, H0_1
-	VMRLG  V0, H1_1, H1_1
-	VMRLG  V0, H2_1, H2_1
-
-	SUB    $16, R3
-	CMPBLE R3, $0, square
-
-load:
-	// load EX0, EX1 and EX2
-	MOVD $·c<>(SB), R5
-	VLM  (R5), EX0, EX2
-
-loop:
-	CMPBLE R3, $64, add // b4	// last 4 or less blocks left
-
-	// next 4 full blocks
-	VLM  (R2), M2, M5
-	SUB  $64, R3
-	MOVD $64(R2), R2
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
-
-	// expacc in-lined to create [m2, m3] limbs
-	VGBM   $0x3f3f, T_0     // 44 bit clear mask
-	VGBM   $0x1f1f, T_1     // 40 bit clear mask
-	VPERM  M2, M3, EX0, T_3
-	VESRLG $4, T_0, T_0     // 44 bit clear mask ready
-	VPERM  M2, M3, EX1, T_4
-	VPERM  M2, M3, EX2, T_5
-	VN     T_0, T_3, T_3
-	VESRLG $4, T_4, T_4
-	VN     T_1, T_5, T_5
-	VN     T_0, T_4, T_4
-	VMRHG  H0_1, T_3, H0_0
-	VMRHG  H1_1, T_4, H1_0
-	VMRHG  H2_1, T_5, H2_0
-	VMRLG  H0_1, T_3, H0_1
-	VMRLG  H1_1, T_4, H1_1
-	VMRLG  H2_1, T_5, H2_1
-	VLEIB  $10, $1, H2_0
-	VLEIB  $10, $1, H2_1
-	VPERM  M4, M5, EX0, T_3
-	VPERM  M4, M5, EX1, T_4
-	VPERM  M4, M5, EX2, T_5
-	VN     T_0, T_3, T_3
-	VESRLG $4, T_4, T_4
-	VN     T_1, T_5, T_5
-	VN     T_0, T_4, T_4
-	VMRHG  V0, T_3, M0
-	VMRHG  V0, T_4, M1
-	VMRHG  V0, T_5, M2
-	VMRLG  V0, T_3, M3
-	VMRLG  V0, T_4, M4
-	VMRLG  V0, T_5, M5
-	VLEIB  $10, $1, M2
-	VLEIB  $10, $1, M5
-
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	CMPBNE R3, $0, loop
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG  V0, H0_1, H0_0
-	VMRHG  V0, H1_1, H1_0
-	VMRHG  V0, H2_1, H2_0
-	VMRLG  V0, H0_1, H0_1
-	VMRLG  V0, H1_1, H1_1
-	VMRLG  V0, H2_1, H2_1
-
-	// load EX0, EX1, EX2
-	MOVD $·constants<>(SB), R5
-	VLM  (R5), EX0, EX2
-
-	// sum vectors
-	VAQ H0_0, H0_1, H0_0
-	VAQ H1_0, H1_1, H1_0
-	VAQ H2_0, H2_1, H2_0
-
-	// h may be >= 2*(2**130-5) so we need to reduce it again
-	// M0...M4 are used as temps here
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
-next:  // carry h1->h2
-	VLEIB  $7, $0x28, T_1
-	VREPIB $4, T_2
-	VGBM   $0x003F, T_3
-	VESRLG $4, T_3
-
-	// byte shift
-	VSRLB T_1, H1_0, T_4
-
-	// bit shift
-	VSRL T_2, T_4, T_4
-
-	// clear h1 carry bits
-	VN T_3, H1_0, H1_0
-
-	// add carry
-	VAQ T_4, H2_0, H2_0
-
-	// h is now < 2*(2**130-5)
-	// pack h into h1 (hi) and h0 (lo)
-	PACK(H0_0, H1_0, H2_0)
-
-	// if h > 2**130-5 then h -= 2**130-5
-	MOD(H0_0, H1_0, T_0, T_1, T_2)
-
-	// h += s
-	MOVD  $·bswapMask<>(SB), R5
-	VL    (R5), T_1
-	VL    16(R4), T_0
-	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
-	VAQ   T_0, H0_0, H0_0
-	VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
-	VST   H0_0, (R1)
-	RET
-
-add:
-	// load EX0, EX1, EX2
-	MOVD $·constants<>(SB), R5
-	VLM  (R5), EX0, EX2
-
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG  V0, H0_1, H0_0
-	VMRHG  V0, H1_1, H1_0
-	VMRHG  V0, H2_1, H2_0
-	VMRLG  V0, H0_1, H0_1
-	VMRLG  V0, H1_1, H1_1
-	VMRLG  V0, H2_1, H2_1
-	CMPBLE R3, $64, b4
-
-b4:
-	CMPBLE R3, $48, b3 // 3 blocks or less
-
-	// 4(3+1) blocks remaining
-	SUB    $49, R3
-	VLM    (R2), M0, M2
-	VLL    R3, 48(R2), M3
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, M3
-	MOVD   $64(R2), R2
-	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
-	VLEIB  $10, $1, H2_0
-	VLEIB  $10, $1, H2_1
-	VZERO  M0
-	VZERO  M1
-	VZERO  M4
-	VZERO  M5
-	VZERO  T_4
-	VZERO  T_10
-	EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
-	VLR    T_4, M2
-	VLEIB  $10, $1, M4
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $10, $1, T_10
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG  V0, H0_1, H0_0
-	VMRHG  V0, H1_1, H1_0
-	VMRHG  V0, H2_1, H2_0
-	VMRLG  V0, H0_1, H0_1
-	VMRLG  V0, H1_1, H1_1
-	VMRLG  V0, H2_1, H2_1
-	SUB    $16, R3
-	CMPBLE R3, $0, square // this condition must always hold true!
-
-b3:
-	CMPBLE R3, $32, b2
-
-	// 3 blocks remaining
-
-	// setup [r²,r]
-	VSLDB $8, R_0, R_0, R_0
-	VSLDB $8, R_1, R_1, R_1
-	VSLDB $8, R_2, R_2, R_2
-	VSLDB $8, R5_1, R5_1, R5_1
-	VSLDB $8, R5_2, R5_2, R5_2
-
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, R5SAVE_1, R5_1
-	VLVGG $1, R5SAVE_2, R5_2
-
-	// setup [h0, h1]
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-	VO    H0_1, H0_0, H0_0
-	VO    H1_1, H1_0, H1_0
-	VO    H2_1, H2_0, H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-
-	// H*[r**2, r]
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
-
-	SUB    $33, R3
-	VLM    (R2), M0, M1
-	VLL    R3, 32(R2), M2
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, M2
-
-	// H += m0
-	VZERO T_1
-	VZERO T_2
-	VZERO T_3
-	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
-	VLEIB $10, $1, T_3
-	VAG   H0_0, T_1, H0_0
-	VAG   H1_0, T_2, H1_0
-	VAG   H2_0, T_3, H2_0
-
-	VZERO M0
-	VZERO M3
-	VZERO M4
-	VZERO M5
-	VZERO T_10
-
-	// (H+m0)*r
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
-
-	// H += m1
-	VZERO V0
-	VZERO T_1
-	VZERO T_2
-	VZERO T_3
-	EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
-	VLEIB $10, $1, T_3
-	VAQ   H0_0, T_1, H0_0
-	VAQ   H1_0, T_2, H1_0
-	VAQ   H2_0, T_3, H2_0
-	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
-
-	// [H, m2] * [r**2, r]
-	EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $10, $1, H2_0
-	VZERO  M0
-	VZERO  M1
-	VZERO  M2
-	VZERO  M3
-	VZERO  M4
-	VZERO  M5
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
-	SUB    $16, R3
-	CMPBLE R3, $0, next   // this condition must always hold true!
-
-b2:
-	CMPBLE R3, $16, b1
-
-	// 2 blocks remaining
-
-	// setup [r²,r]
-	VSLDB $8, R_0, R_0, R_0
-	VSLDB $8, R_1, R_1, R_1
-	VSLDB $8, R_2, R_2, R_2
-	VSLDB $8, R5_1, R5_1, R5_1
-	VSLDB $8, R5_2, R5_2, R5_2
-
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, R5SAVE_1, R5_1
-	VLVGG $1, R5SAVE_2, R5_2
-
-	// setup [h0, h1]
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-	VO    H0_1, H0_0, H0_0
-	VO    H1_1, H1_0, H1_0
-	VO    H2_1, H2_0, H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-
-	// H*[r**2, r]
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
-	VMRHG V0, H0_1, H0_0
-	VMRHG V0, H1_1, H1_0
-	VMRHG V0, H2_1, H2_0
-	VMRLG V0, H0_1, H0_1
-	VMRLG V0, H1_1, H1_1
-	VMRLG V0, H2_1, H2_1
-
-	// move h to the left and 0s at the right
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-
-	// get message blocks and append 1 to start
-	SUB    $17, R3
-	VL     (R2), M0
-	VLL    R3, 16(R2), M1
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, M1
-	VZERO  T_6
-	VZERO  T_7
-	VZERO  T_8
-	EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
-	EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
-	VLEIB  $2, $1, T_8
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $10, $1, T_8
-
-	// add [m0, m1] to h
-	VAG H0_0, T_6, H0_0
-	VAG H1_0, T_7, H1_0
-	VAG H2_0, T_8, H2_0
-
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-	VZERO T_10
-	VZERO M0
-
-	// at this point R_0 .. R5_2 look like [r**2, r]
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
-	SUB    $16, R3, R3
-	CMPBLE R3, $0, next
-
-b1:
-	CMPBLE R3, $0, next
-
-	// 1 block remaining
-
-	// setup [r²,r]
-	VSLDB $8, R_0, R_0, R_0
-	VSLDB $8, R_1, R_1, R_1
-	VSLDB $8, R_2, R_2, R_2
-	VSLDB $8, R5_1, R5_1, R5_1
-	VSLDB $8, R5_2, R5_2, R5_2
-
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, R5SAVE_1, R5_1
-	VLVGG $1, R5SAVE_2, R5_2
-
-	// setup [h0, h1]
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-	VO    H0_1, H0_0, H0_0
-	VO    H1_1, H1_0, H1_0
-	VO    H2_1, H2_0, H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-
-	// H*[r**2, r]
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
-	// set up [0, m0] limbs
-	SUB    $1, R3
-	VLL    R3, (R2), M0
-	ADD    $1, R3
-	MOVBZ  $1, R0
-	CMPBEQ R3, $16, 2(PC)
-	VLVGB  R3, R0, M0
-	VZERO  T_1
-	VZERO  T_2
-	VZERO  T_3
-	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
-	CMPBNE R3, $16, 2(PC)
-	VLEIB  $10, $1, T_3
-
-	// h+m0
-	VAQ H0_0, T_1, H0_0
-	VAQ H1_0, T_2, H1_0
-	VAQ H2_0, T_3, H2_0
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-
-	BR next
-
-square:
-	// setup [r²,r]
-	VSLDB $8, R_0, R_0, R_0
-	VSLDB $8, R_1, R_1, R_1
-	VSLDB $8, R_2, R_2, R_2
-	VSLDB $8, R5_1, R5_1, R5_1
-	VSLDB $8, R5_2, R5_2, R5_2
-
-	VLVGG $1, RSAVE_0, R_0
-	VLVGG $1, RSAVE_1, R_1
-	VLVGG $1, RSAVE_2, R_2
-	VLVGG $1, R5SAVE_1, R5_1
-	VLVGG $1, R5SAVE_2, R5_2
-
-	// setup [h0, h1]
-	VSLDB $8, H0_0, H0_0, H0_0
-	VSLDB $8, H1_0, H1_0, H1_0
-	VSLDB $8, H2_0, H2_0, H2_0
-	VO    H0_1, H0_0, H0_0
-	VO    H1_1, H1_0, H1_0
-	VO    H2_1, H2_0, H2_0
-	VZERO H0_1
-	VZERO H1_1
-	VZERO H2_1
-
-	VZERO M0
-	VZERO M1
-	VZERO M2
-	VZERO M3
-	VZERO M4
-	VZERO M5
-
-	// (h0*r**2) + (h1*r)
-	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
-	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
-	BR next
diff --git a/src/vendor/golang.org/x/text/unicode/bidi/core.go b/src/vendor/golang.org/x/text/unicode/bidi/core.go
index 48d144008a..50deb6600a 100644
--- a/src/vendor/golang.org/x/text/unicode/bidi/core.go
+++ b/src/vendor/golang.org/x/text/unicode/bidi/core.go
@@ -480,15 +480,15 @@ func (s *isolatingRunSequence) resolveWeakTypes() {
 
 	// Rule W1.
 	// Changes all NSMs.
-	preceedingCharacterType := s.sos
+	precedingCharacterType := s.sos
 	for i, t := range s.types {
 		if t == NSM {
-			s.types[i] = preceedingCharacterType
+			s.types[i] = precedingCharacterType
 		} else {
 			if t.in(LRI, RLI, FSI, PDI) {
-				preceedingCharacterType = ON
+				precedingCharacterType = ON
 			}
-			preceedingCharacterType = t
+			precedingCharacterType = t
 		}
 	}
 
diff --git a/src/vendor/modules.txt b/src/vendor/modules.txt
index 37fda889ec..7c42df8348 100644
--- a/src/vendor/modules.txt
+++ b/src/vendor/modules.txt
@@ -1,4 +1,4 @@
-# golang.org/x/crypto v0.0.0-20200414155820-4f8f47aa7992
+# golang.org/x/crypto v0.0.0-20200429183012-4b2356b1ed79
 ## explicit
 golang.org/x/crypto/chacha20
 golang.org/x/crypto/chacha20poly1305
@@ -18,9 +18,10 @@ golang.org/x/net/idna
 golang.org/x/net/lif
 golang.org/x/net/nettest
 golang.org/x/net/route
-# golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd
+# golang.org/x/sys v0.0.0-20200501145240-bc7a7d42d5c3
+## explicit
 golang.org/x/sys/cpu
-# golang.org/x/text v0.3.3-0.20191031172631-4b67af870c6f
+# golang.org/x/text v0.3.3-0.20200430171850-afb9336c4530
 ## explicit
 golang.org/x/text/secure/bidirule
 golang.org/x/text/transform
author	Dmitri Shuralyov <dmitshur@golang.org>	2020-05-01 18:58:41 -0400
committer	Dmitri Shuralyov <dmitshur@golang.org>	2020-05-04 22:52:07 +0000
commit	b5f7ff4aa9c1fef6437f350595caae4ee4b5708d (patch)
tree	aab04441ac1615260b34983999bd5ba923d4be4b /src/vendor
parent	4c003f6b780b471afbf032438eb6c7519458855b (diff)
download	go-b5f7ff4aa9c1fef6437f350595caae4ee4b5708d.tar.gz go-b5f7ff4aa9c1fef6437f350595caae4ee4b5708d.zip