From f8e3dea19012ccf05965d10255789eec33c2ebcf Mon Sep 17 00:00:00 2001 From: Niall Sheridan Date: Thu, 23 Aug 2018 22:51:21 +0100 Subject: Update deps --- .../x/crypto/internal/chacha20/asm_s390x.s | 283 --------------------- .../x/crypto/internal/chacha20/chacha_generic.go | 112 +++++--- .../x/crypto/internal/chacha20/chacha_s390x.s | 283 +++++++++++++++++++++ 3 files changed, 353 insertions(+), 325 deletions(-) delete mode 100644 vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s create mode 100644 vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.s (limited to 'vendor/golang.org/x/crypto/internal/chacha20') diff --git a/vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s b/vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s deleted file mode 100644 index 98427c5..0000000 --- a/vendor/golang.org/x/crypto/internal/chacha20/asm_s390x.s +++ /dev/null @@ -1,283 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build s390x,!gccgo,!appengine - -#include "go_asm.h" -#include "textflag.h" - -// This is an implementation of the ChaCha20 encryption algorithm as -// specified in RFC 7539. It uses vector instructions to compute -// 4 keystream blocks in parallel (256 bytes) which are then XORed -// with the bytes in the input slice. - -GLOBL ·constants<>(SB), RODATA|NOPTR, $32 -// BSWAP: swap bytes in each 4-byte element -DATA ·constants<>+0x00(SB)/4, $0x03020100 -DATA ·constants<>+0x04(SB)/4, $0x07060504 -DATA ·constants<>+0x08(SB)/4, $0x0b0a0908 -DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c -// J0: [j0, j1, j2, j3] -DATA ·constants<>+0x10(SB)/4, $0x61707865 -DATA ·constants<>+0x14(SB)/4, $0x3320646e -DATA ·constants<>+0x18(SB)/4, $0x79622d32 -DATA ·constants<>+0x1c(SB)/4, $0x6b206574 - -// EXRL targets: -TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0 - MVC $1, (R1), (R8) - RET - -TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0 - MVC $1, (R8), (R9) - RET - -#define BSWAP V5 -#define J0 V6 -#define KEY0 V7 -#define KEY1 V8 -#define NONCE V9 -#define CTR V10 -#define M0 V11 -#define M1 V12 -#define M2 V13 -#define M3 V14 -#define INC V15 -#define X0 V16 -#define X1 V17 -#define X2 V18 -#define X3 V19 -#define X4 V20 -#define X5 V21 -#define X6 V22 -#define X7 V23 -#define X8 V24 -#define X9 V25 -#define X10 V26 -#define X11 V27 -#define X12 V28 -#define X13 V29 -#define X14 V30 -#define X15 V31 - -#define NUM_ROUNDS 20 - -#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \ - VAF a1, a0, a0 \ - VAF b1, b0, b0 \ - VAF c1, c0, c0 \ - VAF d1, d0, d0 \ - VX a0, a2, a2 \ - VX b0, b2, b2 \ - VX c0, c2, c2 \ - VX d0, d2, d2 \ - VERLLF $16, a2, a2 \ - VERLLF $16, b2, b2 \ - VERLLF $16, c2, c2 \ - VERLLF $16, d2, d2 \ - VAF a2, a3, a3 \ - VAF b2, b3, b3 \ - VAF c2, c3, c3 \ - VAF d2, d3, d3 \ - VX a3, a1, a1 \ - VX b3, b1, b1 \ - VX c3, c1, c1 \ - VX d3, d1, d1 \ - VERLLF $12, a1, a1 \ - VERLLF $12, b1, b1 \ - VERLLF $12, c1, c1 \ - VERLLF $12, d1, d1 \ - VAF a1, a0, a0 \ - VAF b1, b0, b0 \ - VAF c1, c0, c0 \ - VAF d1, d0, d0 \ - VX a0, a2, a2 \ - VX b0, b2, b2 \ - VX c0, c2, c2 \ - VX d0, d2, d2 \ - VERLLF $8, a2, a2 \ - VERLLF $8, b2, b2 \ - VERLLF $8, c2, c2 \ - VERLLF $8, d2, d2 \ - VAF a2, a3, a3 \ - VAF b2, b3, b3 \ - VAF c2, c3, c3 \ - VAF d2, d3, d3 \ - VX a3, a1, a1 \ - VX b3, b1, b1 \ - VX c3, c1, c1 \ - VX d3, d1, d1 \ - VERLLF $7, a1, a1 \ - VERLLF $7, b1, b1 \ - VERLLF $7, c1, c1 \ - VERLLF $7, d1, d1 - -#define PERMUTE(mask, v0, v1, v2, v3) \ - VPERM v0, v0, mask, v0 \ - VPERM v1, v1, mask, v1 \ - VPERM v2, v2, mask, v2 \ - VPERM v3, v3, mask, v3 - -#define ADDV(x, v0, v1, v2, v3) \ - VAF x, v0, v0 \ - VAF x, v1, v1 \ - VAF x, v2, v2 \ - VAF x, v3, v3 - -#define XORV(off, dst, src, v0, v1, v2, v3) \ - VLM off(src), M0, M3 \ - PERMUTE(BSWAP, v0, v1, v2, v3) \ - VX v0, M0, M0 \ - VX v1, M1, M1 \ - VX v2, M2, M2 \ - VX v3, M3, M3 \ - VSTM M0, M3, off(dst) - -#define SHUFFLE(a, b, c, d, t, u, v, w) \ - VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]} - VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]} - VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]} - VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]} - VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]} - VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]} - VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]} - VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]} - -// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int) -TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 - MOVD $·constants<>(SB), R1 - MOVD dst+0(FP), R2 // R2=&dst[0] - LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src) - MOVD key+48(FP), R5 // R5=key - MOVD nonce+56(FP), R6 // R6=nonce - MOVD counter+64(FP), R7 // R7=counter - MOVD buf+72(FP), R8 // R8=buf - MOVD len+80(FP), R9 // R9=len - - // load BSWAP and J0 - VLM (R1), BSWAP, J0 - - // set up tail buffer - ADD $-1, R4, R12 - MOVBZ R12, R12 - CMPUBEQ R12, $255, aligned - MOVD R4, R1 - AND $~255, R1 - MOVD $(R3)(R1*1), R1 - EXRL $·mvcSrcToBuf(SB), R12 - MOVD $255, R0 - SUB R12, R0 - MOVD R0, (R9) // update len - -aligned: - // setup - MOVD $95, R0 - VLM (R5), KEY0, KEY1 - VLL R0, (R6), NONCE - VZERO M0 - VLEIB $7, $32, M0 - VSRLB M0, NONCE, NONCE - - // initialize counter values - VLREPF (R7), CTR - VZERO INC - VLEIF $1, $1, INC - VLEIF $2, $2, INC - VLEIF $3, $3, INC - VAF INC, CTR, CTR - VREPIF $4, INC - -chacha: - VREPF $0, J0, X0 - VREPF $1, J0, X1 - VREPF $2, J0, X2 - VREPF $3, J0, X3 - VREPF $0, KEY0, X4 - VREPF $1, KEY0, X5 - VREPF $2, KEY0, X6 - VREPF $3, KEY0, X7 - VREPF $0, KEY1, X8 - VREPF $1, KEY1, X9 - VREPF $2, KEY1, X10 - VREPF $3, KEY1, X11 - VLR CTR, X12 - VREPF $1, NONCE, X13 - VREPF $2, NONCE, X14 - VREPF $3, NONCE, X15 - - MOVD $(NUM_ROUNDS/2), R1 - -loop: - ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11) - ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9) - - ADD $-1, R1 - BNE loop - - // decrement length - ADD $-256, R4 - BLT tail - -continue: - // rearrange vectors - SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3) - ADDV(J0, X0, X1, X2, X3) - SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3) - ADDV(KEY0, X4, X5, X6, X7) - SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3) - ADDV(KEY1, X8, X9, X10, X11) - VAF CTR, X12, X12 - SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3) - ADDV(NONCE, X12, X13, X14, X15) - - // increment counters - VAF INC, CTR, CTR - - // xor keystream with plaintext - XORV(0*64, R2, R3, X0, X4, X8, X12) - XORV(1*64, R2, R3, X1, X5, X9, X13) - XORV(2*64, R2, R3, X2, X6, X10, X14) - XORV(3*64, R2, R3, X3, X7, X11, X15) - - // increment pointers - MOVD $256(R2), R2 - MOVD $256(R3), R3 - - CMPBNE R4, $0, chacha - CMPUBEQ R12, $255, return - EXRL $·mvcBufToDst(SB), R12 // len was updated during setup - -return: - VSTEF $0, CTR, (R7) - RET - -tail: - MOVD R2, R9 - MOVD R8, R2 - MOVD R8, R3 - MOVD $0, R4 - JMP continue - -// func hasVectorFacility() bool -TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 - MOVD $x-24(SP), R1 - XC $24, 0(R1), 0(R1) // clear the storage - MOVD $2, R0 // R0 is the number of double words stored -1 - WORD $0xB2B01000 // STFLE 0(R1) - XOR R0, R0 // reset the value of R0 - MOVBZ z-8(SP), R1 - AND $0x40, R1 - BEQ novector - -vectorinstalled: - // check if the vector instruction has been enabled - VLEIB $0, $0xF, V16 - VLGVB $0, V16, R1 - CMPBNE R1, $0xF, novector - MOVB $1, ret+0(FP) // have vx - RET - -novector: - MOVB $0, ret+0(FP) // no vx - RET diff --git a/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go b/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go index 523751f..6570847 100644 --- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go +++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go @@ -32,6 +32,30 @@ func New(key [8]uint32, nonce [3]uint32) *Cipher { return &Cipher{key: key, nonce: nonce} } +// ChaCha20 constants spelling "expand 32-byte k" +const ( + j0 uint32 = 0x61707865 + j1 uint32 = 0x3320646e + j2 uint32 = 0x79622d32 + j3 uint32 = 0x6b206574 +) + +func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) { + a += b + d ^= a + d = (d << 16) | (d >> 16) + c += d + b ^= c + b = (b << 12) | (b >> 20) + a += b + d ^= a + d = (d << 8) | (d >> 24) + c += d + b ^= c + b = (b << 7) | (b >> 25) + return a, b, c, d +} + // XORKeyStream XORs each byte in the given slice with a byte from the // cipher's key stream. Dst and src must overlap entirely or not at all. // @@ -73,6 +97,9 @@ func (s *Cipher) XORKeyStream(dst, src []byte) { return } if haveAsm { + if uint64(len(src))+uint64(s.counter)*64 > (1<<38)-64 { + panic("chacha20: counter overflow") + } s.xorKeyStreamAsm(dst, src) return } @@ -85,59 +112,34 @@ func (s *Cipher) XORKeyStream(dst, src []byte) { copy(s.buf[len(s.buf)-64:], src[fin:]) } - // qr calculates a quarter round - qr := func(a, b, c, d uint32) (uint32, uint32, uint32, uint32) { - a += b - d ^= a - d = (d << 16) | (d >> 16) - c += d - b ^= c - b = (b << 12) | (b >> 20) - a += b - d ^= a - d = (d << 8) | (d >> 24) - c += d - b ^= c - b = (b << 7) | (b >> 25) - return a, b, c, d - } - - // ChaCha20 constants - const ( - j0 = 0x61707865 - j1 = 0x3320646e - j2 = 0x79622d32 - j3 = 0x6b206574 - ) - // pre-calculate most of the first round - s1, s5, s9, s13 := qr(j1, s.key[1], s.key[5], s.nonce[0]) - s2, s6, s10, s14 := qr(j2, s.key[2], s.key[6], s.nonce[1]) - s3, s7, s11, s15 := qr(j3, s.key[3], s.key[7], s.nonce[2]) + s1, s5, s9, s13 := quarterRound(j1, s.key[1], s.key[5], s.nonce[0]) + s2, s6, s10, s14 := quarterRound(j2, s.key[2], s.key[6], s.nonce[1]) + s3, s7, s11, s15 := quarterRound(j3, s.key[3], s.key[7], s.nonce[2]) n := len(src) src, dst = src[:n:n], dst[:n:n] // BCE hint for i := 0; i < n; i += 64 { // calculate the remainder of the first round - s0, s4, s8, s12 := qr(j0, s.key[0], s.key[4], s.counter) + s0, s4, s8, s12 := quarterRound(j0, s.key[0], s.key[4], s.counter) // execute the second round - x0, x5, x10, x15 := qr(s0, s5, s10, s15) - x1, x6, x11, x12 := qr(s1, s6, s11, s12) - x2, x7, x8, x13 := qr(s2, s7, s8, s13) - x3, x4, x9, x14 := qr(s3, s4, s9, s14) + x0, x5, x10, x15 := quarterRound(s0, s5, s10, s15) + x1, x6, x11, x12 := quarterRound(s1, s6, s11, s12) + x2, x7, x8, x13 := quarterRound(s2, s7, s8, s13) + x3, x4, x9, x14 := quarterRound(s3, s4, s9, s14) // execute the remaining 18 rounds for i := 0; i < 9; i++ { - x0, x4, x8, x12 = qr(x0, x4, x8, x12) - x1, x5, x9, x13 = qr(x1, x5, x9, x13) - x2, x6, x10, x14 = qr(x2, x6, x10, x14) - x3, x7, x11, x15 = qr(x3, x7, x11, x15) - - x0, x5, x10, x15 = qr(x0, x5, x10, x15) - x1, x6, x11, x12 = qr(x1, x6, x11, x12) - x2, x7, x8, x13 = qr(x2, x7, x8, x13) - x3, x4, x9, x14 = qr(x3, x4, x9, x14) + x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12) + x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13) + x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14) + x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15) + + x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15) + x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12) + x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13) + x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14) } x0 += j0 @@ -234,3 +236,29 @@ func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) { } s.XORKeyStream(out, in) } + +// HChaCha20 uses the ChaCha20 core to generate a derived key from a key and a +// nonce. It should only be used as part of the XChaCha20 construction. +func HChaCha20(key *[8]uint32, nonce *[4]uint32) [8]uint32 { + x0, x1, x2, x3 := j0, j1, j2, j3 + x4, x5, x6, x7 := key[0], key[1], key[2], key[3] + x8, x9, x10, x11 := key[4], key[5], key[6], key[7] + x12, x13, x14, x15 := nonce[0], nonce[1], nonce[2], nonce[3] + + for i := 0; i < 10; i++ { + x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12) + x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13) + x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14) + x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15) + + x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15) + x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12) + x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13) + x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14) + } + + var out [8]uint32 + out[0], out[1], out[2], out[3] = x0, x1, x2, x3 + out[4], out[5], out[6], out[7] = x12, x13, x14, x15 + return out +} diff --git a/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.s b/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.s new file mode 100644 index 0000000..98427c5 --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.s @@ -0,0 +1,283 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x,!gccgo,!appengine + +#include "go_asm.h" +#include "textflag.h" + +// This is an implementation of the ChaCha20 encryption algorithm as +// specified in RFC 7539. It uses vector instructions to compute +// 4 keystream blocks in parallel (256 bytes) which are then XORed +// with the bytes in the input slice. + +GLOBL ·constants<>(SB), RODATA|NOPTR, $32 +// BSWAP: swap bytes in each 4-byte element +DATA ·constants<>+0x00(SB)/4, $0x03020100 +DATA ·constants<>+0x04(SB)/4, $0x07060504 +DATA ·constants<>+0x08(SB)/4, $0x0b0a0908 +DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c +// J0: [j0, j1, j2, j3] +DATA ·constants<>+0x10(SB)/4, $0x61707865 +DATA ·constants<>+0x14(SB)/4, $0x3320646e +DATA ·constants<>+0x18(SB)/4, $0x79622d32 +DATA ·constants<>+0x1c(SB)/4, $0x6b206574 + +// EXRL targets: +TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0 + MVC $1, (R1), (R8) + RET + +TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0 + MVC $1, (R8), (R9) + RET + +#define BSWAP V5 +#define J0 V6 +#define KEY0 V7 +#define KEY1 V8 +#define NONCE V9 +#define CTR V10 +#define M0 V11 +#define M1 V12 +#define M2 V13 +#define M3 V14 +#define INC V15 +#define X0 V16 +#define X1 V17 +#define X2 V18 +#define X3 V19 +#define X4 V20 +#define X5 V21 +#define X6 V22 +#define X7 V23 +#define X8 V24 +#define X9 V25 +#define X10 V26 +#define X11 V27 +#define X12 V28 +#define X13 V29 +#define X14 V30 +#define X15 V31 + +#define NUM_ROUNDS 20 + +#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $16, a2, a2 \ + VERLLF $16, b2, b2 \ + VERLLF $16, c2, c2 \ + VERLLF $16, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $12, a1, a1 \ + VERLLF $12, b1, b1 \ + VERLLF $12, c1, c1 \ + VERLLF $12, d1, d1 \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $8, a2, a2 \ + VERLLF $8, b2, b2 \ + VERLLF $8, c2, c2 \ + VERLLF $8, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $7, a1, a1 \ + VERLLF $7, b1, b1 \ + VERLLF $7, c1, c1 \ + VERLLF $7, d1, d1 + +#define PERMUTE(mask, v0, v1, v2, v3) \ + VPERM v0, v0, mask, v0 \ + VPERM v1, v1, mask, v1 \ + VPERM v2, v2, mask, v2 \ + VPERM v3, v3, mask, v3 + +#define ADDV(x, v0, v1, v2, v3) \ + VAF x, v0, v0 \ + VAF x, v1, v1 \ + VAF x, v2, v2 \ + VAF x, v3, v3 + +#define XORV(off, dst, src, v0, v1, v2, v3) \ + VLM off(src), M0, M3 \ + PERMUTE(BSWAP, v0, v1, v2, v3) \ + VX v0, M0, M0 \ + VX v1, M1, M1 \ + VX v2, M2, M2 \ + VX v3, M3, M3 \ + VSTM M0, M3, off(dst) + +#define SHUFFLE(a, b, c, d, t, u, v, w) \ + VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]} + VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]} + VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]} + VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]} + VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]} + VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]} + VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]} + VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]} + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVD $·constants<>(SB), R1 + MOVD dst+0(FP), R2 // R2=&dst[0] + LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src) + MOVD key+48(FP), R5 // R5=key + MOVD nonce+56(FP), R6 // R6=nonce + MOVD counter+64(FP), R7 // R7=counter + MOVD buf+72(FP), R8 // R8=buf + MOVD len+80(FP), R9 // R9=len + + // load BSWAP and J0 + VLM (R1), BSWAP, J0 + + // set up tail buffer + ADD $-1, R4, R12 + MOVBZ R12, R12 + CMPUBEQ R12, $255, aligned + MOVD R4, R1 + AND $~255, R1 + MOVD $(R3)(R1*1), R1 + EXRL $·mvcSrcToBuf(SB), R12 + MOVD $255, R0 + SUB R12, R0 + MOVD R0, (R9) // update len + +aligned: + // setup + MOVD $95, R0 + VLM (R5), KEY0, KEY1 + VLL R0, (R6), NONCE + VZERO M0 + VLEIB $7, $32, M0 + VSRLB M0, NONCE, NONCE + + // initialize counter values + VLREPF (R7), CTR + VZERO INC + VLEIF $1, $1, INC + VLEIF $2, $2, INC + VLEIF $3, $3, INC + VAF INC, CTR, CTR + VREPIF $4, INC + +chacha: + VREPF $0, J0, X0 + VREPF $1, J0, X1 + VREPF $2, J0, X2 + VREPF $3, J0, X3 + VREPF $0, KEY0, X4 + VREPF $1, KEY0, X5 + VREPF $2, KEY0, X6 + VREPF $3, KEY0, X7 + VREPF $0, KEY1, X8 + VREPF $1, KEY1, X9 + VREPF $2, KEY1, X10 + VREPF $3, KEY1, X11 + VLR CTR, X12 + VREPF $1, NONCE, X13 + VREPF $2, NONCE, X14 + VREPF $3, NONCE, X15 + + MOVD $(NUM_ROUNDS/2), R1 + +loop: + ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11) + ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9) + + ADD $-1, R1 + BNE loop + + // decrement length + ADD $-256, R4 + BLT tail + +continue: + // rearrange vectors + SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3) + ADDV(J0, X0, X1, X2, X3) + SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3) + ADDV(KEY0, X4, X5, X6, X7) + SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3) + ADDV(KEY1, X8, X9, X10, X11) + VAF CTR, X12, X12 + SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3) + ADDV(NONCE, X12, X13, X14, X15) + + // increment counters + VAF INC, CTR, CTR + + // xor keystream with plaintext + XORV(0*64, R2, R3, X0, X4, X8, X12) + XORV(1*64, R2, R3, X1, X5, X9, X13) + XORV(2*64, R2, R3, X2, X6, X10, X14) + XORV(3*64, R2, R3, X3, X7, X11, X15) + + // increment pointers + MOVD $256(R2), R2 + MOVD $256(R3), R3 + + CMPBNE R4, $0, chacha + CMPUBEQ R12, $255, return + EXRL $·mvcBufToDst(SB), R12 // len was updated during setup + +return: + VSTEF $0, CTR, (R7) + RET + +tail: + MOVD R2, R9 + MOVD R8, R2 + MOVD R8, R3 + MOVD $0, R4 + JMP continue + +// func hasVectorFacility() bool +TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 + MOVD $x-24(SP), R1 + XC $24, 0(R1), 0(R1) // clear the storage + MOVD $2, R0 // R0 is the number of double words stored -1 + WORD $0xB2B01000 // STFLE 0(R1) + XOR R0, R0 // reset the value of R0 + MOVBZ z-8(SP), R1 + AND $0x40, R1 + BEQ novector + +vectorinstalled: + // check if the vector instruction has been enabled + VLEIB $0, $0xF, V16 + VLGVB $0, V16, R1 + CMPBNE R1, $0xF, novector + MOVB $1, ret+0(FP) // have vx + RET + +novector: + MOVB $0, ret+0(FP) // no vx + RET -- cgit v1.2.3