aboutsummaryrefslogtreecommitdiff
path: root/src/crypto/aes/asm_ppc64x.s
blob: 288f7256c7c5bf4b1113470e87ee313193855fad (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ppc64 || ppc64le

// Based on CRYPTOGAMS code with the following comment:
// # ====================================================================
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
// # project. The module is, however, dual licensed under OpenSSL and
// # CRYPTOGAMS licenses depending on where you obtain it. For further
// # details see http://www.openssl.org/~appro/cryptogams/.
// # ====================================================================

// Original code can be found at the link below:
// https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl

// Some function names were changed to be consistent with Go function
// names. For instance, function aes_p8_set_{en,de}crypt_key become
// set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts
// and a new session was created (doEncryptKeyAsm). This was necessary to
// avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm.
// There were other modifications as well but kept the same functionality.

#include "textflag.h"

// For expandKeyAsm
#define INP     R3
#define BITS    R4
#define OUTENC  R5 // Pointer to next expanded encrypt key
#define PTR     R6
#define CNT     R7
#define ROUNDS  R8
#define OUTDEC  R9  // Pointer to next expanded decrypt key
#define TEMP    R19
#define ZERO    V0
#define IN0     V1
#define IN1     V2
#define KEY     V3
#define RCON    V4
#define MASK    V5
#define TMP     V6
#define STAGE   V7
#define OUTPERM V8
#define OUTMASK V9
#define OUTHEAD V10
#define OUTTAIL V11

// For P9 instruction emulation
#define ESPERM  V21  // Endian swapping permute into BE
#define TMP2    V22  // Temporary for P8_STXVB16X/P8_STXVB16X

// For {en,de}cryptBlockAsm
#define BLK_INP    R3
#define BLK_OUT    R4
#define BLK_KEY    R5
#define BLK_ROUNDS R6
#define BLK_IDX    R7

DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
DATA ·rcon+0x08(SB)/8, $0x0706050403020100
DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
DATA ·rcon+0x40(SB)/8, $0x0000000000000000
DATA ·rcon+0x48(SB)/8, $0x0000000000000000
GLOBL ·rcon(SB), RODATA, $80

#ifdef GOARCH_ppc64le
#  ifdef GOPPC64_power9
#define P8_LXVB16X(RA,RB,VT)  LXVB16X	(RA+RB), VT
#define P8_STXVB16X(VS,RA,RB) STXVB16X	VS, (RA+RB)
#define XXBRD_ON_LE(VA,VT)    XXBRD	VA, VT
#  else
// On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned
// doublewords and byte-swapping each doubleword to emulate BE load/stores.
#define NEEDS_ESPERM
#define P8_LXVB16X(RA,RB,VT) \
	LXVD2X	(RA+RB), VT \
	VPERM	VT, VT, ESPERM, VT

#define P8_STXVB16X(VS,RA,RB) \
	VPERM	VS, VS, ESPERM, TMP2 \
	STXVD2X	TMP2, (RA+RB)

#define XXBRD_ON_LE(VA,VT) \
	VPERM	VA, VA, ESPERM, VT

#  endif // defined(GOPPC64_power9)
#else
#define P8_LXVB16X(RA,RB,VT)  LXVD2X	(RA+RB), VT
#define P8_STXVB16X(VS,RA,RB) STXVD2X	VS, (RA+RB)
#define XXBRD_ON_LE(VA, VT)
#endif // defined(GOARCH_ppc64le)

// func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
	// Load the arguments inside the registers
	MOVD	nr+0(FP), ROUNDS
	MOVD	key+8(FP), INP
	MOVD	enc+16(FP), OUTENC
	MOVD	dec+24(FP), OUTDEC

#ifdef NEEDS_ESPERM
	MOVD	$·rcon(SB), PTR // PTR points to rcon addr
	LVX	(PTR), ESPERM
	ADD	$0x10, PTR
#else
	MOVD	$·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
#endif

	// Get key from memory and write aligned into VR
	P8_LXVB16X(INP, R0, IN0)
	ADD	$0x10, INP, INP
	MOVD	$0x20, TEMP

	CMPW	ROUNDS, $12
	LVX	(PTR)(R0), RCON    // lvx   4,0,6      Load first 16 bytes into RCON
	LVX	(PTR)(TEMP), MASK
	ADD	$0x10, PTR, PTR    // addi  6,6,0x10   PTR to next 16 bytes of RCON
	MOVD	$8, CNT            // li    7,8        CNT = 8
	VXOR	ZERO, ZERO, ZERO   // vxor  0,0,0      Zero to be zero :)
	MOVD	CNT, CTR           // mtctr 7          Set the counter to 8 (rounds)

	// The expanded decrypt key is the expanded encrypt key stored in reverse order.
	// Move OUTDEC to the last key location, and store in descending order.
	ADD	$160, OUTDEC, OUTDEC
	BLT	loop128
	ADD	$32, OUTDEC, OUTDEC
	BEQ	l192
	ADD	$32, OUTDEC, OUTDEC
	JMP	l256

loop128:
	// Key schedule (Round 1 to 8)
	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5         Rotate-n-splat
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
	VXOR	IN0, KEY, IN0       // vxor 1,1,3
	BC	0x10, 0, loop128    // bdnz .Loop128

	LVX	(PTR)(R0), RCON // lvx 4,0,6     Last two round keys

	// Key schedule (Round 9)
	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-spat
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	// Key schedule (Round 10)
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
	VXOR	IN0, KEY, IN0       // vxor 1,1,3

	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-splat
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	// Key schedule (Round 11)
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)

	RET

l192:
	LXSDX	(INP+R0), IN1                    // Load next 8 bytes into upper half of VSR.
	XXBRD_ON_LE(IN1, IN1)                    // and convert to BE ordering on LE hosts.
	MOVD	$4, CNT                          // li 7,4
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC
	VSPLTISB	$8, KEY                  // vspltisb 3,8
	MOVD	CNT, CTR                         // mtctr 7
	VSUBUBM	MASK, KEY, MASK                  // vsububm 5,5,3

loop192:
	VPERM	IN1, IN1, MASK, KEY // vperm 3,2,2,5
	VSLDOI	$12, ZERO, IN0, TMP // vsldoi 6,0,1,12
	VCIPHERLAST	KEY, RCON, KEY      // vcipherlast 3,3,4

	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6

	VSLDOI	$8, ZERO, IN1, STAGE  // vsldoi 7,0,2,8
	VSPLTW	$3, IN0, TMP          // vspltw 6,1,3
	VXOR	TMP, IN1, TMP         // vxor 6,6,2
	VSLDOI	$12, ZERO, IN1, IN1   // vsldoi 2,0,2,12
	VADDUWM	RCON, RCON, RCON      // vadduwm 4,4,4
	VXOR	IN1, TMP, IN1         // vxor 2,2,6
	VXOR	IN0, KEY, IN0         // vxor 1,1,3
	VXOR	IN1, KEY, IN1         // vxor 2,2,3
	VSLDOI	$8, STAGE, IN0, STAGE // vsldoi 7,7,1,8

	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	STAGE, (R0+OUTENC)
	STXVD2X	STAGE, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	VSLDOI	$8, IN0, IN1, STAGE              // vsldoi 7,1,2,8
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	STXVD2X	STAGE, (R0+OUTENC)
	STXVD2X	STAGE, (R0+OUTDEC)
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	VSPLTW	$3, IN0, TMP                     // vspltw 6,1,3
	VXOR	TMP, IN1, TMP                    // vxor 6,6,2
	VSLDOI	$12, ZERO, IN1, IN1              // vsldoi 2,0,2,12
	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
	VXOR	IN1, TMP, IN1                    // vxor 2,2,6
	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
	VXOR	IN1, KEY, IN1                    // vxor 2,2,3
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC
	BC	0x10, 0, loop192                 // bdnz .Loop192

	RET

l256:
	P8_LXVB16X(INP, R0, IN1)
	MOVD	$7, CNT                          // li 7,7
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC
	MOVD	CNT, CTR                         // mtctr 7

loop256:
	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	IN1, (R0+OUTENC)
	STXVD2X	IN1, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC
	BC	0x12, 0, done                    // bdz .Ldone

	VSPLTW	$3, IN0, KEY        // vspltw 3,1,3
	VSLDOI	$12, ZERO, IN1, TMP // vsldoi 6,0,2,12
	VSBOX	KEY, KEY            // vsbox 3,3

	VXOR	IN1, TMP, IN1       // vxor 2,2,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN1, TMP, IN1       // vxor 2,2,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN1, TMP, IN1       // vxor 2,2,6

	VXOR	IN1, KEY, IN1 // vxor 2,2,3
	JMP	loop256       // b .Loop256

done:
	RET

// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
	MOVD	nr+0(FP), R6   // Round count/Key size
	MOVD	xk+8(FP), R5   // Key pointer
	MOVD	dst+16(FP), R3 // Dest pointer
	MOVD	src+24(FP), R4 // Src pointer
#ifdef NEEDS_ESPERM
	MOVD	$·rcon(SB), R7
	LVX	(R7), ESPERM   // Permute value for P8_ macros.
#endif

	// Set CR{1,2,3}EQ to hold the key size information.
	CMPU	R6, $10, CR1
	CMPU	R6, $12, CR2
	CMPU	R6, $14, CR3

	MOVD	$16, R6
	MOVD	$32, R7
	MOVD	$48, R8
	MOVD	$64, R9
	MOVD	$80, R10
	MOVD	$96, R11
	MOVD	$112, R12

	// Load text in BE order
	P8_LXVB16X(R4, R0, V0)

	// V1, V2 will hold keys, V0 is a temp.
	// At completion, V2 will hold the ciphertext.
	// Load xk[0:3] and xor with text
	LXVD2X	(R0+R5), V1
	VXOR	V0, V1, V0

	// Load xk[4:11] and cipher
	LXVD2X	(R6+R5), V1
	LXVD2X	(R7+R5), V2
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[12:19] and cipher
	LXVD2X	(R8+R5), V1
	LXVD2X	(R9+R5), V2
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[20:27] and cipher
	LXVD2X	(R10+R5), V1
	LXVD2X	(R11+R5), V2
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Increment xk pointer to reuse constant offsets in R6-R12.
	ADD	$112, R5

	// Load xk[28:35] and cipher
	LXVD2X	(R0+R5), V1
	LXVD2X	(R6+R5), V2
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[36:43] and cipher
	LXVD2X	(R7+R5), V1
	LXVD2X	(R8+R5), V2
	BEQ	CR1, Ldec_tail // Key size 10?
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[44:51] and cipher
	LXVD2X	(R9+R5), V1
	LXVD2X	(R10+R5), V2
	BEQ	CR2, Ldec_tail // Key size 12?
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[52:59] and cipher
	LXVD2X	(R11+R5), V1
	LXVD2X	(R12+R5), V2
	BNE	CR3, Linvalid_key_len // Not key size 14?
	// Fallthrough to final cipher

Ldec_tail:
	// Cipher last two keys such that key information is
	// cleared from V1 and V2.
	VCIPHER		V0, V1, V1
	VCIPHERLAST	V1, V2, V2

	// Store the result in BE order.
	P8_STXVB16X(V2, R3, R0)
	RET

Linvalid_key_len:
	// Segfault, this should never happen. Only 3 keys sizes are created/used.
	MOVD	R0, 0(R0)
	RET

// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
	MOVD	nr+0(FP), R6   // Round count/Key size
	MOVD	xk+8(FP), R5   // Key pointer
	MOVD	dst+16(FP), R3 // Dest pointer
	MOVD	src+24(FP), R4 // Src pointer
#ifdef NEEDS_ESPERM
	MOVD	$·rcon(SB), R7
	LVX	(R7), ESPERM   // Permute value for P8_ macros.
#endif

	// Set CR{1,2,3}EQ to hold the key size information.
	CMPU	R6, $10, CR1
	CMPU	R6, $12, CR2
	CMPU	R6, $14, CR3

	MOVD	$16, R6
	MOVD	$32, R7
	MOVD	$48, R8
	MOVD	$64, R9
	MOVD	$80, R10
	MOVD	$96, R11
	MOVD	$112, R12

	// Load text in BE order
	P8_LXVB16X(R4, R0, V0)

	// V1, V2 will hold keys, V0 is a temp.
	// At completion, V2 will hold the text.
	// Load xk[0:3] and xor with ciphertext
	LXVD2X	(R0+R5), V1
	VXOR	V0, V1, V0

	// Load xk[4:11] and cipher
	LXVD2X	(R6+R5), V1
	LXVD2X	(R7+R5), V2
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[12:19] and cipher
	LXVD2X	(R8+R5), V1
	LXVD2X	(R9+R5), V2
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[20:27] and cipher
	LXVD2X	(R10+R5), V1
	LXVD2X	(R11+R5), V2
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Increment xk pointer to reuse constant offsets in R6-R12.
	ADD	$112, R5

	// Load xk[28:35] and cipher
	LXVD2X	(R0+R5), V1
	LXVD2X	(R6+R5), V2
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[36:43] and cipher
	LXVD2X	(R7+R5), V1
	LXVD2X	(R8+R5), V2
	BEQ	CR1, Ldec_tail // Key size 10?
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[44:51] and cipher
	LXVD2X	(R9+R5), V1
	LXVD2X	(R10+R5), V2
	BEQ	CR2, Ldec_tail // Key size 12?
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[52:59] and cipher
	LXVD2X	(R11+R5), V1
	LXVD2X	(R12+R5), V2
	BNE	CR3, Linvalid_key_len // Not key size 14?
	// Fallthrough to final cipher

Ldec_tail:
	// Cipher last two keys such that key information is
	// cleared from V1 and V2.
	VNCIPHER	V0, V1, V1
	VNCIPHERLAST	V1, V2, V2

	// Store the result in BE order.
	P8_STXVB16X(V2, R3, R0)
	RET

Linvalid_key_len:
	// Segfault, this should never happen. Only 3 keys sizes are created/used.
	MOVD	R0, 0(R0)
	RET

// Remove defines from above so they can be defined here
#undef INP
#undef OUTENC
#undef ROUNDS
#undef KEY
#undef TMP

#define INP R3
#define OUTP R4
#define LEN R5
#define KEYP R6
#define ROUNDS R7
#define IVP R8
#define ENC R9

#define INOUT V2
#define TMP V3
#define IVEC V4

// Load the crypt key into VSRs.
//
// The expanded key is stored and loaded using
// STXVD2X/LXVD2X. The in-memory byte ordering
// depends on the endianness of the machine. The
// expanded keys are generated by expandKeyAsm above.
//
// Rkeyp holds the key pointer. It is clobbered. Once
// the expanded keys are loaded, it is not needed.
//
// R12,R14-R21 are scratch registers.
// For keyp of 10, V6, V11-V20 hold the expanded key.
// For keyp of 12, V6, V9-V20 hold the expanded key.
// For keyp of 14, V6, V7-V20 hold the expanded key.
#define LOAD_KEY(Rkeyp) \
	MOVD	$16, R12 \
	MOVD	$32, R14 \
	MOVD	$48, R15 \
	MOVD	$64, R16 \
	MOVD	$80, R17 \
	MOVD	$96, R18 \
	MOVD	$112, R19 \
	MOVD	$128, R20 \
	MOVD	$144, R21 \
	LXVD2X	(R0+Rkeyp), V6 \
	ADD	$16, Rkeyp \
	BEQ	CR1, L_start10 \
	BEQ	CR2, L_start12 \
	LXVD2X	(R0+Rkeyp), V7 \
	LXVD2X	(R12+Rkeyp), V8 \
	ADD	$32, Rkeyp \
	L_start12: \
	LXVD2X	(R0+Rkeyp), V9 \
	LXVD2X	(R12+Rkeyp), V10 \
	ADD	$32, Rkeyp \
	L_start10: \
	LXVD2X	(R0+Rkeyp), V11 \
	LXVD2X	(R12+Rkeyp), V12 \
	LXVD2X	(R14+Rkeyp), V13 \
	LXVD2X	(R15+Rkeyp), V14 \
	LXVD2X	(R16+Rkeyp), V15 \
	LXVD2X	(R17+Rkeyp), V16 \
	LXVD2X	(R18+Rkeyp), V17 \
	LXVD2X	(R19+Rkeyp), V18 \
	LXVD2X	(R20+Rkeyp), V19 \
	LXVD2X	(R21+Rkeyp), V20

// Perform aes cipher operation for keysize 10/12/14 using the keys
// loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
//
// Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
// performance V6 and IVEC can be swapped (xor is both associative and
// commutative) during encryption:
//
//	VXOR INOUT, IVEC, INOUT
//	VXOR INOUT, V6, INOUT
//
//	into
//
//	VXOR INOUT, V6, INOUT
//	VXOR INOUT, IVEC, INOUT
//
#define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
	VXOR	Vin, Vxor, Vout \
	BEQ	CR1, label10 \
	BEQ	CR2, label12 \
	vcipher	Vout, V7, Vout \
	vcipher	Vout, V8, Vout \
	label12: \
	vcipher	Vout, V9, Vout \
	vcipher	Vout, V10, Vout \
	label10: \
	vcipher	Vout, V11, Vout \
	vcipher	Vout, V12, Vout \
	vcipher	Vout, V13, Vout \
	vcipher	Vout, V14, Vout \
	vcipher	Vout, V15, Vout \
	vcipher	Vout, V16, Vout \
	vcipher	Vout, V17, Vout \
	vcipher	Vout, V18, Vout \
	vcipher	Vout, V19, Vout \
	vciphel	Vout, V20, Vout \

#define CLEAR_KEYS() \
	VXOR	V6, V6, V6 \
	VXOR	V7, V7, V7 \
	VXOR	V8, V8, V8 \
	VXOR	V9, V9, V9 \
	VXOR	V10, V10, V10 \
	VXOR	V11, V11, V11 \
	VXOR	V12, V12, V12 \
	VXOR	V13, V13, V13 \
	VXOR	V14, V14, V14 \
	VXOR	V15, V15, V15 \
	VXOR	V16, V16, V16 \
	VXOR	V17, V17, V17 \
	VXOR	V18, V18, V18 \
	VXOR	V19, V19, V19 \
	VXOR	V20, V20, V20

//func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
	MOVD	src+0(FP), INP
	MOVD	dst+8(FP), OUTP
	MOVD	length+16(FP), LEN
	MOVD	key+24(FP), KEYP
	MOVD	iv+32(FP), IVP
	MOVD	enc+40(FP), ENC
	MOVD	nr+48(FP), ROUNDS

#ifdef NEEDS_ESPERM
	MOVD	$·rcon(SB), R11
	LVX	(R11), ESPERM   // Permute value for P8_ macros.
#endif

	// Assume len > 0 && len % blockSize == 0.
	CMPW	ENC, $0
	P8_LXVB16X(IVP, R0, IVEC)
	CMPU	ROUNDS, $10, CR1
	CMPU	ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.

	// Setup key in VSRs, and set loop count in CTR.
	LOAD_KEY(KEYP)
	SRD	$4, LEN
	MOVD	LEN, CTR

	BEQ	Lcbc_dec

	PCALIGN $16
Lcbc_enc:
	P8_LXVB16X(INP, R0, INOUT)
	ADD	$16, INP
	VXOR	INOUT, V6, INOUT
	CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
	VOR	INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
	P8_STXVB16X(INOUT, OUTP, R0)
	ADD	$16, OUTP
	BDNZ	Lcbc_enc

	P8_STXVB16X(INOUT, IVP, R0)
	CLEAR_KEYS()
	RET

	PCALIGN $16
Lcbc_dec:
	P8_LXVB16X(INP, R0, TMP)
	ADD	$16, INP
	CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
	VXOR	INOUT, IVEC, INOUT
	VOR	TMP, TMP, IVEC // TMP is IVEC for next block.
	P8_STXVB16X(INOUT, OUTP, R0)
	ADD	$16, OUTP
	BDNZ	Lcbc_dec

	P8_STXVB16X(IVEC, IVP, R0)
	CLEAR_KEYS()
	RET