xref: /openbmc/linux/drivers/crypto/vmx/aesp8-ppc.pl (revision a8fe58ce)
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for AES instructions as per PowerISA
11# specification version 2.07, first implemented by POWER8 processor.
12# The module is endian-agnostic in sense that it supports both big-
13# and little-endian cases. Data alignment in parallelizable modes is
14# handled with VSX loads and stores, which implies MSR.VSX flag being
15# set. It should also be noted that ISA specification doesn't prohibit
16# alignment exceptions for these instructions on page boundaries.
17# Initially alignment was handled in pure AltiVec/VMX way [when data
18# is aligned programmatically, which in turn guarantees exception-
19# free execution], but it turned to hamper performance when vcipher
20# instructions are interleaved. It's reckoned that eventual
21# misalignment penalties at page boundaries are in average lower
22# than additional overhead in pure AltiVec approach.
23
24$flavour = shift;
25
26if ($flavour =~ /64/) {
27	$SIZE_T	=8;
28	$LRSAVE	=2*$SIZE_T;
29	$STU	="stdu";
30	$POP	="ld";
31	$PUSH	="std";
32	$UCMP	="cmpld";
33	$SHL	="sldi";
34} elsif ($flavour =~ /32/) {
35	$SIZE_T	=4;
36	$LRSAVE	=$SIZE_T;
37	$STU	="stwu";
38	$POP	="lwz";
39	$PUSH	="stw";
40	$UCMP	="cmplw";
41	$SHL	="slwi";
42} else { die "nonsense $flavour"; }
43
44$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49die "can't locate ppc-xlate.pl";
50
51open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53$FRAME=8*$SIZE_T;
54$prefix="aes_p8";
55
56$sp="r1";
57$vrsave="r12";
58
59#########################################################################
60{{{	# Key setup procedures						#
61my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
62my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
63my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
64
65$code.=<<___;
66.machine	"any"
67
68.text
69
70.align	7
71rcon:
72.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
73.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
74.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
75.long	0,0,0,0						?asis
76Lconsts:
77	mflr	r0
78	bcl	20,31,\$+4
79	mflr	$ptr	 #vvvvv "distance between . and rcon
80	addi	$ptr,$ptr,-0x48
81	mtlr	r0
82	blr
83	.long	0
84	.byte	0,12,0x14,0,0,0,0,0
85.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
86
87.globl	.${prefix}_set_encrypt_key
88Lset_encrypt_key:
89	mflr		r11
90	$PUSH		r11,$LRSAVE($sp)
91
92	li		$ptr,-1
93	${UCMP}i	$inp,0
94	beq-		Lenc_key_abort		# if ($inp==0) return -1;
95	${UCMP}i	$out,0
96	beq-		Lenc_key_abort		# if ($out==0) return -1;
97	li		$ptr,-2
98	cmpwi		$bits,128
99	blt-		Lenc_key_abort
100	cmpwi		$bits,256
101	bgt-		Lenc_key_abort
102	andi.		r0,$bits,0x3f
103	bne-		Lenc_key_abort
104
105	lis		r0,0xfff0
106	mfspr		$vrsave,256
107	mtspr		256,r0
108
109	bl		Lconsts
110	mtlr		r11
111
112	neg		r9,$inp
113	lvx		$in0,0,$inp
114	addi		$inp,$inp,15		# 15 is not typo
115	lvsr		$key,0,r9		# borrow $key
116	li		r8,0x20
117	cmpwi		$bits,192
118	lvx		$in1,0,$inp
119	le?vspltisb	$mask,0x0f		# borrow $mask
120	lvx		$rcon,0,$ptr
121	le?vxor		$key,$key,$mask		# adjust for byte swap
122	lvx		$mask,r8,$ptr
123	addi		$ptr,$ptr,0x10
124	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
125	li		$cnt,8
126	vxor		$zero,$zero,$zero
127	mtctr		$cnt
128
129	?lvsr		$outperm,0,$out
130	vspltisb	$outmask,-1
131	lvx		$outhead,0,$out
132	?vperm		$outmask,$zero,$outmask,$outperm
133
134	blt		Loop128
135	addi		$inp,$inp,8
136	beq		L192
137	addi		$inp,$inp,8
138	b		L256
139
140.align	4
141Loop128:
142	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
143	vsldoi		$tmp,$zero,$in0,12	# >>32
144	 vperm		$outtail,$in0,$in0,$outperm	# rotate
145	 vsel		$stage,$outhead,$outtail,$outmask
146	 vmr		$outhead,$outtail
147	vcipherlast	$key,$key,$rcon
148	 stvx		$stage,0,$out
149	 addi		$out,$out,16
150
151	vxor		$in0,$in0,$tmp
152	vsldoi		$tmp,$zero,$tmp,12	# >>32
153	vxor		$in0,$in0,$tmp
154	vsldoi		$tmp,$zero,$tmp,12	# >>32
155	vxor		$in0,$in0,$tmp
156	 vadduwm	$rcon,$rcon,$rcon
157	vxor		$in0,$in0,$key
158	bdnz		Loop128
159
160	lvx		$rcon,0,$ptr		# last two round keys
161
162	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
163	vsldoi		$tmp,$zero,$in0,12	# >>32
164	 vperm		$outtail,$in0,$in0,$outperm	# rotate
165	 vsel		$stage,$outhead,$outtail,$outmask
166	 vmr		$outhead,$outtail
167	vcipherlast	$key,$key,$rcon
168	 stvx		$stage,0,$out
169	 addi		$out,$out,16
170
171	vxor		$in0,$in0,$tmp
172	vsldoi		$tmp,$zero,$tmp,12	# >>32
173	vxor		$in0,$in0,$tmp
174	vsldoi		$tmp,$zero,$tmp,12	# >>32
175	vxor		$in0,$in0,$tmp
176	 vadduwm	$rcon,$rcon,$rcon
177	vxor		$in0,$in0,$key
178
179	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
180	vsldoi		$tmp,$zero,$in0,12	# >>32
181	 vperm		$outtail,$in0,$in0,$outperm	# rotate
182	 vsel		$stage,$outhead,$outtail,$outmask
183	 vmr		$outhead,$outtail
184	vcipherlast	$key,$key,$rcon
185	 stvx		$stage,0,$out
186	 addi		$out,$out,16
187
188	vxor		$in0,$in0,$tmp
189	vsldoi		$tmp,$zero,$tmp,12	# >>32
190	vxor		$in0,$in0,$tmp
191	vsldoi		$tmp,$zero,$tmp,12	# >>32
192	vxor		$in0,$in0,$tmp
193	vxor		$in0,$in0,$key
194	 vperm		$outtail,$in0,$in0,$outperm	# rotate
195	 vsel		$stage,$outhead,$outtail,$outmask
196	 vmr		$outhead,$outtail
197	 stvx		$stage,0,$out
198
199	addi		$inp,$out,15		# 15 is not typo
200	addi		$out,$out,0x50
201
202	li		$rounds,10
203	b		Ldone
204
205.align	4
206L192:
207	lvx		$tmp,0,$inp
208	li		$cnt,4
209	 vperm		$outtail,$in0,$in0,$outperm	# rotate
210	 vsel		$stage,$outhead,$outtail,$outmask
211	 vmr		$outhead,$outtail
212	 stvx		$stage,0,$out
213	 addi		$out,$out,16
214	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
215	vspltisb	$key,8			# borrow $key
216	mtctr		$cnt
217	vsububm		$mask,$mask,$key	# adjust the mask
218
219Loop192:
220	vperm		$key,$in1,$in1,$mask	# roate-n-splat
221	vsldoi		$tmp,$zero,$in0,12	# >>32
222	vcipherlast	$key,$key,$rcon
223
224	vxor		$in0,$in0,$tmp
225	vsldoi		$tmp,$zero,$tmp,12	# >>32
226	vxor		$in0,$in0,$tmp
227	vsldoi		$tmp,$zero,$tmp,12	# >>32
228	vxor		$in0,$in0,$tmp
229
230	 vsldoi		$stage,$zero,$in1,8
231	vspltw		$tmp,$in0,3
232	vxor		$tmp,$tmp,$in1
233	vsldoi		$in1,$zero,$in1,12	# >>32
234	 vadduwm	$rcon,$rcon,$rcon
235	vxor		$in1,$in1,$tmp
236	vxor		$in0,$in0,$key
237	vxor		$in1,$in1,$key
238	 vsldoi		$stage,$stage,$in0,8
239
240	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
241	vsldoi		$tmp,$zero,$in0,12	# >>32
242	 vperm		$outtail,$stage,$stage,$outperm	# rotate
243	 vsel		$stage,$outhead,$outtail,$outmask
244	 vmr		$outhead,$outtail
245	vcipherlast	$key,$key,$rcon
246	 stvx		$stage,0,$out
247	 addi		$out,$out,16
248
249	 vsldoi		$stage,$in0,$in1,8
250	vxor		$in0,$in0,$tmp
251	vsldoi		$tmp,$zero,$tmp,12	# >>32
252	 vperm		$outtail,$stage,$stage,$outperm	# rotate
253	 vsel		$stage,$outhead,$outtail,$outmask
254	 vmr		$outhead,$outtail
255	vxor		$in0,$in0,$tmp
256	vsldoi		$tmp,$zero,$tmp,12	# >>32
257	vxor		$in0,$in0,$tmp
258	 stvx		$stage,0,$out
259	 addi		$out,$out,16
260
261	vspltw		$tmp,$in0,3
262	vxor		$tmp,$tmp,$in1
263	vsldoi		$in1,$zero,$in1,12	# >>32
264	 vadduwm	$rcon,$rcon,$rcon
265	vxor		$in1,$in1,$tmp
266	vxor		$in0,$in0,$key
267	vxor		$in1,$in1,$key
268	 vperm		$outtail,$in0,$in0,$outperm	# rotate
269	 vsel		$stage,$outhead,$outtail,$outmask
270	 vmr		$outhead,$outtail
271	 stvx		$stage,0,$out
272	 addi		$inp,$out,15		# 15 is not typo
273	 addi		$out,$out,16
274	bdnz		Loop192
275
276	li		$rounds,12
277	addi		$out,$out,0x20
278	b		Ldone
279
280.align	4
281L256:
282	lvx		$tmp,0,$inp
283	li		$cnt,7
284	li		$rounds,14
285	 vperm		$outtail,$in0,$in0,$outperm	# rotate
286	 vsel		$stage,$outhead,$outtail,$outmask
287	 vmr		$outhead,$outtail
288	 stvx		$stage,0,$out
289	 addi		$out,$out,16
290	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
291	mtctr		$cnt
292
293Loop256:
294	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
295	vsldoi		$tmp,$zero,$in0,12	# >>32
296	 vperm		$outtail,$in1,$in1,$outperm	# rotate
297	 vsel		$stage,$outhead,$outtail,$outmask
298	 vmr		$outhead,$outtail
299	vcipherlast	$key,$key,$rcon
300	 stvx		$stage,0,$out
301	 addi		$out,$out,16
302
303	vxor		$in0,$in0,$tmp
304	vsldoi		$tmp,$zero,$tmp,12	# >>32
305	vxor		$in0,$in0,$tmp
306	vsldoi		$tmp,$zero,$tmp,12	# >>32
307	vxor		$in0,$in0,$tmp
308	 vadduwm	$rcon,$rcon,$rcon
309	vxor		$in0,$in0,$key
310	 vperm		$outtail,$in0,$in0,$outperm	# rotate
311	 vsel		$stage,$outhead,$outtail,$outmask
312	 vmr		$outhead,$outtail
313	 stvx		$stage,0,$out
314	 addi		$inp,$out,15		# 15 is not typo
315	 addi		$out,$out,16
316	bdz		Ldone
317
318	vspltw		$key,$in0,3		# just splat
319	vsldoi		$tmp,$zero,$in1,12	# >>32
320	vsbox		$key,$key
321
322	vxor		$in1,$in1,$tmp
323	vsldoi		$tmp,$zero,$tmp,12	# >>32
324	vxor		$in1,$in1,$tmp
325	vsldoi		$tmp,$zero,$tmp,12	# >>32
326	vxor		$in1,$in1,$tmp
327
328	vxor		$in1,$in1,$key
329	b		Loop256
330
331.align	4
332Ldone:
333	lvx		$in1,0,$inp		# redundant in aligned case
334	vsel		$in1,$outhead,$in1,$outmask
335	stvx		$in1,0,$inp
336	li		$ptr,0
337	mtspr		256,$vrsave
338	stw		$rounds,0($out)
339
340Lenc_key_abort:
341	mr		r3,$ptr
342	blr
343	.long		0
344	.byte		0,12,0x14,1,0,0,3,0
345	.long		0
346.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
347
348.globl	.${prefix}_set_decrypt_key
349	$STU		$sp,-$FRAME($sp)
350	mflr		r10
351	$PUSH		r10,$FRAME+$LRSAVE($sp)
352	bl		Lset_encrypt_key
353	mtlr		r10
354
355	cmpwi		r3,0
356	bne-		Ldec_key_abort
357
358	slwi		$cnt,$rounds,4
359	subi		$inp,$out,240		# first round key
360	srwi		$rounds,$rounds,1
361	add		$out,$inp,$cnt		# last round key
362	mtctr		$rounds
363
364Ldeckey:
365	lwz		r0, 0($inp)
366	lwz		r6, 4($inp)
367	lwz		r7, 8($inp)
368	lwz		r8, 12($inp)
369	addi		$inp,$inp,16
370	lwz		r9, 0($out)
371	lwz		r10,4($out)
372	lwz		r11,8($out)
373	lwz		r12,12($out)
374	stw		r0, 0($out)
375	stw		r6, 4($out)
376	stw		r7, 8($out)
377	stw		r8, 12($out)
378	subi		$out,$out,16
379	stw		r9, -16($inp)
380	stw		r10,-12($inp)
381	stw		r11,-8($inp)
382	stw		r12,-4($inp)
383	bdnz		Ldeckey
384
385	xor		r3,r3,r3		# return value
386Ldec_key_abort:
387	addi		$sp,$sp,$FRAME
388	blr
389	.long		0
390	.byte		0,12,4,1,0x80,0,3,0
391	.long		0
392.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
393___
394}}}
395#########################################################################
396{{{	# Single block en- and decrypt procedures			#
397sub gen_block () {
398my $dir = shift;
399my $n   = $dir eq "de" ? "n" : "";
400my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
401
402$code.=<<___;
403.globl	.${prefix}_${dir}crypt
404	lwz		$rounds,240($key)
405	lis		r0,0xfc00
406	mfspr		$vrsave,256
407	li		$idx,15			# 15 is not typo
408	mtspr		256,r0
409
410	lvx		v0,0,$inp
411	neg		r11,$out
412	lvx		v1,$idx,$inp
413	lvsl		v2,0,$inp		# inpperm
414	le?vspltisb	v4,0x0f
415	?lvsl		v3,0,r11		# outperm
416	le?vxor		v2,v2,v4
417	li		$idx,16
418	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
419	lvx		v1,0,$key
420	?lvsl		v5,0,$key		# keyperm
421	srwi		$rounds,$rounds,1
422	lvx		v2,$idx,$key
423	addi		$idx,$idx,16
424	subi		$rounds,$rounds,1
425	?vperm		v1,v1,v2,v5		# align round key
426
427	vxor		v0,v0,v1
428	lvx		v1,$idx,$key
429	addi		$idx,$idx,16
430	mtctr		$rounds
431
432Loop_${dir}c:
433	?vperm		v2,v2,v1,v5
434	v${n}cipher	v0,v0,v2
435	lvx		v2,$idx,$key
436	addi		$idx,$idx,16
437	?vperm		v1,v1,v2,v5
438	v${n}cipher	v0,v0,v1
439	lvx		v1,$idx,$key
440	addi		$idx,$idx,16
441	bdnz		Loop_${dir}c
442
443	?vperm		v2,v2,v1,v5
444	v${n}cipher	v0,v0,v2
445	lvx		v2,$idx,$key
446	?vperm		v1,v1,v2,v5
447	v${n}cipherlast	v0,v0,v1
448
449	vspltisb	v2,-1
450	vxor		v1,v1,v1
451	li		$idx,15			# 15 is not typo
452	?vperm		v2,v1,v2,v3		# outmask
453	le?vxor		v3,v3,v4
454	lvx		v1,0,$out		# outhead
455	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
456	vsel		v1,v1,v0,v2
457	lvx		v4,$idx,$out
458	stvx		v1,0,$out
459	vsel		v0,v0,v4,v2
460	stvx		v0,$idx,$out
461
462	mtspr		256,$vrsave
463	blr
464	.long		0
465	.byte		0,12,0x14,0,0,0,3,0
466	.long		0
467.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
468___
469}
470&gen_block("en");
471&gen_block("de");
472}}}
473#########################################################################
474{{{	# CBC en- and decrypt procedures				#
475my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
476my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
477my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
478						map("v$_",(4..10));
479$code.=<<___;
480.globl	.${prefix}_cbc_encrypt
481	${UCMP}i	$len,16
482	bltlr-
483
484	cmpwi		$enc,0			# test direction
485	lis		r0,0xffe0
486	mfspr		$vrsave,256
487	mtspr		256,r0
488
489	li		$idx,15
490	vxor		$rndkey0,$rndkey0,$rndkey0
491	le?vspltisb	$tmp,0x0f
492
493	lvx		$ivec,0,$ivp		# load [unaligned] iv
494	lvsl		$inpperm,0,$ivp
495	lvx		$inptail,$idx,$ivp
496	le?vxor		$inpperm,$inpperm,$tmp
497	vperm		$ivec,$ivec,$inptail,$inpperm
498
499	neg		r11,$inp
500	?lvsl		$keyperm,0,$key		# prepare for unaligned key
501	lwz		$rounds,240($key)
502
503	lvsr		$inpperm,0,r11		# prepare for unaligned load
504	lvx		$inptail,0,$inp
505	addi		$inp,$inp,15		# 15 is not typo
506	le?vxor		$inpperm,$inpperm,$tmp
507
508	?lvsr		$outperm,0,$out		# prepare for unaligned store
509	vspltisb	$outmask,-1
510	lvx		$outhead,0,$out
511	?vperm		$outmask,$rndkey0,$outmask,$outperm
512	le?vxor		$outperm,$outperm,$tmp
513
514	srwi		$rounds,$rounds,1
515	li		$idx,16
516	subi		$rounds,$rounds,1
517	beq		Lcbc_dec
518
519Lcbc_enc:
520	vmr		$inout,$inptail
521	lvx		$inptail,0,$inp
522	addi		$inp,$inp,16
523	mtctr		$rounds
524	subi		$len,$len,16		# len-=16
525
526	lvx		$rndkey0,0,$key
527	 vperm		$inout,$inout,$inptail,$inpperm
528	lvx		$rndkey1,$idx,$key
529	addi		$idx,$idx,16
530	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
531	vxor		$inout,$inout,$rndkey0
532	lvx		$rndkey0,$idx,$key
533	addi		$idx,$idx,16
534	vxor		$inout,$inout,$ivec
535
536Loop_cbc_enc:
537	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
538	vcipher		$inout,$inout,$rndkey1
539	lvx		$rndkey1,$idx,$key
540	addi		$idx,$idx,16
541	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
542	vcipher		$inout,$inout,$rndkey0
543	lvx		$rndkey0,$idx,$key
544	addi		$idx,$idx,16
545	bdnz		Loop_cbc_enc
546
547	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
548	vcipher		$inout,$inout,$rndkey1
549	lvx		$rndkey1,$idx,$key
550	li		$idx,16
551	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
552	vcipherlast	$ivec,$inout,$rndkey0
553	${UCMP}i	$len,16
554
555	vperm		$tmp,$ivec,$ivec,$outperm
556	vsel		$inout,$outhead,$tmp,$outmask
557	vmr		$outhead,$tmp
558	stvx		$inout,0,$out
559	addi		$out,$out,16
560	bge		Lcbc_enc
561
562	b		Lcbc_done
563
564.align	4
565Lcbc_dec:
566	${UCMP}i	$len,128
567	bge		_aesp8_cbc_decrypt8x
568	vmr		$tmp,$inptail
569	lvx		$inptail,0,$inp
570	addi		$inp,$inp,16
571	mtctr		$rounds
572	subi		$len,$len,16		# len-=16
573
574	lvx		$rndkey0,0,$key
575	 vperm		$tmp,$tmp,$inptail,$inpperm
576	lvx		$rndkey1,$idx,$key
577	addi		$idx,$idx,16
578	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
579	vxor		$inout,$tmp,$rndkey0
580	lvx		$rndkey0,$idx,$key
581	addi		$idx,$idx,16
582
583Loop_cbc_dec:
584	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
585	vncipher	$inout,$inout,$rndkey1
586	lvx		$rndkey1,$idx,$key
587	addi		$idx,$idx,16
588	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
589	vncipher	$inout,$inout,$rndkey0
590	lvx		$rndkey0,$idx,$key
591	addi		$idx,$idx,16
592	bdnz		Loop_cbc_dec
593
594	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
595	vncipher	$inout,$inout,$rndkey1
596	lvx		$rndkey1,$idx,$key
597	li		$idx,16
598	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
599	vncipherlast	$inout,$inout,$rndkey0
600	${UCMP}i	$len,16
601
602	vxor		$inout,$inout,$ivec
603	vmr		$ivec,$tmp
604	vperm		$tmp,$inout,$inout,$outperm
605	vsel		$inout,$outhead,$tmp,$outmask
606	vmr		$outhead,$tmp
607	stvx		$inout,0,$out
608	addi		$out,$out,16
609	bge		Lcbc_dec
610
611Lcbc_done:
612	addi		$out,$out,-1
613	lvx		$inout,0,$out		# redundant in aligned case
614	vsel		$inout,$outhead,$inout,$outmask
615	stvx		$inout,0,$out
616
617	neg		$enc,$ivp		# write [unaligned] iv
618	li		$idx,15			# 15 is not typo
619	vxor		$rndkey0,$rndkey0,$rndkey0
620	vspltisb	$outmask,-1
621	le?vspltisb	$tmp,0x0f
622	?lvsl		$outperm,0,$enc
623	?vperm		$outmask,$rndkey0,$outmask,$outperm
624	le?vxor		$outperm,$outperm,$tmp
625	lvx		$outhead,0,$ivp
626	vperm		$ivec,$ivec,$ivec,$outperm
627	vsel		$inout,$outhead,$ivec,$outmask
628	lvx		$inptail,$idx,$ivp
629	stvx		$inout,0,$ivp
630	vsel		$inout,$ivec,$inptail,$outmask
631	stvx		$inout,$idx,$ivp
632
633	mtspr		256,$vrsave
634	blr
635	.long		0
636	.byte		0,12,0x14,0,0,0,6,0
637	.long		0
638___
639#########################################################################
640{{	# Optimized CBC decrypt procedure				#
641my $key_="r11";
642my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
643my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
644my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
645my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
646			# v26-v31 last 6 round keys
647my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
648
649$code.=<<___;
650.align	5
651_aesp8_cbc_decrypt8x:
652	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
653	li		r10,`$FRAME+8*16+15`
654	li		r11,`$FRAME+8*16+31`
655	stvx		v20,r10,$sp		# ABI says so
656	addi		r10,r10,32
657	stvx		v21,r11,$sp
658	addi		r11,r11,32
659	stvx		v22,r10,$sp
660	addi		r10,r10,32
661	stvx		v23,r11,$sp
662	addi		r11,r11,32
663	stvx		v24,r10,$sp
664	addi		r10,r10,32
665	stvx		v25,r11,$sp
666	addi		r11,r11,32
667	stvx		v26,r10,$sp
668	addi		r10,r10,32
669	stvx		v27,r11,$sp
670	addi		r11,r11,32
671	stvx		v28,r10,$sp
672	addi		r10,r10,32
673	stvx		v29,r11,$sp
674	addi		r11,r11,32
675	stvx		v30,r10,$sp
676	stvx		v31,r11,$sp
677	li		r0,-1
678	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
679	li		$x10,0x10
680	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
681	li		$x20,0x20
682	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
683	li		$x30,0x30
684	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
685	li		$x40,0x40
686	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
687	li		$x50,0x50
688	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
689	li		$x60,0x60
690	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
691	li		$x70,0x70
692	mtspr		256,r0
693
694	subi		$rounds,$rounds,3	# -4 in total
695	subi		$len,$len,128		# bias
696
697	lvx		$rndkey0,$x00,$key	# load key schedule
698	lvx		v30,$x10,$key
699	addi		$key,$key,0x20
700	lvx		v31,$x00,$key
701	?vperm		$rndkey0,$rndkey0,v30,$keyperm
702	addi		$key_,$sp,$FRAME+15
703	mtctr		$rounds
704
705Load_cbc_dec_key:
706	?vperm		v24,v30,v31,$keyperm
707	lvx		v30,$x10,$key
708	addi		$key,$key,0x20
709	stvx		v24,$x00,$key_		# off-load round[1]
710	?vperm		v25,v31,v30,$keyperm
711	lvx		v31,$x00,$key
712	stvx		v25,$x10,$key_		# off-load round[2]
713	addi		$key_,$key_,0x20
714	bdnz		Load_cbc_dec_key
715
716	lvx		v26,$x10,$key
717	?vperm		v24,v30,v31,$keyperm
718	lvx		v27,$x20,$key
719	stvx		v24,$x00,$key_		# off-load round[3]
720	?vperm		v25,v31,v26,$keyperm
721	lvx		v28,$x30,$key
722	stvx		v25,$x10,$key_		# off-load round[4]
723	addi		$key_,$sp,$FRAME+15	# rewind $key_
724	?vperm		v26,v26,v27,$keyperm
725	lvx		v29,$x40,$key
726	?vperm		v27,v27,v28,$keyperm
727	lvx		v30,$x50,$key
728	?vperm		v28,v28,v29,$keyperm
729	lvx		v31,$x60,$key
730	?vperm		v29,v29,v30,$keyperm
731	lvx		$out0,$x70,$key		# borrow $out0
732	?vperm		v30,v30,v31,$keyperm
733	lvx		v24,$x00,$key_		# pre-load round[1]
734	?vperm		v31,v31,$out0,$keyperm
735	lvx		v25,$x10,$key_		# pre-load round[2]
736
737	#lvx		$inptail,0,$inp		# "caller" already did this
738	#addi		$inp,$inp,15		# 15 is not typo
739	subi		$inp,$inp,15		# undo "caller"
740
741	 le?li		$idx,8
742	lvx_u		$in0,$x00,$inp		# load first 8 "words"
743	 le?lvsl	$inpperm,0,$idx
744	 le?vspltisb	$tmp,0x0f
745	lvx_u		$in1,$x10,$inp
746	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
747	lvx_u		$in2,$x20,$inp
748	 le?vperm	$in0,$in0,$in0,$inpperm
749	lvx_u		$in3,$x30,$inp
750	 le?vperm	$in1,$in1,$in1,$inpperm
751	lvx_u		$in4,$x40,$inp
752	 le?vperm	$in2,$in2,$in2,$inpperm
753	vxor		$out0,$in0,$rndkey0
754	lvx_u		$in5,$x50,$inp
755	 le?vperm	$in3,$in3,$in3,$inpperm
756	vxor		$out1,$in1,$rndkey0
757	lvx_u		$in6,$x60,$inp
758	 le?vperm	$in4,$in4,$in4,$inpperm
759	vxor		$out2,$in2,$rndkey0
760	lvx_u		$in7,$x70,$inp
761	addi		$inp,$inp,0x80
762	 le?vperm	$in5,$in5,$in5,$inpperm
763	vxor		$out3,$in3,$rndkey0
764	 le?vperm	$in6,$in6,$in6,$inpperm
765	vxor		$out4,$in4,$rndkey0
766	 le?vperm	$in7,$in7,$in7,$inpperm
767	vxor		$out5,$in5,$rndkey0
768	vxor		$out6,$in6,$rndkey0
769	vxor		$out7,$in7,$rndkey0
770
771	mtctr		$rounds
772	b		Loop_cbc_dec8x
773.align	5
774Loop_cbc_dec8x:
775	vncipher	$out0,$out0,v24
776	vncipher	$out1,$out1,v24
777	vncipher	$out2,$out2,v24
778	vncipher	$out3,$out3,v24
779	vncipher	$out4,$out4,v24
780	vncipher	$out5,$out5,v24
781	vncipher	$out6,$out6,v24
782	vncipher	$out7,$out7,v24
783	lvx		v24,$x20,$key_		# round[3]
784	addi		$key_,$key_,0x20
785
786	vncipher	$out0,$out0,v25
787	vncipher	$out1,$out1,v25
788	vncipher	$out2,$out2,v25
789	vncipher	$out3,$out3,v25
790	vncipher	$out4,$out4,v25
791	vncipher	$out5,$out5,v25
792	vncipher	$out6,$out6,v25
793	vncipher	$out7,$out7,v25
794	lvx		v25,$x10,$key_		# round[4]
795	bdnz		Loop_cbc_dec8x
796
797	subic		$len,$len,128		# $len-=128
798	vncipher	$out0,$out0,v24
799	vncipher	$out1,$out1,v24
800	vncipher	$out2,$out2,v24
801	vncipher	$out3,$out3,v24
802	vncipher	$out4,$out4,v24
803	vncipher	$out5,$out5,v24
804	vncipher	$out6,$out6,v24
805	vncipher	$out7,$out7,v24
806
807	subfe.		r0,r0,r0		# borrow?-1:0
808	vncipher	$out0,$out0,v25
809	vncipher	$out1,$out1,v25
810	vncipher	$out2,$out2,v25
811	vncipher	$out3,$out3,v25
812	vncipher	$out4,$out4,v25
813	vncipher	$out5,$out5,v25
814	vncipher	$out6,$out6,v25
815	vncipher	$out7,$out7,v25
816
817	and		r0,r0,$len
818	vncipher	$out0,$out0,v26
819	vncipher	$out1,$out1,v26
820	vncipher	$out2,$out2,v26
821	vncipher	$out3,$out3,v26
822	vncipher	$out4,$out4,v26
823	vncipher	$out5,$out5,v26
824	vncipher	$out6,$out6,v26
825	vncipher	$out7,$out7,v26
826
827	add		$inp,$inp,r0		# $inp is adjusted in such
828						# way that at exit from the
829						# loop inX-in7 are loaded
830						# with last "words"
831	vncipher	$out0,$out0,v27
832	vncipher	$out1,$out1,v27
833	vncipher	$out2,$out2,v27
834	vncipher	$out3,$out3,v27
835	vncipher	$out4,$out4,v27
836	vncipher	$out5,$out5,v27
837	vncipher	$out6,$out6,v27
838	vncipher	$out7,$out7,v27
839
840	addi		$key_,$sp,$FRAME+15	# rewind $key_
841	vncipher	$out0,$out0,v28
842	vncipher	$out1,$out1,v28
843	vncipher	$out2,$out2,v28
844	vncipher	$out3,$out3,v28
845	vncipher	$out4,$out4,v28
846	vncipher	$out5,$out5,v28
847	vncipher	$out6,$out6,v28
848	vncipher	$out7,$out7,v28
849	lvx		v24,$x00,$key_		# re-pre-load round[1]
850
851	vncipher	$out0,$out0,v29
852	vncipher	$out1,$out1,v29
853	vncipher	$out2,$out2,v29
854	vncipher	$out3,$out3,v29
855	vncipher	$out4,$out4,v29
856	vncipher	$out5,$out5,v29
857	vncipher	$out6,$out6,v29
858	vncipher	$out7,$out7,v29
859	lvx		v25,$x10,$key_		# re-pre-load round[2]
860
861	vncipher	$out0,$out0,v30
862	 vxor		$ivec,$ivec,v31		# xor with last round key
863	vncipher	$out1,$out1,v30
864	 vxor		$in0,$in0,v31
865	vncipher	$out2,$out2,v30
866	 vxor		$in1,$in1,v31
867	vncipher	$out3,$out3,v30
868	 vxor		$in2,$in2,v31
869	vncipher	$out4,$out4,v30
870	 vxor		$in3,$in3,v31
871	vncipher	$out5,$out5,v30
872	 vxor		$in4,$in4,v31
873	vncipher	$out6,$out6,v30
874	 vxor		$in5,$in5,v31
875	vncipher	$out7,$out7,v30
876	 vxor		$in6,$in6,v31
877
878	vncipherlast	$out0,$out0,$ivec
879	vncipherlast	$out1,$out1,$in0
880	 lvx_u		$in0,$x00,$inp		# load next input block
881	vncipherlast	$out2,$out2,$in1
882	 lvx_u		$in1,$x10,$inp
883	vncipherlast	$out3,$out3,$in2
884	 le?vperm	$in0,$in0,$in0,$inpperm
885	 lvx_u		$in2,$x20,$inp
886	vncipherlast	$out4,$out4,$in3
887	 le?vperm	$in1,$in1,$in1,$inpperm
888	 lvx_u		$in3,$x30,$inp
889	vncipherlast	$out5,$out5,$in4
890	 le?vperm	$in2,$in2,$in2,$inpperm
891	 lvx_u		$in4,$x40,$inp
892	vncipherlast	$out6,$out6,$in5
893	 le?vperm	$in3,$in3,$in3,$inpperm
894	 lvx_u		$in5,$x50,$inp
895	vncipherlast	$out7,$out7,$in6
896	 le?vperm	$in4,$in4,$in4,$inpperm
897	 lvx_u		$in6,$x60,$inp
898	vmr		$ivec,$in7
899	 le?vperm	$in5,$in5,$in5,$inpperm
900	 lvx_u		$in7,$x70,$inp
901	 addi		$inp,$inp,0x80
902
903	le?vperm	$out0,$out0,$out0,$inpperm
904	le?vperm	$out1,$out1,$out1,$inpperm
905	stvx_u		$out0,$x00,$out
906	 le?vperm	$in6,$in6,$in6,$inpperm
907	 vxor		$out0,$in0,$rndkey0
908	le?vperm	$out2,$out2,$out2,$inpperm
909	stvx_u		$out1,$x10,$out
910	 le?vperm	$in7,$in7,$in7,$inpperm
911	 vxor		$out1,$in1,$rndkey0
912	le?vperm	$out3,$out3,$out3,$inpperm
913	stvx_u		$out2,$x20,$out
914	 vxor		$out2,$in2,$rndkey0
915	le?vperm	$out4,$out4,$out4,$inpperm
916	stvx_u		$out3,$x30,$out
917	 vxor		$out3,$in3,$rndkey0
918	le?vperm	$out5,$out5,$out5,$inpperm
919	stvx_u		$out4,$x40,$out
920	 vxor		$out4,$in4,$rndkey0
921	le?vperm	$out6,$out6,$out6,$inpperm
922	stvx_u		$out5,$x50,$out
923	 vxor		$out5,$in5,$rndkey0
924	le?vperm	$out7,$out7,$out7,$inpperm
925	stvx_u		$out6,$x60,$out
926	 vxor		$out6,$in6,$rndkey0
927	stvx_u		$out7,$x70,$out
928	addi		$out,$out,0x80
929	 vxor		$out7,$in7,$rndkey0
930
931	mtctr		$rounds
932	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
933
934	addic.		$len,$len,128
935	beq		Lcbc_dec8x_done
936	nop
937	nop
938
939Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
940	vncipher	$out1,$out1,v24
941	vncipher	$out2,$out2,v24
942	vncipher	$out3,$out3,v24
943	vncipher	$out4,$out4,v24
944	vncipher	$out5,$out5,v24
945	vncipher	$out6,$out6,v24
946	vncipher	$out7,$out7,v24
947	lvx		v24,$x20,$key_		# round[3]
948	addi		$key_,$key_,0x20
949
950	vncipher	$out1,$out1,v25
951	vncipher	$out2,$out2,v25
952	vncipher	$out3,$out3,v25
953	vncipher	$out4,$out4,v25
954	vncipher	$out5,$out5,v25
955	vncipher	$out6,$out6,v25
956	vncipher	$out7,$out7,v25
957	lvx		v25,$x10,$key_		# round[4]
958	bdnz		Loop_cbc_dec8x_tail
959
960	vncipher	$out1,$out1,v24
961	vncipher	$out2,$out2,v24
962	vncipher	$out3,$out3,v24
963	vncipher	$out4,$out4,v24
964	vncipher	$out5,$out5,v24
965	vncipher	$out6,$out6,v24
966	vncipher	$out7,$out7,v24
967
968	vncipher	$out1,$out1,v25
969	vncipher	$out2,$out2,v25
970	vncipher	$out3,$out3,v25
971	vncipher	$out4,$out4,v25
972	vncipher	$out5,$out5,v25
973	vncipher	$out6,$out6,v25
974	vncipher	$out7,$out7,v25
975
976	vncipher	$out1,$out1,v26
977	vncipher	$out2,$out2,v26
978	vncipher	$out3,$out3,v26
979	vncipher	$out4,$out4,v26
980	vncipher	$out5,$out5,v26
981	vncipher	$out6,$out6,v26
982	vncipher	$out7,$out7,v26
983
984	vncipher	$out1,$out1,v27
985	vncipher	$out2,$out2,v27
986	vncipher	$out3,$out3,v27
987	vncipher	$out4,$out4,v27
988	vncipher	$out5,$out5,v27
989	vncipher	$out6,$out6,v27
990	vncipher	$out7,$out7,v27
991
992	vncipher	$out1,$out1,v28
993	vncipher	$out2,$out2,v28
994	vncipher	$out3,$out3,v28
995	vncipher	$out4,$out4,v28
996	vncipher	$out5,$out5,v28
997	vncipher	$out6,$out6,v28
998	vncipher	$out7,$out7,v28
999
1000	vncipher	$out1,$out1,v29
1001	vncipher	$out2,$out2,v29
1002	vncipher	$out3,$out3,v29
1003	vncipher	$out4,$out4,v29
1004	vncipher	$out5,$out5,v29
1005	vncipher	$out6,$out6,v29
1006	vncipher	$out7,$out7,v29
1007
1008	vncipher	$out1,$out1,v30
1009	 vxor		$ivec,$ivec,v31		# last round key
1010	vncipher	$out2,$out2,v30
1011	 vxor		$in1,$in1,v31
1012	vncipher	$out3,$out3,v30
1013	 vxor		$in2,$in2,v31
1014	vncipher	$out4,$out4,v30
1015	 vxor		$in3,$in3,v31
1016	vncipher	$out5,$out5,v30
1017	 vxor		$in4,$in4,v31
1018	vncipher	$out6,$out6,v30
1019	 vxor		$in5,$in5,v31
1020	vncipher	$out7,$out7,v30
1021	 vxor		$in6,$in6,v31
1022
1023	cmplwi		$len,32			# switch($len)
1024	blt		Lcbc_dec8x_one
1025	nop
1026	beq		Lcbc_dec8x_two
1027	cmplwi		$len,64
1028	blt		Lcbc_dec8x_three
1029	nop
1030	beq		Lcbc_dec8x_four
1031	cmplwi		$len,96
1032	blt		Lcbc_dec8x_five
1033	nop
1034	beq		Lcbc_dec8x_six
1035
1036Lcbc_dec8x_seven:
1037	vncipherlast	$out1,$out1,$ivec
1038	vncipherlast	$out2,$out2,$in1
1039	vncipherlast	$out3,$out3,$in2
1040	vncipherlast	$out4,$out4,$in3
1041	vncipherlast	$out5,$out5,$in4
1042	vncipherlast	$out6,$out6,$in5
1043	vncipherlast	$out7,$out7,$in6
1044	vmr		$ivec,$in7
1045
1046	le?vperm	$out1,$out1,$out1,$inpperm
1047	le?vperm	$out2,$out2,$out2,$inpperm
1048	stvx_u		$out1,$x00,$out
1049	le?vperm	$out3,$out3,$out3,$inpperm
1050	stvx_u		$out2,$x10,$out
1051	le?vperm	$out4,$out4,$out4,$inpperm
1052	stvx_u		$out3,$x20,$out
1053	le?vperm	$out5,$out5,$out5,$inpperm
1054	stvx_u		$out4,$x30,$out
1055	le?vperm	$out6,$out6,$out6,$inpperm
1056	stvx_u		$out5,$x40,$out
1057	le?vperm	$out7,$out7,$out7,$inpperm
1058	stvx_u		$out6,$x50,$out
1059	stvx_u		$out7,$x60,$out
1060	addi		$out,$out,0x70
1061	b		Lcbc_dec8x_done
1062
1063.align	5
1064Lcbc_dec8x_six:
1065	vncipherlast	$out2,$out2,$ivec
1066	vncipherlast	$out3,$out3,$in2
1067	vncipherlast	$out4,$out4,$in3
1068	vncipherlast	$out5,$out5,$in4
1069	vncipherlast	$out6,$out6,$in5
1070	vncipherlast	$out7,$out7,$in6
1071	vmr		$ivec,$in7
1072
1073	le?vperm	$out2,$out2,$out2,$inpperm
1074	le?vperm	$out3,$out3,$out3,$inpperm
1075	stvx_u		$out2,$x00,$out
1076	le?vperm	$out4,$out4,$out4,$inpperm
1077	stvx_u		$out3,$x10,$out
1078	le?vperm	$out5,$out5,$out5,$inpperm
1079	stvx_u		$out4,$x20,$out
1080	le?vperm	$out6,$out6,$out6,$inpperm
1081	stvx_u		$out5,$x30,$out
1082	le?vperm	$out7,$out7,$out7,$inpperm
1083	stvx_u		$out6,$x40,$out
1084	stvx_u		$out7,$x50,$out
1085	addi		$out,$out,0x60
1086	b		Lcbc_dec8x_done
1087
1088.align	5
1089Lcbc_dec8x_five:
1090	vncipherlast	$out3,$out3,$ivec
1091	vncipherlast	$out4,$out4,$in3
1092	vncipherlast	$out5,$out5,$in4
1093	vncipherlast	$out6,$out6,$in5
1094	vncipherlast	$out7,$out7,$in6
1095	vmr		$ivec,$in7
1096
1097	le?vperm	$out3,$out3,$out3,$inpperm
1098	le?vperm	$out4,$out4,$out4,$inpperm
1099	stvx_u		$out3,$x00,$out
1100	le?vperm	$out5,$out5,$out5,$inpperm
1101	stvx_u		$out4,$x10,$out
1102	le?vperm	$out6,$out6,$out6,$inpperm
1103	stvx_u		$out5,$x20,$out
1104	le?vperm	$out7,$out7,$out7,$inpperm
1105	stvx_u		$out6,$x30,$out
1106	stvx_u		$out7,$x40,$out
1107	addi		$out,$out,0x50
1108	b		Lcbc_dec8x_done
1109
1110.align	5
1111Lcbc_dec8x_four:
1112	vncipherlast	$out4,$out4,$ivec
1113	vncipherlast	$out5,$out5,$in4
1114	vncipherlast	$out6,$out6,$in5
1115	vncipherlast	$out7,$out7,$in6
1116	vmr		$ivec,$in7
1117
1118	le?vperm	$out4,$out4,$out4,$inpperm
1119	le?vperm	$out5,$out5,$out5,$inpperm
1120	stvx_u		$out4,$x00,$out
1121	le?vperm	$out6,$out6,$out6,$inpperm
1122	stvx_u		$out5,$x10,$out
1123	le?vperm	$out7,$out7,$out7,$inpperm
1124	stvx_u		$out6,$x20,$out
1125	stvx_u		$out7,$x30,$out
1126	addi		$out,$out,0x40
1127	b		Lcbc_dec8x_done
1128
1129.align	5
1130Lcbc_dec8x_three:
1131	vncipherlast	$out5,$out5,$ivec
1132	vncipherlast	$out6,$out6,$in5
1133	vncipherlast	$out7,$out7,$in6
1134	vmr		$ivec,$in7
1135
1136	le?vperm	$out5,$out5,$out5,$inpperm
1137	le?vperm	$out6,$out6,$out6,$inpperm
1138	stvx_u		$out5,$x00,$out
1139	le?vperm	$out7,$out7,$out7,$inpperm
1140	stvx_u		$out6,$x10,$out
1141	stvx_u		$out7,$x20,$out
1142	addi		$out,$out,0x30
1143	b		Lcbc_dec8x_done
1144
1145.align	5
1146Lcbc_dec8x_two:
1147	vncipherlast	$out6,$out6,$ivec
1148	vncipherlast	$out7,$out7,$in6
1149	vmr		$ivec,$in7
1150
1151	le?vperm	$out6,$out6,$out6,$inpperm
1152	le?vperm	$out7,$out7,$out7,$inpperm
1153	stvx_u		$out6,$x00,$out
1154	stvx_u		$out7,$x10,$out
1155	addi		$out,$out,0x20
1156	b		Lcbc_dec8x_done
1157
1158.align	5
1159Lcbc_dec8x_one:
1160	vncipherlast	$out7,$out7,$ivec
1161	vmr		$ivec,$in7
1162
1163	le?vperm	$out7,$out7,$out7,$inpperm
1164	stvx_u		$out7,0,$out
1165	addi		$out,$out,0x10
1166
1167Lcbc_dec8x_done:
1168	le?vperm	$ivec,$ivec,$ivec,$inpperm
1169	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1170
1171	li		r10,`$FRAME+15`
1172	li		r11,`$FRAME+31`
1173	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1174	addi		r10,r10,32
1175	stvx		$inpperm,r11,$sp
1176	addi		r11,r11,32
1177	stvx		$inpperm,r10,$sp
1178	addi		r10,r10,32
1179	stvx		$inpperm,r11,$sp
1180	addi		r11,r11,32
1181	stvx		$inpperm,r10,$sp
1182	addi		r10,r10,32
1183	stvx		$inpperm,r11,$sp
1184	addi		r11,r11,32
1185	stvx		$inpperm,r10,$sp
1186	addi		r10,r10,32
1187	stvx		$inpperm,r11,$sp
1188	addi		r11,r11,32
1189
1190	mtspr		256,$vrsave
1191	lvx		v20,r10,$sp		# ABI says so
1192	addi		r10,r10,32
1193	lvx		v21,r11,$sp
1194	addi		r11,r11,32
1195	lvx		v22,r10,$sp
1196	addi		r10,r10,32
1197	lvx		v23,r11,$sp
1198	addi		r11,r11,32
1199	lvx		v24,r10,$sp
1200	addi		r10,r10,32
1201	lvx		v25,r11,$sp
1202	addi		r11,r11,32
1203	lvx		v26,r10,$sp
1204	addi		r10,r10,32
1205	lvx		v27,r11,$sp
1206	addi		r11,r11,32
1207	lvx		v28,r10,$sp
1208	addi		r10,r10,32
1209	lvx		v29,r11,$sp
1210	addi		r11,r11,32
1211	lvx		v30,r10,$sp
1212	lvx		v31,r11,$sp
1213	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1214	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1215	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1216	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1217	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1218	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1219	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1220	blr
1221	.long		0
1222	.byte		0,12,0x14,0,0x80,6,6,0
1223	.long		0
1224.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1225___
1226}}	}}}
1227
1228#########################################################################
1229{{{	# CTR procedure[s]						#
1230my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1231my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1232my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1233						map("v$_",(4..11));
1234my $dat=$tmp;
1235
1236$code.=<<___;
1237.globl	.${prefix}_ctr32_encrypt_blocks
1238	${UCMP}i	$len,1
1239	bltlr-
1240
1241	lis		r0,0xfff0
1242	mfspr		$vrsave,256
1243	mtspr		256,r0
1244
1245	li		$idx,15
1246	vxor		$rndkey0,$rndkey0,$rndkey0
1247	le?vspltisb	$tmp,0x0f
1248
1249	lvx		$ivec,0,$ivp		# load [unaligned] iv
1250	lvsl		$inpperm,0,$ivp
1251	lvx		$inptail,$idx,$ivp
1252	 vspltisb	$one,1
1253	le?vxor		$inpperm,$inpperm,$tmp
1254	vperm		$ivec,$ivec,$inptail,$inpperm
1255	 vsldoi		$one,$rndkey0,$one,1
1256
1257	neg		r11,$inp
1258	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1259	lwz		$rounds,240($key)
1260
1261	lvsr		$inpperm,0,r11		# prepare for unaligned load
1262	lvx		$inptail,0,$inp
1263	addi		$inp,$inp,15		# 15 is not typo
1264	le?vxor		$inpperm,$inpperm,$tmp
1265
1266	srwi		$rounds,$rounds,1
1267	li		$idx,16
1268	subi		$rounds,$rounds,1
1269
1270	${UCMP}i	$len,8
1271	bge		_aesp8_ctr32_encrypt8x
1272
1273	?lvsr		$outperm,0,$out		# prepare for unaligned store
1274	vspltisb	$outmask,-1
1275	lvx		$outhead,0,$out
1276	?vperm		$outmask,$rndkey0,$outmask,$outperm
1277	le?vxor		$outperm,$outperm,$tmp
1278
1279	lvx		$rndkey0,0,$key
1280	mtctr		$rounds
1281	lvx		$rndkey1,$idx,$key
1282	addi		$idx,$idx,16
1283	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1284	vxor		$inout,$ivec,$rndkey0
1285	lvx		$rndkey0,$idx,$key
1286	addi		$idx,$idx,16
1287	b		Loop_ctr32_enc
1288
1289.align	5
1290Loop_ctr32_enc:
1291	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1292	vcipher		$inout,$inout,$rndkey1
1293	lvx		$rndkey1,$idx,$key
1294	addi		$idx,$idx,16
1295	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1296	vcipher		$inout,$inout,$rndkey0
1297	lvx		$rndkey0,$idx,$key
1298	addi		$idx,$idx,16
1299	bdnz		Loop_ctr32_enc
1300
1301	vadduwm		$ivec,$ivec,$one
1302	 vmr		$dat,$inptail
1303	 lvx		$inptail,0,$inp
1304	 addi		$inp,$inp,16
1305	 subic.		$len,$len,1		# blocks--
1306
1307	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1308	vcipher		$inout,$inout,$rndkey1
1309	lvx		$rndkey1,$idx,$key
1310	 vperm		$dat,$dat,$inptail,$inpperm
1311	 li		$idx,16
1312	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1313	 lvx		$rndkey0,0,$key
1314	vxor		$dat,$dat,$rndkey1	# last round key
1315	vcipherlast	$inout,$inout,$dat
1316
1317	 lvx		$rndkey1,$idx,$key
1318	 addi		$idx,$idx,16
1319	vperm		$inout,$inout,$inout,$outperm
1320	vsel		$dat,$outhead,$inout,$outmask
1321	 mtctr		$rounds
1322	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1323	vmr		$outhead,$inout
1324	 vxor		$inout,$ivec,$rndkey0
1325	 lvx		$rndkey0,$idx,$key
1326	 addi		$idx,$idx,16
1327	stvx		$dat,0,$out
1328	addi		$out,$out,16
1329	bne		Loop_ctr32_enc
1330
1331	addi		$out,$out,-1
1332	lvx		$inout,0,$out		# redundant in aligned case
1333	vsel		$inout,$outhead,$inout,$outmask
1334	stvx		$inout,0,$out
1335
1336	mtspr		256,$vrsave
1337	blr
1338	.long		0
1339	.byte		0,12,0x14,0,0,0,6,0
1340	.long		0
1341___
1342#########################################################################
1343{{	# Optimized CTR procedure					#
1344my $key_="r11";
1345my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1346my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1347my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1348my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1349			# v26-v31 last 6 round keys
1350my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1351my ($two,$three,$four)=($outhead,$outperm,$outmask);
1352
1353$code.=<<___;
1354.align	5
1355_aesp8_ctr32_encrypt8x:
1356	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1357	li		r10,`$FRAME+8*16+15`
1358	li		r11,`$FRAME+8*16+31`
1359	stvx		v20,r10,$sp		# ABI says so
1360	addi		r10,r10,32
1361	stvx		v21,r11,$sp
1362	addi		r11,r11,32
1363	stvx		v22,r10,$sp
1364	addi		r10,r10,32
1365	stvx		v23,r11,$sp
1366	addi		r11,r11,32
1367	stvx		v24,r10,$sp
1368	addi		r10,r10,32
1369	stvx		v25,r11,$sp
1370	addi		r11,r11,32
1371	stvx		v26,r10,$sp
1372	addi		r10,r10,32
1373	stvx		v27,r11,$sp
1374	addi		r11,r11,32
1375	stvx		v28,r10,$sp
1376	addi		r10,r10,32
1377	stvx		v29,r11,$sp
1378	addi		r11,r11,32
1379	stvx		v30,r10,$sp
1380	stvx		v31,r11,$sp
1381	li		r0,-1
1382	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1383	li		$x10,0x10
1384	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1385	li		$x20,0x20
1386	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1387	li		$x30,0x30
1388	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1389	li		$x40,0x40
1390	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1391	li		$x50,0x50
1392	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1393	li		$x60,0x60
1394	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1395	li		$x70,0x70
1396	mtspr		256,r0
1397
1398	subi		$rounds,$rounds,3	# -4 in total
1399
1400	lvx		$rndkey0,$x00,$key	# load key schedule
1401	lvx		v30,$x10,$key
1402	addi		$key,$key,0x20
1403	lvx		v31,$x00,$key
1404	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1405	addi		$key_,$sp,$FRAME+15
1406	mtctr		$rounds
1407
1408Load_ctr32_enc_key:
1409	?vperm		v24,v30,v31,$keyperm
1410	lvx		v30,$x10,$key
1411	addi		$key,$key,0x20
1412	stvx		v24,$x00,$key_		# off-load round[1]
1413	?vperm		v25,v31,v30,$keyperm
1414	lvx		v31,$x00,$key
1415	stvx		v25,$x10,$key_		# off-load round[2]
1416	addi		$key_,$key_,0x20
1417	bdnz		Load_ctr32_enc_key
1418
1419	lvx		v26,$x10,$key
1420	?vperm		v24,v30,v31,$keyperm
1421	lvx		v27,$x20,$key
1422	stvx		v24,$x00,$key_		# off-load round[3]
1423	?vperm		v25,v31,v26,$keyperm
1424	lvx		v28,$x30,$key
1425	stvx		v25,$x10,$key_		# off-load round[4]
1426	addi		$key_,$sp,$FRAME+15	# rewind $key_
1427	?vperm		v26,v26,v27,$keyperm
1428	lvx		v29,$x40,$key
1429	?vperm		v27,v27,v28,$keyperm
1430	lvx		v30,$x50,$key
1431	?vperm		v28,v28,v29,$keyperm
1432	lvx		v31,$x60,$key
1433	?vperm		v29,v29,v30,$keyperm
1434	lvx		$out0,$x70,$key		# borrow $out0
1435	?vperm		v30,v30,v31,$keyperm
1436	lvx		v24,$x00,$key_		# pre-load round[1]
1437	?vperm		v31,v31,$out0,$keyperm
1438	lvx		v25,$x10,$key_		# pre-load round[2]
1439
1440	vadduqm		$two,$one,$one
1441	subi		$inp,$inp,15		# undo "caller"
1442	$SHL		$len,$len,4
1443
1444	vadduqm		$out1,$ivec,$one	# counter values ...
1445	vadduqm		$out2,$ivec,$two
1446	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1447	 le?li		$idx,8
1448	vadduqm		$out3,$out1,$two
1449	vxor		$out1,$out1,$rndkey0
1450	 le?lvsl	$inpperm,0,$idx
1451	vadduqm		$out4,$out2,$two
1452	vxor		$out2,$out2,$rndkey0
1453	 le?vspltisb	$tmp,0x0f
1454	vadduqm		$out5,$out3,$two
1455	vxor		$out3,$out3,$rndkey0
1456	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1457	vadduqm		$out6,$out4,$two
1458	vxor		$out4,$out4,$rndkey0
1459	vadduqm		$out7,$out5,$two
1460	vxor		$out5,$out5,$rndkey0
1461	vadduqm		$ivec,$out6,$two	# next counter value
1462	vxor		$out6,$out6,$rndkey0
1463	vxor		$out7,$out7,$rndkey0
1464
1465	mtctr		$rounds
1466	b		Loop_ctr32_enc8x
1467.align	5
1468Loop_ctr32_enc8x:
1469	vcipher 	$out0,$out0,v24
1470	vcipher 	$out1,$out1,v24
1471	vcipher 	$out2,$out2,v24
1472	vcipher 	$out3,$out3,v24
1473	vcipher 	$out4,$out4,v24
1474	vcipher 	$out5,$out5,v24
1475	vcipher 	$out6,$out6,v24
1476	vcipher 	$out7,$out7,v24
1477Loop_ctr32_enc8x_middle:
1478	lvx		v24,$x20,$key_		# round[3]
1479	addi		$key_,$key_,0x20
1480
1481	vcipher 	$out0,$out0,v25
1482	vcipher 	$out1,$out1,v25
1483	vcipher 	$out2,$out2,v25
1484	vcipher 	$out3,$out3,v25
1485	vcipher 	$out4,$out4,v25
1486	vcipher 	$out5,$out5,v25
1487	vcipher 	$out6,$out6,v25
1488	vcipher 	$out7,$out7,v25
1489	lvx		v25,$x10,$key_		# round[4]
1490	bdnz		Loop_ctr32_enc8x
1491
1492	subic		r11,$len,256		# $len-256, borrow $key_
1493	vcipher 	$out0,$out0,v24
1494	vcipher 	$out1,$out1,v24
1495	vcipher 	$out2,$out2,v24
1496	vcipher 	$out3,$out3,v24
1497	vcipher 	$out4,$out4,v24
1498	vcipher 	$out5,$out5,v24
1499	vcipher 	$out6,$out6,v24
1500	vcipher 	$out7,$out7,v24
1501
1502	subfe		r0,r0,r0		# borrow?-1:0
1503	vcipher 	$out0,$out0,v25
1504	vcipher 	$out1,$out1,v25
1505	vcipher 	$out2,$out2,v25
1506	vcipher 	$out3,$out3,v25
1507	vcipher 	$out4,$out4,v25
1508	vcipher		$out5,$out5,v25
1509	vcipher		$out6,$out6,v25
1510	vcipher		$out7,$out7,v25
1511
1512	and		r0,r0,r11
1513	addi		$key_,$sp,$FRAME+15	# rewind $key_
1514	vcipher		$out0,$out0,v26
1515	vcipher		$out1,$out1,v26
1516	vcipher		$out2,$out2,v26
1517	vcipher		$out3,$out3,v26
1518	vcipher		$out4,$out4,v26
1519	vcipher		$out5,$out5,v26
1520	vcipher		$out6,$out6,v26
1521	vcipher		$out7,$out7,v26
1522	lvx		v24,$x00,$key_		# re-pre-load round[1]
1523
1524	subic		$len,$len,129		# $len-=129
1525	vcipher		$out0,$out0,v27
1526	addi		$len,$len,1		# $len-=128 really
1527	vcipher		$out1,$out1,v27
1528	vcipher		$out2,$out2,v27
1529	vcipher		$out3,$out3,v27
1530	vcipher		$out4,$out4,v27
1531	vcipher		$out5,$out5,v27
1532	vcipher		$out6,$out6,v27
1533	vcipher		$out7,$out7,v27
1534	lvx		v25,$x10,$key_		# re-pre-load round[2]
1535
1536	vcipher		$out0,$out0,v28
1537	 lvx_u		$in0,$x00,$inp		# load input
1538	vcipher		$out1,$out1,v28
1539	 lvx_u		$in1,$x10,$inp
1540	vcipher		$out2,$out2,v28
1541	 lvx_u		$in2,$x20,$inp
1542	vcipher		$out3,$out3,v28
1543	 lvx_u		$in3,$x30,$inp
1544	vcipher		$out4,$out4,v28
1545	 lvx_u		$in4,$x40,$inp
1546	vcipher		$out5,$out5,v28
1547	 lvx_u		$in5,$x50,$inp
1548	vcipher		$out6,$out6,v28
1549	 lvx_u		$in6,$x60,$inp
1550	vcipher		$out7,$out7,v28
1551	 lvx_u		$in7,$x70,$inp
1552	 addi		$inp,$inp,0x80
1553
1554	vcipher		$out0,$out0,v29
1555	 le?vperm	$in0,$in0,$in0,$inpperm
1556	vcipher		$out1,$out1,v29
1557	 le?vperm	$in1,$in1,$in1,$inpperm
1558	vcipher		$out2,$out2,v29
1559	 le?vperm	$in2,$in2,$in2,$inpperm
1560	vcipher		$out3,$out3,v29
1561	 le?vperm	$in3,$in3,$in3,$inpperm
1562	vcipher		$out4,$out4,v29
1563	 le?vperm	$in4,$in4,$in4,$inpperm
1564	vcipher		$out5,$out5,v29
1565	 le?vperm	$in5,$in5,$in5,$inpperm
1566	vcipher		$out6,$out6,v29
1567	 le?vperm	$in6,$in6,$in6,$inpperm
1568	vcipher		$out7,$out7,v29
1569	 le?vperm	$in7,$in7,$in7,$inpperm
1570
1571	add		$inp,$inp,r0		# $inp is adjusted in such
1572						# way that at exit from the
1573						# loop inX-in7 are loaded
1574						# with last "words"
1575	subfe.		r0,r0,r0		# borrow?-1:0
1576	vcipher		$out0,$out0,v30
1577	 vxor		$in0,$in0,v31		# xor with last round key
1578	vcipher		$out1,$out1,v30
1579	 vxor		$in1,$in1,v31
1580	vcipher		$out2,$out2,v30
1581	 vxor		$in2,$in2,v31
1582	vcipher		$out3,$out3,v30
1583	 vxor		$in3,$in3,v31
1584	vcipher		$out4,$out4,v30
1585	 vxor		$in4,$in4,v31
1586	vcipher		$out5,$out5,v30
1587	 vxor		$in5,$in5,v31
1588	vcipher		$out6,$out6,v30
1589	 vxor		$in6,$in6,v31
1590	vcipher		$out7,$out7,v30
1591	 vxor		$in7,$in7,v31
1592
1593	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1594
1595	vcipherlast	$in0,$out0,$in0
1596	vcipherlast	$in1,$out1,$in1
1597	 vadduqm	$out1,$ivec,$one	# counter values ...
1598	vcipherlast	$in2,$out2,$in2
1599	 vadduqm	$out2,$ivec,$two
1600	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1601	vcipherlast	$in3,$out3,$in3
1602	 vadduqm	$out3,$out1,$two
1603	 vxor		$out1,$out1,$rndkey0
1604	vcipherlast	$in4,$out4,$in4
1605	 vadduqm	$out4,$out2,$two
1606	 vxor		$out2,$out2,$rndkey0
1607	vcipherlast	$in5,$out5,$in5
1608	 vadduqm	$out5,$out3,$two
1609	 vxor		$out3,$out3,$rndkey0
1610	vcipherlast	$in6,$out6,$in6
1611	 vadduqm	$out6,$out4,$two
1612	 vxor		$out4,$out4,$rndkey0
1613	vcipherlast	$in7,$out7,$in7
1614	 vadduqm	$out7,$out5,$two
1615	 vxor		$out5,$out5,$rndkey0
1616	le?vperm	$in0,$in0,$in0,$inpperm
1617	 vadduqm	$ivec,$out6,$two	# next counter value
1618	 vxor		$out6,$out6,$rndkey0
1619	le?vperm	$in1,$in1,$in1,$inpperm
1620	 vxor		$out7,$out7,$rndkey0
1621	mtctr		$rounds
1622
1623	 vcipher	$out0,$out0,v24
1624	stvx_u		$in0,$x00,$out
1625	le?vperm	$in2,$in2,$in2,$inpperm
1626	 vcipher	$out1,$out1,v24
1627	stvx_u		$in1,$x10,$out
1628	le?vperm	$in3,$in3,$in3,$inpperm
1629	 vcipher	$out2,$out2,v24
1630	stvx_u		$in2,$x20,$out
1631	le?vperm	$in4,$in4,$in4,$inpperm
1632	 vcipher	$out3,$out3,v24
1633	stvx_u		$in3,$x30,$out
1634	le?vperm	$in5,$in5,$in5,$inpperm
1635	 vcipher	$out4,$out4,v24
1636	stvx_u		$in4,$x40,$out
1637	le?vperm	$in6,$in6,$in6,$inpperm
1638	 vcipher	$out5,$out5,v24
1639	stvx_u		$in5,$x50,$out
1640	le?vperm	$in7,$in7,$in7,$inpperm
1641	 vcipher	$out6,$out6,v24
1642	stvx_u		$in6,$x60,$out
1643	 vcipher	$out7,$out7,v24
1644	stvx_u		$in7,$x70,$out
1645	addi		$out,$out,0x80
1646
1647	b		Loop_ctr32_enc8x_middle
1648
1649.align	5
1650Lctr32_enc8x_break:
1651	cmpwi		$len,-0x60
1652	blt		Lctr32_enc8x_one
1653	nop
1654	beq		Lctr32_enc8x_two
1655	cmpwi		$len,-0x40
1656	blt		Lctr32_enc8x_three
1657	nop
1658	beq		Lctr32_enc8x_four
1659	cmpwi		$len,-0x20
1660	blt		Lctr32_enc8x_five
1661	nop
1662	beq		Lctr32_enc8x_six
1663	cmpwi		$len,0x00
1664	blt		Lctr32_enc8x_seven
1665
1666Lctr32_enc8x_eight:
1667	vcipherlast	$out0,$out0,$in0
1668	vcipherlast	$out1,$out1,$in1
1669	vcipherlast	$out2,$out2,$in2
1670	vcipherlast	$out3,$out3,$in3
1671	vcipherlast	$out4,$out4,$in4
1672	vcipherlast	$out5,$out5,$in5
1673	vcipherlast	$out6,$out6,$in6
1674	vcipherlast	$out7,$out7,$in7
1675
1676	le?vperm	$out0,$out0,$out0,$inpperm
1677	le?vperm	$out1,$out1,$out1,$inpperm
1678	stvx_u		$out0,$x00,$out
1679	le?vperm	$out2,$out2,$out2,$inpperm
1680	stvx_u		$out1,$x10,$out
1681	le?vperm	$out3,$out3,$out3,$inpperm
1682	stvx_u		$out2,$x20,$out
1683	le?vperm	$out4,$out4,$out4,$inpperm
1684	stvx_u		$out3,$x30,$out
1685	le?vperm	$out5,$out5,$out5,$inpperm
1686	stvx_u		$out4,$x40,$out
1687	le?vperm	$out6,$out6,$out6,$inpperm
1688	stvx_u		$out5,$x50,$out
1689	le?vperm	$out7,$out7,$out7,$inpperm
1690	stvx_u		$out6,$x60,$out
1691	stvx_u		$out7,$x70,$out
1692	addi		$out,$out,0x80
1693	b		Lctr32_enc8x_done
1694
1695.align	5
1696Lctr32_enc8x_seven:
1697	vcipherlast	$out0,$out0,$in1
1698	vcipherlast	$out1,$out1,$in2
1699	vcipherlast	$out2,$out2,$in3
1700	vcipherlast	$out3,$out3,$in4
1701	vcipherlast	$out4,$out4,$in5
1702	vcipherlast	$out5,$out5,$in6
1703	vcipherlast	$out6,$out6,$in7
1704
1705	le?vperm	$out0,$out0,$out0,$inpperm
1706	le?vperm	$out1,$out1,$out1,$inpperm
1707	stvx_u		$out0,$x00,$out
1708	le?vperm	$out2,$out2,$out2,$inpperm
1709	stvx_u		$out1,$x10,$out
1710	le?vperm	$out3,$out3,$out3,$inpperm
1711	stvx_u		$out2,$x20,$out
1712	le?vperm	$out4,$out4,$out4,$inpperm
1713	stvx_u		$out3,$x30,$out
1714	le?vperm	$out5,$out5,$out5,$inpperm
1715	stvx_u		$out4,$x40,$out
1716	le?vperm	$out6,$out6,$out6,$inpperm
1717	stvx_u		$out5,$x50,$out
1718	stvx_u		$out6,$x60,$out
1719	addi		$out,$out,0x70
1720	b		Lctr32_enc8x_done
1721
1722.align	5
1723Lctr32_enc8x_six:
1724	vcipherlast	$out0,$out0,$in2
1725	vcipherlast	$out1,$out1,$in3
1726	vcipherlast	$out2,$out2,$in4
1727	vcipherlast	$out3,$out3,$in5
1728	vcipherlast	$out4,$out4,$in6
1729	vcipherlast	$out5,$out5,$in7
1730
1731	le?vperm	$out0,$out0,$out0,$inpperm
1732	le?vperm	$out1,$out1,$out1,$inpperm
1733	stvx_u		$out0,$x00,$out
1734	le?vperm	$out2,$out2,$out2,$inpperm
1735	stvx_u		$out1,$x10,$out
1736	le?vperm	$out3,$out3,$out3,$inpperm
1737	stvx_u		$out2,$x20,$out
1738	le?vperm	$out4,$out4,$out4,$inpperm
1739	stvx_u		$out3,$x30,$out
1740	le?vperm	$out5,$out5,$out5,$inpperm
1741	stvx_u		$out4,$x40,$out
1742	stvx_u		$out5,$x50,$out
1743	addi		$out,$out,0x60
1744	b		Lctr32_enc8x_done
1745
1746.align	5
1747Lctr32_enc8x_five:
1748	vcipherlast	$out0,$out0,$in3
1749	vcipherlast	$out1,$out1,$in4
1750	vcipherlast	$out2,$out2,$in5
1751	vcipherlast	$out3,$out3,$in6
1752	vcipherlast	$out4,$out4,$in7
1753
1754	le?vperm	$out0,$out0,$out0,$inpperm
1755	le?vperm	$out1,$out1,$out1,$inpperm
1756	stvx_u		$out0,$x00,$out
1757	le?vperm	$out2,$out2,$out2,$inpperm
1758	stvx_u		$out1,$x10,$out
1759	le?vperm	$out3,$out3,$out3,$inpperm
1760	stvx_u		$out2,$x20,$out
1761	le?vperm	$out4,$out4,$out4,$inpperm
1762	stvx_u		$out3,$x30,$out
1763	stvx_u		$out4,$x40,$out
1764	addi		$out,$out,0x50
1765	b		Lctr32_enc8x_done
1766
1767.align	5
1768Lctr32_enc8x_four:
1769	vcipherlast	$out0,$out0,$in4
1770	vcipherlast	$out1,$out1,$in5
1771	vcipherlast	$out2,$out2,$in6
1772	vcipherlast	$out3,$out3,$in7
1773
1774	le?vperm	$out0,$out0,$out0,$inpperm
1775	le?vperm	$out1,$out1,$out1,$inpperm
1776	stvx_u		$out0,$x00,$out
1777	le?vperm	$out2,$out2,$out2,$inpperm
1778	stvx_u		$out1,$x10,$out
1779	le?vperm	$out3,$out3,$out3,$inpperm
1780	stvx_u		$out2,$x20,$out
1781	stvx_u		$out3,$x30,$out
1782	addi		$out,$out,0x40
1783	b		Lctr32_enc8x_done
1784
1785.align	5
1786Lctr32_enc8x_three:
1787	vcipherlast	$out0,$out0,$in5
1788	vcipherlast	$out1,$out1,$in6
1789	vcipherlast	$out2,$out2,$in7
1790
1791	le?vperm	$out0,$out0,$out0,$inpperm
1792	le?vperm	$out1,$out1,$out1,$inpperm
1793	stvx_u		$out0,$x00,$out
1794	le?vperm	$out2,$out2,$out2,$inpperm
1795	stvx_u		$out1,$x10,$out
1796	stvx_u		$out2,$x20,$out
1797	addi		$out,$out,0x30
1798	b		Lcbc_dec8x_done
1799
1800.align	5
1801Lctr32_enc8x_two:
1802	vcipherlast	$out0,$out0,$in6
1803	vcipherlast	$out1,$out1,$in7
1804
1805	le?vperm	$out0,$out0,$out0,$inpperm
1806	le?vperm	$out1,$out1,$out1,$inpperm
1807	stvx_u		$out0,$x00,$out
1808	stvx_u		$out1,$x10,$out
1809	addi		$out,$out,0x20
1810	b		Lcbc_dec8x_done
1811
1812.align	5
1813Lctr32_enc8x_one:
1814	vcipherlast	$out0,$out0,$in7
1815
1816	le?vperm	$out0,$out0,$out0,$inpperm
1817	stvx_u		$out0,0,$out
1818	addi		$out,$out,0x10
1819
1820Lctr32_enc8x_done:
1821	li		r10,`$FRAME+15`
1822	li		r11,`$FRAME+31`
1823	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1824	addi		r10,r10,32
1825	stvx		$inpperm,r11,$sp
1826	addi		r11,r11,32
1827	stvx		$inpperm,r10,$sp
1828	addi		r10,r10,32
1829	stvx		$inpperm,r11,$sp
1830	addi		r11,r11,32
1831	stvx		$inpperm,r10,$sp
1832	addi		r10,r10,32
1833	stvx		$inpperm,r11,$sp
1834	addi		r11,r11,32
1835	stvx		$inpperm,r10,$sp
1836	addi		r10,r10,32
1837	stvx		$inpperm,r11,$sp
1838	addi		r11,r11,32
1839
1840	mtspr		256,$vrsave
1841	lvx		v20,r10,$sp		# ABI says so
1842	addi		r10,r10,32
1843	lvx		v21,r11,$sp
1844	addi		r11,r11,32
1845	lvx		v22,r10,$sp
1846	addi		r10,r10,32
1847	lvx		v23,r11,$sp
1848	addi		r11,r11,32
1849	lvx		v24,r10,$sp
1850	addi		r10,r10,32
1851	lvx		v25,r11,$sp
1852	addi		r11,r11,32
1853	lvx		v26,r10,$sp
1854	addi		r10,r10,32
1855	lvx		v27,r11,$sp
1856	addi		r11,r11,32
1857	lvx		v28,r10,$sp
1858	addi		r10,r10,32
1859	lvx		v29,r11,$sp
1860	addi		r11,r11,32
1861	lvx		v30,r10,$sp
1862	lvx		v31,r11,$sp
1863	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1864	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1865	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1866	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1867	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1868	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1869	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1870	blr
1871	.long		0
1872	.byte		0,12,0x14,0,0x80,6,6,0
1873	.long		0
1874.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1875___
1876}}	}}}
1877
1878my $consts=1;
1879foreach(split("\n",$code)) {
1880        s/\`([^\`]*)\`/eval($1)/geo;
1881
1882	# constants table endian-specific conversion
1883	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
1884	    my $conv=$3;
1885	    my @bytes=();
1886
1887	    # convert to endian-agnostic format
1888	    if ($1 eq "long") {
1889	      foreach (split(/,\s*/,$2)) {
1890		my $l = /^0/?oct:int;
1891		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1892	      }
1893	    } else {
1894		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
1895	    }
1896
1897	    # little-endian conversion
1898	    if ($flavour =~ /le$/o) {
1899		SWITCH: for($conv)  {
1900		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
1901		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
1902		}
1903	    }
1904
1905	    #emit
1906	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1907	    next;
1908	}
1909	$consts=0 if (m/Lconsts:/o);	# end of table
1910
1911	# instructions prefixed with '?' are endian-specific and need
1912	# to be adjusted accordingly...
1913	if ($flavour =~ /le$/o) {	# little-endian
1914	    s/le\?//o		or
1915	    s/be\?/#be#/o	or
1916	    s/\?lvsr/lvsl/o	or
1917	    s/\?lvsl/lvsr/o	or
1918	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1919	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1920	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1921	} else {			# big-endian
1922	    s/le\?/#le#/o	or
1923	    s/be\?//o		or
1924	    s/\?([a-z]+)/$1/o;
1925	}
1926
1927        print $_,"\n";
1928}
1929
1930close STDOUT;
1931