xref: /openbmc/linux/drivers/crypto/vmx/aesp8-ppc.pl (revision 82e6fdd6)
1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for AES instructions as per PowerISA
18# specification version 2.07, first implemented by POWER8 processor.
19# The module is endian-agnostic in sense that it supports both big-
20# and little-endian cases. Data alignment in parallelizable modes is
21# handled with VSX loads and stores, which implies MSR.VSX flag being
22# set. It should also be noted that ISA specification doesn't prohibit
23# alignment exceptions for these instructions on page boundaries.
24# Initially alignment was handled in pure AltiVec/VMX way [when data
25# is aligned programmatically, which in turn guarantees exception-
26# free execution], but it turned to hamper performance when vcipher
27# instructions are interleaved. It's reckoned that eventual
28# misalignment penalties at page boundaries are in average lower
29# than additional overhead in pure AltiVec approach.
30#
31# May 2016
32#
33# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
34# systems were measured.
35#
36######################################################################
37# Current large-block performance in cycles per byte processed with
38# 128-bit key (less is better).
39#
40#		CBC en-/decrypt	CTR	XTS
41# POWER8[le]	3.96/0.72	0.74	1.1
42# POWER8[be]	3.75/0.65	0.66	1.0
43
44$flavour = shift;
45
46if ($flavour =~ /64/) {
47	$SIZE_T	=8;
48	$LRSAVE	=2*$SIZE_T;
49	$STU	="stdu";
50	$POP	="ld";
51	$PUSH	="std";
52	$UCMP	="cmpld";
53	$SHL	="sldi";
54} elsif ($flavour =~ /32/) {
55	$SIZE_T	=4;
56	$LRSAVE	=$SIZE_T;
57	$STU	="stwu";
58	$POP	="lwz";
59	$PUSH	="stw";
60	$UCMP	="cmplw";
61	$SHL	="slwi";
62} else { die "nonsense $flavour"; }
63
64$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
65
66$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
68( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
69die "can't locate ppc-xlate.pl";
70
71open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
72
73$FRAME=8*$SIZE_T;
74$prefix="aes_p8";
75
76$sp="r1";
77$vrsave="r12";
78
79#########################################################################
80{{{	# Key setup procedures						#
81my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
82my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
83my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
84
85$code.=<<___;
86.machine	"any"
87
88.text
89
90.align	7
91rcon:
92.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
93.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
94.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
95.long	0,0,0,0						?asis
96Lconsts:
97	mflr	r0
98	bcl	20,31,\$+4
99	mflr	$ptr	 #vvvvv "distance between . and rcon
100	addi	$ptr,$ptr,-0x48
101	mtlr	r0
102	blr
103	.long	0
104	.byte	0,12,0x14,0,0,0,0,0
105.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
106
107.globl	.${prefix}_set_encrypt_key
108Lset_encrypt_key:
109	mflr		r11
110	$PUSH		r11,$LRSAVE($sp)
111
112	li		$ptr,-1
113	${UCMP}i	$inp,0
114	beq-		Lenc_key_abort		# if ($inp==0) return -1;
115	${UCMP}i	$out,0
116	beq-		Lenc_key_abort		# if ($out==0) return -1;
117	li		$ptr,-2
118	cmpwi		$bits,128
119	blt-		Lenc_key_abort
120	cmpwi		$bits,256
121	bgt-		Lenc_key_abort
122	andi.		r0,$bits,0x3f
123	bne-		Lenc_key_abort
124
125	lis		r0,0xfff0
126	mfspr		$vrsave,256
127	mtspr		256,r0
128
129	bl		Lconsts
130	mtlr		r11
131
132	neg		r9,$inp
133	lvx		$in0,0,$inp
134	addi		$inp,$inp,15		# 15 is not typo
135	lvsr		$key,0,r9		# borrow $key
136	li		r8,0x20
137	cmpwi		$bits,192
138	lvx		$in1,0,$inp
139	le?vspltisb	$mask,0x0f		# borrow $mask
140	lvx		$rcon,0,$ptr
141	le?vxor		$key,$key,$mask		# adjust for byte swap
142	lvx		$mask,r8,$ptr
143	addi		$ptr,$ptr,0x10
144	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
145	li		$cnt,8
146	vxor		$zero,$zero,$zero
147	mtctr		$cnt
148
149	?lvsr		$outperm,0,$out
150	vspltisb	$outmask,-1
151	lvx		$outhead,0,$out
152	?vperm		$outmask,$zero,$outmask,$outperm
153
154	blt		Loop128
155	addi		$inp,$inp,8
156	beq		L192
157	addi		$inp,$inp,8
158	b		L256
159
160.align	4
161Loop128:
162	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
163	vsldoi		$tmp,$zero,$in0,12	# >>32
164	 vperm		$outtail,$in0,$in0,$outperm	# rotate
165	 vsel		$stage,$outhead,$outtail,$outmask
166	 vmr		$outhead,$outtail
167	vcipherlast	$key,$key,$rcon
168	 stvx		$stage,0,$out
169	 addi		$out,$out,16
170
171	vxor		$in0,$in0,$tmp
172	vsldoi		$tmp,$zero,$tmp,12	# >>32
173	vxor		$in0,$in0,$tmp
174	vsldoi		$tmp,$zero,$tmp,12	# >>32
175	vxor		$in0,$in0,$tmp
176	 vadduwm	$rcon,$rcon,$rcon
177	vxor		$in0,$in0,$key
178	bdnz		Loop128
179
180	lvx		$rcon,0,$ptr		# last two round keys
181
182	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
183	vsldoi		$tmp,$zero,$in0,12	# >>32
184	 vperm		$outtail,$in0,$in0,$outperm	# rotate
185	 vsel		$stage,$outhead,$outtail,$outmask
186	 vmr		$outhead,$outtail
187	vcipherlast	$key,$key,$rcon
188	 stvx		$stage,0,$out
189	 addi		$out,$out,16
190
191	vxor		$in0,$in0,$tmp
192	vsldoi		$tmp,$zero,$tmp,12	# >>32
193	vxor		$in0,$in0,$tmp
194	vsldoi		$tmp,$zero,$tmp,12	# >>32
195	vxor		$in0,$in0,$tmp
196	 vadduwm	$rcon,$rcon,$rcon
197	vxor		$in0,$in0,$key
198
199	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
200	vsldoi		$tmp,$zero,$in0,12	# >>32
201	 vperm		$outtail,$in0,$in0,$outperm	# rotate
202	 vsel		$stage,$outhead,$outtail,$outmask
203	 vmr		$outhead,$outtail
204	vcipherlast	$key,$key,$rcon
205	 stvx		$stage,0,$out
206	 addi		$out,$out,16
207
208	vxor		$in0,$in0,$tmp
209	vsldoi		$tmp,$zero,$tmp,12	# >>32
210	vxor		$in0,$in0,$tmp
211	vsldoi		$tmp,$zero,$tmp,12	# >>32
212	vxor		$in0,$in0,$tmp
213	vxor		$in0,$in0,$key
214	 vperm		$outtail,$in0,$in0,$outperm	# rotate
215	 vsel		$stage,$outhead,$outtail,$outmask
216	 vmr		$outhead,$outtail
217	 stvx		$stage,0,$out
218
219	addi		$inp,$out,15		# 15 is not typo
220	addi		$out,$out,0x50
221
222	li		$rounds,10
223	b		Ldone
224
225.align	4
226L192:
227	lvx		$tmp,0,$inp
228	li		$cnt,4
229	 vperm		$outtail,$in0,$in0,$outperm	# rotate
230	 vsel		$stage,$outhead,$outtail,$outmask
231	 vmr		$outhead,$outtail
232	 stvx		$stage,0,$out
233	 addi		$out,$out,16
234	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
235	vspltisb	$key,8			# borrow $key
236	mtctr		$cnt
237	vsububm		$mask,$mask,$key	# adjust the mask
238
239Loop192:
240	vperm		$key,$in1,$in1,$mask	# roate-n-splat
241	vsldoi		$tmp,$zero,$in0,12	# >>32
242	vcipherlast	$key,$key,$rcon
243
244	vxor		$in0,$in0,$tmp
245	vsldoi		$tmp,$zero,$tmp,12	# >>32
246	vxor		$in0,$in0,$tmp
247	vsldoi		$tmp,$zero,$tmp,12	# >>32
248	vxor		$in0,$in0,$tmp
249
250	 vsldoi		$stage,$zero,$in1,8
251	vspltw		$tmp,$in0,3
252	vxor		$tmp,$tmp,$in1
253	vsldoi		$in1,$zero,$in1,12	# >>32
254	 vadduwm	$rcon,$rcon,$rcon
255	vxor		$in1,$in1,$tmp
256	vxor		$in0,$in0,$key
257	vxor		$in1,$in1,$key
258	 vsldoi		$stage,$stage,$in0,8
259
260	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
261	vsldoi		$tmp,$zero,$in0,12	# >>32
262	 vperm		$outtail,$stage,$stage,$outperm	# rotate
263	 vsel		$stage,$outhead,$outtail,$outmask
264	 vmr		$outhead,$outtail
265	vcipherlast	$key,$key,$rcon
266	 stvx		$stage,0,$out
267	 addi		$out,$out,16
268
269	 vsldoi		$stage,$in0,$in1,8
270	vxor		$in0,$in0,$tmp
271	vsldoi		$tmp,$zero,$tmp,12	# >>32
272	 vperm		$outtail,$stage,$stage,$outperm	# rotate
273	 vsel		$stage,$outhead,$outtail,$outmask
274	 vmr		$outhead,$outtail
275	vxor		$in0,$in0,$tmp
276	vsldoi		$tmp,$zero,$tmp,12	# >>32
277	vxor		$in0,$in0,$tmp
278	 stvx		$stage,0,$out
279	 addi		$out,$out,16
280
281	vspltw		$tmp,$in0,3
282	vxor		$tmp,$tmp,$in1
283	vsldoi		$in1,$zero,$in1,12	# >>32
284	 vadduwm	$rcon,$rcon,$rcon
285	vxor		$in1,$in1,$tmp
286	vxor		$in0,$in0,$key
287	vxor		$in1,$in1,$key
288	 vperm		$outtail,$in0,$in0,$outperm	# rotate
289	 vsel		$stage,$outhead,$outtail,$outmask
290	 vmr		$outhead,$outtail
291	 stvx		$stage,0,$out
292	 addi		$inp,$out,15		# 15 is not typo
293	 addi		$out,$out,16
294	bdnz		Loop192
295
296	li		$rounds,12
297	addi		$out,$out,0x20
298	b		Ldone
299
300.align	4
301L256:
302	lvx		$tmp,0,$inp
303	li		$cnt,7
304	li		$rounds,14
305	 vperm		$outtail,$in0,$in0,$outperm	# rotate
306	 vsel		$stage,$outhead,$outtail,$outmask
307	 vmr		$outhead,$outtail
308	 stvx		$stage,0,$out
309	 addi		$out,$out,16
310	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
311	mtctr		$cnt
312
313Loop256:
314	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
315	vsldoi		$tmp,$zero,$in0,12	# >>32
316	 vperm		$outtail,$in1,$in1,$outperm	# rotate
317	 vsel		$stage,$outhead,$outtail,$outmask
318	 vmr		$outhead,$outtail
319	vcipherlast	$key,$key,$rcon
320	 stvx		$stage,0,$out
321	 addi		$out,$out,16
322
323	vxor		$in0,$in0,$tmp
324	vsldoi		$tmp,$zero,$tmp,12	# >>32
325	vxor		$in0,$in0,$tmp
326	vsldoi		$tmp,$zero,$tmp,12	# >>32
327	vxor		$in0,$in0,$tmp
328	 vadduwm	$rcon,$rcon,$rcon
329	vxor		$in0,$in0,$key
330	 vperm		$outtail,$in0,$in0,$outperm	# rotate
331	 vsel		$stage,$outhead,$outtail,$outmask
332	 vmr		$outhead,$outtail
333	 stvx		$stage,0,$out
334	 addi		$inp,$out,15		# 15 is not typo
335	 addi		$out,$out,16
336	bdz		Ldone
337
338	vspltw		$key,$in0,3		# just splat
339	vsldoi		$tmp,$zero,$in1,12	# >>32
340	vsbox		$key,$key
341
342	vxor		$in1,$in1,$tmp
343	vsldoi		$tmp,$zero,$tmp,12	# >>32
344	vxor		$in1,$in1,$tmp
345	vsldoi		$tmp,$zero,$tmp,12	# >>32
346	vxor		$in1,$in1,$tmp
347
348	vxor		$in1,$in1,$key
349	b		Loop256
350
351.align	4
352Ldone:
353	lvx		$in1,0,$inp		# redundant in aligned case
354	vsel		$in1,$outhead,$in1,$outmask
355	stvx		$in1,0,$inp
356	li		$ptr,0
357	mtspr		256,$vrsave
358	stw		$rounds,0($out)
359
360Lenc_key_abort:
361	mr		r3,$ptr
362	blr
363	.long		0
364	.byte		0,12,0x14,1,0,0,3,0
365	.long		0
366.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
367
368.globl	.${prefix}_set_decrypt_key
369	$STU		$sp,-$FRAME($sp)
370	mflr		r10
371	$PUSH		r10,$FRAME+$LRSAVE($sp)
372	bl		Lset_encrypt_key
373	mtlr		r10
374
375	cmpwi		r3,0
376	bne-		Ldec_key_abort
377
378	slwi		$cnt,$rounds,4
379	subi		$inp,$out,240		# first round key
380	srwi		$rounds,$rounds,1
381	add		$out,$inp,$cnt		# last round key
382	mtctr		$rounds
383
384Ldeckey:
385	lwz		r0, 0($inp)
386	lwz		r6, 4($inp)
387	lwz		r7, 8($inp)
388	lwz		r8, 12($inp)
389	addi		$inp,$inp,16
390	lwz		r9, 0($out)
391	lwz		r10,4($out)
392	lwz		r11,8($out)
393	lwz		r12,12($out)
394	stw		r0, 0($out)
395	stw		r6, 4($out)
396	stw		r7, 8($out)
397	stw		r8, 12($out)
398	subi		$out,$out,16
399	stw		r9, -16($inp)
400	stw		r10,-12($inp)
401	stw		r11,-8($inp)
402	stw		r12,-4($inp)
403	bdnz		Ldeckey
404
405	xor		r3,r3,r3		# return value
406Ldec_key_abort:
407	addi		$sp,$sp,$FRAME
408	blr
409	.long		0
410	.byte		0,12,4,1,0x80,0,3,0
411	.long		0
412.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
413___
414}}}
415#########################################################################
416{{{	# Single block en- and decrypt procedures			#
417sub gen_block () {
418my $dir = shift;
419my $n   = $dir eq "de" ? "n" : "";
420my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
421
422$code.=<<___;
423.globl	.${prefix}_${dir}crypt
424	lwz		$rounds,240($key)
425	lis		r0,0xfc00
426	mfspr		$vrsave,256
427	li		$idx,15			# 15 is not typo
428	mtspr		256,r0
429
430	lvx		v0,0,$inp
431	neg		r11,$out
432	lvx		v1,$idx,$inp
433	lvsl		v2,0,$inp		# inpperm
434	le?vspltisb	v4,0x0f
435	?lvsl		v3,0,r11		# outperm
436	le?vxor		v2,v2,v4
437	li		$idx,16
438	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
439	lvx		v1,0,$key
440	?lvsl		v5,0,$key		# keyperm
441	srwi		$rounds,$rounds,1
442	lvx		v2,$idx,$key
443	addi		$idx,$idx,16
444	subi		$rounds,$rounds,1
445	?vperm		v1,v1,v2,v5		# align round key
446
447	vxor		v0,v0,v1
448	lvx		v1,$idx,$key
449	addi		$idx,$idx,16
450	mtctr		$rounds
451
452Loop_${dir}c:
453	?vperm		v2,v2,v1,v5
454	v${n}cipher	v0,v0,v2
455	lvx		v2,$idx,$key
456	addi		$idx,$idx,16
457	?vperm		v1,v1,v2,v5
458	v${n}cipher	v0,v0,v1
459	lvx		v1,$idx,$key
460	addi		$idx,$idx,16
461	bdnz		Loop_${dir}c
462
463	?vperm		v2,v2,v1,v5
464	v${n}cipher	v0,v0,v2
465	lvx		v2,$idx,$key
466	?vperm		v1,v1,v2,v5
467	v${n}cipherlast	v0,v0,v1
468
469	vspltisb	v2,-1
470	vxor		v1,v1,v1
471	li		$idx,15			# 15 is not typo
472	?vperm		v2,v1,v2,v3		# outmask
473	le?vxor		v3,v3,v4
474	lvx		v1,0,$out		# outhead
475	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
476	vsel		v1,v1,v0,v2
477	lvx		v4,$idx,$out
478	stvx		v1,0,$out
479	vsel		v0,v0,v4,v2
480	stvx		v0,$idx,$out
481
482	mtspr		256,$vrsave
483	blr
484	.long		0
485	.byte		0,12,0x14,0,0,0,3,0
486	.long		0
487.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
488___
489}
490&gen_block("en");
491&gen_block("de");
492}}}
493#########################################################################
494{{{	# CBC en- and decrypt procedures				#
495my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
496my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
497my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
498						map("v$_",(4..10));
499$code.=<<___;
500.globl	.${prefix}_cbc_encrypt
501	${UCMP}i	$len,16
502	bltlr-
503
504	cmpwi		$enc,0			# test direction
505	lis		r0,0xffe0
506	mfspr		$vrsave,256
507	mtspr		256,r0
508
509	li		$idx,15
510	vxor		$rndkey0,$rndkey0,$rndkey0
511	le?vspltisb	$tmp,0x0f
512
513	lvx		$ivec,0,$ivp		# load [unaligned] iv
514	lvsl		$inpperm,0,$ivp
515	lvx		$inptail,$idx,$ivp
516	le?vxor		$inpperm,$inpperm,$tmp
517	vperm		$ivec,$ivec,$inptail,$inpperm
518
519	neg		r11,$inp
520	?lvsl		$keyperm,0,$key		# prepare for unaligned key
521	lwz		$rounds,240($key)
522
523	lvsr		$inpperm,0,r11		# prepare for unaligned load
524	lvx		$inptail,0,$inp
525	addi		$inp,$inp,15		# 15 is not typo
526	le?vxor		$inpperm,$inpperm,$tmp
527
528	?lvsr		$outperm,0,$out		# prepare for unaligned store
529	vspltisb	$outmask,-1
530	lvx		$outhead,0,$out
531	?vperm		$outmask,$rndkey0,$outmask,$outperm
532	le?vxor		$outperm,$outperm,$tmp
533
534	srwi		$rounds,$rounds,1
535	li		$idx,16
536	subi		$rounds,$rounds,1
537	beq		Lcbc_dec
538
539Lcbc_enc:
540	vmr		$inout,$inptail
541	lvx		$inptail,0,$inp
542	addi		$inp,$inp,16
543	mtctr		$rounds
544	subi		$len,$len,16		# len-=16
545
546	lvx		$rndkey0,0,$key
547	 vperm		$inout,$inout,$inptail,$inpperm
548	lvx		$rndkey1,$idx,$key
549	addi		$idx,$idx,16
550	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
551	vxor		$inout,$inout,$rndkey0
552	lvx		$rndkey0,$idx,$key
553	addi		$idx,$idx,16
554	vxor		$inout,$inout,$ivec
555
556Loop_cbc_enc:
557	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
558	vcipher		$inout,$inout,$rndkey1
559	lvx		$rndkey1,$idx,$key
560	addi		$idx,$idx,16
561	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
562	vcipher		$inout,$inout,$rndkey0
563	lvx		$rndkey0,$idx,$key
564	addi		$idx,$idx,16
565	bdnz		Loop_cbc_enc
566
567	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
568	vcipher		$inout,$inout,$rndkey1
569	lvx		$rndkey1,$idx,$key
570	li		$idx,16
571	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
572	vcipherlast	$ivec,$inout,$rndkey0
573	${UCMP}i	$len,16
574
575	vperm		$tmp,$ivec,$ivec,$outperm
576	vsel		$inout,$outhead,$tmp,$outmask
577	vmr		$outhead,$tmp
578	stvx		$inout,0,$out
579	addi		$out,$out,16
580	bge		Lcbc_enc
581
582	b		Lcbc_done
583
584.align	4
585Lcbc_dec:
586	${UCMP}i	$len,128
587	bge		_aesp8_cbc_decrypt8x
588	vmr		$tmp,$inptail
589	lvx		$inptail,0,$inp
590	addi		$inp,$inp,16
591	mtctr		$rounds
592	subi		$len,$len,16		# len-=16
593
594	lvx		$rndkey0,0,$key
595	 vperm		$tmp,$tmp,$inptail,$inpperm
596	lvx		$rndkey1,$idx,$key
597	addi		$idx,$idx,16
598	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
599	vxor		$inout,$tmp,$rndkey0
600	lvx		$rndkey0,$idx,$key
601	addi		$idx,$idx,16
602
603Loop_cbc_dec:
604	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
605	vncipher	$inout,$inout,$rndkey1
606	lvx		$rndkey1,$idx,$key
607	addi		$idx,$idx,16
608	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
609	vncipher	$inout,$inout,$rndkey0
610	lvx		$rndkey0,$idx,$key
611	addi		$idx,$idx,16
612	bdnz		Loop_cbc_dec
613
614	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
615	vncipher	$inout,$inout,$rndkey1
616	lvx		$rndkey1,$idx,$key
617	li		$idx,16
618	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
619	vncipherlast	$inout,$inout,$rndkey0
620	${UCMP}i	$len,16
621
622	vxor		$inout,$inout,$ivec
623	vmr		$ivec,$tmp
624	vperm		$tmp,$inout,$inout,$outperm
625	vsel		$inout,$outhead,$tmp,$outmask
626	vmr		$outhead,$tmp
627	stvx		$inout,0,$out
628	addi		$out,$out,16
629	bge		Lcbc_dec
630
631Lcbc_done:
632	addi		$out,$out,-1
633	lvx		$inout,0,$out		# redundant in aligned case
634	vsel		$inout,$outhead,$inout,$outmask
635	stvx		$inout,0,$out
636
637	neg		$enc,$ivp		# write [unaligned] iv
638	li		$idx,15			# 15 is not typo
639	vxor		$rndkey0,$rndkey0,$rndkey0
640	vspltisb	$outmask,-1
641	le?vspltisb	$tmp,0x0f
642	?lvsl		$outperm,0,$enc
643	?vperm		$outmask,$rndkey0,$outmask,$outperm
644	le?vxor		$outperm,$outperm,$tmp
645	lvx		$outhead,0,$ivp
646	vperm		$ivec,$ivec,$ivec,$outperm
647	vsel		$inout,$outhead,$ivec,$outmask
648	lvx		$inptail,$idx,$ivp
649	stvx		$inout,0,$ivp
650	vsel		$inout,$ivec,$inptail,$outmask
651	stvx		$inout,$idx,$ivp
652
653	mtspr		256,$vrsave
654	blr
655	.long		0
656	.byte		0,12,0x14,0,0,0,6,0
657	.long		0
658___
659#########################################################################
660{{	# Optimized CBC decrypt procedure				#
661my $key_="r11";
662my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
663my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
664my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
665my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
666			# v26-v31 last 6 round keys
667my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
668
669$code.=<<___;
670.align	5
671_aesp8_cbc_decrypt8x:
672	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
673	li		r10,`$FRAME+8*16+15`
674	li		r11,`$FRAME+8*16+31`
675	stvx		v20,r10,$sp		# ABI says so
676	addi		r10,r10,32
677	stvx		v21,r11,$sp
678	addi		r11,r11,32
679	stvx		v22,r10,$sp
680	addi		r10,r10,32
681	stvx		v23,r11,$sp
682	addi		r11,r11,32
683	stvx		v24,r10,$sp
684	addi		r10,r10,32
685	stvx		v25,r11,$sp
686	addi		r11,r11,32
687	stvx		v26,r10,$sp
688	addi		r10,r10,32
689	stvx		v27,r11,$sp
690	addi		r11,r11,32
691	stvx		v28,r10,$sp
692	addi		r10,r10,32
693	stvx		v29,r11,$sp
694	addi		r11,r11,32
695	stvx		v30,r10,$sp
696	stvx		v31,r11,$sp
697	li		r0,-1
698	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
699	li		$x10,0x10
700	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
701	li		$x20,0x20
702	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
703	li		$x30,0x30
704	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
705	li		$x40,0x40
706	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
707	li		$x50,0x50
708	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
709	li		$x60,0x60
710	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
711	li		$x70,0x70
712	mtspr		256,r0
713
714	subi		$rounds,$rounds,3	# -4 in total
715	subi		$len,$len,128		# bias
716
717	lvx		$rndkey0,$x00,$key	# load key schedule
718	lvx		v30,$x10,$key
719	addi		$key,$key,0x20
720	lvx		v31,$x00,$key
721	?vperm		$rndkey0,$rndkey0,v30,$keyperm
722	addi		$key_,$sp,$FRAME+15
723	mtctr		$rounds
724
725Load_cbc_dec_key:
726	?vperm		v24,v30,v31,$keyperm
727	lvx		v30,$x10,$key
728	addi		$key,$key,0x20
729	stvx		v24,$x00,$key_		# off-load round[1]
730	?vperm		v25,v31,v30,$keyperm
731	lvx		v31,$x00,$key
732	stvx		v25,$x10,$key_		# off-load round[2]
733	addi		$key_,$key_,0x20
734	bdnz		Load_cbc_dec_key
735
736	lvx		v26,$x10,$key
737	?vperm		v24,v30,v31,$keyperm
738	lvx		v27,$x20,$key
739	stvx		v24,$x00,$key_		# off-load round[3]
740	?vperm		v25,v31,v26,$keyperm
741	lvx		v28,$x30,$key
742	stvx		v25,$x10,$key_		# off-load round[4]
743	addi		$key_,$sp,$FRAME+15	# rewind $key_
744	?vperm		v26,v26,v27,$keyperm
745	lvx		v29,$x40,$key
746	?vperm		v27,v27,v28,$keyperm
747	lvx		v30,$x50,$key
748	?vperm		v28,v28,v29,$keyperm
749	lvx		v31,$x60,$key
750	?vperm		v29,v29,v30,$keyperm
751	lvx		$out0,$x70,$key		# borrow $out0
752	?vperm		v30,v30,v31,$keyperm
753	lvx		v24,$x00,$key_		# pre-load round[1]
754	?vperm		v31,v31,$out0,$keyperm
755	lvx		v25,$x10,$key_		# pre-load round[2]
756
757	#lvx		$inptail,0,$inp		# "caller" already did this
758	#addi		$inp,$inp,15		# 15 is not typo
759	subi		$inp,$inp,15		# undo "caller"
760
761	 le?li		$idx,8
762	lvx_u		$in0,$x00,$inp		# load first 8 "words"
763	 le?lvsl	$inpperm,0,$idx
764	 le?vspltisb	$tmp,0x0f
765	lvx_u		$in1,$x10,$inp
766	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
767	lvx_u		$in2,$x20,$inp
768	 le?vperm	$in0,$in0,$in0,$inpperm
769	lvx_u		$in3,$x30,$inp
770	 le?vperm	$in1,$in1,$in1,$inpperm
771	lvx_u		$in4,$x40,$inp
772	 le?vperm	$in2,$in2,$in2,$inpperm
773	vxor		$out0,$in0,$rndkey0
774	lvx_u		$in5,$x50,$inp
775	 le?vperm	$in3,$in3,$in3,$inpperm
776	vxor		$out1,$in1,$rndkey0
777	lvx_u		$in6,$x60,$inp
778	 le?vperm	$in4,$in4,$in4,$inpperm
779	vxor		$out2,$in2,$rndkey0
780	lvx_u		$in7,$x70,$inp
781	addi		$inp,$inp,0x80
782	 le?vperm	$in5,$in5,$in5,$inpperm
783	vxor		$out3,$in3,$rndkey0
784	 le?vperm	$in6,$in6,$in6,$inpperm
785	vxor		$out4,$in4,$rndkey0
786	 le?vperm	$in7,$in7,$in7,$inpperm
787	vxor		$out5,$in5,$rndkey0
788	vxor		$out6,$in6,$rndkey0
789	vxor		$out7,$in7,$rndkey0
790
791	mtctr		$rounds
792	b		Loop_cbc_dec8x
793.align	5
794Loop_cbc_dec8x:
795	vncipher	$out0,$out0,v24
796	vncipher	$out1,$out1,v24
797	vncipher	$out2,$out2,v24
798	vncipher	$out3,$out3,v24
799	vncipher	$out4,$out4,v24
800	vncipher	$out5,$out5,v24
801	vncipher	$out6,$out6,v24
802	vncipher	$out7,$out7,v24
803	lvx		v24,$x20,$key_		# round[3]
804	addi		$key_,$key_,0x20
805
806	vncipher	$out0,$out0,v25
807	vncipher	$out1,$out1,v25
808	vncipher	$out2,$out2,v25
809	vncipher	$out3,$out3,v25
810	vncipher	$out4,$out4,v25
811	vncipher	$out5,$out5,v25
812	vncipher	$out6,$out6,v25
813	vncipher	$out7,$out7,v25
814	lvx		v25,$x10,$key_		# round[4]
815	bdnz		Loop_cbc_dec8x
816
817	subic		$len,$len,128		# $len-=128
818	vncipher	$out0,$out0,v24
819	vncipher	$out1,$out1,v24
820	vncipher	$out2,$out2,v24
821	vncipher	$out3,$out3,v24
822	vncipher	$out4,$out4,v24
823	vncipher	$out5,$out5,v24
824	vncipher	$out6,$out6,v24
825	vncipher	$out7,$out7,v24
826
827	subfe.		r0,r0,r0		# borrow?-1:0
828	vncipher	$out0,$out0,v25
829	vncipher	$out1,$out1,v25
830	vncipher	$out2,$out2,v25
831	vncipher	$out3,$out3,v25
832	vncipher	$out4,$out4,v25
833	vncipher	$out5,$out5,v25
834	vncipher	$out6,$out6,v25
835	vncipher	$out7,$out7,v25
836
837	and		r0,r0,$len
838	vncipher	$out0,$out0,v26
839	vncipher	$out1,$out1,v26
840	vncipher	$out2,$out2,v26
841	vncipher	$out3,$out3,v26
842	vncipher	$out4,$out4,v26
843	vncipher	$out5,$out5,v26
844	vncipher	$out6,$out6,v26
845	vncipher	$out7,$out7,v26
846
847	add		$inp,$inp,r0		# $inp is adjusted in such
848						# way that at exit from the
849						# loop inX-in7 are loaded
850						# with last "words"
851	vncipher	$out0,$out0,v27
852	vncipher	$out1,$out1,v27
853	vncipher	$out2,$out2,v27
854	vncipher	$out3,$out3,v27
855	vncipher	$out4,$out4,v27
856	vncipher	$out5,$out5,v27
857	vncipher	$out6,$out6,v27
858	vncipher	$out7,$out7,v27
859
860	addi		$key_,$sp,$FRAME+15	# rewind $key_
861	vncipher	$out0,$out0,v28
862	vncipher	$out1,$out1,v28
863	vncipher	$out2,$out2,v28
864	vncipher	$out3,$out3,v28
865	vncipher	$out4,$out4,v28
866	vncipher	$out5,$out5,v28
867	vncipher	$out6,$out6,v28
868	vncipher	$out7,$out7,v28
869	lvx		v24,$x00,$key_		# re-pre-load round[1]
870
871	vncipher	$out0,$out0,v29
872	vncipher	$out1,$out1,v29
873	vncipher	$out2,$out2,v29
874	vncipher	$out3,$out3,v29
875	vncipher	$out4,$out4,v29
876	vncipher	$out5,$out5,v29
877	vncipher	$out6,$out6,v29
878	vncipher	$out7,$out7,v29
879	lvx		v25,$x10,$key_		# re-pre-load round[2]
880
881	vncipher	$out0,$out0,v30
882	 vxor		$ivec,$ivec,v31		# xor with last round key
883	vncipher	$out1,$out1,v30
884	 vxor		$in0,$in0,v31
885	vncipher	$out2,$out2,v30
886	 vxor		$in1,$in1,v31
887	vncipher	$out3,$out3,v30
888	 vxor		$in2,$in2,v31
889	vncipher	$out4,$out4,v30
890	 vxor		$in3,$in3,v31
891	vncipher	$out5,$out5,v30
892	 vxor		$in4,$in4,v31
893	vncipher	$out6,$out6,v30
894	 vxor		$in5,$in5,v31
895	vncipher	$out7,$out7,v30
896	 vxor		$in6,$in6,v31
897
898	vncipherlast	$out0,$out0,$ivec
899	vncipherlast	$out1,$out1,$in0
900	 lvx_u		$in0,$x00,$inp		# load next input block
901	vncipherlast	$out2,$out2,$in1
902	 lvx_u		$in1,$x10,$inp
903	vncipherlast	$out3,$out3,$in2
904	 le?vperm	$in0,$in0,$in0,$inpperm
905	 lvx_u		$in2,$x20,$inp
906	vncipherlast	$out4,$out4,$in3
907	 le?vperm	$in1,$in1,$in1,$inpperm
908	 lvx_u		$in3,$x30,$inp
909	vncipherlast	$out5,$out5,$in4
910	 le?vperm	$in2,$in2,$in2,$inpperm
911	 lvx_u		$in4,$x40,$inp
912	vncipherlast	$out6,$out6,$in5
913	 le?vperm	$in3,$in3,$in3,$inpperm
914	 lvx_u		$in5,$x50,$inp
915	vncipherlast	$out7,$out7,$in6
916	 le?vperm	$in4,$in4,$in4,$inpperm
917	 lvx_u		$in6,$x60,$inp
918	vmr		$ivec,$in7
919	 le?vperm	$in5,$in5,$in5,$inpperm
920	 lvx_u		$in7,$x70,$inp
921	 addi		$inp,$inp,0x80
922
923	le?vperm	$out0,$out0,$out0,$inpperm
924	le?vperm	$out1,$out1,$out1,$inpperm
925	stvx_u		$out0,$x00,$out
926	 le?vperm	$in6,$in6,$in6,$inpperm
927	 vxor		$out0,$in0,$rndkey0
928	le?vperm	$out2,$out2,$out2,$inpperm
929	stvx_u		$out1,$x10,$out
930	 le?vperm	$in7,$in7,$in7,$inpperm
931	 vxor		$out1,$in1,$rndkey0
932	le?vperm	$out3,$out3,$out3,$inpperm
933	stvx_u		$out2,$x20,$out
934	 vxor		$out2,$in2,$rndkey0
935	le?vperm	$out4,$out4,$out4,$inpperm
936	stvx_u		$out3,$x30,$out
937	 vxor		$out3,$in3,$rndkey0
938	le?vperm	$out5,$out5,$out5,$inpperm
939	stvx_u		$out4,$x40,$out
940	 vxor		$out4,$in4,$rndkey0
941	le?vperm	$out6,$out6,$out6,$inpperm
942	stvx_u		$out5,$x50,$out
943	 vxor		$out5,$in5,$rndkey0
944	le?vperm	$out7,$out7,$out7,$inpperm
945	stvx_u		$out6,$x60,$out
946	 vxor		$out6,$in6,$rndkey0
947	stvx_u		$out7,$x70,$out
948	addi		$out,$out,0x80
949	 vxor		$out7,$in7,$rndkey0
950
951	mtctr		$rounds
952	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
953
954	addic.		$len,$len,128
955	beq		Lcbc_dec8x_done
956	nop
957	nop
958
959Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
960	vncipher	$out1,$out1,v24
961	vncipher	$out2,$out2,v24
962	vncipher	$out3,$out3,v24
963	vncipher	$out4,$out4,v24
964	vncipher	$out5,$out5,v24
965	vncipher	$out6,$out6,v24
966	vncipher	$out7,$out7,v24
967	lvx		v24,$x20,$key_		# round[3]
968	addi		$key_,$key_,0x20
969
970	vncipher	$out1,$out1,v25
971	vncipher	$out2,$out2,v25
972	vncipher	$out3,$out3,v25
973	vncipher	$out4,$out4,v25
974	vncipher	$out5,$out5,v25
975	vncipher	$out6,$out6,v25
976	vncipher	$out7,$out7,v25
977	lvx		v25,$x10,$key_		# round[4]
978	bdnz		Loop_cbc_dec8x_tail
979
980	vncipher	$out1,$out1,v24
981	vncipher	$out2,$out2,v24
982	vncipher	$out3,$out3,v24
983	vncipher	$out4,$out4,v24
984	vncipher	$out5,$out5,v24
985	vncipher	$out6,$out6,v24
986	vncipher	$out7,$out7,v24
987
988	vncipher	$out1,$out1,v25
989	vncipher	$out2,$out2,v25
990	vncipher	$out3,$out3,v25
991	vncipher	$out4,$out4,v25
992	vncipher	$out5,$out5,v25
993	vncipher	$out6,$out6,v25
994	vncipher	$out7,$out7,v25
995
996	vncipher	$out1,$out1,v26
997	vncipher	$out2,$out2,v26
998	vncipher	$out3,$out3,v26
999	vncipher	$out4,$out4,v26
1000	vncipher	$out5,$out5,v26
1001	vncipher	$out6,$out6,v26
1002	vncipher	$out7,$out7,v26
1003
1004	vncipher	$out1,$out1,v27
1005	vncipher	$out2,$out2,v27
1006	vncipher	$out3,$out3,v27
1007	vncipher	$out4,$out4,v27
1008	vncipher	$out5,$out5,v27
1009	vncipher	$out6,$out6,v27
1010	vncipher	$out7,$out7,v27
1011
1012	vncipher	$out1,$out1,v28
1013	vncipher	$out2,$out2,v28
1014	vncipher	$out3,$out3,v28
1015	vncipher	$out4,$out4,v28
1016	vncipher	$out5,$out5,v28
1017	vncipher	$out6,$out6,v28
1018	vncipher	$out7,$out7,v28
1019
1020	vncipher	$out1,$out1,v29
1021	vncipher	$out2,$out2,v29
1022	vncipher	$out3,$out3,v29
1023	vncipher	$out4,$out4,v29
1024	vncipher	$out5,$out5,v29
1025	vncipher	$out6,$out6,v29
1026	vncipher	$out7,$out7,v29
1027
1028	vncipher	$out1,$out1,v30
1029	 vxor		$ivec,$ivec,v31		# last round key
1030	vncipher	$out2,$out2,v30
1031	 vxor		$in1,$in1,v31
1032	vncipher	$out3,$out3,v30
1033	 vxor		$in2,$in2,v31
1034	vncipher	$out4,$out4,v30
1035	 vxor		$in3,$in3,v31
1036	vncipher	$out5,$out5,v30
1037	 vxor		$in4,$in4,v31
1038	vncipher	$out6,$out6,v30
1039	 vxor		$in5,$in5,v31
1040	vncipher	$out7,$out7,v30
1041	 vxor		$in6,$in6,v31
1042
1043	cmplwi		$len,32			# switch($len)
1044	blt		Lcbc_dec8x_one
1045	nop
1046	beq		Lcbc_dec8x_two
1047	cmplwi		$len,64
1048	blt		Lcbc_dec8x_three
1049	nop
1050	beq		Lcbc_dec8x_four
1051	cmplwi		$len,96
1052	blt		Lcbc_dec8x_five
1053	nop
1054	beq		Lcbc_dec8x_six
1055
1056Lcbc_dec8x_seven:
1057	vncipherlast	$out1,$out1,$ivec
1058	vncipherlast	$out2,$out2,$in1
1059	vncipherlast	$out3,$out3,$in2
1060	vncipherlast	$out4,$out4,$in3
1061	vncipherlast	$out5,$out5,$in4
1062	vncipherlast	$out6,$out6,$in5
1063	vncipherlast	$out7,$out7,$in6
1064	vmr		$ivec,$in7
1065
1066	le?vperm	$out1,$out1,$out1,$inpperm
1067	le?vperm	$out2,$out2,$out2,$inpperm
1068	stvx_u		$out1,$x00,$out
1069	le?vperm	$out3,$out3,$out3,$inpperm
1070	stvx_u		$out2,$x10,$out
1071	le?vperm	$out4,$out4,$out4,$inpperm
1072	stvx_u		$out3,$x20,$out
1073	le?vperm	$out5,$out5,$out5,$inpperm
1074	stvx_u		$out4,$x30,$out
1075	le?vperm	$out6,$out6,$out6,$inpperm
1076	stvx_u		$out5,$x40,$out
1077	le?vperm	$out7,$out7,$out7,$inpperm
1078	stvx_u		$out6,$x50,$out
1079	stvx_u		$out7,$x60,$out
1080	addi		$out,$out,0x70
1081	b		Lcbc_dec8x_done
1082
1083.align	5
1084Lcbc_dec8x_six:
1085	vncipherlast	$out2,$out2,$ivec
1086	vncipherlast	$out3,$out3,$in2
1087	vncipherlast	$out4,$out4,$in3
1088	vncipherlast	$out5,$out5,$in4
1089	vncipherlast	$out6,$out6,$in5
1090	vncipherlast	$out7,$out7,$in6
1091	vmr		$ivec,$in7
1092
1093	le?vperm	$out2,$out2,$out2,$inpperm
1094	le?vperm	$out3,$out3,$out3,$inpperm
1095	stvx_u		$out2,$x00,$out
1096	le?vperm	$out4,$out4,$out4,$inpperm
1097	stvx_u		$out3,$x10,$out
1098	le?vperm	$out5,$out5,$out5,$inpperm
1099	stvx_u		$out4,$x20,$out
1100	le?vperm	$out6,$out6,$out6,$inpperm
1101	stvx_u		$out5,$x30,$out
1102	le?vperm	$out7,$out7,$out7,$inpperm
1103	stvx_u		$out6,$x40,$out
1104	stvx_u		$out7,$x50,$out
1105	addi		$out,$out,0x60
1106	b		Lcbc_dec8x_done
1107
1108.align	5
1109Lcbc_dec8x_five:
1110	vncipherlast	$out3,$out3,$ivec
1111	vncipherlast	$out4,$out4,$in3
1112	vncipherlast	$out5,$out5,$in4
1113	vncipherlast	$out6,$out6,$in5
1114	vncipherlast	$out7,$out7,$in6
1115	vmr		$ivec,$in7
1116
1117	le?vperm	$out3,$out3,$out3,$inpperm
1118	le?vperm	$out4,$out4,$out4,$inpperm
1119	stvx_u		$out3,$x00,$out
1120	le?vperm	$out5,$out5,$out5,$inpperm
1121	stvx_u		$out4,$x10,$out
1122	le?vperm	$out6,$out6,$out6,$inpperm
1123	stvx_u		$out5,$x20,$out
1124	le?vperm	$out7,$out7,$out7,$inpperm
1125	stvx_u		$out6,$x30,$out
1126	stvx_u		$out7,$x40,$out
1127	addi		$out,$out,0x50
1128	b		Lcbc_dec8x_done
1129
1130.align	5
1131Lcbc_dec8x_four:
1132	vncipherlast	$out4,$out4,$ivec
1133	vncipherlast	$out5,$out5,$in4
1134	vncipherlast	$out6,$out6,$in5
1135	vncipherlast	$out7,$out7,$in6
1136	vmr		$ivec,$in7
1137
1138	le?vperm	$out4,$out4,$out4,$inpperm
1139	le?vperm	$out5,$out5,$out5,$inpperm
1140	stvx_u		$out4,$x00,$out
1141	le?vperm	$out6,$out6,$out6,$inpperm
1142	stvx_u		$out5,$x10,$out
1143	le?vperm	$out7,$out7,$out7,$inpperm
1144	stvx_u		$out6,$x20,$out
1145	stvx_u		$out7,$x30,$out
1146	addi		$out,$out,0x40
1147	b		Lcbc_dec8x_done
1148
1149.align	5
1150Lcbc_dec8x_three:
1151	vncipherlast	$out5,$out5,$ivec
1152	vncipherlast	$out6,$out6,$in5
1153	vncipherlast	$out7,$out7,$in6
1154	vmr		$ivec,$in7
1155
1156	le?vperm	$out5,$out5,$out5,$inpperm
1157	le?vperm	$out6,$out6,$out6,$inpperm
1158	stvx_u		$out5,$x00,$out
1159	le?vperm	$out7,$out7,$out7,$inpperm
1160	stvx_u		$out6,$x10,$out
1161	stvx_u		$out7,$x20,$out
1162	addi		$out,$out,0x30
1163	b		Lcbc_dec8x_done
1164
1165.align	5
1166Lcbc_dec8x_two:
1167	vncipherlast	$out6,$out6,$ivec
1168	vncipherlast	$out7,$out7,$in6
1169	vmr		$ivec,$in7
1170
1171	le?vperm	$out6,$out6,$out6,$inpperm
1172	le?vperm	$out7,$out7,$out7,$inpperm
1173	stvx_u		$out6,$x00,$out
1174	stvx_u		$out7,$x10,$out
1175	addi		$out,$out,0x20
1176	b		Lcbc_dec8x_done
1177
1178.align	5
1179Lcbc_dec8x_one:
1180	vncipherlast	$out7,$out7,$ivec
1181	vmr		$ivec,$in7
1182
1183	le?vperm	$out7,$out7,$out7,$inpperm
1184	stvx_u		$out7,0,$out
1185	addi		$out,$out,0x10
1186
1187Lcbc_dec8x_done:
1188	le?vperm	$ivec,$ivec,$ivec,$inpperm
1189	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1190
1191	li		r10,`$FRAME+15`
1192	li		r11,`$FRAME+31`
1193	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1194	addi		r10,r10,32
1195	stvx		$inpperm,r11,$sp
1196	addi		r11,r11,32
1197	stvx		$inpperm,r10,$sp
1198	addi		r10,r10,32
1199	stvx		$inpperm,r11,$sp
1200	addi		r11,r11,32
1201	stvx		$inpperm,r10,$sp
1202	addi		r10,r10,32
1203	stvx		$inpperm,r11,$sp
1204	addi		r11,r11,32
1205	stvx		$inpperm,r10,$sp
1206	addi		r10,r10,32
1207	stvx		$inpperm,r11,$sp
1208	addi		r11,r11,32
1209
1210	mtspr		256,$vrsave
1211	lvx		v20,r10,$sp		# ABI says so
1212	addi		r10,r10,32
1213	lvx		v21,r11,$sp
1214	addi		r11,r11,32
1215	lvx		v22,r10,$sp
1216	addi		r10,r10,32
1217	lvx		v23,r11,$sp
1218	addi		r11,r11,32
1219	lvx		v24,r10,$sp
1220	addi		r10,r10,32
1221	lvx		v25,r11,$sp
1222	addi		r11,r11,32
1223	lvx		v26,r10,$sp
1224	addi		r10,r10,32
1225	lvx		v27,r11,$sp
1226	addi		r11,r11,32
1227	lvx		v28,r10,$sp
1228	addi		r10,r10,32
1229	lvx		v29,r11,$sp
1230	addi		r11,r11,32
1231	lvx		v30,r10,$sp
1232	lvx		v31,r11,$sp
1233	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1234	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1235	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1236	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1237	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1238	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1239	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1240	blr
1241	.long		0
1242	.byte		0,12,0x14,0,0x80,6,6,0
1243	.long		0
1244.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1245___
1246}}	}}}
1247
1248#########################################################################
1249{{{	# CTR procedure[s]						#
1250my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1251my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1252my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1253						map("v$_",(4..11));
1254my $dat=$tmp;
1255
1256$code.=<<___;
1257.globl	.${prefix}_ctr32_encrypt_blocks
1258	${UCMP}i	$len,1
1259	bltlr-
1260
1261	lis		r0,0xfff0
1262	mfspr		$vrsave,256
1263	mtspr		256,r0
1264
1265	li		$idx,15
1266	vxor		$rndkey0,$rndkey0,$rndkey0
1267	le?vspltisb	$tmp,0x0f
1268
1269	lvx		$ivec,0,$ivp		# load [unaligned] iv
1270	lvsl		$inpperm,0,$ivp
1271	lvx		$inptail,$idx,$ivp
1272	 vspltisb	$one,1
1273	le?vxor		$inpperm,$inpperm,$tmp
1274	vperm		$ivec,$ivec,$inptail,$inpperm
1275	 vsldoi		$one,$rndkey0,$one,1
1276
1277	neg		r11,$inp
1278	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1279	lwz		$rounds,240($key)
1280
1281	lvsr		$inpperm,0,r11		# prepare for unaligned load
1282	lvx		$inptail,0,$inp
1283	addi		$inp,$inp,15		# 15 is not typo
1284	le?vxor		$inpperm,$inpperm,$tmp
1285
1286	srwi		$rounds,$rounds,1
1287	li		$idx,16
1288	subi		$rounds,$rounds,1
1289
1290	${UCMP}i	$len,8
1291	bge		_aesp8_ctr32_encrypt8x
1292
1293	?lvsr		$outperm,0,$out		# prepare for unaligned store
1294	vspltisb	$outmask,-1
1295	lvx		$outhead,0,$out
1296	?vperm		$outmask,$rndkey0,$outmask,$outperm
1297	le?vxor		$outperm,$outperm,$tmp
1298
1299	lvx		$rndkey0,0,$key
1300	mtctr		$rounds
1301	lvx		$rndkey1,$idx,$key
1302	addi		$idx,$idx,16
1303	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1304	vxor		$inout,$ivec,$rndkey0
1305	lvx		$rndkey0,$idx,$key
1306	addi		$idx,$idx,16
1307	b		Loop_ctr32_enc
1308
1309.align	5
1310Loop_ctr32_enc:
1311	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1312	vcipher		$inout,$inout,$rndkey1
1313	lvx		$rndkey1,$idx,$key
1314	addi		$idx,$idx,16
1315	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1316	vcipher		$inout,$inout,$rndkey0
1317	lvx		$rndkey0,$idx,$key
1318	addi		$idx,$idx,16
1319	bdnz		Loop_ctr32_enc
1320
1321	vadduwm		$ivec,$ivec,$one
1322	 vmr		$dat,$inptail
1323	 lvx		$inptail,0,$inp
1324	 addi		$inp,$inp,16
1325	 subic.		$len,$len,1		# blocks--
1326
1327	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1328	vcipher		$inout,$inout,$rndkey1
1329	lvx		$rndkey1,$idx,$key
1330	 vperm		$dat,$dat,$inptail,$inpperm
1331	 li		$idx,16
1332	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1333	 lvx		$rndkey0,0,$key
1334	vxor		$dat,$dat,$rndkey1	# last round key
1335	vcipherlast	$inout,$inout,$dat
1336
1337	 lvx		$rndkey1,$idx,$key
1338	 addi		$idx,$idx,16
1339	vperm		$inout,$inout,$inout,$outperm
1340	vsel		$dat,$outhead,$inout,$outmask
1341	 mtctr		$rounds
1342	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1343	vmr		$outhead,$inout
1344	 vxor		$inout,$ivec,$rndkey0
1345	 lvx		$rndkey0,$idx,$key
1346	 addi		$idx,$idx,16
1347	stvx		$dat,0,$out
1348	addi		$out,$out,16
1349	bne		Loop_ctr32_enc
1350
1351	addi		$out,$out,-1
1352	lvx		$inout,0,$out		# redundant in aligned case
1353	vsel		$inout,$outhead,$inout,$outmask
1354	stvx		$inout,0,$out
1355
1356	mtspr		256,$vrsave
1357	blr
1358	.long		0
1359	.byte		0,12,0x14,0,0,0,6,0
1360	.long		0
1361___
1362#########################################################################
1363{{	# Optimized CTR procedure					#
1364my $key_="r11";
1365my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1366my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1367my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1368my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1369			# v26-v31 last 6 round keys
1370my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1371my ($two,$three,$four)=($outhead,$outperm,$outmask);
1372
1373$code.=<<___;
1374.align	5
1375_aesp8_ctr32_encrypt8x:
1376	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1377	li		r10,`$FRAME+8*16+15`
1378	li		r11,`$FRAME+8*16+31`
1379	stvx		v20,r10,$sp		# ABI says so
1380	addi		r10,r10,32
1381	stvx		v21,r11,$sp
1382	addi		r11,r11,32
1383	stvx		v22,r10,$sp
1384	addi		r10,r10,32
1385	stvx		v23,r11,$sp
1386	addi		r11,r11,32
1387	stvx		v24,r10,$sp
1388	addi		r10,r10,32
1389	stvx		v25,r11,$sp
1390	addi		r11,r11,32
1391	stvx		v26,r10,$sp
1392	addi		r10,r10,32
1393	stvx		v27,r11,$sp
1394	addi		r11,r11,32
1395	stvx		v28,r10,$sp
1396	addi		r10,r10,32
1397	stvx		v29,r11,$sp
1398	addi		r11,r11,32
1399	stvx		v30,r10,$sp
1400	stvx		v31,r11,$sp
1401	li		r0,-1
1402	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1403	li		$x10,0x10
1404	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1405	li		$x20,0x20
1406	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1407	li		$x30,0x30
1408	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1409	li		$x40,0x40
1410	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1411	li		$x50,0x50
1412	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1413	li		$x60,0x60
1414	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1415	li		$x70,0x70
1416	mtspr		256,r0
1417
1418	subi		$rounds,$rounds,3	# -4 in total
1419
1420	lvx		$rndkey0,$x00,$key	# load key schedule
1421	lvx		v30,$x10,$key
1422	addi		$key,$key,0x20
1423	lvx		v31,$x00,$key
1424	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1425	addi		$key_,$sp,$FRAME+15
1426	mtctr		$rounds
1427
1428Load_ctr32_enc_key:
1429	?vperm		v24,v30,v31,$keyperm
1430	lvx		v30,$x10,$key
1431	addi		$key,$key,0x20
1432	stvx		v24,$x00,$key_		# off-load round[1]
1433	?vperm		v25,v31,v30,$keyperm
1434	lvx		v31,$x00,$key
1435	stvx		v25,$x10,$key_		# off-load round[2]
1436	addi		$key_,$key_,0x20
1437	bdnz		Load_ctr32_enc_key
1438
1439	lvx		v26,$x10,$key
1440	?vperm		v24,v30,v31,$keyperm
1441	lvx		v27,$x20,$key
1442	stvx		v24,$x00,$key_		# off-load round[3]
1443	?vperm		v25,v31,v26,$keyperm
1444	lvx		v28,$x30,$key
1445	stvx		v25,$x10,$key_		# off-load round[4]
1446	addi		$key_,$sp,$FRAME+15	# rewind $key_
1447	?vperm		v26,v26,v27,$keyperm
1448	lvx		v29,$x40,$key
1449	?vperm		v27,v27,v28,$keyperm
1450	lvx		v30,$x50,$key
1451	?vperm		v28,v28,v29,$keyperm
1452	lvx		v31,$x60,$key
1453	?vperm		v29,v29,v30,$keyperm
1454	lvx		$out0,$x70,$key		# borrow $out0
1455	?vperm		v30,v30,v31,$keyperm
1456	lvx		v24,$x00,$key_		# pre-load round[1]
1457	?vperm		v31,v31,$out0,$keyperm
1458	lvx		v25,$x10,$key_		# pre-load round[2]
1459
1460	vadduqm		$two,$one,$one
1461	subi		$inp,$inp,15		# undo "caller"
1462	$SHL		$len,$len,4
1463
1464	vadduqm		$out1,$ivec,$one	# counter values ...
1465	vadduqm		$out2,$ivec,$two
1466	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1467	 le?li		$idx,8
1468	vadduqm		$out3,$out1,$two
1469	vxor		$out1,$out1,$rndkey0
1470	 le?lvsl	$inpperm,0,$idx
1471	vadduqm		$out4,$out2,$two
1472	vxor		$out2,$out2,$rndkey0
1473	 le?vspltisb	$tmp,0x0f
1474	vadduqm		$out5,$out3,$two
1475	vxor		$out3,$out3,$rndkey0
1476	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1477	vadduqm		$out6,$out4,$two
1478	vxor		$out4,$out4,$rndkey0
1479	vadduqm		$out7,$out5,$two
1480	vxor		$out5,$out5,$rndkey0
1481	vadduqm		$ivec,$out6,$two	# next counter value
1482	vxor		$out6,$out6,$rndkey0
1483	vxor		$out7,$out7,$rndkey0
1484
1485	mtctr		$rounds
1486	b		Loop_ctr32_enc8x
1487.align	5
1488Loop_ctr32_enc8x:
1489	vcipher 	$out0,$out0,v24
1490	vcipher 	$out1,$out1,v24
1491	vcipher 	$out2,$out2,v24
1492	vcipher 	$out3,$out3,v24
1493	vcipher 	$out4,$out4,v24
1494	vcipher 	$out5,$out5,v24
1495	vcipher 	$out6,$out6,v24
1496	vcipher 	$out7,$out7,v24
1497Loop_ctr32_enc8x_middle:
1498	lvx		v24,$x20,$key_		# round[3]
1499	addi		$key_,$key_,0x20
1500
1501	vcipher 	$out0,$out0,v25
1502	vcipher 	$out1,$out1,v25
1503	vcipher 	$out2,$out2,v25
1504	vcipher 	$out3,$out3,v25
1505	vcipher 	$out4,$out4,v25
1506	vcipher 	$out5,$out5,v25
1507	vcipher 	$out6,$out6,v25
1508	vcipher 	$out7,$out7,v25
1509	lvx		v25,$x10,$key_		# round[4]
1510	bdnz		Loop_ctr32_enc8x
1511
1512	subic		r11,$len,256		# $len-256, borrow $key_
1513	vcipher 	$out0,$out0,v24
1514	vcipher 	$out1,$out1,v24
1515	vcipher 	$out2,$out2,v24
1516	vcipher 	$out3,$out3,v24
1517	vcipher 	$out4,$out4,v24
1518	vcipher 	$out5,$out5,v24
1519	vcipher 	$out6,$out6,v24
1520	vcipher 	$out7,$out7,v24
1521
1522	subfe		r0,r0,r0		# borrow?-1:0
1523	vcipher 	$out0,$out0,v25
1524	vcipher 	$out1,$out1,v25
1525	vcipher 	$out2,$out2,v25
1526	vcipher 	$out3,$out3,v25
1527	vcipher 	$out4,$out4,v25
1528	vcipher		$out5,$out5,v25
1529	vcipher		$out6,$out6,v25
1530	vcipher		$out7,$out7,v25
1531
1532	and		r0,r0,r11
1533	addi		$key_,$sp,$FRAME+15	# rewind $key_
1534	vcipher		$out0,$out0,v26
1535	vcipher		$out1,$out1,v26
1536	vcipher		$out2,$out2,v26
1537	vcipher		$out3,$out3,v26
1538	vcipher		$out4,$out4,v26
1539	vcipher		$out5,$out5,v26
1540	vcipher		$out6,$out6,v26
1541	vcipher		$out7,$out7,v26
1542	lvx		v24,$x00,$key_		# re-pre-load round[1]
1543
1544	subic		$len,$len,129		# $len-=129
1545	vcipher		$out0,$out0,v27
1546	addi		$len,$len,1		# $len-=128 really
1547	vcipher		$out1,$out1,v27
1548	vcipher		$out2,$out2,v27
1549	vcipher		$out3,$out3,v27
1550	vcipher		$out4,$out4,v27
1551	vcipher		$out5,$out5,v27
1552	vcipher		$out6,$out6,v27
1553	vcipher		$out7,$out7,v27
1554	lvx		v25,$x10,$key_		# re-pre-load round[2]
1555
1556	vcipher		$out0,$out0,v28
1557	 lvx_u		$in0,$x00,$inp		# load input
1558	vcipher		$out1,$out1,v28
1559	 lvx_u		$in1,$x10,$inp
1560	vcipher		$out2,$out2,v28
1561	 lvx_u		$in2,$x20,$inp
1562	vcipher		$out3,$out3,v28
1563	 lvx_u		$in3,$x30,$inp
1564	vcipher		$out4,$out4,v28
1565	 lvx_u		$in4,$x40,$inp
1566	vcipher		$out5,$out5,v28
1567	 lvx_u		$in5,$x50,$inp
1568	vcipher		$out6,$out6,v28
1569	 lvx_u		$in6,$x60,$inp
1570	vcipher		$out7,$out7,v28
1571	 lvx_u		$in7,$x70,$inp
1572	 addi		$inp,$inp,0x80
1573
1574	vcipher		$out0,$out0,v29
1575	 le?vperm	$in0,$in0,$in0,$inpperm
1576	vcipher		$out1,$out1,v29
1577	 le?vperm	$in1,$in1,$in1,$inpperm
1578	vcipher		$out2,$out2,v29
1579	 le?vperm	$in2,$in2,$in2,$inpperm
1580	vcipher		$out3,$out3,v29
1581	 le?vperm	$in3,$in3,$in3,$inpperm
1582	vcipher		$out4,$out4,v29
1583	 le?vperm	$in4,$in4,$in4,$inpperm
1584	vcipher		$out5,$out5,v29
1585	 le?vperm	$in5,$in5,$in5,$inpperm
1586	vcipher		$out6,$out6,v29
1587	 le?vperm	$in6,$in6,$in6,$inpperm
1588	vcipher		$out7,$out7,v29
1589	 le?vperm	$in7,$in7,$in7,$inpperm
1590
1591	add		$inp,$inp,r0		# $inp is adjusted in such
1592						# way that at exit from the
1593						# loop inX-in7 are loaded
1594						# with last "words"
1595	subfe.		r0,r0,r0		# borrow?-1:0
1596	vcipher		$out0,$out0,v30
1597	 vxor		$in0,$in0,v31		# xor with last round key
1598	vcipher		$out1,$out1,v30
1599	 vxor		$in1,$in1,v31
1600	vcipher		$out2,$out2,v30
1601	 vxor		$in2,$in2,v31
1602	vcipher		$out3,$out3,v30
1603	 vxor		$in3,$in3,v31
1604	vcipher		$out4,$out4,v30
1605	 vxor		$in4,$in4,v31
1606	vcipher		$out5,$out5,v30
1607	 vxor		$in5,$in5,v31
1608	vcipher		$out6,$out6,v30
1609	 vxor		$in6,$in6,v31
1610	vcipher		$out7,$out7,v30
1611	 vxor		$in7,$in7,v31
1612
1613	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1614
1615	vcipherlast	$in0,$out0,$in0
1616	vcipherlast	$in1,$out1,$in1
1617	 vadduqm	$out1,$ivec,$one	# counter values ...
1618	vcipherlast	$in2,$out2,$in2
1619	 vadduqm	$out2,$ivec,$two
1620	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1621	vcipherlast	$in3,$out3,$in3
1622	 vadduqm	$out3,$out1,$two
1623	 vxor		$out1,$out1,$rndkey0
1624	vcipherlast	$in4,$out4,$in4
1625	 vadduqm	$out4,$out2,$two
1626	 vxor		$out2,$out2,$rndkey0
1627	vcipherlast	$in5,$out5,$in5
1628	 vadduqm	$out5,$out3,$two
1629	 vxor		$out3,$out3,$rndkey0
1630	vcipherlast	$in6,$out6,$in6
1631	 vadduqm	$out6,$out4,$two
1632	 vxor		$out4,$out4,$rndkey0
1633	vcipherlast	$in7,$out7,$in7
1634	 vadduqm	$out7,$out5,$two
1635	 vxor		$out5,$out5,$rndkey0
1636	le?vperm	$in0,$in0,$in0,$inpperm
1637	 vadduqm	$ivec,$out6,$two	# next counter value
1638	 vxor		$out6,$out6,$rndkey0
1639	le?vperm	$in1,$in1,$in1,$inpperm
1640	 vxor		$out7,$out7,$rndkey0
1641	mtctr		$rounds
1642
1643	 vcipher	$out0,$out0,v24
1644	stvx_u		$in0,$x00,$out
1645	le?vperm	$in2,$in2,$in2,$inpperm
1646	 vcipher	$out1,$out1,v24
1647	stvx_u		$in1,$x10,$out
1648	le?vperm	$in3,$in3,$in3,$inpperm
1649	 vcipher	$out2,$out2,v24
1650	stvx_u		$in2,$x20,$out
1651	le?vperm	$in4,$in4,$in4,$inpperm
1652	 vcipher	$out3,$out3,v24
1653	stvx_u		$in3,$x30,$out
1654	le?vperm	$in5,$in5,$in5,$inpperm
1655	 vcipher	$out4,$out4,v24
1656	stvx_u		$in4,$x40,$out
1657	le?vperm	$in6,$in6,$in6,$inpperm
1658	 vcipher	$out5,$out5,v24
1659	stvx_u		$in5,$x50,$out
1660	le?vperm	$in7,$in7,$in7,$inpperm
1661	 vcipher	$out6,$out6,v24
1662	stvx_u		$in6,$x60,$out
1663	 vcipher	$out7,$out7,v24
1664	stvx_u		$in7,$x70,$out
1665	addi		$out,$out,0x80
1666
1667	b		Loop_ctr32_enc8x_middle
1668
1669.align	5
1670Lctr32_enc8x_break:
1671	cmpwi		$len,-0x60
1672	blt		Lctr32_enc8x_one
1673	nop
1674	beq		Lctr32_enc8x_two
1675	cmpwi		$len,-0x40
1676	blt		Lctr32_enc8x_three
1677	nop
1678	beq		Lctr32_enc8x_four
1679	cmpwi		$len,-0x20
1680	blt		Lctr32_enc8x_five
1681	nop
1682	beq		Lctr32_enc8x_six
1683	cmpwi		$len,0x00
1684	blt		Lctr32_enc8x_seven
1685
1686Lctr32_enc8x_eight:
1687	vcipherlast	$out0,$out0,$in0
1688	vcipherlast	$out1,$out1,$in1
1689	vcipherlast	$out2,$out2,$in2
1690	vcipherlast	$out3,$out3,$in3
1691	vcipherlast	$out4,$out4,$in4
1692	vcipherlast	$out5,$out5,$in5
1693	vcipherlast	$out6,$out6,$in6
1694	vcipherlast	$out7,$out7,$in7
1695
1696	le?vperm	$out0,$out0,$out0,$inpperm
1697	le?vperm	$out1,$out1,$out1,$inpperm
1698	stvx_u		$out0,$x00,$out
1699	le?vperm	$out2,$out2,$out2,$inpperm
1700	stvx_u		$out1,$x10,$out
1701	le?vperm	$out3,$out3,$out3,$inpperm
1702	stvx_u		$out2,$x20,$out
1703	le?vperm	$out4,$out4,$out4,$inpperm
1704	stvx_u		$out3,$x30,$out
1705	le?vperm	$out5,$out5,$out5,$inpperm
1706	stvx_u		$out4,$x40,$out
1707	le?vperm	$out6,$out6,$out6,$inpperm
1708	stvx_u		$out5,$x50,$out
1709	le?vperm	$out7,$out7,$out7,$inpperm
1710	stvx_u		$out6,$x60,$out
1711	stvx_u		$out7,$x70,$out
1712	addi		$out,$out,0x80
1713	b		Lctr32_enc8x_done
1714
1715.align	5
1716Lctr32_enc8x_seven:
1717	vcipherlast	$out0,$out0,$in1
1718	vcipherlast	$out1,$out1,$in2
1719	vcipherlast	$out2,$out2,$in3
1720	vcipherlast	$out3,$out3,$in4
1721	vcipherlast	$out4,$out4,$in5
1722	vcipherlast	$out5,$out5,$in6
1723	vcipherlast	$out6,$out6,$in7
1724
1725	le?vperm	$out0,$out0,$out0,$inpperm
1726	le?vperm	$out1,$out1,$out1,$inpperm
1727	stvx_u		$out0,$x00,$out
1728	le?vperm	$out2,$out2,$out2,$inpperm
1729	stvx_u		$out1,$x10,$out
1730	le?vperm	$out3,$out3,$out3,$inpperm
1731	stvx_u		$out2,$x20,$out
1732	le?vperm	$out4,$out4,$out4,$inpperm
1733	stvx_u		$out3,$x30,$out
1734	le?vperm	$out5,$out5,$out5,$inpperm
1735	stvx_u		$out4,$x40,$out
1736	le?vperm	$out6,$out6,$out6,$inpperm
1737	stvx_u		$out5,$x50,$out
1738	stvx_u		$out6,$x60,$out
1739	addi		$out,$out,0x70
1740	b		Lctr32_enc8x_done
1741
1742.align	5
1743Lctr32_enc8x_six:
1744	vcipherlast	$out0,$out0,$in2
1745	vcipherlast	$out1,$out1,$in3
1746	vcipherlast	$out2,$out2,$in4
1747	vcipherlast	$out3,$out3,$in5
1748	vcipherlast	$out4,$out4,$in6
1749	vcipherlast	$out5,$out5,$in7
1750
1751	le?vperm	$out0,$out0,$out0,$inpperm
1752	le?vperm	$out1,$out1,$out1,$inpperm
1753	stvx_u		$out0,$x00,$out
1754	le?vperm	$out2,$out2,$out2,$inpperm
1755	stvx_u		$out1,$x10,$out
1756	le?vperm	$out3,$out3,$out3,$inpperm
1757	stvx_u		$out2,$x20,$out
1758	le?vperm	$out4,$out4,$out4,$inpperm
1759	stvx_u		$out3,$x30,$out
1760	le?vperm	$out5,$out5,$out5,$inpperm
1761	stvx_u		$out4,$x40,$out
1762	stvx_u		$out5,$x50,$out
1763	addi		$out,$out,0x60
1764	b		Lctr32_enc8x_done
1765
1766.align	5
1767Lctr32_enc8x_five:
1768	vcipherlast	$out0,$out0,$in3
1769	vcipherlast	$out1,$out1,$in4
1770	vcipherlast	$out2,$out2,$in5
1771	vcipherlast	$out3,$out3,$in6
1772	vcipherlast	$out4,$out4,$in7
1773
1774	le?vperm	$out0,$out0,$out0,$inpperm
1775	le?vperm	$out1,$out1,$out1,$inpperm
1776	stvx_u		$out0,$x00,$out
1777	le?vperm	$out2,$out2,$out2,$inpperm
1778	stvx_u		$out1,$x10,$out
1779	le?vperm	$out3,$out3,$out3,$inpperm
1780	stvx_u		$out2,$x20,$out
1781	le?vperm	$out4,$out4,$out4,$inpperm
1782	stvx_u		$out3,$x30,$out
1783	stvx_u		$out4,$x40,$out
1784	addi		$out,$out,0x50
1785	b		Lctr32_enc8x_done
1786
1787.align	5
1788Lctr32_enc8x_four:
1789	vcipherlast	$out0,$out0,$in4
1790	vcipherlast	$out1,$out1,$in5
1791	vcipherlast	$out2,$out2,$in6
1792	vcipherlast	$out3,$out3,$in7
1793
1794	le?vperm	$out0,$out0,$out0,$inpperm
1795	le?vperm	$out1,$out1,$out1,$inpperm
1796	stvx_u		$out0,$x00,$out
1797	le?vperm	$out2,$out2,$out2,$inpperm
1798	stvx_u		$out1,$x10,$out
1799	le?vperm	$out3,$out3,$out3,$inpperm
1800	stvx_u		$out2,$x20,$out
1801	stvx_u		$out3,$x30,$out
1802	addi		$out,$out,0x40
1803	b		Lctr32_enc8x_done
1804
1805.align	5
1806Lctr32_enc8x_three:
1807	vcipherlast	$out0,$out0,$in5
1808	vcipherlast	$out1,$out1,$in6
1809	vcipherlast	$out2,$out2,$in7
1810
1811	le?vperm	$out0,$out0,$out0,$inpperm
1812	le?vperm	$out1,$out1,$out1,$inpperm
1813	stvx_u		$out0,$x00,$out
1814	le?vperm	$out2,$out2,$out2,$inpperm
1815	stvx_u		$out1,$x10,$out
1816	stvx_u		$out2,$x20,$out
1817	addi		$out,$out,0x30
1818	b		Lcbc_dec8x_done
1819
1820.align	5
1821Lctr32_enc8x_two:
1822	vcipherlast	$out0,$out0,$in6
1823	vcipherlast	$out1,$out1,$in7
1824
1825	le?vperm	$out0,$out0,$out0,$inpperm
1826	le?vperm	$out1,$out1,$out1,$inpperm
1827	stvx_u		$out0,$x00,$out
1828	stvx_u		$out1,$x10,$out
1829	addi		$out,$out,0x20
1830	b		Lcbc_dec8x_done
1831
1832.align	5
1833Lctr32_enc8x_one:
1834	vcipherlast	$out0,$out0,$in7
1835
1836	le?vperm	$out0,$out0,$out0,$inpperm
1837	stvx_u		$out0,0,$out
1838	addi		$out,$out,0x10
1839
1840Lctr32_enc8x_done:
1841	li		r10,`$FRAME+15`
1842	li		r11,`$FRAME+31`
1843	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1844	addi		r10,r10,32
1845	stvx		$inpperm,r11,$sp
1846	addi		r11,r11,32
1847	stvx		$inpperm,r10,$sp
1848	addi		r10,r10,32
1849	stvx		$inpperm,r11,$sp
1850	addi		r11,r11,32
1851	stvx		$inpperm,r10,$sp
1852	addi		r10,r10,32
1853	stvx		$inpperm,r11,$sp
1854	addi		r11,r11,32
1855	stvx		$inpperm,r10,$sp
1856	addi		r10,r10,32
1857	stvx		$inpperm,r11,$sp
1858	addi		r11,r11,32
1859
1860	mtspr		256,$vrsave
1861	lvx		v20,r10,$sp		# ABI says so
1862	addi		r10,r10,32
1863	lvx		v21,r11,$sp
1864	addi		r11,r11,32
1865	lvx		v22,r10,$sp
1866	addi		r10,r10,32
1867	lvx		v23,r11,$sp
1868	addi		r11,r11,32
1869	lvx		v24,r10,$sp
1870	addi		r10,r10,32
1871	lvx		v25,r11,$sp
1872	addi		r11,r11,32
1873	lvx		v26,r10,$sp
1874	addi		r10,r10,32
1875	lvx		v27,r11,$sp
1876	addi		r11,r11,32
1877	lvx		v28,r10,$sp
1878	addi		r10,r10,32
1879	lvx		v29,r11,$sp
1880	addi		r11,r11,32
1881	lvx		v30,r10,$sp
1882	lvx		v31,r11,$sp
1883	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1884	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1885	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1886	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1887	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1888	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1889	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1890	blr
1891	.long		0
1892	.byte		0,12,0x14,0,0x80,6,6,0
1893	.long		0
1894.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1895___
1896}}	}}}
1897
1898#########################################################################
1899{{{	# XTS procedures						#
1900# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1901#                             const AES_KEY *key1, const AES_KEY *key2,	#
1902#                             [const] unsigned char iv[16]);		#
1903# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1904# input tweak value is assumed to be encrypted already, and last tweak	#
1905# value, one suitable for consecutive call on same chunk of data, is	#
1906# written back to original buffer. In addition, in "tweak chaining"	#
1907# mode only complete input blocks are processed.			#
1908
1909my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1910my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1911my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1912my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1913my $taillen = $key2;
1914
1915   ($inp,$idx) = ($idx,$inp);				# reassign
1916
1917$code.=<<___;
1918.globl	.${prefix}_xts_encrypt
1919	mr		$inp,r3				# reassign
1920	li		r3,-1
1921	${UCMP}i	$len,16
1922	bltlr-
1923
1924	lis		r0,0xfff0
1925	mfspr		r12,256				# save vrsave
1926	li		r11,0
1927	mtspr		256,r0
1928
1929	vspltisb	$seven,0x07			# 0x070707..07
1930	le?lvsl		$leperm,r11,r11
1931	le?vspltisb	$tmp,0x0f
1932	le?vxor		$leperm,$leperm,$seven
1933
1934	li		$idx,15
1935	lvx		$tweak,0,$ivp			# load [unaligned] iv
1936	lvsl		$inpperm,0,$ivp
1937	lvx		$inptail,$idx,$ivp
1938	le?vxor		$inpperm,$inpperm,$tmp
1939	vperm		$tweak,$tweak,$inptail,$inpperm
1940
1941	neg		r11,$inp
1942	lvsr		$inpperm,0,r11			# prepare for unaligned load
1943	lvx		$inout,0,$inp
1944	addi		$inp,$inp,15			# 15 is not typo
1945	le?vxor		$inpperm,$inpperm,$tmp
1946
1947	${UCMP}i	$key2,0				# key2==NULL?
1948	beq		Lxts_enc_no_key2
1949
1950	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
1951	lwz		$rounds,240($key2)
1952	srwi		$rounds,$rounds,1
1953	subi		$rounds,$rounds,1
1954	li		$idx,16
1955
1956	lvx		$rndkey0,0,$key2
1957	lvx		$rndkey1,$idx,$key2
1958	addi		$idx,$idx,16
1959	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1960	vxor		$tweak,$tweak,$rndkey0
1961	lvx		$rndkey0,$idx,$key2
1962	addi		$idx,$idx,16
1963	mtctr		$rounds
1964
1965Ltweak_xts_enc:
1966	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1967	vcipher		$tweak,$tweak,$rndkey1
1968	lvx		$rndkey1,$idx,$key2
1969	addi		$idx,$idx,16
1970	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1971	vcipher		$tweak,$tweak,$rndkey0
1972	lvx		$rndkey0,$idx,$key2
1973	addi		$idx,$idx,16
1974	bdnz		Ltweak_xts_enc
1975
1976	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1977	vcipher		$tweak,$tweak,$rndkey1
1978	lvx		$rndkey1,$idx,$key2
1979	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1980	vcipherlast	$tweak,$tweak,$rndkey0
1981
1982	li		$ivp,0				# don't chain the tweak
1983	b		Lxts_enc
1984
1985Lxts_enc_no_key2:
1986	li		$idx,-16
1987	and		$len,$len,$idx			# in "tweak chaining"
1988							# mode only complete
1989							# blocks are processed
1990Lxts_enc:
1991	lvx		$inptail,0,$inp
1992	addi		$inp,$inp,16
1993
1994	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
1995	lwz		$rounds,240($key1)
1996	srwi		$rounds,$rounds,1
1997	subi		$rounds,$rounds,1
1998	li		$idx,16
1999
2000	vslb		$eighty7,$seven,$seven		# 0x808080..80
2001	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2002	vspltisb	$tmp,1				# 0x010101..01
2003	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2004
2005	${UCMP}i	$len,96
2006	bge		_aesp8_xts_encrypt6x
2007
2008	andi.		$taillen,$len,15
2009	subic		r0,$len,32
2010	subi		$taillen,$taillen,16
2011	subfe		r0,r0,r0
2012	and		r0,r0,$taillen
2013	add		$inp,$inp,r0
2014
2015	lvx		$rndkey0,0,$key1
2016	lvx		$rndkey1,$idx,$key1
2017	addi		$idx,$idx,16
2018	vperm		$inout,$inout,$inptail,$inpperm
2019	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2020	vxor		$inout,$inout,$tweak
2021	vxor		$inout,$inout,$rndkey0
2022	lvx		$rndkey0,$idx,$key1
2023	addi		$idx,$idx,16
2024	mtctr		$rounds
2025	b		Loop_xts_enc
2026
2027.align	5
2028Loop_xts_enc:
2029	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2030	vcipher		$inout,$inout,$rndkey1
2031	lvx		$rndkey1,$idx,$key1
2032	addi		$idx,$idx,16
2033	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2034	vcipher		$inout,$inout,$rndkey0
2035	lvx		$rndkey0,$idx,$key1
2036	addi		$idx,$idx,16
2037	bdnz		Loop_xts_enc
2038
2039	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2040	vcipher		$inout,$inout,$rndkey1
2041	lvx		$rndkey1,$idx,$key1
2042	li		$idx,16
2043	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2044	vxor		$rndkey0,$rndkey0,$tweak
2045	vcipherlast	$output,$inout,$rndkey0
2046
2047	le?vperm	$tmp,$output,$output,$leperm
2048	be?nop
2049	le?stvx_u	$tmp,0,$out
2050	be?stvx_u	$output,0,$out
2051	addi		$out,$out,16
2052
2053	subic.		$len,$len,16
2054	beq		Lxts_enc_done
2055
2056	vmr		$inout,$inptail
2057	lvx		$inptail,0,$inp
2058	addi		$inp,$inp,16
2059	lvx		$rndkey0,0,$key1
2060	lvx		$rndkey1,$idx,$key1
2061	addi		$idx,$idx,16
2062
2063	subic		r0,$len,32
2064	subfe		r0,r0,r0
2065	and		r0,r0,$taillen
2066	add		$inp,$inp,r0
2067
2068	vsrab		$tmp,$tweak,$seven		# next tweak value
2069	vaddubm		$tweak,$tweak,$tweak
2070	vsldoi		$tmp,$tmp,$tmp,15
2071	vand		$tmp,$tmp,$eighty7
2072	vxor		$tweak,$tweak,$tmp
2073
2074	vperm		$inout,$inout,$inptail,$inpperm
2075	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2076	vxor		$inout,$inout,$tweak
2077	vxor		$output,$output,$rndkey0	# just in case $len<16
2078	vxor		$inout,$inout,$rndkey0
2079	lvx		$rndkey0,$idx,$key1
2080	addi		$idx,$idx,16
2081
2082	mtctr		$rounds
2083	${UCMP}i	$len,16
2084	bge		Loop_xts_enc
2085
2086	vxor		$output,$output,$tweak
2087	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2088	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2089	vspltisb	$tmp,-1
2090	vperm		$inptail,$inptail,$tmp,$inpperm
2091	vsel		$inout,$inout,$output,$inptail
2092
2093	subi		r11,$out,17
2094	subi		$out,$out,16
2095	mtctr		$len
2096	li		$len,16
2097Loop_xts_enc_steal:
2098	lbzu		r0,1(r11)
2099	stb		r0,16(r11)
2100	bdnz		Loop_xts_enc_steal
2101
2102	mtctr		$rounds
2103	b		Loop_xts_enc			# one more time...
2104
2105Lxts_enc_done:
2106	${UCMP}i	$ivp,0
2107	beq		Lxts_enc_ret
2108
2109	vsrab		$tmp,$tweak,$seven		# next tweak value
2110	vaddubm		$tweak,$tweak,$tweak
2111	vsldoi		$tmp,$tmp,$tmp,15
2112	vand		$tmp,$tmp,$eighty7
2113	vxor		$tweak,$tweak,$tmp
2114
2115	le?vperm	$tweak,$tweak,$tweak,$leperm
2116	stvx_u		$tweak,0,$ivp
2117
2118Lxts_enc_ret:
2119	mtspr		256,r12				# restore vrsave
2120	li		r3,0
2121	blr
2122	.long		0
2123	.byte		0,12,0x04,0,0x80,6,6,0
2124	.long		0
2125.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2126
2127.globl	.${prefix}_xts_decrypt
2128	mr		$inp,r3				# reassign
2129	li		r3,-1
2130	${UCMP}i	$len,16
2131	bltlr-
2132
2133	lis		r0,0xfff8
2134	mfspr		r12,256				# save vrsave
2135	li		r11,0
2136	mtspr		256,r0
2137
2138	andi.		r0,$len,15
2139	neg		r0,r0
2140	andi.		r0,r0,16
2141	sub		$len,$len,r0
2142
2143	vspltisb	$seven,0x07			# 0x070707..07
2144	le?lvsl		$leperm,r11,r11
2145	le?vspltisb	$tmp,0x0f
2146	le?vxor		$leperm,$leperm,$seven
2147
2148	li		$idx,15
2149	lvx		$tweak,0,$ivp			# load [unaligned] iv
2150	lvsl		$inpperm,0,$ivp
2151	lvx		$inptail,$idx,$ivp
2152	le?vxor		$inpperm,$inpperm,$tmp
2153	vperm		$tweak,$tweak,$inptail,$inpperm
2154
2155	neg		r11,$inp
2156	lvsr		$inpperm,0,r11			# prepare for unaligned load
2157	lvx		$inout,0,$inp
2158	addi		$inp,$inp,15			# 15 is not typo
2159	le?vxor		$inpperm,$inpperm,$tmp
2160
2161	${UCMP}i	$key2,0				# key2==NULL?
2162	beq		Lxts_dec_no_key2
2163
2164	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2165	lwz		$rounds,240($key2)
2166	srwi		$rounds,$rounds,1
2167	subi		$rounds,$rounds,1
2168	li		$idx,16
2169
2170	lvx		$rndkey0,0,$key2
2171	lvx		$rndkey1,$idx,$key2
2172	addi		$idx,$idx,16
2173	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2174	vxor		$tweak,$tweak,$rndkey0
2175	lvx		$rndkey0,$idx,$key2
2176	addi		$idx,$idx,16
2177	mtctr		$rounds
2178
2179Ltweak_xts_dec:
2180	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2181	vcipher		$tweak,$tweak,$rndkey1
2182	lvx		$rndkey1,$idx,$key2
2183	addi		$idx,$idx,16
2184	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2185	vcipher		$tweak,$tweak,$rndkey0
2186	lvx		$rndkey0,$idx,$key2
2187	addi		$idx,$idx,16
2188	bdnz		Ltweak_xts_dec
2189
2190	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2191	vcipher		$tweak,$tweak,$rndkey1
2192	lvx		$rndkey1,$idx,$key2
2193	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2194	vcipherlast	$tweak,$tweak,$rndkey0
2195
2196	li		$ivp,0				# don't chain the tweak
2197	b		Lxts_dec
2198
2199Lxts_dec_no_key2:
2200	neg		$idx,$len
2201	andi.		$idx,$idx,15
2202	add		$len,$len,$idx			# in "tweak chaining"
2203							# mode only complete
2204							# blocks are processed
2205Lxts_dec:
2206	lvx		$inptail,0,$inp
2207	addi		$inp,$inp,16
2208
2209	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2210	lwz		$rounds,240($key1)
2211	srwi		$rounds,$rounds,1
2212	subi		$rounds,$rounds,1
2213	li		$idx,16
2214
2215	vslb		$eighty7,$seven,$seven		# 0x808080..80
2216	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2217	vspltisb	$tmp,1				# 0x010101..01
2218	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2219
2220	${UCMP}i	$len,96
2221	bge		_aesp8_xts_decrypt6x
2222
2223	lvx		$rndkey0,0,$key1
2224	lvx		$rndkey1,$idx,$key1
2225	addi		$idx,$idx,16
2226	vperm		$inout,$inout,$inptail,$inpperm
2227	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2228	vxor		$inout,$inout,$tweak
2229	vxor		$inout,$inout,$rndkey0
2230	lvx		$rndkey0,$idx,$key1
2231	addi		$idx,$idx,16
2232	mtctr		$rounds
2233
2234	${UCMP}i	$len,16
2235	blt		Ltail_xts_dec
2236	be?b		Loop_xts_dec
2237
2238.align	5
2239Loop_xts_dec:
2240	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2241	vncipher	$inout,$inout,$rndkey1
2242	lvx		$rndkey1,$idx,$key1
2243	addi		$idx,$idx,16
2244	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2245	vncipher	$inout,$inout,$rndkey0
2246	lvx		$rndkey0,$idx,$key1
2247	addi		$idx,$idx,16
2248	bdnz		Loop_xts_dec
2249
2250	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2251	vncipher	$inout,$inout,$rndkey1
2252	lvx		$rndkey1,$idx,$key1
2253	li		$idx,16
2254	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2255	vxor		$rndkey0,$rndkey0,$tweak
2256	vncipherlast	$output,$inout,$rndkey0
2257
2258	le?vperm	$tmp,$output,$output,$leperm
2259	be?nop
2260	le?stvx_u	$tmp,0,$out
2261	be?stvx_u	$output,0,$out
2262	addi		$out,$out,16
2263
2264	subic.		$len,$len,16
2265	beq		Lxts_dec_done
2266
2267	vmr		$inout,$inptail
2268	lvx		$inptail,0,$inp
2269	addi		$inp,$inp,16
2270	lvx		$rndkey0,0,$key1
2271	lvx		$rndkey1,$idx,$key1
2272	addi		$idx,$idx,16
2273
2274	vsrab		$tmp,$tweak,$seven		# next tweak value
2275	vaddubm		$tweak,$tweak,$tweak
2276	vsldoi		$tmp,$tmp,$tmp,15
2277	vand		$tmp,$tmp,$eighty7
2278	vxor		$tweak,$tweak,$tmp
2279
2280	vperm		$inout,$inout,$inptail,$inpperm
2281	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2282	vxor		$inout,$inout,$tweak
2283	vxor		$inout,$inout,$rndkey0
2284	lvx		$rndkey0,$idx,$key1
2285	addi		$idx,$idx,16
2286
2287	mtctr		$rounds
2288	${UCMP}i	$len,16
2289	bge		Loop_xts_dec
2290
2291Ltail_xts_dec:
2292	vsrab		$tmp,$tweak,$seven		# next tweak value
2293	vaddubm		$tweak1,$tweak,$tweak
2294	vsldoi		$tmp,$tmp,$tmp,15
2295	vand		$tmp,$tmp,$eighty7
2296	vxor		$tweak1,$tweak1,$tmp
2297
2298	subi		$inp,$inp,16
2299	add		$inp,$inp,$len
2300
2301	vxor		$inout,$inout,$tweak		# :-(
2302	vxor		$inout,$inout,$tweak1		# :-)
2303
2304Loop_xts_dec_short:
2305	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2306	vncipher	$inout,$inout,$rndkey1
2307	lvx		$rndkey1,$idx,$key1
2308	addi		$idx,$idx,16
2309	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2310	vncipher	$inout,$inout,$rndkey0
2311	lvx		$rndkey0,$idx,$key1
2312	addi		$idx,$idx,16
2313	bdnz		Loop_xts_dec_short
2314
2315	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2316	vncipher	$inout,$inout,$rndkey1
2317	lvx		$rndkey1,$idx,$key1
2318	li		$idx,16
2319	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2320	vxor		$rndkey0,$rndkey0,$tweak1
2321	vncipherlast	$output,$inout,$rndkey0
2322
2323	le?vperm	$tmp,$output,$output,$leperm
2324	be?nop
2325	le?stvx_u	$tmp,0,$out
2326	be?stvx_u	$output,0,$out
2327
2328	vmr		$inout,$inptail
2329	lvx		$inptail,0,$inp
2330	#addi		$inp,$inp,16
2331	lvx		$rndkey0,0,$key1
2332	lvx		$rndkey1,$idx,$key1
2333	addi		$idx,$idx,16
2334	vperm		$inout,$inout,$inptail,$inpperm
2335	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2336
2337	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2338	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2339	vspltisb	$tmp,-1
2340	vperm		$inptail,$inptail,$tmp,$inpperm
2341	vsel		$inout,$inout,$output,$inptail
2342
2343	vxor		$rndkey0,$rndkey0,$tweak
2344	vxor		$inout,$inout,$rndkey0
2345	lvx		$rndkey0,$idx,$key1
2346	addi		$idx,$idx,16
2347
2348	subi		r11,$out,1
2349	mtctr		$len
2350	li		$len,16
2351Loop_xts_dec_steal:
2352	lbzu		r0,1(r11)
2353	stb		r0,16(r11)
2354	bdnz		Loop_xts_dec_steal
2355
2356	mtctr		$rounds
2357	b		Loop_xts_dec			# one more time...
2358
2359Lxts_dec_done:
2360	${UCMP}i	$ivp,0
2361	beq		Lxts_dec_ret
2362
2363	vsrab		$tmp,$tweak,$seven		# next tweak value
2364	vaddubm		$tweak,$tweak,$tweak
2365	vsldoi		$tmp,$tmp,$tmp,15
2366	vand		$tmp,$tmp,$eighty7
2367	vxor		$tweak,$tweak,$tmp
2368
2369	le?vperm	$tweak,$tweak,$tweak,$leperm
2370	stvx_u		$tweak,0,$ivp
2371
2372Lxts_dec_ret:
2373	mtspr		256,r12				# restore vrsave
2374	li		r3,0
2375	blr
2376	.long		0
2377	.byte		0,12,0x04,0,0x80,6,6,0
2378	.long		0
2379.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2380___
2381#########################################################################
2382{{	# Optimized XTS procedures					#
2383my $key_=$key2;
2384my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2385    $x00=0 if ($flavour =~ /osx/);
2386my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2387my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2388my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2389my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2390			# v26-v31 last 6 round keys
2391my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2392my $taillen=$x70;
2393
2394$code.=<<___;
2395.align	5
2396_aesp8_xts_encrypt6x:
2397	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2398	mflr		r11
2399	li		r7,`$FRAME+8*16+15`
2400	li		r3,`$FRAME+8*16+31`
2401	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2402	stvx		v20,r7,$sp		# ABI says so
2403	addi		r7,r7,32
2404	stvx		v21,r3,$sp
2405	addi		r3,r3,32
2406	stvx		v22,r7,$sp
2407	addi		r7,r7,32
2408	stvx		v23,r3,$sp
2409	addi		r3,r3,32
2410	stvx		v24,r7,$sp
2411	addi		r7,r7,32
2412	stvx		v25,r3,$sp
2413	addi		r3,r3,32
2414	stvx		v26,r7,$sp
2415	addi		r7,r7,32
2416	stvx		v27,r3,$sp
2417	addi		r3,r3,32
2418	stvx		v28,r7,$sp
2419	addi		r7,r7,32
2420	stvx		v29,r3,$sp
2421	addi		r3,r3,32
2422	stvx		v30,r7,$sp
2423	stvx		v31,r3,$sp
2424	li		r0,-1
2425	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2426	li		$x10,0x10
2427	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2428	li		$x20,0x20
2429	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2430	li		$x30,0x30
2431	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2432	li		$x40,0x40
2433	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2434	li		$x50,0x50
2435	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2436	li		$x60,0x60
2437	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2438	li		$x70,0x70
2439	mtspr		256,r0
2440
2441	subi		$rounds,$rounds,3	# -4 in total
2442
2443	lvx		$rndkey0,$x00,$key1	# load key schedule
2444	lvx		v30,$x10,$key1
2445	addi		$key1,$key1,0x20
2446	lvx		v31,$x00,$key1
2447	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2448	addi		$key_,$sp,$FRAME+15
2449	mtctr		$rounds
2450
2451Load_xts_enc_key:
2452	?vperm		v24,v30,v31,$keyperm
2453	lvx		v30,$x10,$key1
2454	addi		$key1,$key1,0x20
2455	stvx		v24,$x00,$key_		# off-load round[1]
2456	?vperm		v25,v31,v30,$keyperm
2457	lvx		v31,$x00,$key1
2458	stvx		v25,$x10,$key_		# off-load round[2]
2459	addi		$key_,$key_,0x20
2460	bdnz		Load_xts_enc_key
2461
2462	lvx		v26,$x10,$key1
2463	?vperm		v24,v30,v31,$keyperm
2464	lvx		v27,$x20,$key1
2465	stvx		v24,$x00,$key_		# off-load round[3]
2466	?vperm		v25,v31,v26,$keyperm
2467	lvx		v28,$x30,$key1
2468	stvx		v25,$x10,$key_		# off-load round[4]
2469	addi		$key_,$sp,$FRAME+15	# rewind $key_
2470	?vperm		v26,v26,v27,$keyperm
2471	lvx		v29,$x40,$key1
2472	?vperm		v27,v27,v28,$keyperm
2473	lvx		v30,$x50,$key1
2474	?vperm		v28,v28,v29,$keyperm
2475	lvx		v31,$x60,$key1
2476	?vperm		v29,v29,v30,$keyperm
2477	lvx		$twk5,$x70,$key1	# borrow $twk5
2478	?vperm		v30,v30,v31,$keyperm
2479	lvx		v24,$x00,$key_		# pre-load round[1]
2480	?vperm		v31,v31,$twk5,$keyperm
2481	lvx		v25,$x10,$key_		# pre-load round[2]
2482
2483	 vperm		$in0,$inout,$inptail,$inpperm
2484	 subi		$inp,$inp,31		# undo "caller"
2485	vxor		$twk0,$tweak,$rndkey0
2486	vsrab		$tmp,$tweak,$seven	# next tweak value
2487	vaddubm		$tweak,$tweak,$tweak
2488	vsldoi		$tmp,$tmp,$tmp,15
2489	vand		$tmp,$tmp,$eighty7
2490	 vxor		$out0,$in0,$twk0
2491	vxor		$tweak,$tweak,$tmp
2492
2493	 lvx_u		$in1,$x10,$inp
2494	vxor		$twk1,$tweak,$rndkey0
2495	vsrab		$tmp,$tweak,$seven	# next tweak value
2496	vaddubm		$tweak,$tweak,$tweak
2497	vsldoi		$tmp,$tmp,$tmp,15
2498	 le?vperm	$in1,$in1,$in1,$leperm
2499	vand		$tmp,$tmp,$eighty7
2500	 vxor		$out1,$in1,$twk1
2501	vxor		$tweak,$tweak,$tmp
2502
2503	 lvx_u		$in2,$x20,$inp
2504	 andi.		$taillen,$len,15
2505	vxor		$twk2,$tweak,$rndkey0
2506	vsrab		$tmp,$tweak,$seven	# next tweak value
2507	vaddubm		$tweak,$tweak,$tweak
2508	vsldoi		$tmp,$tmp,$tmp,15
2509	 le?vperm	$in2,$in2,$in2,$leperm
2510	vand		$tmp,$tmp,$eighty7
2511	 vxor		$out2,$in2,$twk2
2512	vxor		$tweak,$tweak,$tmp
2513
2514	 lvx_u		$in3,$x30,$inp
2515	 sub		$len,$len,$taillen
2516	vxor		$twk3,$tweak,$rndkey0
2517	vsrab		$tmp,$tweak,$seven	# next tweak value
2518	vaddubm		$tweak,$tweak,$tweak
2519	vsldoi		$tmp,$tmp,$tmp,15
2520	 le?vperm	$in3,$in3,$in3,$leperm
2521	vand		$tmp,$tmp,$eighty7
2522	 vxor		$out3,$in3,$twk3
2523	vxor		$tweak,$tweak,$tmp
2524
2525	 lvx_u		$in4,$x40,$inp
2526	 subi		$len,$len,0x60
2527	vxor		$twk4,$tweak,$rndkey0
2528	vsrab		$tmp,$tweak,$seven	# next tweak value
2529	vaddubm		$tweak,$tweak,$tweak
2530	vsldoi		$tmp,$tmp,$tmp,15
2531	 le?vperm	$in4,$in4,$in4,$leperm
2532	vand		$tmp,$tmp,$eighty7
2533	 vxor		$out4,$in4,$twk4
2534	vxor		$tweak,$tweak,$tmp
2535
2536	 lvx_u		$in5,$x50,$inp
2537	 addi		$inp,$inp,0x60
2538	vxor		$twk5,$tweak,$rndkey0
2539	vsrab		$tmp,$tweak,$seven	# next tweak value
2540	vaddubm		$tweak,$tweak,$tweak
2541	vsldoi		$tmp,$tmp,$tmp,15
2542	 le?vperm	$in5,$in5,$in5,$leperm
2543	vand		$tmp,$tmp,$eighty7
2544	 vxor		$out5,$in5,$twk5
2545	vxor		$tweak,$tweak,$tmp
2546
2547	vxor		v31,v31,$rndkey0
2548	mtctr		$rounds
2549	b		Loop_xts_enc6x
2550
2551.align	5
2552Loop_xts_enc6x:
2553	vcipher		$out0,$out0,v24
2554	vcipher		$out1,$out1,v24
2555	vcipher		$out2,$out2,v24
2556	vcipher		$out3,$out3,v24
2557	vcipher		$out4,$out4,v24
2558	vcipher		$out5,$out5,v24
2559	lvx		v24,$x20,$key_		# round[3]
2560	addi		$key_,$key_,0x20
2561
2562	vcipher		$out0,$out0,v25
2563	vcipher		$out1,$out1,v25
2564	vcipher		$out2,$out2,v25
2565	vcipher		$out3,$out3,v25
2566	vcipher		$out4,$out4,v25
2567	vcipher		$out5,$out5,v25
2568	lvx		v25,$x10,$key_		# round[4]
2569	bdnz		Loop_xts_enc6x
2570
2571	subic		$len,$len,96		# $len-=96
2572	 vxor		$in0,$twk0,v31		# xor with last round key
2573	vcipher		$out0,$out0,v24
2574	vcipher		$out1,$out1,v24
2575	 vsrab		$tmp,$tweak,$seven	# next tweak value
2576	 vxor		$twk0,$tweak,$rndkey0
2577	 vaddubm	$tweak,$tweak,$tweak
2578	vcipher		$out2,$out2,v24
2579	vcipher		$out3,$out3,v24
2580	 vsldoi		$tmp,$tmp,$tmp,15
2581	vcipher		$out4,$out4,v24
2582	vcipher		$out5,$out5,v24
2583
2584	subfe.		r0,r0,r0		# borrow?-1:0
2585	 vand		$tmp,$tmp,$eighty7
2586	vcipher		$out0,$out0,v25
2587	vcipher		$out1,$out1,v25
2588	 vxor		$tweak,$tweak,$tmp
2589	vcipher		$out2,$out2,v25
2590	vcipher		$out3,$out3,v25
2591	 vxor		$in1,$twk1,v31
2592	 vsrab		$tmp,$tweak,$seven	# next tweak value
2593	 vxor		$twk1,$tweak,$rndkey0
2594	vcipher		$out4,$out4,v25
2595	vcipher		$out5,$out5,v25
2596
2597	and		r0,r0,$len
2598	 vaddubm	$tweak,$tweak,$tweak
2599	 vsldoi		$tmp,$tmp,$tmp,15
2600	vcipher		$out0,$out0,v26
2601	vcipher		$out1,$out1,v26
2602	 vand		$tmp,$tmp,$eighty7
2603	vcipher		$out2,$out2,v26
2604	vcipher		$out3,$out3,v26
2605	 vxor		$tweak,$tweak,$tmp
2606	vcipher		$out4,$out4,v26
2607	vcipher		$out5,$out5,v26
2608
2609	add		$inp,$inp,r0		# $inp is adjusted in such
2610						# way that at exit from the
2611						# loop inX-in5 are loaded
2612						# with last "words"
2613	 vxor		$in2,$twk2,v31
2614	 vsrab		$tmp,$tweak,$seven	# next tweak value
2615	 vxor		$twk2,$tweak,$rndkey0
2616	 vaddubm	$tweak,$tweak,$tweak
2617	vcipher		$out0,$out0,v27
2618	vcipher		$out1,$out1,v27
2619	 vsldoi		$tmp,$tmp,$tmp,15
2620	vcipher		$out2,$out2,v27
2621	vcipher		$out3,$out3,v27
2622	 vand		$tmp,$tmp,$eighty7
2623	vcipher		$out4,$out4,v27
2624	vcipher		$out5,$out5,v27
2625
2626	addi		$key_,$sp,$FRAME+15	# rewind $key_
2627	 vxor		$tweak,$tweak,$tmp
2628	vcipher		$out0,$out0,v28
2629	vcipher		$out1,$out1,v28
2630	 vxor		$in3,$twk3,v31
2631	 vsrab		$tmp,$tweak,$seven	# next tweak value
2632	 vxor		$twk3,$tweak,$rndkey0
2633	vcipher		$out2,$out2,v28
2634	vcipher		$out3,$out3,v28
2635	 vaddubm	$tweak,$tweak,$tweak
2636	 vsldoi		$tmp,$tmp,$tmp,15
2637	vcipher		$out4,$out4,v28
2638	vcipher		$out5,$out5,v28
2639	lvx		v24,$x00,$key_		# re-pre-load round[1]
2640	 vand		$tmp,$tmp,$eighty7
2641
2642	vcipher		$out0,$out0,v29
2643	vcipher		$out1,$out1,v29
2644	 vxor		$tweak,$tweak,$tmp
2645	vcipher		$out2,$out2,v29
2646	vcipher		$out3,$out3,v29
2647	 vxor		$in4,$twk4,v31
2648	 vsrab		$tmp,$tweak,$seven	# next tweak value
2649	 vxor		$twk4,$tweak,$rndkey0
2650	vcipher		$out4,$out4,v29
2651	vcipher		$out5,$out5,v29
2652	lvx		v25,$x10,$key_		# re-pre-load round[2]
2653	 vaddubm	$tweak,$tweak,$tweak
2654	 vsldoi		$tmp,$tmp,$tmp,15
2655
2656	vcipher		$out0,$out0,v30
2657	vcipher		$out1,$out1,v30
2658	 vand		$tmp,$tmp,$eighty7
2659	vcipher		$out2,$out2,v30
2660	vcipher		$out3,$out3,v30
2661	 vxor		$tweak,$tweak,$tmp
2662	vcipher		$out4,$out4,v30
2663	vcipher		$out5,$out5,v30
2664	 vxor		$in5,$twk5,v31
2665	 vsrab		$tmp,$tweak,$seven	# next tweak value
2666	 vxor		$twk5,$tweak,$rndkey0
2667
2668	vcipherlast	$out0,$out0,$in0
2669	 lvx_u		$in0,$x00,$inp		# load next input block
2670	 vaddubm	$tweak,$tweak,$tweak
2671	 vsldoi		$tmp,$tmp,$tmp,15
2672	vcipherlast	$out1,$out1,$in1
2673	 lvx_u		$in1,$x10,$inp
2674	vcipherlast	$out2,$out2,$in2
2675	 le?vperm	$in0,$in0,$in0,$leperm
2676	 lvx_u		$in2,$x20,$inp
2677	 vand		$tmp,$tmp,$eighty7
2678	vcipherlast	$out3,$out3,$in3
2679	 le?vperm	$in1,$in1,$in1,$leperm
2680	 lvx_u		$in3,$x30,$inp
2681	vcipherlast	$out4,$out4,$in4
2682	 le?vperm	$in2,$in2,$in2,$leperm
2683	 lvx_u		$in4,$x40,$inp
2684	 vxor		$tweak,$tweak,$tmp
2685	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2686						# in stealing mode
2687	 le?vperm	$in3,$in3,$in3,$leperm
2688	 lvx_u		$in5,$x50,$inp
2689	 addi		$inp,$inp,0x60
2690	 le?vperm	$in4,$in4,$in4,$leperm
2691	 le?vperm	$in5,$in5,$in5,$leperm
2692
2693	le?vperm	$out0,$out0,$out0,$leperm
2694	le?vperm	$out1,$out1,$out1,$leperm
2695	stvx_u		$out0,$x00,$out		# store output
2696	 vxor		$out0,$in0,$twk0
2697	le?vperm	$out2,$out2,$out2,$leperm
2698	stvx_u		$out1,$x10,$out
2699	 vxor		$out1,$in1,$twk1
2700	le?vperm	$out3,$out3,$out3,$leperm
2701	stvx_u		$out2,$x20,$out
2702	 vxor		$out2,$in2,$twk2
2703	le?vperm	$out4,$out4,$out4,$leperm
2704	stvx_u		$out3,$x30,$out
2705	 vxor		$out3,$in3,$twk3
2706	le?vperm	$out5,$tmp,$tmp,$leperm
2707	stvx_u		$out4,$x40,$out
2708	 vxor		$out4,$in4,$twk4
2709	le?stvx_u	$out5,$x50,$out
2710	be?stvx_u	$tmp, $x50,$out
2711	 vxor		$out5,$in5,$twk5
2712	addi		$out,$out,0x60
2713
2714	mtctr		$rounds
2715	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2716
2717	addic.		$len,$len,0x60
2718	beq		Lxts_enc6x_zero
2719	cmpwi		$len,0x20
2720	blt		Lxts_enc6x_one
2721	nop
2722	beq		Lxts_enc6x_two
2723	cmpwi		$len,0x40
2724	blt		Lxts_enc6x_three
2725	nop
2726	beq		Lxts_enc6x_four
2727
2728Lxts_enc6x_five:
2729	vxor		$out0,$in1,$twk0
2730	vxor		$out1,$in2,$twk1
2731	vxor		$out2,$in3,$twk2
2732	vxor		$out3,$in4,$twk3
2733	vxor		$out4,$in5,$twk4
2734
2735	bl		_aesp8_xts_enc5x
2736
2737	le?vperm	$out0,$out0,$out0,$leperm
2738	vmr		$twk0,$twk5		# unused tweak
2739	le?vperm	$out1,$out1,$out1,$leperm
2740	stvx_u		$out0,$x00,$out		# store output
2741	le?vperm	$out2,$out2,$out2,$leperm
2742	stvx_u		$out1,$x10,$out
2743	le?vperm	$out3,$out3,$out3,$leperm
2744	stvx_u		$out2,$x20,$out
2745	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2746	le?vperm	$out4,$out4,$out4,$leperm
2747	stvx_u		$out3,$x30,$out
2748	stvx_u		$out4,$x40,$out
2749	addi		$out,$out,0x50
2750	bne		Lxts_enc6x_steal
2751	b		Lxts_enc6x_done
2752
2753.align	4
2754Lxts_enc6x_four:
2755	vxor		$out0,$in2,$twk0
2756	vxor		$out1,$in3,$twk1
2757	vxor		$out2,$in4,$twk2
2758	vxor		$out3,$in5,$twk3
2759	vxor		$out4,$out4,$out4
2760
2761	bl		_aesp8_xts_enc5x
2762
2763	le?vperm	$out0,$out0,$out0,$leperm
2764	vmr		$twk0,$twk4		# unused tweak
2765	le?vperm	$out1,$out1,$out1,$leperm
2766	stvx_u		$out0,$x00,$out		# store output
2767	le?vperm	$out2,$out2,$out2,$leperm
2768	stvx_u		$out1,$x10,$out
2769	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2770	le?vperm	$out3,$out3,$out3,$leperm
2771	stvx_u		$out2,$x20,$out
2772	stvx_u		$out3,$x30,$out
2773	addi		$out,$out,0x40
2774	bne		Lxts_enc6x_steal
2775	b		Lxts_enc6x_done
2776
2777.align	4
2778Lxts_enc6x_three:
2779	vxor		$out0,$in3,$twk0
2780	vxor		$out1,$in4,$twk1
2781	vxor		$out2,$in5,$twk2
2782	vxor		$out3,$out3,$out3
2783	vxor		$out4,$out4,$out4
2784
2785	bl		_aesp8_xts_enc5x
2786
2787	le?vperm	$out0,$out0,$out0,$leperm
2788	vmr		$twk0,$twk3		# unused tweak
2789	le?vperm	$out1,$out1,$out1,$leperm
2790	stvx_u		$out0,$x00,$out		# store output
2791	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2792	le?vperm	$out2,$out2,$out2,$leperm
2793	stvx_u		$out1,$x10,$out
2794	stvx_u		$out2,$x20,$out
2795	addi		$out,$out,0x30
2796	bne		Lxts_enc6x_steal
2797	b		Lxts_enc6x_done
2798
2799.align	4
2800Lxts_enc6x_two:
2801	vxor		$out0,$in4,$twk0
2802	vxor		$out1,$in5,$twk1
2803	vxor		$out2,$out2,$out2
2804	vxor		$out3,$out3,$out3
2805	vxor		$out4,$out4,$out4
2806
2807	bl		_aesp8_xts_enc5x
2808
2809	le?vperm	$out0,$out0,$out0,$leperm
2810	vmr		$twk0,$twk2		# unused tweak
2811	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2812	le?vperm	$out1,$out1,$out1,$leperm
2813	stvx_u		$out0,$x00,$out		# store output
2814	stvx_u		$out1,$x10,$out
2815	addi		$out,$out,0x20
2816	bne		Lxts_enc6x_steal
2817	b		Lxts_enc6x_done
2818
2819.align	4
2820Lxts_enc6x_one:
2821	vxor		$out0,$in5,$twk0
2822	nop
2823Loop_xts_enc1x:
2824	vcipher		$out0,$out0,v24
2825	lvx		v24,$x20,$key_		# round[3]
2826	addi		$key_,$key_,0x20
2827
2828	vcipher		$out0,$out0,v25
2829	lvx		v25,$x10,$key_		# round[4]
2830	bdnz		Loop_xts_enc1x
2831
2832	add		$inp,$inp,$taillen
2833	cmpwi		$taillen,0
2834	vcipher		$out0,$out0,v24
2835
2836	subi		$inp,$inp,16
2837	vcipher		$out0,$out0,v25
2838
2839	lvsr		$inpperm,0,$taillen
2840	vcipher		$out0,$out0,v26
2841
2842	lvx_u		$in0,0,$inp
2843	vcipher		$out0,$out0,v27
2844
2845	addi		$key_,$sp,$FRAME+15	# rewind $key_
2846	vcipher		$out0,$out0,v28
2847	lvx		v24,$x00,$key_		# re-pre-load round[1]
2848
2849	vcipher		$out0,$out0,v29
2850	lvx		v25,$x10,$key_		# re-pre-load round[2]
2851	 vxor		$twk0,$twk0,v31
2852
2853	le?vperm	$in0,$in0,$in0,$leperm
2854	vcipher		$out0,$out0,v30
2855
2856	vperm		$in0,$in0,$in0,$inpperm
2857	vcipherlast	$out0,$out0,$twk0
2858
2859	vmr		$twk0,$twk1		# unused tweak
2860	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2861	le?vperm	$out0,$out0,$out0,$leperm
2862	stvx_u		$out0,$x00,$out		# store output
2863	addi		$out,$out,0x10
2864	bne		Lxts_enc6x_steal
2865	b		Lxts_enc6x_done
2866
2867.align	4
2868Lxts_enc6x_zero:
2869	cmpwi		$taillen,0
2870	beq		Lxts_enc6x_done
2871
2872	add		$inp,$inp,$taillen
2873	subi		$inp,$inp,16
2874	lvx_u		$in0,0,$inp
2875	lvsr		$inpperm,0,$taillen	# $in5 is no more
2876	le?vperm	$in0,$in0,$in0,$leperm
2877	vperm		$in0,$in0,$in0,$inpperm
2878	vxor		$tmp,$tmp,$twk0
2879Lxts_enc6x_steal:
2880	vxor		$in0,$in0,$twk0
2881	vxor		$out0,$out0,$out0
2882	vspltisb	$out1,-1
2883	vperm		$out0,$out0,$out1,$inpperm
2884	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2885
2886	subi		r30,$out,17
2887	subi		$out,$out,16
2888	mtctr		$taillen
2889Loop_xts_enc6x_steal:
2890	lbzu		r0,1(r30)
2891	stb		r0,16(r30)
2892	bdnz		Loop_xts_enc6x_steal
2893
2894	li		$taillen,0
2895	mtctr		$rounds
2896	b		Loop_xts_enc1x		# one more time...
2897
2898.align	4
2899Lxts_enc6x_done:
2900	${UCMP}i	$ivp,0
2901	beq		Lxts_enc6x_ret
2902
2903	vxor		$tweak,$twk0,$rndkey0
2904	le?vperm	$tweak,$tweak,$tweak,$leperm
2905	stvx_u		$tweak,0,$ivp
2906
2907Lxts_enc6x_ret:
2908	mtlr		r11
2909	li		r10,`$FRAME+15`
2910	li		r11,`$FRAME+31`
2911	stvx		$seven,r10,$sp		# wipe copies of round keys
2912	addi		r10,r10,32
2913	stvx		$seven,r11,$sp
2914	addi		r11,r11,32
2915	stvx		$seven,r10,$sp
2916	addi		r10,r10,32
2917	stvx		$seven,r11,$sp
2918	addi		r11,r11,32
2919	stvx		$seven,r10,$sp
2920	addi		r10,r10,32
2921	stvx		$seven,r11,$sp
2922	addi		r11,r11,32
2923	stvx		$seven,r10,$sp
2924	addi		r10,r10,32
2925	stvx		$seven,r11,$sp
2926	addi		r11,r11,32
2927
2928	mtspr		256,$vrsave
2929	lvx		v20,r10,$sp		# ABI says so
2930	addi		r10,r10,32
2931	lvx		v21,r11,$sp
2932	addi		r11,r11,32
2933	lvx		v22,r10,$sp
2934	addi		r10,r10,32
2935	lvx		v23,r11,$sp
2936	addi		r11,r11,32
2937	lvx		v24,r10,$sp
2938	addi		r10,r10,32
2939	lvx		v25,r11,$sp
2940	addi		r11,r11,32
2941	lvx		v26,r10,$sp
2942	addi		r10,r10,32
2943	lvx		v27,r11,$sp
2944	addi		r11,r11,32
2945	lvx		v28,r10,$sp
2946	addi		r10,r10,32
2947	lvx		v29,r11,$sp
2948	addi		r11,r11,32
2949	lvx		v30,r10,$sp
2950	lvx		v31,r11,$sp
2951	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2952	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2953	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2954	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2955	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2956	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2957	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
2958	blr
2959	.long		0
2960	.byte		0,12,0x04,1,0x80,6,6,0
2961	.long		0
2962
2963.align	5
2964_aesp8_xts_enc5x:
2965	vcipher		$out0,$out0,v24
2966	vcipher		$out1,$out1,v24
2967	vcipher		$out2,$out2,v24
2968	vcipher		$out3,$out3,v24
2969	vcipher		$out4,$out4,v24
2970	lvx		v24,$x20,$key_		# round[3]
2971	addi		$key_,$key_,0x20
2972
2973	vcipher		$out0,$out0,v25
2974	vcipher		$out1,$out1,v25
2975	vcipher		$out2,$out2,v25
2976	vcipher		$out3,$out3,v25
2977	vcipher		$out4,$out4,v25
2978	lvx		v25,$x10,$key_		# round[4]
2979	bdnz		_aesp8_xts_enc5x
2980
2981	add		$inp,$inp,$taillen
2982	cmpwi		$taillen,0
2983	vcipher		$out0,$out0,v24
2984	vcipher		$out1,$out1,v24
2985	vcipher		$out2,$out2,v24
2986	vcipher		$out3,$out3,v24
2987	vcipher		$out4,$out4,v24
2988
2989	subi		$inp,$inp,16
2990	vcipher		$out0,$out0,v25
2991	vcipher		$out1,$out1,v25
2992	vcipher		$out2,$out2,v25
2993	vcipher		$out3,$out3,v25
2994	vcipher		$out4,$out4,v25
2995	 vxor		$twk0,$twk0,v31
2996
2997	vcipher		$out0,$out0,v26
2998	lvsr		$inpperm,r0,$taillen	# $in5 is no more
2999	vcipher		$out1,$out1,v26
3000	vcipher		$out2,$out2,v26
3001	vcipher		$out3,$out3,v26
3002	vcipher		$out4,$out4,v26
3003	 vxor		$in1,$twk1,v31
3004
3005	vcipher		$out0,$out0,v27
3006	lvx_u		$in0,0,$inp
3007	vcipher		$out1,$out1,v27
3008	vcipher		$out2,$out2,v27
3009	vcipher		$out3,$out3,v27
3010	vcipher		$out4,$out4,v27
3011	 vxor		$in2,$twk2,v31
3012
3013	addi		$key_,$sp,$FRAME+15	# rewind $key_
3014	vcipher		$out0,$out0,v28
3015	vcipher		$out1,$out1,v28
3016	vcipher		$out2,$out2,v28
3017	vcipher		$out3,$out3,v28
3018	vcipher		$out4,$out4,v28
3019	lvx		v24,$x00,$key_		# re-pre-load round[1]
3020	 vxor		$in3,$twk3,v31
3021
3022	vcipher		$out0,$out0,v29
3023	le?vperm	$in0,$in0,$in0,$leperm
3024	vcipher		$out1,$out1,v29
3025	vcipher		$out2,$out2,v29
3026	vcipher		$out3,$out3,v29
3027	vcipher		$out4,$out4,v29
3028	lvx		v25,$x10,$key_		# re-pre-load round[2]
3029	 vxor		$in4,$twk4,v31
3030
3031	vcipher		$out0,$out0,v30
3032	vperm		$in0,$in0,$in0,$inpperm
3033	vcipher		$out1,$out1,v30
3034	vcipher		$out2,$out2,v30
3035	vcipher		$out3,$out3,v30
3036	vcipher		$out4,$out4,v30
3037
3038	vcipherlast	$out0,$out0,$twk0
3039	vcipherlast	$out1,$out1,$in1
3040	vcipherlast	$out2,$out2,$in2
3041	vcipherlast	$out3,$out3,$in3
3042	vcipherlast	$out4,$out4,$in4
3043	blr
3044        .long   	0
3045        .byte   	0,12,0x14,0,0,0,0,0
3046
3047.align	5
3048_aesp8_xts_decrypt6x:
3049	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3050	mflr		r11
3051	li		r7,`$FRAME+8*16+15`
3052	li		r3,`$FRAME+8*16+31`
3053	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3054	stvx		v20,r7,$sp		# ABI says so
3055	addi		r7,r7,32
3056	stvx		v21,r3,$sp
3057	addi		r3,r3,32
3058	stvx		v22,r7,$sp
3059	addi		r7,r7,32
3060	stvx		v23,r3,$sp
3061	addi		r3,r3,32
3062	stvx		v24,r7,$sp
3063	addi		r7,r7,32
3064	stvx		v25,r3,$sp
3065	addi		r3,r3,32
3066	stvx		v26,r7,$sp
3067	addi		r7,r7,32
3068	stvx		v27,r3,$sp
3069	addi		r3,r3,32
3070	stvx		v28,r7,$sp
3071	addi		r7,r7,32
3072	stvx		v29,r3,$sp
3073	addi		r3,r3,32
3074	stvx		v30,r7,$sp
3075	stvx		v31,r3,$sp
3076	li		r0,-1
3077	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3078	li		$x10,0x10
3079	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3080	li		$x20,0x20
3081	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3082	li		$x30,0x30
3083	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3084	li		$x40,0x40
3085	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3086	li		$x50,0x50
3087	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3088	li		$x60,0x60
3089	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3090	li		$x70,0x70
3091	mtspr		256,r0
3092
3093	subi		$rounds,$rounds,3	# -4 in total
3094
3095	lvx		$rndkey0,$x00,$key1	# load key schedule
3096	lvx		v30,$x10,$key1
3097	addi		$key1,$key1,0x20
3098	lvx		v31,$x00,$key1
3099	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3100	addi		$key_,$sp,$FRAME+15
3101	mtctr		$rounds
3102
3103Load_xts_dec_key:
3104	?vperm		v24,v30,v31,$keyperm
3105	lvx		v30,$x10,$key1
3106	addi		$key1,$key1,0x20
3107	stvx		v24,$x00,$key_		# off-load round[1]
3108	?vperm		v25,v31,v30,$keyperm
3109	lvx		v31,$x00,$key1
3110	stvx		v25,$x10,$key_		# off-load round[2]
3111	addi		$key_,$key_,0x20
3112	bdnz		Load_xts_dec_key
3113
3114	lvx		v26,$x10,$key1
3115	?vperm		v24,v30,v31,$keyperm
3116	lvx		v27,$x20,$key1
3117	stvx		v24,$x00,$key_		# off-load round[3]
3118	?vperm		v25,v31,v26,$keyperm
3119	lvx		v28,$x30,$key1
3120	stvx		v25,$x10,$key_		# off-load round[4]
3121	addi		$key_,$sp,$FRAME+15	# rewind $key_
3122	?vperm		v26,v26,v27,$keyperm
3123	lvx		v29,$x40,$key1
3124	?vperm		v27,v27,v28,$keyperm
3125	lvx		v30,$x50,$key1
3126	?vperm		v28,v28,v29,$keyperm
3127	lvx		v31,$x60,$key1
3128	?vperm		v29,v29,v30,$keyperm
3129	lvx		$twk5,$x70,$key1	# borrow $twk5
3130	?vperm		v30,v30,v31,$keyperm
3131	lvx		v24,$x00,$key_		# pre-load round[1]
3132	?vperm		v31,v31,$twk5,$keyperm
3133	lvx		v25,$x10,$key_		# pre-load round[2]
3134
3135	 vperm		$in0,$inout,$inptail,$inpperm
3136	 subi		$inp,$inp,31		# undo "caller"
3137	vxor		$twk0,$tweak,$rndkey0
3138	vsrab		$tmp,$tweak,$seven	# next tweak value
3139	vaddubm		$tweak,$tweak,$tweak
3140	vsldoi		$tmp,$tmp,$tmp,15
3141	vand		$tmp,$tmp,$eighty7
3142	 vxor		$out0,$in0,$twk0
3143	vxor		$tweak,$tweak,$tmp
3144
3145	 lvx_u		$in1,$x10,$inp
3146	vxor		$twk1,$tweak,$rndkey0
3147	vsrab		$tmp,$tweak,$seven	# next tweak value
3148	vaddubm		$tweak,$tweak,$tweak
3149	vsldoi		$tmp,$tmp,$tmp,15
3150	 le?vperm	$in1,$in1,$in1,$leperm
3151	vand		$tmp,$tmp,$eighty7
3152	 vxor		$out1,$in1,$twk1
3153	vxor		$tweak,$tweak,$tmp
3154
3155	 lvx_u		$in2,$x20,$inp
3156	 andi.		$taillen,$len,15
3157	vxor		$twk2,$tweak,$rndkey0
3158	vsrab		$tmp,$tweak,$seven	# next tweak value
3159	vaddubm		$tweak,$tweak,$tweak
3160	vsldoi		$tmp,$tmp,$tmp,15
3161	 le?vperm	$in2,$in2,$in2,$leperm
3162	vand		$tmp,$tmp,$eighty7
3163	 vxor		$out2,$in2,$twk2
3164	vxor		$tweak,$tweak,$tmp
3165
3166	 lvx_u		$in3,$x30,$inp
3167	 sub		$len,$len,$taillen
3168	vxor		$twk3,$tweak,$rndkey0
3169	vsrab		$tmp,$tweak,$seven	# next tweak value
3170	vaddubm		$tweak,$tweak,$tweak
3171	vsldoi		$tmp,$tmp,$tmp,15
3172	 le?vperm	$in3,$in3,$in3,$leperm
3173	vand		$tmp,$tmp,$eighty7
3174	 vxor		$out3,$in3,$twk3
3175	vxor		$tweak,$tweak,$tmp
3176
3177	 lvx_u		$in4,$x40,$inp
3178	 subi		$len,$len,0x60
3179	vxor		$twk4,$tweak,$rndkey0
3180	vsrab		$tmp,$tweak,$seven	# next tweak value
3181	vaddubm		$tweak,$tweak,$tweak
3182	vsldoi		$tmp,$tmp,$tmp,15
3183	 le?vperm	$in4,$in4,$in4,$leperm
3184	vand		$tmp,$tmp,$eighty7
3185	 vxor		$out4,$in4,$twk4
3186	vxor		$tweak,$tweak,$tmp
3187
3188	 lvx_u		$in5,$x50,$inp
3189	 addi		$inp,$inp,0x60
3190	vxor		$twk5,$tweak,$rndkey0
3191	vsrab		$tmp,$tweak,$seven	# next tweak value
3192	vaddubm		$tweak,$tweak,$tweak
3193	vsldoi		$tmp,$tmp,$tmp,15
3194	 le?vperm	$in5,$in5,$in5,$leperm
3195	vand		$tmp,$tmp,$eighty7
3196	 vxor		$out5,$in5,$twk5
3197	vxor		$tweak,$tweak,$tmp
3198
3199	vxor		v31,v31,$rndkey0
3200	mtctr		$rounds
3201	b		Loop_xts_dec6x
3202
3203.align	5
3204Loop_xts_dec6x:
3205	vncipher	$out0,$out0,v24
3206	vncipher	$out1,$out1,v24
3207	vncipher	$out2,$out2,v24
3208	vncipher	$out3,$out3,v24
3209	vncipher	$out4,$out4,v24
3210	vncipher	$out5,$out5,v24
3211	lvx		v24,$x20,$key_		# round[3]
3212	addi		$key_,$key_,0x20
3213
3214	vncipher	$out0,$out0,v25
3215	vncipher	$out1,$out1,v25
3216	vncipher	$out2,$out2,v25
3217	vncipher	$out3,$out3,v25
3218	vncipher	$out4,$out4,v25
3219	vncipher	$out5,$out5,v25
3220	lvx		v25,$x10,$key_		# round[4]
3221	bdnz		Loop_xts_dec6x
3222
3223	subic		$len,$len,96		# $len-=96
3224	 vxor		$in0,$twk0,v31		# xor with last round key
3225	vncipher	$out0,$out0,v24
3226	vncipher	$out1,$out1,v24
3227	 vsrab		$tmp,$tweak,$seven	# next tweak value
3228	 vxor		$twk0,$tweak,$rndkey0
3229	 vaddubm	$tweak,$tweak,$tweak
3230	vncipher	$out2,$out2,v24
3231	vncipher	$out3,$out3,v24
3232	 vsldoi		$tmp,$tmp,$tmp,15
3233	vncipher	$out4,$out4,v24
3234	vncipher	$out5,$out5,v24
3235
3236	subfe.		r0,r0,r0		# borrow?-1:0
3237	 vand		$tmp,$tmp,$eighty7
3238	vncipher	$out0,$out0,v25
3239	vncipher	$out1,$out1,v25
3240	 vxor		$tweak,$tweak,$tmp
3241	vncipher	$out2,$out2,v25
3242	vncipher	$out3,$out3,v25
3243	 vxor		$in1,$twk1,v31
3244	 vsrab		$tmp,$tweak,$seven	# next tweak value
3245	 vxor		$twk1,$tweak,$rndkey0
3246	vncipher	$out4,$out4,v25
3247	vncipher	$out5,$out5,v25
3248
3249	and		r0,r0,$len
3250	 vaddubm	$tweak,$tweak,$tweak
3251	 vsldoi		$tmp,$tmp,$tmp,15
3252	vncipher	$out0,$out0,v26
3253	vncipher	$out1,$out1,v26
3254	 vand		$tmp,$tmp,$eighty7
3255	vncipher	$out2,$out2,v26
3256	vncipher	$out3,$out3,v26
3257	 vxor		$tweak,$tweak,$tmp
3258	vncipher	$out4,$out4,v26
3259	vncipher	$out5,$out5,v26
3260
3261	add		$inp,$inp,r0		# $inp is adjusted in such
3262						# way that at exit from the
3263						# loop inX-in5 are loaded
3264						# with last "words"
3265	 vxor		$in2,$twk2,v31
3266	 vsrab		$tmp,$tweak,$seven	# next tweak value
3267	 vxor		$twk2,$tweak,$rndkey0
3268	 vaddubm	$tweak,$tweak,$tweak
3269	vncipher	$out0,$out0,v27
3270	vncipher	$out1,$out1,v27
3271	 vsldoi		$tmp,$tmp,$tmp,15
3272	vncipher	$out2,$out2,v27
3273	vncipher	$out3,$out3,v27
3274	 vand		$tmp,$tmp,$eighty7
3275	vncipher	$out4,$out4,v27
3276	vncipher	$out5,$out5,v27
3277
3278	addi		$key_,$sp,$FRAME+15	# rewind $key_
3279	 vxor		$tweak,$tweak,$tmp
3280	vncipher	$out0,$out0,v28
3281	vncipher	$out1,$out1,v28
3282	 vxor		$in3,$twk3,v31
3283	 vsrab		$tmp,$tweak,$seven	# next tweak value
3284	 vxor		$twk3,$tweak,$rndkey0
3285	vncipher	$out2,$out2,v28
3286	vncipher	$out3,$out3,v28
3287	 vaddubm	$tweak,$tweak,$tweak
3288	 vsldoi		$tmp,$tmp,$tmp,15
3289	vncipher	$out4,$out4,v28
3290	vncipher	$out5,$out5,v28
3291	lvx		v24,$x00,$key_		# re-pre-load round[1]
3292	 vand		$tmp,$tmp,$eighty7
3293
3294	vncipher	$out0,$out0,v29
3295	vncipher	$out1,$out1,v29
3296	 vxor		$tweak,$tweak,$tmp
3297	vncipher	$out2,$out2,v29
3298	vncipher	$out3,$out3,v29
3299	 vxor		$in4,$twk4,v31
3300	 vsrab		$tmp,$tweak,$seven	# next tweak value
3301	 vxor		$twk4,$tweak,$rndkey0
3302	vncipher	$out4,$out4,v29
3303	vncipher	$out5,$out5,v29
3304	lvx		v25,$x10,$key_		# re-pre-load round[2]
3305	 vaddubm	$tweak,$tweak,$tweak
3306	 vsldoi		$tmp,$tmp,$tmp,15
3307
3308	vncipher	$out0,$out0,v30
3309	vncipher	$out1,$out1,v30
3310	 vand		$tmp,$tmp,$eighty7
3311	vncipher	$out2,$out2,v30
3312	vncipher	$out3,$out3,v30
3313	 vxor		$tweak,$tweak,$tmp
3314	vncipher	$out4,$out4,v30
3315	vncipher	$out5,$out5,v30
3316	 vxor		$in5,$twk5,v31
3317	 vsrab		$tmp,$tweak,$seven	# next tweak value
3318	 vxor		$twk5,$tweak,$rndkey0
3319
3320	vncipherlast	$out0,$out0,$in0
3321	 lvx_u		$in0,$x00,$inp		# load next input block
3322	 vaddubm	$tweak,$tweak,$tweak
3323	 vsldoi		$tmp,$tmp,$tmp,15
3324	vncipherlast	$out1,$out1,$in1
3325	 lvx_u		$in1,$x10,$inp
3326	vncipherlast	$out2,$out2,$in2
3327	 le?vperm	$in0,$in0,$in0,$leperm
3328	 lvx_u		$in2,$x20,$inp
3329	 vand		$tmp,$tmp,$eighty7
3330	vncipherlast	$out3,$out3,$in3
3331	 le?vperm	$in1,$in1,$in1,$leperm
3332	 lvx_u		$in3,$x30,$inp
3333	vncipherlast	$out4,$out4,$in4
3334	 le?vperm	$in2,$in2,$in2,$leperm
3335	 lvx_u		$in4,$x40,$inp
3336	 vxor		$tweak,$tweak,$tmp
3337	vncipherlast	$out5,$out5,$in5
3338	 le?vperm	$in3,$in3,$in3,$leperm
3339	 lvx_u		$in5,$x50,$inp
3340	 addi		$inp,$inp,0x60
3341	 le?vperm	$in4,$in4,$in4,$leperm
3342	 le?vperm	$in5,$in5,$in5,$leperm
3343
3344	le?vperm	$out0,$out0,$out0,$leperm
3345	le?vperm	$out1,$out1,$out1,$leperm
3346	stvx_u		$out0,$x00,$out		# store output
3347	 vxor		$out0,$in0,$twk0
3348	le?vperm	$out2,$out2,$out2,$leperm
3349	stvx_u		$out1,$x10,$out
3350	 vxor		$out1,$in1,$twk1
3351	le?vperm	$out3,$out3,$out3,$leperm
3352	stvx_u		$out2,$x20,$out
3353	 vxor		$out2,$in2,$twk2
3354	le?vperm	$out4,$out4,$out4,$leperm
3355	stvx_u		$out3,$x30,$out
3356	 vxor		$out3,$in3,$twk3
3357	le?vperm	$out5,$out5,$out5,$leperm
3358	stvx_u		$out4,$x40,$out
3359	 vxor		$out4,$in4,$twk4
3360	stvx_u		$out5,$x50,$out
3361	 vxor		$out5,$in5,$twk5
3362	addi		$out,$out,0x60
3363
3364	mtctr		$rounds
3365	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3366
3367	addic.		$len,$len,0x60
3368	beq		Lxts_dec6x_zero
3369	cmpwi		$len,0x20
3370	blt		Lxts_dec6x_one
3371	nop
3372	beq		Lxts_dec6x_two
3373	cmpwi		$len,0x40
3374	blt		Lxts_dec6x_three
3375	nop
3376	beq		Lxts_dec6x_four
3377
3378Lxts_dec6x_five:
3379	vxor		$out0,$in1,$twk0
3380	vxor		$out1,$in2,$twk1
3381	vxor		$out2,$in3,$twk2
3382	vxor		$out3,$in4,$twk3
3383	vxor		$out4,$in5,$twk4
3384
3385	bl		_aesp8_xts_dec5x
3386
3387	le?vperm	$out0,$out0,$out0,$leperm
3388	vmr		$twk0,$twk5		# unused tweak
3389	vxor		$twk1,$tweak,$rndkey0
3390	le?vperm	$out1,$out1,$out1,$leperm
3391	stvx_u		$out0,$x00,$out		# store output
3392	vxor		$out0,$in0,$twk1
3393	le?vperm	$out2,$out2,$out2,$leperm
3394	stvx_u		$out1,$x10,$out
3395	le?vperm	$out3,$out3,$out3,$leperm
3396	stvx_u		$out2,$x20,$out
3397	le?vperm	$out4,$out4,$out4,$leperm
3398	stvx_u		$out3,$x30,$out
3399	stvx_u		$out4,$x40,$out
3400	addi		$out,$out,0x50
3401	bne		Lxts_dec6x_steal
3402	b		Lxts_dec6x_done
3403
3404.align	4
3405Lxts_dec6x_four:
3406	vxor		$out0,$in2,$twk0
3407	vxor		$out1,$in3,$twk1
3408	vxor		$out2,$in4,$twk2
3409	vxor		$out3,$in5,$twk3
3410	vxor		$out4,$out4,$out4
3411
3412	bl		_aesp8_xts_dec5x
3413
3414	le?vperm	$out0,$out0,$out0,$leperm
3415	vmr		$twk0,$twk4		# unused tweak
3416	vmr		$twk1,$twk5
3417	le?vperm	$out1,$out1,$out1,$leperm
3418	stvx_u		$out0,$x00,$out		# store output
3419	vxor		$out0,$in0,$twk5
3420	le?vperm	$out2,$out2,$out2,$leperm
3421	stvx_u		$out1,$x10,$out
3422	le?vperm	$out3,$out3,$out3,$leperm
3423	stvx_u		$out2,$x20,$out
3424	stvx_u		$out3,$x30,$out
3425	addi		$out,$out,0x40
3426	bne		Lxts_dec6x_steal
3427	b		Lxts_dec6x_done
3428
3429.align	4
3430Lxts_dec6x_three:
3431	vxor		$out0,$in3,$twk0
3432	vxor		$out1,$in4,$twk1
3433	vxor		$out2,$in5,$twk2
3434	vxor		$out3,$out3,$out3
3435	vxor		$out4,$out4,$out4
3436
3437	bl		_aesp8_xts_dec5x
3438
3439	le?vperm	$out0,$out0,$out0,$leperm
3440	vmr		$twk0,$twk3		# unused tweak
3441	vmr		$twk1,$twk4
3442	le?vperm	$out1,$out1,$out1,$leperm
3443	stvx_u		$out0,$x00,$out		# store output
3444	vxor		$out0,$in0,$twk4
3445	le?vperm	$out2,$out2,$out2,$leperm
3446	stvx_u		$out1,$x10,$out
3447	stvx_u		$out2,$x20,$out
3448	addi		$out,$out,0x30
3449	bne		Lxts_dec6x_steal
3450	b		Lxts_dec6x_done
3451
3452.align	4
3453Lxts_dec6x_two:
3454	vxor		$out0,$in4,$twk0
3455	vxor		$out1,$in5,$twk1
3456	vxor		$out2,$out2,$out2
3457	vxor		$out3,$out3,$out3
3458	vxor		$out4,$out4,$out4
3459
3460	bl		_aesp8_xts_dec5x
3461
3462	le?vperm	$out0,$out0,$out0,$leperm
3463	vmr		$twk0,$twk2		# unused tweak
3464	vmr		$twk1,$twk3
3465	le?vperm	$out1,$out1,$out1,$leperm
3466	stvx_u		$out0,$x00,$out		# store output
3467	vxor		$out0,$in0,$twk3
3468	stvx_u		$out1,$x10,$out
3469	addi		$out,$out,0x20
3470	bne		Lxts_dec6x_steal
3471	b		Lxts_dec6x_done
3472
3473.align	4
3474Lxts_dec6x_one:
3475	vxor		$out0,$in5,$twk0
3476	nop
3477Loop_xts_dec1x:
3478	vncipher	$out0,$out0,v24
3479	lvx		v24,$x20,$key_		# round[3]
3480	addi		$key_,$key_,0x20
3481
3482	vncipher	$out0,$out0,v25
3483	lvx		v25,$x10,$key_		# round[4]
3484	bdnz		Loop_xts_dec1x
3485
3486	subi		r0,$taillen,1
3487	vncipher	$out0,$out0,v24
3488
3489	andi.		r0,r0,16
3490	cmpwi		$taillen,0
3491	vncipher	$out0,$out0,v25
3492
3493	sub		$inp,$inp,r0
3494	vncipher	$out0,$out0,v26
3495
3496	lvx_u		$in0,0,$inp
3497	vncipher	$out0,$out0,v27
3498
3499	addi		$key_,$sp,$FRAME+15	# rewind $key_
3500	vncipher	$out0,$out0,v28
3501	lvx		v24,$x00,$key_		# re-pre-load round[1]
3502
3503	vncipher	$out0,$out0,v29
3504	lvx		v25,$x10,$key_		# re-pre-load round[2]
3505	 vxor		$twk0,$twk0,v31
3506
3507	le?vperm	$in0,$in0,$in0,$leperm
3508	vncipher	$out0,$out0,v30
3509
3510	mtctr		$rounds
3511	vncipherlast	$out0,$out0,$twk0
3512
3513	vmr		$twk0,$twk1		# unused tweak
3514	vmr		$twk1,$twk2
3515	le?vperm	$out0,$out0,$out0,$leperm
3516	stvx_u		$out0,$x00,$out		# store output
3517	addi		$out,$out,0x10
3518	vxor		$out0,$in0,$twk2
3519	bne		Lxts_dec6x_steal
3520	b		Lxts_dec6x_done
3521
3522.align	4
3523Lxts_dec6x_zero:
3524	cmpwi		$taillen,0
3525	beq		Lxts_dec6x_done
3526
3527	lvx_u		$in0,0,$inp
3528	le?vperm	$in0,$in0,$in0,$leperm
3529	vxor		$out0,$in0,$twk1
3530Lxts_dec6x_steal:
3531	vncipher	$out0,$out0,v24
3532	lvx		v24,$x20,$key_		# round[3]
3533	addi		$key_,$key_,0x20
3534
3535	vncipher	$out0,$out0,v25
3536	lvx		v25,$x10,$key_		# round[4]
3537	bdnz		Lxts_dec6x_steal
3538
3539	add		$inp,$inp,$taillen
3540	vncipher	$out0,$out0,v24
3541
3542	cmpwi		$taillen,0
3543	vncipher	$out0,$out0,v25
3544
3545	lvx_u		$in0,0,$inp
3546	vncipher	$out0,$out0,v26
3547
3548	lvsr		$inpperm,0,$taillen	# $in5 is no more
3549	vncipher	$out0,$out0,v27
3550
3551	addi		$key_,$sp,$FRAME+15	# rewind $key_
3552	vncipher	$out0,$out0,v28
3553	lvx		v24,$x00,$key_		# re-pre-load round[1]
3554
3555	vncipher	$out0,$out0,v29
3556	lvx		v25,$x10,$key_		# re-pre-load round[2]
3557	 vxor		$twk1,$twk1,v31
3558
3559	le?vperm	$in0,$in0,$in0,$leperm
3560	vncipher	$out0,$out0,v30
3561
3562	vperm		$in0,$in0,$in0,$inpperm
3563	vncipherlast	$tmp,$out0,$twk1
3564
3565	le?vperm	$out0,$tmp,$tmp,$leperm
3566	le?stvx_u	$out0,0,$out
3567	be?stvx_u	$tmp,0,$out
3568
3569	vxor		$out0,$out0,$out0
3570	vspltisb	$out1,-1
3571	vperm		$out0,$out0,$out1,$inpperm
3572	vsel		$out0,$in0,$tmp,$out0
3573	vxor		$out0,$out0,$twk0
3574
3575	subi		r30,$out,1
3576	mtctr		$taillen
3577Loop_xts_dec6x_steal:
3578	lbzu		r0,1(r30)
3579	stb		r0,16(r30)
3580	bdnz		Loop_xts_dec6x_steal
3581
3582	li		$taillen,0
3583	mtctr		$rounds
3584	b		Loop_xts_dec1x		# one more time...
3585
3586.align	4
3587Lxts_dec6x_done:
3588	${UCMP}i	$ivp,0
3589	beq		Lxts_dec6x_ret
3590
3591	vxor		$tweak,$twk0,$rndkey0
3592	le?vperm	$tweak,$tweak,$tweak,$leperm
3593	stvx_u		$tweak,0,$ivp
3594
3595Lxts_dec6x_ret:
3596	mtlr		r11
3597	li		r10,`$FRAME+15`
3598	li		r11,`$FRAME+31`
3599	stvx		$seven,r10,$sp		# wipe copies of round keys
3600	addi		r10,r10,32
3601	stvx		$seven,r11,$sp
3602	addi		r11,r11,32
3603	stvx		$seven,r10,$sp
3604	addi		r10,r10,32
3605	stvx		$seven,r11,$sp
3606	addi		r11,r11,32
3607	stvx		$seven,r10,$sp
3608	addi		r10,r10,32
3609	stvx		$seven,r11,$sp
3610	addi		r11,r11,32
3611	stvx		$seven,r10,$sp
3612	addi		r10,r10,32
3613	stvx		$seven,r11,$sp
3614	addi		r11,r11,32
3615
3616	mtspr		256,$vrsave
3617	lvx		v20,r10,$sp		# ABI says so
3618	addi		r10,r10,32
3619	lvx		v21,r11,$sp
3620	addi		r11,r11,32
3621	lvx		v22,r10,$sp
3622	addi		r10,r10,32
3623	lvx		v23,r11,$sp
3624	addi		r11,r11,32
3625	lvx		v24,r10,$sp
3626	addi		r10,r10,32
3627	lvx		v25,r11,$sp
3628	addi		r11,r11,32
3629	lvx		v26,r10,$sp
3630	addi		r10,r10,32
3631	lvx		v27,r11,$sp
3632	addi		r11,r11,32
3633	lvx		v28,r10,$sp
3634	addi		r10,r10,32
3635	lvx		v29,r11,$sp
3636	addi		r11,r11,32
3637	lvx		v30,r10,$sp
3638	lvx		v31,r11,$sp
3639	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3640	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3641	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3642	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3643	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3644	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3645	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3646	blr
3647	.long		0
3648	.byte		0,12,0x04,1,0x80,6,6,0
3649	.long		0
3650
3651.align	5
3652_aesp8_xts_dec5x:
3653	vncipher	$out0,$out0,v24
3654	vncipher	$out1,$out1,v24
3655	vncipher	$out2,$out2,v24
3656	vncipher	$out3,$out3,v24
3657	vncipher	$out4,$out4,v24
3658	lvx		v24,$x20,$key_		# round[3]
3659	addi		$key_,$key_,0x20
3660
3661	vncipher	$out0,$out0,v25
3662	vncipher	$out1,$out1,v25
3663	vncipher	$out2,$out2,v25
3664	vncipher	$out3,$out3,v25
3665	vncipher	$out4,$out4,v25
3666	lvx		v25,$x10,$key_		# round[4]
3667	bdnz		_aesp8_xts_dec5x
3668
3669	subi		r0,$taillen,1
3670	vncipher	$out0,$out0,v24
3671	vncipher	$out1,$out1,v24
3672	vncipher	$out2,$out2,v24
3673	vncipher	$out3,$out3,v24
3674	vncipher	$out4,$out4,v24
3675
3676	andi.		r0,r0,16
3677	cmpwi		$taillen,0
3678	vncipher	$out0,$out0,v25
3679	vncipher	$out1,$out1,v25
3680	vncipher	$out2,$out2,v25
3681	vncipher	$out3,$out3,v25
3682	vncipher	$out4,$out4,v25
3683	 vxor		$twk0,$twk0,v31
3684
3685	sub		$inp,$inp,r0
3686	vncipher	$out0,$out0,v26
3687	vncipher	$out1,$out1,v26
3688	vncipher	$out2,$out2,v26
3689	vncipher	$out3,$out3,v26
3690	vncipher	$out4,$out4,v26
3691	 vxor		$in1,$twk1,v31
3692
3693	vncipher	$out0,$out0,v27
3694	lvx_u		$in0,0,$inp
3695	vncipher	$out1,$out1,v27
3696	vncipher	$out2,$out2,v27
3697	vncipher	$out3,$out3,v27
3698	vncipher	$out4,$out4,v27
3699	 vxor		$in2,$twk2,v31
3700
3701	addi		$key_,$sp,$FRAME+15	# rewind $key_
3702	vncipher	$out0,$out0,v28
3703	vncipher	$out1,$out1,v28
3704	vncipher	$out2,$out2,v28
3705	vncipher	$out3,$out3,v28
3706	vncipher	$out4,$out4,v28
3707	lvx		v24,$x00,$key_		# re-pre-load round[1]
3708	 vxor		$in3,$twk3,v31
3709
3710	vncipher	$out0,$out0,v29
3711	le?vperm	$in0,$in0,$in0,$leperm
3712	vncipher	$out1,$out1,v29
3713	vncipher	$out2,$out2,v29
3714	vncipher	$out3,$out3,v29
3715	vncipher	$out4,$out4,v29
3716	lvx		v25,$x10,$key_		# re-pre-load round[2]
3717	 vxor		$in4,$twk4,v31
3718
3719	vncipher	$out0,$out0,v30
3720	vncipher	$out1,$out1,v30
3721	vncipher	$out2,$out2,v30
3722	vncipher	$out3,$out3,v30
3723	vncipher	$out4,$out4,v30
3724
3725	vncipherlast	$out0,$out0,$twk0
3726	vncipherlast	$out1,$out1,$in1
3727	vncipherlast	$out2,$out2,$in2
3728	vncipherlast	$out3,$out3,$in3
3729	vncipherlast	$out4,$out4,$in4
3730	mtctr		$rounds
3731	blr
3732        .long   	0
3733        .byte   	0,12,0x14,0,0,0,0,0
3734___
3735}}	}}}
3736
3737my $consts=1;
3738foreach(split("\n",$code)) {
3739        s/\`([^\`]*)\`/eval($1)/geo;
3740
3741	# constants table endian-specific conversion
3742	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3743	    my $conv=$3;
3744	    my @bytes=();
3745
3746	    # convert to endian-agnostic format
3747	    if ($1 eq "long") {
3748	      foreach (split(/,\s*/,$2)) {
3749		my $l = /^0/?oct:int;
3750		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3751	      }
3752	    } else {
3753		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3754	    }
3755
3756	    # little-endian conversion
3757	    if ($flavour =~ /le$/o) {
3758		SWITCH: for($conv)  {
3759		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3760		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3761		}
3762	    }
3763
3764	    #emit
3765	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3766	    next;
3767	}
3768	$consts=0 if (m/Lconsts:/o);	# end of table
3769
3770	# instructions prefixed with '?' are endian-specific and need
3771	# to be adjusted accordingly...
3772	if ($flavour =~ /le$/o) {	# little-endian
3773	    s/le\?//o		or
3774	    s/be\?/#be#/o	or
3775	    s/\?lvsr/lvsl/o	or
3776	    s/\?lvsl/lvsr/o	or
3777	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3778	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3779	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3780	} else {			# big-endian
3781	    s/le\?/#le#/o	or
3782	    s/be\?//o		or
3783	    s/\?([a-z]+)/$1/o;
3784	}
3785
3786        print $_,"\n";
3787}
3788
3789close STDOUT;
3790