xref: /openbmc/linux/drivers/crypto/vmx/aesp8-ppc.pl (revision ea47eed33a3fe3d919e6e3cf4e4eb5507b817188)
1#! /usr/bin/env perl
2# SPDX-License-Identifier: GPL-2.0
3
4# This code is taken from CRYPTOGAMs[1] and is included here using the option
5# in the license to distribute the code under the GPL. Therefore this program
6# is free software; you can redistribute it and/or modify it under the terms of
7# the GNU General Public License version 2 as published by the Free Software
8# Foundation.
9#
10# [1] https://www.openssl.org/~appro/cryptogams/
11
12# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
13# All rights reserved.
14#
15# Redistribution and use in source and binary forms, with or without
16# modification, are permitted provided that the following conditions
17# are met:
18#
19#       * Redistributions of source code must retain copyright notices,
20#         this list of conditions and the following disclaimer.
21#
22#       * Redistributions in binary form must reproduce the above
23#         copyright notice, this list of conditions and the following
24#         disclaimer in the documentation and/or other materials
25#         provided with the distribution.
26#
27#       * Neither the name of the CRYPTOGAMS nor the names of its
28#         copyright holder and contributors may be used to endorse or
29#         promote products derived from this software without specific
30#         prior written permission.
31#
32# ALTERNATIVELY, provided that this notice is retained in full, this
33# product may be distributed under the terms of the GNU General Public
34# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
35# those given above.
36#
37# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
38# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48
49# ====================================================================
50# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
51# project. The module is, however, dual licensed under OpenSSL and
52# CRYPTOGAMS licenses depending on where you obtain it. For further
53# details see http://www.openssl.org/~appro/cryptogams/.
54# ====================================================================
55#
56# This module implements support for AES instructions as per PowerISA
57# specification version 2.07, first implemented by POWER8 processor.
58# The module is endian-agnostic in sense that it supports both big-
59# and little-endian cases. Data alignment in parallelizable modes is
60# handled with VSX loads and stores, which implies MSR.VSX flag being
61# set. It should also be noted that ISA specification doesn't prohibit
62# alignment exceptions for these instructions on page boundaries.
63# Initially alignment was handled in pure AltiVec/VMX way [when data
64# is aligned programmatically, which in turn guarantees exception-
65# free execution], but it turned to hamper performance when vcipher
66# instructions are interleaved. It's reckoned that eventual
67# misalignment penalties at page boundaries are in average lower
68# than additional overhead in pure AltiVec approach.
69#
70# May 2016
71#
72# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
73# systems were measured.
74#
75######################################################################
76# Current large-block performance in cycles per byte processed with
77# 128-bit key (less is better).
78#
79#		CBC en-/decrypt	CTR	XTS
80# POWER8[le]	3.96/0.72	0.74	1.1
81# POWER8[be]	3.75/0.65	0.66	1.0
82
83$flavour = shift;
84
85if ($flavour =~ /64/) {
86	$SIZE_T	=8;
87	$LRSAVE	=2*$SIZE_T;
88	$STU	="stdu";
89	$POP	="ld";
90	$PUSH	="std";
91	$UCMP	="cmpld";
92	$SHL	="sldi";
93} elsif ($flavour =~ /32/) {
94	$SIZE_T	=4;
95	$LRSAVE	=$SIZE_T;
96	$STU	="stwu";
97	$POP	="lwz";
98	$PUSH	="stw";
99	$UCMP	="cmplw";
100	$SHL	="slwi";
101} else { die "nonsense $flavour"; }
102
103$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
104
105$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
106( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
107( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
108die "can't locate ppc-xlate.pl";
109
110open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
111
112$FRAME=8*$SIZE_T;
113$prefix="aes_p8";
114
115$sp="r1";
116$vrsave="r12";
117
118#########################################################################
119{{{	# Key setup procedures						#
120my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
121my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
122my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
123
124$code.=<<___;
125.machine	"any"
126
127.text
128
129.align	7
130rcon:
131.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
132.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
133.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
134.long	0,0,0,0						?asis
135Lconsts:
136	mflr	r0
137	bcl	20,31,\$+4
138	mflr	$ptr	 #vvvvv "distance between . and rcon
139	addi	$ptr,$ptr,-0x48
140	mtlr	r0
141	blr
142	.long	0
143	.byte	0,12,0x14,0,0,0,0,0
144.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
145
146.globl	.${prefix}_set_encrypt_key
147Lset_encrypt_key:
148	mflr		r11
149	$PUSH		r11,$LRSAVE($sp)
150
151	li		$ptr,-1
152	${UCMP}i	$inp,0
153	beq-		Lenc_key_abort		# if ($inp==0) return -1;
154	${UCMP}i	$out,0
155	beq-		Lenc_key_abort		# if ($out==0) return -1;
156	li		$ptr,-2
157	cmpwi		$bits,128
158	blt-		Lenc_key_abort
159	cmpwi		$bits,256
160	bgt-		Lenc_key_abort
161	andi.		r0,$bits,0x3f
162	bne-		Lenc_key_abort
163
164	lis		r0,0xfff0
165	mfspr		$vrsave,256
166	mtspr		256,r0
167
168	bl		Lconsts
169	mtlr		r11
170
171	neg		r9,$inp
172	lvx		$in0,0,$inp
173	addi		$inp,$inp,15		# 15 is not typo
174	lvsr		$key,0,r9		# borrow $key
175	li		r8,0x20
176	cmpwi		$bits,192
177	lvx		$in1,0,$inp
178	le?vspltisb	$mask,0x0f		# borrow $mask
179	lvx		$rcon,0,$ptr
180	le?vxor		$key,$key,$mask		# adjust for byte swap
181	lvx		$mask,r8,$ptr
182	addi		$ptr,$ptr,0x10
183	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
184	li		$cnt,8
185	vxor		$zero,$zero,$zero
186	mtctr		$cnt
187
188	?lvsr		$outperm,0,$out
189	vspltisb	$outmask,-1
190	lvx		$outhead,0,$out
191	?vperm		$outmask,$zero,$outmask,$outperm
192
193	blt		Loop128
194	addi		$inp,$inp,8
195	beq		L192
196	addi		$inp,$inp,8
197	b		L256
198
199.align	4
200Loop128:
201	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
202	vsldoi		$tmp,$zero,$in0,12	# >>32
203	 vperm		$outtail,$in0,$in0,$outperm	# rotate
204	 vsel		$stage,$outhead,$outtail,$outmask
205	 vmr		$outhead,$outtail
206	vcipherlast	$key,$key,$rcon
207	 stvx		$stage,0,$out
208	 addi		$out,$out,16
209
210	vxor		$in0,$in0,$tmp
211	vsldoi		$tmp,$zero,$tmp,12	# >>32
212	vxor		$in0,$in0,$tmp
213	vsldoi		$tmp,$zero,$tmp,12	# >>32
214	vxor		$in0,$in0,$tmp
215	 vadduwm	$rcon,$rcon,$rcon
216	vxor		$in0,$in0,$key
217	bdnz		Loop128
218
219	lvx		$rcon,0,$ptr		# last two round keys
220
221	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
222	vsldoi		$tmp,$zero,$in0,12	# >>32
223	 vperm		$outtail,$in0,$in0,$outperm	# rotate
224	 vsel		$stage,$outhead,$outtail,$outmask
225	 vmr		$outhead,$outtail
226	vcipherlast	$key,$key,$rcon
227	 stvx		$stage,0,$out
228	 addi		$out,$out,16
229
230	vxor		$in0,$in0,$tmp
231	vsldoi		$tmp,$zero,$tmp,12	# >>32
232	vxor		$in0,$in0,$tmp
233	vsldoi		$tmp,$zero,$tmp,12	# >>32
234	vxor		$in0,$in0,$tmp
235	 vadduwm	$rcon,$rcon,$rcon
236	vxor		$in0,$in0,$key
237
238	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
239	vsldoi		$tmp,$zero,$in0,12	# >>32
240	 vperm		$outtail,$in0,$in0,$outperm	# rotate
241	 vsel		$stage,$outhead,$outtail,$outmask
242	 vmr		$outhead,$outtail
243	vcipherlast	$key,$key,$rcon
244	 stvx		$stage,0,$out
245	 addi		$out,$out,16
246
247	vxor		$in0,$in0,$tmp
248	vsldoi		$tmp,$zero,$tmp,12	# >>32
249	vxor		$in0,$in0,$tmp
250	vsldoi		$tmp,$zero,$tmp,12	# >>32
251	vxor		$in0,$in0,$tmp
252	vxor		$in0,$in0,$key
253	 vperm		$outtail,$in0,$in0,$outperm	# rotate
254	 vsel		$stage,$outhead,$outtail,$outmask
255	 vmr		$outhead,$outtail
256	 stvx		$stage,0,$out
257
258	addi		$inp,$out,15		# 15 is not typo
259	addi		$out,$out,0x50
260
261	li		$rounds,10
262	b		Ldone
263
264.align	4
265L192:
266	lvx		$tmp,0,$inp
267	li		$cnt,4
268	 vperm		$outtail,$in0,$in0,$outperm	# rotate
269	 vsel		$stage,$outhead,$outtail,$outmask
270	 vmr		$outhead,$outtail
271	 stvx		$stage,0,$out
272	 addi		$out,$out,16
273	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
274	vspltisb	$key,8			# borrow $key
275	mtctr		$cnt
276	vsububm		$mask,$mask,$key	# adjust the mask
277
278Loop192:
279	vperm		$key,$in1,$in1,$mask	# roate-n-splat
280	vsldoi		$tmp,$zero,$in0,12	# >>32
281	vcipherlast	$key,$key,$rcon
282
283	vxor		$in0,$in0,$tmp
284	vsldoi		$tmp,$zero,$tmp,12	# >>32
285	vxor		$in0,$in0,$tmp
286	vsldoi		$tmp,$zero,$tmp,12	# >>32
287	vxor		$in0,$in0,$tmp
288
289	 vsldoi		$stage,$zero,$in1,8
290	vspltw		$tmp,$in0,3
291	vxor		$tmp,$tmp,$in1
292	vsldoi		$in1,$zero,$in1,12	# >>32
293	 vadduwm	$rcon,$rcon,$rcon
294	vxor		$in1,$in1,$tmp
295	vxor		$in0,$in0,$key
296	vxor		$in1,$in1,$key
297	 vsldoi		$stage,$stage,$in0,8
298
299	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
300	vsldoi		$tmp,$zero,$in0,12	# >>32
301	 vperm		$outtail,$stage,$stage,$outperm	# rotate
302	 vsel		$stage,$outhead,$outtail,$outmask
303	 vmr		$outhead,$outtail
304	vcipherlast	$key,$key,$rcon
305	 stvx		$stage,0,$out
306	 addi		$out,$out,16
307
308	 vsldoi		$stage,$in0,$in1,8
309	vxor		$in0,$in0,$tmp
310	vsldoi		$tmp,$zero,$tmp,12	# >>32
311	 vperm		$outtail,$stage,$stage,$outperm	# rotate
312	 vsel		$stage,$outhead,$outtail,$outmask
313	 vmr		$outhead,$outtail
314	vxor		$in0,$in0,$tmp
315	vsldoi		$tmp,$zero,$tmp,12	# >>32
316	vxor		$in0,$in0,$tmp
317	 stvx		$stage,0,$out
318	 addi		$out,$out,16
319
320	vspltw		$tmp,$in0,3
321	vxor		$tmp,$tmp,$in1
322	vsldoi		$in1,$zero,$in1,12	# >>32
323	 vadduwm	$rcon,$rcon,$rcon
324	vxor		$in1,$in1,$tmp
325	vxor		$in0,$in0,$key
326	vxor		$in1,$in1,$key
327	 vperm		$outtail,$in0,$in0,$outperm	# rotate
328	 vsel		$stage,$outhead,$outtail,$outmask
329	 vmr		$outhead,$outtail
330	 stvx		$stage,0,$out
331	 addi		$inp,$out,15		# 15 is not typo
332	 addi		$out,$out,16
333	bdnz		Loop192
334
335	li		$rounds,12
336	addi		$out,$out,0x20
337	b		Ldone
338
339.align	4
340L256:
341	lvx		$tmp,0,$inp
342	li		$cnt,7
343	li		$rounds,14
344	 vperm		$outtail,$in0,$in0,$outperm	# rotate
345	 vsel		$stage,$outhead,$outtail,$outmask
346	 vmr		$outhead,$outtail
347	 stvx		$stage,0,$out
348	 addi		$out,$out,16
349	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
350	mtctr		$cnt
351
352Loop256:
353	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
354	vsldoi		$tmp,$zero,$in0,12	# >>32
355	 vperm		$outtail,$in1,$in1,$outperm	# rotate
356	 vsel		$stage,$outhead,$outtail,$outmask
357	 vmr		$outhead,$outtail
358	vcipherlast	$key,$key,$rcon
359	 stvx		$stage,0,$out
360	 addi		$out,$out,16
361
362	vxor		$in0,$in0,$tmp
363	vsldoi		$tmp,$zero,$tmp,12	# >>32
364	vxor		$in0,$in0,$tmp
365	vsldoi		$tmp,$zero,$tmp,12	# >>32
366	vxor		$in0,$in0,$tmp
367	 vadduwm	$rcon,$rcon,$rcon
368	vxor		$in0,$in0,$key
369	 vperm		$outtail,$in0,$in0,$outperm	# rotate
370	 vsel		$stage,$outhead,$outtail,$outmask
371	 vmr		$outhead,$outtail
372	 stvx		$stage,0,$out
373	 addi		$inp,$out,15		# 15 is not typo
374	 addi		$out,$out,16
375	bdz		Ldone
376
377	vspltw		$key,$in0,3		# just splat
378	vsldoi		$tmp,$zero,$in1,12	# >>32
379	vsbox		$key,$key
380
381	vxor		$in1,$in1,$tmp
382	vsldoi		$tmp,$zero,$tmp,12	# >>32
383	vxor		$in1,$in1,$tmp
384	vsldoi		$tmp,$zero,$tmp,12	# >>32
385	vxor		$in1,$in1,$tmp
386
387	vxor		$in1,$in1,$key
388	b		Loop256
389
390.align	4
391Ldone:
392	lvx		$in1,0,$inp		# redundant in aligned case
393	vsel		$in1,$outhead,$in1,$outmask
394	stvx		$in1,0,$inp
395	li		$ptr,0
396	mtspr		256,$vrsave
397	stw		$rounds,0($out)
398
399Lenc_key_abort:
400	mr		r3,$ptr
401	blr
402	.long		0
403	.byte		0,12,0x14,1,0,0,3,0
404	.long		0
405.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
406
407.globl	.${prefix}_set_decrypt_key
408	$STU		$sp,-$FRAME($sp)
409	mflr		r10
410	$PUSH		r10,$FRAME+$LRSAVE($sp)
411	bl		Lset_encrypt_key
412	mtlr		r10
413
414	cmpwi		r3,0
415	bne-		Ldec_key_abort
416
417	slwi		$cnt,$rounds,4
418	subi		$inp,$out,240		# first round key
419	srwi		$rounds,$rounds,1
420	add		$out,$inp,$cnt		# last round key
421	mtctr		$rounds
422
423Ldeckey:
424	lwz		r0, 0($inp)
425	lwz		r6, 4($inp)
426	lwz		r7, 8($inp)
427	lwz		r8, 12($inp)
428	addi		$inp,$inp,16
429	lwz		r9, 0($out)
430	lwz		r10,4($out)
431	lwz		r11,8($out)
432	lwz		r12,12($out)
433	stw		r0, 0($out)
434	stw		r6, 4($out)
435	stw		r7, 8($out)
436	stw		r8, 12($out)
437	subi		$out,$out,16
438	stw		r9, -16($inp)
439	stw		r10,-12($inp)
440	stw		r11,-8($inp)
441	stw		r12,-4($inp)
442	bdnz		Ldeckey
443
444	xor		r3,r3,r3		# return value
445Ldec_key_abort:
446	addi		$sp,$sp,$FRAME
447	blr
448	.long		0
449	.byte		0,12,4,1,0x80,0,3,0
450	.long		0
451.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
452___
453}}}
454#########################################################################
455{{{	# Single block en- and decrypt procedures			#
456sub gen_block () {
457my $dir = shift;
458my $n   = $dir eq "de" ? "n" : "";
459my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
460
461$code.=<<___;
462.globl	.${prefix}_${dir}crypt
463	lwz		$rounds,240($key)
464	lis		r0,0xfc00
465	mfspr		$vrsave,256
466	li		$idx,15			# 15 is not typo
467	mtspr		256,r0
468
469	lvx		v0,0,$inp
470	neg		r11,$out
471	lvx		v1,$idx,$inp
472	lvsl		v2,0,$inp		# inpperm
473	le?vspltisb	v4,0x0f
474	?lvsl		v3,0,r11		# outperm
475	le?vxor		v2,v2,v4
476	li		$idx,16
477	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
478	lvx		v1,0,$key
479	?lvsl		v5,0,$key		# keyperm
480	srwi		$rounds,$rounds,1
481	lvx		v2,$idx,$key
482	addi		$idx,$idx,16
483	subi		$rounds,$rounds,1
484	?vperm		v1,v1,v2,v5		# align round key
485
486	vxor		v0,v0,v1
487	lvx		v1,$idx,$key
488	addi		$idx,$idx,16
489	mtctr		$rounds
490
491Loop_${dir}c:
492	?vperm		v2,v2,v1,v5
493	v${n}cipher	v0,v0,v2
494	lvx		v2,$idx,$key
495	addi		$idx,$idx,16
496	?vperm		v1,v1,v2,v5
497	v${n}cipher	v0,v0,v1
498	lvx		v1,$idx,$key
499	addi		$idx,$idx,16
500	bdnz		Loop_${dir}c
501
502	?vperm		v2,v2,v1,v5
503	v${n}cipher	v0,v0,v2
504	lvx		v2,$idx,$key
505	?vperm		v1,v1,v2,v5
506	v${n}cipherlast	v0,v0,v1
507
508	vspltisb	v2,-1
509	vxor		v1,v1,v1
510	li		$idx,15			# 15 is not typo
511	?vperm		v2,v1,v2,v3		# outmask
512	le?vxor		v3,v3,v4
513	lvx		v1,0,$out		# outhead
514	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
515	vsel		v1,v1,v0,v2
516	lvx		v4,$idx,$out
517	stvx		v1,0,$out
518	vsel		v0,v0,v4,v2
519	stvx		v0,$idx,$out
520
521	mtspr		256,$vrsave
522	blr
523	.long		0
524	.byte		0,12,0x14,0,0,0,3,0
525	.long		0
526.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
527___
528}
529&gen_block("en");
530&gen_block("de");
531}}}
532#########################################################################
533{{{	# CBC en- and decrypt procedures				#
534my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
535my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
536my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
537						map("v$_",(4..10));
538$code.=<<___;
539.globl	.${prefix}_cbc_encrypt
540	${UCMP}i	$len,16
541	bltlr-
542
543	cmpwi		$enc,0			# test direction
544	lis		r0,0xffe0
545	mfspr		$vrsave,256
546	mtspr		256,r0
547
548	li		$idx,15
549	vxor		$rndkey0,$rndkey0,$rndkey0
550	le?vspltisb	$tmp,0x0f
551
552	lvx		$ivec,0,$ivp		# load [unaligned] iv
553	lvsl		$inpperm,0,$ivp
554	lvx		$inptail,$idx,$ivp
555	le?vxor		$inpperm,$inpperm,$tmp
556	vperm		$ivec,$ivec,$inptail,$inpperm
557
558	neg		r11,$inp
559	?lvsl		$keyperm,0,$key		# prepare for unaligned key
560	lwz		$rounds,240($key)
561
562	lvsr		$inpperm,0,r11		# prepare for unaligned load
563	lvx		$inptail,0,$inp
564	addi		$inp,$inp,15		# 15 is not typo
565	le?vxor		$inpperm,$inpperm,$tmp
566
567	?lvsr		$outperm,0,$out		# prepare for unaligned store
568	vspltisb	$outmask,-1
569	lvx		$outhead,0,$out
570	?vperm		$outmask,$rndkey0,$outmask,$outperm
571	le?vxor		$outperm,$outperm,$tmp
572
573	srwi		$rounds,$rounds,1
574	li		$idx,16
575	subi		$rounds,$rounds,1
576	beq		Lcbc_dec
577
578Lcbc_enc:
579	vmr		$inout,$inptail
580	lvx		$inptail,0,$inp
581	addi		$inp,$inp,16
582	mtctr		$rounds
583	subi		$len,$len,16		# len-=16
584
585	lvx		$rndkey0,0,$key
586	 vperm		$inout,$inout,$inptail,$inpperm
587	lvx		$rndkey1,$idx,$key
588	addi		$idx,$idx,16
589	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
590	vxor		$inout,$inout,$rndkey0
591	lvx		$rndkey0,$idx,$key
592	addi		$idx,$idx,16
593	vxor		$inout,$inout,$ivec
594
595Loop_cbc_enc:
596	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
597	vcipher		$inout,$inout,$rndkey1
598	lvx		$rndkey1,$idx,$key
599	addi		$idx,$idx,16
600	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
601	vcipher		$inout,$inout,$rndkey0
602	lvx		$rndkey0,$idx,$key
603	addi		$idx,$idx,16
604	bdnz		Loop_cbc_enc
605
606	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
607	vcipher		$inout,$inout,$rndkey1
608	lvx		$rndkey1,$idx,$key
609	li		$idx,16
610	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
611	vcipherlast	$ivec,$inout,$rndkey0
612	${UCMP}i	$len,16
613
614	vperm		$tmp,$ivec,$ivec,$outperm
615	vsel		$inout,$outhead,$tmp,$outmask
616	vmr		$outhead,$tmp
617	stvx		$inout,0,$out
618	addi		$out,$out,16
619	bge		Lcbc_enc
620
621	b		Lcbc_done
622
623.align	4
624Lcbc_dec:
625	${UCMP}i	$len,128
626	bge		_aesp8_cbc_decrypt8x
627	vmr		$tmp,$inptail
628	lvx		$inptail,0,$inp
629	addi		$inp,$inp,16
630	mtctr		$rounds
631	subi		$len,$len,16		# len-=16
632
633	lvx		$rndkey0,0,$key
634	 vperm		$tmp,$tmp,$inptail,$inpperm
635	lvx		$rndkey1,$idx,$key
636	addi		$idx,$idx,16
637	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
638	vxor		$inout,$tmp,$rndkey0
639	lvx		$rndkey0,$idx,$key
640	addi		$idx,$idx,16
641
642Loop_cbc_dec:
643	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
644	vncipher	$inout,$inout,$rndkey1
645	lvx		$rndkey1,$idx,$key
646	addi		$idx,$idx,16
647	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
648	vncipher	$inout,$inout,$rndkey0
649	lvx		$rndkey0,$idx,$key
650	addi		$idx,$idx,16
651	bdnz		Loop_cbc_dec
652
653	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
654	vncipher	$inout,$inout,$rndkey1
655	lvx		$rndkey1,$idx,$key
656	li		$idx,16
657	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
658	vncipherlast	$inout,$inout,$rndkey0
659	${UCMP}i	$len,16
660
661	vxor		$inout,$inout,$ivec
662	vmr		$ivec,$tmp
663	vperm		$tmp,$inout,$inout,$outperm
664	vsel		$inout,$outhead,$tmp,$outmask
665	vmr		$outhead,$tmp
666	stvx		$inout,0,$out
667	addi		$out,$out,16
668	bge		Lcbc_dec
669
670Lcbc_done:
671	addi		$out,$out,-1
672	lvx		$inout,0,$out		# redundant in aligned case
673	vsel		$inout,$outhead,$inout,$outmask
674	stvx		$inout,0,$out
675
676	neg		$enc,$ivp		# write [unaligned] iv
677	li		$idx,15			# 15 is not typo
678	vxor		$rndkey0,$rndkey0,$rndkey0
679	vspltisb	$outmask,-1
680	le?vspltisb	$tmp,0x0f
681	?lvsl		$outperm,0,$enc
682	?vperm		$outmask,$rndkey0,$outmask,$outperm
683	le?vxor		$outperm,$outperm,$tmp
684	lvx		$outhead,0,$ivp
685	vperm		$ivec,$ivec,$ivec,$outperm
686	vsel		$inout,$outhead,$ivec,$outmask
687	lvx		$inptail,$idx,$ivp
688	stvx		$inout,0,$ivp
689	vsel		$inout,$ivec,$inptail,$outmask
690	stvx		$inout,$idx,$ivp
691
692	mtspr		256,$vrsave
693	blr
694	.long		0
695	.byte		0,12,0x14,0,0,0,6,0
696	.long		0
697___
698#########################################################################
699{{	# Optimized CBC decrypt procedure				#
700my $key_="r11";
701my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
702my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
703my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
704my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
705			# v26-v31 last 6 round keys
706my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
707
708$code.=<<___;
709.align	5
710_aesp8_cbc_decrypt8x:
711	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
712	li		r10,`$FRAME+8*16+15`
713	li		r11,`$FRAME+8*16+31`
714	stvx		v20,r10,$sp		# ABI says so
715	addi		r10,r10,32
716	stvx		v21,r11,$sp
717	addi		r11,r11,32
718	stvx		v22,r10,$sp
719	addi		r10,r10,32
720	stvx		v23,r11,$sp
721	addi		r11,r11,32
722	stvx		v24,r10,$sp
723	addi		r10,r10,32
724	stvx		v25,r11,$sp
725	addi		r11,r11,32
726	stvx		v26,r10,$sp
727	addi		r10,r10,32
728	stvx		v27,r11,$sp
729	addi		r11,r11,32
730	stvx		v28,r10,$sp
731	addi		r10,r10,32
732	stvx		v29,r11,$sp
733	addi		r11,r11,32
734	stvx		v30,r10,$sp
735	stvx		v31,r11,$sp
736	li		r0,-1
737	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
738	li		$x10,0x10
739	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
740	li		$x20,0x20
741	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
742	li		$x30,0x30
743	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
744	li		$x40,0x40
745	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
746	li		$x50,0x50
747	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
748	li		$x60,0x60
749	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
750	li		$x70,0x70
751	mtspr		256,r0
752
753	subi		$rounds,$rounds,3	# -4 in total
754	subi		$len,$len,128		# bias
755
756	lvx		$rndkey0,$x00,$key	# load key schedule
757	lvx		v30,$x10,$key
758	addi		$key,$key,0x20
759	lvx		v31,$x00,$key
760	?vperm		$rndkey0,$rndkey0,v30,$keyperm
761	addi		$key_,$sp,$FRAME+15
762	mtctr		$rounds
763
764Load_cbc_dec_key:
765	?vperm		v24,v30,v31,$keyperm
766	lvx		v30,$x10,$key
767	addi		$key,$key,0x20
768	stvx		v24,$x00,$key_		# off-load round[1]
769	?vperm		v25,v31,v30,$keyperm
770	lvx		v31,$x00,$key
771	stvx		v25,$x10,$key_		# off-load round[2]
772	addi		$key_,$key_,0x20
773	bdnz		Load_cbc_dec_key
774
775	lvx		v26,$x10,$key
776	?vperm		v24,v30,v31,$keyperm
777	lvx		v27,$x20,$key
778	stvx		v24,$x00,$key_		# off-load round[3]
779	?vperm		v25,v31,v26,$keyperm
780	lvx		v28,$x30,$key
781	stvx		v25,$x10,$key_		# off-load round[4]
782	addi		$key_,$sp,$FRAME+15	# rewind $key_
783	?vperm		v26,v26,v27,$keyperm
784	lvx		v29,$x40,$key
785	?vperm		v27,v27,v28,$keyperm
786	lvx		v30,$x50,$key
787	?vperm		v28,v28,v29,$keyperm
788	lvx		v31,$x60,$key
789	?vperm		v29,v29,v30,$keyperm
790	lvx		$out0,$x70,$key		# borrow $out0
791	?vperm		v30,v30,v31,$keyperm
792	lvx		v24,$x00,$key_		# pre-load round[1]
793	?vperm		v31,v31,$out0,$keyperm
794	lvx		v25,$x10,$key_		# pre-load round[2]
795
796	#lvx		$inptail,0,$inp		# "caller" already did this
797	#addi		$inp,$inp,15		# 15 is not typo
798	subi		$inp,$inp,15		# undo "caller"
799
800	 le?li		$idx,8
801	lvx_u		$in0,$x00,$inp		# load first 8 "words"
802	 le?lvsl	$inpperm,0,$idx
803	 le?vspltisb	$tmp,0x0f
804	lvx_u		$in1,$x10,$inp
805	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
806	lvx_u		$in2,$x20,$inp
807	 le?vperm	$in0,$in0,$in0,$inpperm
808	lvx_u		$in3,$x30,$inp
809	 le?vperm	$in1,$in1,$in1,$inpperm
810	lvx_u		$in4,$x40,$inp
811	 le?vperm	$in2,$in2,$in2,$inpperm
812	vxor		$out0,$in0,$rndkey0
813	lvx_u		$in5,$x50,$inp
814	 le?vperm	$in3,$in3,$in3,$inpperm
815	vxor		$out1,$in1,$rndkey0
816	lvx_u		$in6,$x60,$inp
817	 le?vperm	$in4,$in4,$in4,$inpperm
818	vxor		$out2,$in2,$rndkey0
819	lvx_u		$in7,$x70,$inp
820	addi		$inp,$inp,0x80
821	 le?vperm	$in5,$in5,$in5,$inpperm
822	vxor		$out3,$in3,$rndkey0
823	 le?vperm	$in6,$in6,$in6,$inpperm
824	vxor		$out4,$in4,$rndkey0
825	 le?vperm	$in7,$in7,$in7,$inpperm
826	vxor		$out5,$in5,$rndkey0
827	vxor		$out6,$in6,$rndkey0
828	vxor		$out7,$in7,$rndkey0
829
830	mtctr		$rounds
831	b		Loop_cbc_dec8x
832.align	5
833Loop_cbc_dec8x:
834	vncipher	$out0,$out0,v24
835	vncipher	$out1,$out1,v24
836	vncipher	$out2,$out2,v24
837	vncipher	$out3,$out3,v24
838	vncipher	$out4,$out4,v24
839	vncipher	$out5,$out5,v24
840	vncipher	$out6,$out6,v24
841	vncipher	$out7,$out7,v24
842	lvx		v24,$x20,$key_		# round[3]
843	addi		$key_,$key_,0x20
844
845	vncipher	$out0,$out0,v25
846	vncipher	$out1,$out1,v25
847	vncipher	$out2,$out2,v25
848	vncipher	$out3,$out3,v25
849	vncipher	$out4,$out4,v25
850	vncipher	$out5,$out5,v25
851	vncipher	$out6,$out6,v25
852	vncipher	$out7,$out7,v25
853	lvx		v25,$x10,$key_		# round[4]
854	bdnz		Loop_cbc_dec8x
855
856	subic		$len,$len,128		# $len-=128
857	vncipher	$out0,$out0,v24
858	vncipher	$out1,$out1,v24
859	vncipher	$out2,$out2,v24
860	vncipher	$out3,$out3,v24
861	vncipher	$out4,$out4,v24
862	vncipher	$out5,$out5,v24
863	vncipher	$out6,$out6,v24
864	vncipher	$out7,$out7,v24
865
866	subfe.		r0,r0,r0		# borrow?-1:0
867	vncipher	$out0,$out0,v25
868	vncipher	$out1,$out1,v25
869	vncipher	$out2,$out2,v25
870	vncipher	$out3,$out3,v25
871	vncipher	$out4,$out4,v25
872	vncipher	$out5,$out5,v25
873	vncipher	$out6,$out6,v25
874	vncipher	$out7,$out7,v25
875
876	and		r0,r0,$len
877	vncipher	$out0,$out0,v26
878	vncipher	$out1,$out1,v26
879	vncipher	$out2,$out2,v26
880	vncipher	$out3,$out3,v26
881	vncipher	$out4,$out4,v26
882	vncipher	$out5,$out5,v26
883	vncipher	$out6,$out6,v26
884	vncipher	$out7,$out7,v26
885
886	add		$inp,$inp,r0		# $inp is adjusted in such
887						# way that at exit from the
888						# loop inX-in7 are loaded
889						# with last "words"
890	vncipher	$out0,$out0,v27
891	vncipher	$out1,$out1,v27
892	vncipher	$out2,$out2,v27
893	vncipher	$out3,$out3,v27
894	vncipher	$out4,$out4,v27
895	vncipher	$out5,$out5,v27
896	vncipher	$out6,$out6,v27
897	vncipher	$out7,$out7,v27
898
899	addi		$key_,$sp,$FRAME+15	# rewind $key_
900	vncipher	$out0,$out0,v28
901	vncipher	$out1,$out1,v28
902	vncipher	$out2,$out2,v28
903	vncipher	$out3,$out3,v28
904	vncipher	$out4,$out4,v28
905	vncipher	$out5,$out5,v28
906	vncipher	$out6,$out6,v28
907	vncipher	$out7,$out7,v28
908	lvx		v24,$x00,$key_		# re-pre-load round[1]
909
910	vncipher	$out0,$out0,v29
911	vncipher	$out1,$out1,v29
912	vncipher	$out2,$out2,v29
913	vncipher	$out3,$out3,v29
914	vncipher	$out4,$out4,v29
915	vncipher	$out5,$out5,v29
916	vncipher	$out6,$out6,v29
917	vncipher	$out7,$out7,v29
918	lvx		v25,$x10,$key_		# re-pre-load round[2]
919
920	vncipher	$out0,$out0,v30
921	 vxor		$ivec,$ivec,v31		# xor with last round key
922	vncipher	$out1,$out1,v30
923	 vxor		$in0,$in0,v31
924	vncipher	$out2,$out2,v30
925	 vxor		$in1,$in1,v31
926	vncipher	$out3,$out3,v30
927	 vxor		$in2,$in2,v31
928	vncipher	$out4,$out4,v30
929	 vxor		$in3,$in3,v31
930	vncipher	$out5,$out5,v30
931	 vxor		$in4,$in4,v31
932	vncipher	$out6,$out6,v30
933	 vxor		$in5,$in5,v31
934	vncipher	$out7,$out7,v30
935	 vxor		$in6,$in6,v31
936
937	vncipherlast	$out0,$out0,$ivec
938	vncipherlast	$out1,$out1,$in0
939	 lvx_u		$in0,$x00,$inp		# load next input block
940	vncipherlast	$out2,$out2,$in1
941	 lvx_u		$in1,$x10,$inp
942	vncipherlast	$out3,$out3,$in2
943	 le?vperm	$in0,$in0,$in0,$inpperm
944	 lvx_u		$in2,$x20,$inp
945	vncipherlast	$out4,$out4,$in3
946	 le?vperm	$in1,$in1,$in1,$inpperm
947	 lvx_u		$in3,$x30,$inp
948	vncipherlast	$out5,$out5,$in4
949	 le?vperm	$in2,$in2,$in2,$inpperm
950	 lvx_u		$in4,$x40,$inp
951	vncipherlast	$out6,$out6,$in5
952	 le?vperm	$in3,$in3,$in3,$inpperm
953	 lvx_u		$in5,$x50,$inp
954	vncipherlast	$out7,$out7,$in6
955	 le?vperm	$in4,$in4,$in4,$inpperm
956	 lvx_u		$in6,$x60,$inp
957	vmr		$ivec,$in7
958	 le?vperm	$in5,$in5,$in5,$inpperm
959	 lvx_u		$in7,$x70,$inp
960	 addi		$inp,$inp,0x80
961
962	le?vperm	$out0,$out0,$out0,$inpperm
963	le?vperm	$out1,$out1,$out1,$inpperm
964	stvx_u		$out0,$x00,$out
965	 le?vperm	$in6,$in6,$in6,$inpperm
966	 vxor		$out0,$in0,$rndkey0
967	le?vperm	$out2,$out2,$out2,$inpperm
968	stvx_u		$out1,$x10,$out
969	 le?vperm	$in7,$in7,$in7,$inpperm
970	 vxor		$out1,$in1,$rndkey0
971	le?vperm	$out3,$out3,$out3,$inpperm
972	stvx_u		$out2,$x20,$out
973	 vxor		$out2,$in2,$rndkey0
974	le?vperm	$out4,$out4,$out4,$inpperm
975	stvx_u		$out3,$x30,$out
976	 vxor		$out3,$in3,$rndkey0
977	le?vperm	$out5,$out5,$out5,$inpperm
978	stvx_u		$out4,$x40,$out
979	 vxor		$out4,$in4,$rndkey0
980	le?vperm	$out6,$out6,$out6,$inpperm
981	stvx_u		$out5,$x50,$out
982	 vxor		$out5,$in5,$rndkey0
983	le?vperm	$out7,$out7,$out7,$inpperm
984	stvx_u		$out6,$x60,$out
985	 vxor		$out6,$in6,$rndkey0
986	stvx_u		$out7,$x70,$out
987	addi		$out,$out,0x80
988	 vxor		$out7,$in7,$rndkey0
989
990	mtctr		$rounds
991	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
992
993	addic.		$len,$len,128
994	beq		Lcbc_dec8x_done
995	nop
996	nop
997
998Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
999	vncipher	$out1,$out1,v24
1000	vncipher	$out2,$out2,v24
1001	vncipher	$out3,$out3,v24
1002	vncipher	$out4,$out4,v24
1003	vncipher	$out5,$out5,v24
1004	vncipher	$out6,$out6,v24
1005	vncipher	$out7,$out7,v24
1006	lvx		v24,$x20,$key_		# round[3]
1007	addi		$key_,$key_,0x20
1008
1009	vncipher	$out1,$out1,v25
1010	vncipher	$out2,$out2,v25
1011	vncipher	$out3,$out3,v25
1012	vncipher	$out4,$out4,v25
1013	vncipher	$out5,$out5,v25
1014	vncipher	$out6,$out6,v25
1015	vncipher	$out7,$out7,v25
1016	lvx		v25,$x10,$key_		# round[4]
1017	bdnz		Loop_cbc_dec8x_tail
1018
1019	vncipher	$out1,$out1,v24
1020	vncipher	$out2,$out2,v24
1021	vncipher	$out3,$out3,v24
1022	vncipher	$out4,$out4,v24
1023	vncipher	$out5,$out5,v24
1024	vncipher	$out6,$out6,v24
1025	vncipher	$out7,$out7,v24
1026
1027	vncipher	$out1,$out1,v25
1028	vncipher	$out2,$out2,v25
1029	vncipher	$out3,$out3,v25
1030	vncipher	$out4,$out4,v25
1031	vncipher	$out5,$out5,v25
1032	vncipher	$out6,$out6,v25
1033	vncipher	$out7,$out7,v25
1034
1035	vncipher	$out1,$out1,v26
1036	vncipher	$out2,$out2,v26
1037	vncipher	$out3,$out3,v26
1038	vncipher	$out4,$out4,v26
1039	vncipher	$out5,$out5,v26
1040	vncipher	$out6,$out6,v26
1041	vncipher	$out7,$out7,v26
1042
1043	vncipher	$out1,$out1,v27
1044	vncipher	$out2,$out2,v27
1045	vncipher	$out3,$out3,v27
1046	vncipher	$out4,$out4,v27
1047	vncipher	$out5,$out5,v27
1048	vncipher	$out6,$out6,v27
1049	vncipher	$out7,$out7,v27
1050
1051	vncipher	$out1,$out1,v28
1052	vncipher	$out2,$out2,v28
1053	vncipher	$out3,$out3,v28
1054	vncipher	$out4,$out4,v28
1055	vncipher	$out5,$out5,v28
1056	vncipher	$out6,$out6,v28
1057	vncipher	$out7,$out7,v28
1058
1059	vncipher	$out1,$out1,v29
1060	vncipher	$out2,$out2,v29
1061	vncipher	$out3,$out3,v29
1062	vncipher	$out4,$out4,v29
1063	vncipher	$out5,$out5,v29
1064	vncipher	$out6,$out6,v29
1065	vncipher	$out7,$out7,v29
1066
1067	vncipher	$out1,$out1,v30
1068	 vxor		$ivec,$ivec,v31		# last round key
1069	vncipher	$out2,$out2,v30
1070	 vxor		$in1,$in1,v31
1071	vncipher	$out3,$out3,v30
1072	 vxor		$in2,$in2,v31
1073	vncipher	$out4,$out4,v30
1074	 vxor		$in3,$in3,v31
1075	vncipher	$out5,$out5,v30
1076	 vxor		$in4,$in4,v31
1077	vncipher	$out6,$out6,v30
1078	 vxor		$in5,$in5,v31
1079	vncipher	$out7,$out7,v30
1080	 vxor		$in6,$in6,v31
1081
1082	cmplwi		$len,32			# switch($len)
1083	blt		Lcbc_dec8x_one
1084	nop
1085	beq		Lcbc_dec8x_two
1086	cmplwi		$len,64
1087	blt		Lcbc_dec8x_three
1088	nop
1089	beq		Lcbc_dec8x_four
1090	cmplwi		$len,96
1091	blt		Lcbc_dec8x_five
1092	nop
1093	beq		Lcbc_dec8x_six
1094
1095Lcbc_dec8x_seven:
1096	vncipherlast	$out1,$out1,$ivec
1097	vncipherlast	$out2,$out2,$in1
1098	vncipherlast	$out3,$out3,$in2
1099	vncipherlast	$out4,$out4,$in3
1100	vncipherlast	$out5,$out5,$in4
1101	vncipherlast	$out6,$out6,$in5
1102	vncipherlast	$out7,$out7,$in6
1103	vmr		$ivec,$in7
1104
1105	le?vperm	$out1,$out1,$out1,$inpperm
1106	le?vperm	$out2,$out2,$out2,$inpperm
1107	stvx_u		$out1,$x00,$out
1108	le?vperm	$out3,$out3,$out3,$inpperm
1109	stvx_u		$out2,$x10,$out
1110	le?vperm	$out4,$out4,$out4,$inpperm
1111	stvx_u		$out3,$x20,$out
1112	le?vperm	$out5,$out5,$out5,$inpperm
1113	stvx_u		$out4,$x30,$out
1114	le?vperm	$out6,$out6,$out6,$inpperm
1115	stvx_u		$out5,$x40,$out
1116	le?vperm	$out7,$out7,$out7,$inpperm
1117	stvx_u		$out6,$x50,$out
1118	stvx_u		$out7,$x60,$out
1119	addi		$out,$out,0x70
1120	b		Lcbc_dec8x_done
1121
1122.align	5
1123Lcbc_dec8x_six:
1124	vncipherlast	$out2,$out2,$ivec
1125	vncipherlast	$out3,$out3,$in2
1126	vncipherlast	$out4,$out4,$in3
1127	vncipherlast	$out5,$out5,$in4
1128	vncipherlast	$out6,$out6,$in5
1129	vncipherlast	$out7,$out7,$in6
1130	vmr		$ivec,$in7
1131
1132	le?vperm	$out2,$out2,$out2,$inpperm
1133	le?vperm	$out3,$out3,$out3,$inpperm
1134	stvx_u		$out2,$x00,$out
1135	le?vperm	$out4,$out4,$out4,$inpperm
1136	stvx_u		$out3,$x10,$out
1137	le?vperm	$out5,$out5,$out5,$inpperm
1138	stvx_u		$out4,$x20,$out
1139	le?vperm	$out6,$out6,$out6,$inpperm
1140	stvx_u		$out5,$x30,$out
1141	le?vperm	$out7,$out7,$out7,$inpperm
1142	stvx_u		$out6,$x40,$out
1143	stvx_u		$out7,$x50,$out
1144	addi		$out,$out,0x60
1145	b		Lcbc_dec8x_done
1146
1147.align	5
1148Lcbc_dec8x_five:
1149	vncipherlast	$out3,$out3,$ivec
1150	vncipherlast	$out4,$out4,$in3
1151	vncipherlast	$out5,$out5,$in4
1152	vncipherlast	$out6,$out6,$in5
1153	vncipherlast	$out7,$out7,$in6
1154	vmr		$ivec,$in7
1155
1156	le?vperm	$out3,$out3,$out3,$inpperm
1157	le?vperm	$out4,$out4,$out4,$inpperm
1158	stvx_u		$out3,$x00,$out
1159	le?vperm	$out5,$out5,$out5,$inpperm
1160	stvx_u		$out4,$x10,$out
1161	le?vperm	$out6,$out6,$out6,$inpperm
1162	stvx_u		$out5,$x20,$out
1163	le?vperm	$out7,$out7,$out7,$inpperm
1164	stvx_u		$out6,$x30,$out
1165	stvx_u		$out7,$x40,$out
1166	addi		$out,$out,0x50
1167	b		Lcbc_dec8x_done
1168
1169.align	5
1170Lcbc_dec8x_four:
1171	vncipherlast	$out4,$out4,$ivec
1172	vncipherlast	$out5,$out5,$in4
1173	vncipherlast	$out6,$out6,$in5
1174	vncipherlast	$out7,$out7,$in6
1175	vmr		$ivec,$in7
1176
1177	le?vperm	$out4,$out4,$out4,$inpperm
1178	le?vperm	$out5,$out5,$out5,$inpperm
1179	stvx_u		$out4,$x00,$out
1180	le?vperm	$out6,$out6,$out6,$inpperm
1181	stvx_u		$out5,$x10,$out
1182	le?vperm	$out7,$out7,$out7,$inpperm
1183	stvx_u		$out6,$x20,$out
1184	stvx_u		$out7,$x30,$out
1185	addi		$out,$out,0x40
1186	b		Lcbc_dec8x_done
1187
1188.align	5
1189Lcbc_dec8x_three:
1190	vncipherlast	$out5,$out5,$ivec
1191	vncipherlast	$out6,$out6,$in5
1192	vncipherlast	$out7,$out7,$in6
1193	vmr		$ivec,$in7
1194
1195	le?vperm	$out5,$out5,$out5,$inpperm
1196	le?vperm	$out6,$out6,$out6,$inpperm
1197	stvx_u		$out5,$x00,$out
1198	le?vperm	$out7,$out7,$out7,$inpperm
1199	stvx_u		$out6,$x10,$out
1200	stvx_u		$out7,$x20,$out
1201	addi		$out,$out,0x30
1202	b		Lcbc_dec8x_done
1203
1204.align	5
1205Lcbc_dec8x_two:
1206	vncipherlast	$out6,$out6,$ivec
1207	vncipherlast	$out7,$out7,$in6
1208	vmr		$ivec,$in7
1209
1210	le?vperm	$out6,$out6,$out6,$inpperm
1211	le?vperm	$out7,$out7,$out7,$inpperm
1212	stvx_u		$out6,$x00,$out
1213	stvx_u		$out7,$x10,$out
1214	addi		$out,$out,0x20
1215	b		Lcbc_dec8x_done
1216
1217.align	5
1218Lcbc_dec8x_one:
1219	vncipherlast	$out7,$out7,$ivec
1220	vmr		$ivec,$in7
1221
1222	le?vperm	$out7,$out7,$out7,$inpperm
1223	stvx_u		$out7,0,$out
1224	addi		$out,$out,0x10
1225
1226Lcbc_dec8x_done:
1227	le?vperm	$ivec,$ivec,$ivec,$inpperm
1228	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1229
1230	li		r10,`$FRAME+15`
1231	li		r11,`$FRAME+31`
1232	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1233	addi		r10,r10,32
1234	stvx		$inpperm,r11,$sp
1235	addi		r11,r11,32
1236	stvx		$inpperm,r10,$sp
1237	addi		r10,r10,32
1238	stvx		$inpperm,r11,$sp
1239	addi		r11,r11,32
1240	stvx		$inpperm,r10,$sp
1241	addi		r10,r10,32
1242	stvx		$inpperm,r11,$sp
1243	addi		r11,r11,32
1244	stvx		$inpperm,r10,$sp
1245	addi		r10,r10,32
1246	stvx		$inpperm,r11,$sp
1247	addi		r11,r11,32
1248
1249	mtspr		256,$vrsave
1250	lvx		v20,r10,$sp		# ABI says so
1251	addi		r10,r10,32
1252	lvx		v21,r11,$sp
1253	addi		r11,r11,32
1254	lvx		v22,r10,$sp
1255	addi		r10,r10,32
1256	lvx		v23,r11,$sp
1257	addi		r11,r11,32
1258	lvx		v24,r10,$sp
1259	addi		r10,r10,32
1260	lvx		v25,r11,$sp
1261	addi		r11,r11,32
1262	lvx		v26,r10,$sp
1263	addi		r10,r10,32
1264	lvx		v27,r11,$sp
1265	addi		r11,r11,32
1266	lvx		v28,r10,$sp
1267	addi		r10,r10,32
1268	lvx		v29,r11,$sp
1269	addi		r11,r11,32
1270	lvx		v30,r10,$sp
1271	lvx		v31,r11,$sp
1272	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1273	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1274	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1275	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1276	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1277	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1278	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1279	blr
1280	.long		0
1281	.byte		0,12,0x14,0,0x80,6,6,0
1282	.long		0
1283.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1284___
1285}}	}}}
1286
1287#########################################################################
1288{{{	# CTR procedure[s]						#
1289my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1290my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1291my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1292						map("v$_",(4..11));
1293my $dat=$tmp;
1294
1295$code.=<<___;
1296.globl	.${prefix}_ctr32_encrypt_blocks
1297	${UCMP}i	$len,1
1298	bltlr-
1299
1300	lis		r0,0xfff0
1301	mfspr		$vrsave,256
1302	mtspr		256,r0
1303
1304	li		$idx,15
1305	vxor		$rndkey0,$rndkey0,$rndkey0
1306	le?vspltisb	$tmp,0x0f
1307
1308	lvx		$ivec,0,$ivp		# load [unaligned] iv
1309	lvsl		$inpperm,0,$ivp
1310	lvx		$inptail,$idx,$ivp
1311	 vspltisb	$one,1
1312	le?vxor		$inpperm,$inpperm,$tmp
1313	vperm		$ivec,$ivec,$inptail,$inpperm
1314	 vsldoi		$one,$rndkey0,$one,1
1315
1316	neg		r11,$inp
1317	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1318	lwz		$rounds,240($key)
1319
1320	lvsr		$inpperm,0,r11		# prepare for unaligned load
1321	lvx		$inptail,0,$inp
1322	addi		$inp,$inp,15		# 15 is not typo
1323	le?vxor		$inpperm,$inpperm,$tmp
1324
1325	srwi		$rounds,$rounds,1
1326	li		$idx,16
1327	subi		$rounds,$rounds,1
1328
1329	${UCMP}i	$len,8
1330	bge		_aesp8_ctr32_encrypt8x
1331
1332	?lvsr		$outperm,0,$out		# prepare for unaligned store
1333	vspltisb	$outmask,-1
1334	lvx		$outhead,0,$out
1335	?vperm		$outmask,$rndkey0,$outmask,$outperm
1336	le?vxor		$outperm,$outperm,$tmp
1337
1338	lvx		$rndkey0,0,$key
1339	mtctr		$rounds
1340	lvx		$rndkey1,$idx,$key
1341	addi		$idx,$idx,16
1342	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1343	vxor		$inout,$ivec,$rndkey0
1344	lvx		$rndkey0,$idx,$key
1345	addi		$idx,$idx,16
1346	b		Loop_ctr32_enc
1347
1348.align	5
1349Loop_ctr32_enc:
1350	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1351	vcipher		$inout,$inout,$rndkey1
1352	lvx		$rndkey1,$idx,$key
1353	addi		$idx,$idx,16
1354	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1355	vcipher		$inout,$inout,$rndkey0
1356	lvx		$rndkey0,$idx,$key
1357	addi		$idx,$idx,16
1358	bdnz		Loop_ctr32_enc
1359
1360	vadduwm		$ivec,$ivec,$one
1361	 vmr		$dat,$inptail
1362	 lvx		$inptail,0,$inp
1363	 addi		$inp,$inp,16
1364	 subic.		$len,$len,1		# blocks--
1365
1366	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1367	vcipher		$inout,$inout,$rndkey1
1368	lvx		$rndkey1,$idx,$key
1369	 vperm		$dat,$dat,$inptail,$inpperm
1370	 li		$idx,16
1371	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1372	 lvx		$rndkey0,0,$key
1373	vxor		$dat,$dat,$rndkey1	# last round key
1374	vcipherlast	$inout,$inout,$dat
1375
1376	 lvx		$rndkey1,$idx,$key
1377	 addi		$idx,$idx,16
1378	vperm		$inout,$inout,$inout,$outperm
1379	vsel		$dat,$outhead,$inout,$outmask
1380	 mtctr		$rounds
1381	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1382	vmr		$outhead,$inout
1383	 vxor		$inout,$ivec,$rndkey0
1384	 lvx		$rndkey0,$idx,$key
1385	 addi		$idx,$idx,16
1386	stvx		$dat,0,$out
1387	addi		$out,$out,16
1388	bne		Loop_ctr32_enc
1389
1390	addi		$out,$out,-1
1391	lvx		$inout,0,$out		# redundant in aligned case
1392	vsel		$inout,$outhead,$inout,$outmask
1393	stvx		$inout,0,$out
1394
1395	mtspr		256,$vrsave
1396	blr
1397	.long		0
1398	.byte		0,12,0x14,0,0,0,6,0
1399	.long		0
1400___
1401#########################################################################
1402{{	# Optimized CTR procedure					#
1403my $key_="r11";
1404my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1405my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1406my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1407my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1408			# v26-v31 last 6 round keys
1409my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1410my ($two,$three,$four)=($outhead,$outperm,$outmask);
1411
1412$code.=<<___;
1413.align	5
1414_aesp8_ctr32_encrypt8x:
1415	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1416	li		r10,`$FRAME+8*16+15`
1417	li		r11,`$FRAME+8*16+31`
1418	stvx		v20,r10,$sp		# ABI says so
1419	addi		r10,r10,32
1420	stvx		v21,r11,$sp
1421	addi		r11,r11,32
1422	stvx		v22,r10,$sp
1423	addi		r10,r10,32
1424	stvx		v23,r11,$sp
1425	addi		r11,r11,32
1426	stvx		v24,r10,$sp
1427	addi		r10,r10,32
1428	stvx		v25,r11,$sp
1429	addi		r11,r11,32
1430	stvx		v26,r10,$sp
1431	addi		r10,r10,32
1432	stvx		v27,r11,$sp
1433	addi		r11,r11,32
1434	stvx		v28,r10,$sp
1435	addi		r10,r10,32
1436	stvx		v29,r11,$sp
1437	addi		r11,r11,32
1438	stvx		v30,r10,$sp
1439	stvx		v31,r11,$sp
1440	li		r0,-1
1441	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1442	li		$x10,0x10
1443	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1444	li		$x20,0x20
1445	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1446	li		$x30,0x30
1447	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1448	li		$x40,0x40
1449	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1450	li		$x50,0x50
1451	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1452	li		$x60,0x60
1453	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1454	li		$x70,0x70
1455	mtspr		256,r0
1456
1457	subi		$rounds,$rounds,3	# -4 in total
1458
1459	lvx		$rndkey0,$x00,$key	# load key schedule
1460	lvx		v30,$x10,$key
1461	addi		$key,$key,0x20
1462	lvx		v31,$x00,$key
1463	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1464	addi		$key_,$sp,$FRAME+15
1465	mtctr		$rounds
1466
1467Load_ctr32_enc_key:
1468	?vperm		v24,v30,v31,$keyperm
1469	lvx		v30,$x10,$key
1470	addi		$key,$key,0x20
1471	stvx		v24,$x00,$key_		# off-load round[1]
1472	?vperm		v25,v31,v30,$keyperm
1473	lvx		v31,$x00,$key
1474	stvx		v25,$x10,$key_		# off-load round[2]
1475	addi		$key_,$key_,0x20
1476	bdnz		Load_ctr32_enc_key
1477
1478	lvx		v26,$x10,$key
1479	?vperm		v24,v30,v31,$keyperm
1480	lvx		v27,$x20,$key
1481	stvx		v24,$x00,$key_		# off-load round[3]
1482	?vperm		v25,v31,v26,$keyperm
1483	lvx		v28,$x30,$key
1484	stvx		v25,$x10,$key_		# off-load round[4]
1485	addi		$key_,$sp,$FRAME+15	# rewind $key_
1486	?vperm		v26,v26,v27,$keyperm
1487	lvx		v29,$x40,$key
1488	?vperm		v27,v27,v28,$keyperm
1489	lvx		v30,$x50,$key
1490	?vperm		v28,v28,v29,$keyperm
1491	lvx		v31,$x60,$key
1492	?vperm		v29,v29,v30,$keyperm
1493	lvx		$out0,$x70,$key		# borrow $out0
1494	?vperm		v30,v30,v31,$keyperm
1495	lvx		v24,$x00,$key_		# pre-load round[1]
1496	?vperm		v31,v31,$out0,$keyperm
1497	lvx		v25,$x10,$key_		# pre-load round[2]
1498
1499	vadduqm		$two,$one,$one
1500	subi		$inp,$inp,15		# undo "caller"
1501	$SHL		$len,$len,4
1502
1503	vadduqm		$out1,$ivec,$one	# counter values ...
1504	vadduqm		$out2,$ivec,$two
1505	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1506	 le?li		$idx,8
1507	vadduqm		$out3,$out1,$two
1508	vxor		$out1,$out1,$rndkey0
1509	 le?lvsl	$inpperm,0,$idx
1510	vadduqm		$out4,$out2,$two
1511	vxor		$out2,$out2,$rndkey0
1512	 le?vspltisb	$tmp,0x0f
1513	vadduqm		$out5,$out3,$two
1514	vxor		$out3,$out3,$rndkey0
1515	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1516	vadduqm		$out6,$out4,$two
1517	vxor		$out4,$out4,$rndkey0
1518	vadduqm		$out7,$out5,$two
1519	vxor		$out5,$out5,$rndkey0
1520	vadduqm		$ivec,$out6,$two	# next counter value
1521	vxor		$out6,$out6,$rndkey0
1522	vxor		$out7,$out7,$rndkey0
1523
1524	mtctr		$rounds
1525	b		Loop_ctr32_enc8x
1526.align	5
1527Loop_ctr32_enc8x:
1528	vcipher 	$out0,$out0,v24
1529	vcipher 	$out1,$out1,v24
1530	vcipher 	$out2,$out2,v24
1531	vcipher 	$out3,$out3,v24
1532	vcipher 	$out4,$out4,v24
1533	vcipher 	$out5,$out5,v24
1534	vcipher 	$out6,$out6,v24
1535	vcipher 	$out7,$out7,v24
1536Loop_ctr32_enc8x_middle:
1537	lvx		v24,$x20,$key_		# round[3]
1538	addi		$key_,$key_,0x20
1539
1540	vcipher 	$out0,$out0,v25
1541	vcipher 	$out1,$out1,v25
1542	vcipher 	$out2,$out2,v25
1543	vcipher 	$out3,$out3,v25
1544	vcipher 	$out4,$out4,v25
1545	vcipher 	$out5,$out5,v25
1546	vcipher 	$out6,$out6,v25
1547	vcipher 	$out7,$out7,v25
1548	lvx		v25,$x10,$key_		# round[4]
1549	bdnz		Loop_ctr32_enc8x
1550
1551	subic		r11,$len,256		# $len-256, borrow $key_
1552	vcipher 	$out0,$out0,v24
1553	vcipher 	$out1,$out1,v24
1554	vcipher 	$out2,$out2,v24
1555	vcipher 	$out3,$out3,v24
1556	vcipher 	$out4,$out4,v24
1557	vcipher 	$out5,$out5,v24
1558	vcipher 	$out6,$out6,v24
1559	vcipher 	$out7,$out7,v24
1560
1561	subfe		r0,r0,r0		# borrow?-1:0
1562	vcipher 	$out0,$out0,v25
1563	vcipher 	$out1,$out1,v25
1564	vcipher 	$out2,$out2,v25
1565	vcipher 	$out3,$out3,v25
1566	vcipher 	$out4,$out4,v25
1567	vcipher		$out5,$out5,v25
1568	vcipher		$out6,$out6,v25
1569	vcipher		$out7,$out7,v25
1570
1571	and		r0,r0,r11
1572	addi		$key_,$sp,$FRAME+15	# rewind $key_
1573	vcipher		$out0,$out0,v26
1574	vcipher		$out1,$out1,v26
1575	vcipher		$out2,$out2,v26
1576	vcipher		$out3,$out3,v26
1577	vcipher		$out4,$out4,v26
1578	vcipher		$out5,$out5,v26
1579	vcipher		$out6,$out6,v26
1580	vcipher		$out7,$out7,v26
1581	lvx		v24,$x00,$key_		# re-pre-load round[1]
1582
1583	subic		$len,$len,129		# $len-=129
1584	vcipher		$out0,$out0,v27
1585	addi		$len,$len,1		# $len-=128 really
1586	vcipher		$out1,$out1,v27
1587	vcipher		$out2,$out2,v27
1588	vcipher		$out3,$out3,v27
1589	vcipher		$out4,$out4,v27
1590	vcipher		$out5,$out5,v27
1591	vcipher		$out6,$out6,v27
1592	vcipher		$out7,$out7,v27
1593	lvx		v25,$x10,$key_		# re-pre-load round[2]
1594
1595	vcipher		$out0,$out0,v28
1596	 lvx_u		$in0,$x00,$inp		# load input
1597	vcipher		$out1,$out1,v28
1598	 lvx_u		$in1,$x10,$inp
1599	vcipher		$out2,$out2,v28
1600	 lvx_u		$in2,$x20,$inp
1601	vcipher		$out3,$out3,v28
1602	 lvx_u		$in3,$x30,$inp
1603	vcipher		$out4,$out4,v28
1604	 lvx_u		$in4,$x40,$inp
1605	vcipher		$out5,$out5,v28
1606	 lvx_u		$in5,$x50,$inp
1607	vcipher		$out6,$out6,v28
1608	 lvx_u		$in6,$x60,$inp
1609	vcipher		$out7,$out7,v28
1610	 lvx_u		$in7,$x70,$inp
1611	 addi		$inp,$inp,0x80
1612
1613	vcipher		$out0,$out0,v29
1614	 le?vperm	$in0,$in0,$in0,$inpperm
1615	vcipher		$out1,$out1,v29
1616	 le?vperm	$in1,$in1,$in1,$inpperm
1617	vcipher		$out2,$out2,v29
1618	 le?vperm	$in2,$in2,$in2,$inpperm
1619	vcipher		$out3,$out3,v29
1620	 le?vperm	$in3,$in3,$in3,$inpperm
1621	vcipher		$out4,$out4,v29
1622	 le?vperm	$in4,$in4,$in4,$inpperm
1623	vcipher		$out5,$out5,v29
1624	 le?vperm	$in5,$in5,$in5,$inpperm
1625	vcipher		$out6,$out6,v29
1626	 le?vperm	$in6,$in6,$in6,$inpperm
1627	vcipher		$out7,$out7,v29
1628	 le?vperm	$in7,$in7,$in7,$inpperm
1629
1630	add		$inp,$inp,r0		# $inp is adjusted in such
1631						# way that at exit from the
1632						# loop inX-in7 are loaded
1633						# with last "words"
1634	subfe.		r0,r0,r0		# borrow?-1:0
1635	vcipher		$out0,$out0,v30
1636	 vxor		$in0,$in0,v31		# xor with last round key
1637	vcipher		$out1,$out1,v30
1638	 vxor		$in1,$in1,v31
1639	vcipher		$out2,$out2,v30
1640	 vxor		$in2,$in2,v31
1641	vcipher		$out3,$out3,v30
1642	 vxor		$in3,$in3,v31
1643	vcipher		$out4,$out4,v30
1644	 vxor		$in4,$in4,v31
1645	vcipher		$out5,$out5,v30
1646	 vxor		$in5,$in5,v31
1647	vcipher		$out6,$out6,v30
1648	 vxor		$in6,$in6,v31
1649	vcipher		$out7,$out7,v30
1650	 vxor		$in7,$in7,v31
1651
1652	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1653
1654	vcipherlast	$in0,$out0,$in0
1655	vcipherlast	$in1,$out1,$in1
1656	 vadduqm	$out1,$ivec,$one	# counter values ...
1657	vcipherlast	$in2,$out2,$in2
1658	 vadduqm	$out2,$ivec,$two
1659	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1660	vcipherlast	$in3,$out3,$in3
1661	 vadduqm	$out3,$out1,$two
1662	 vxor		$out1,$out1,$rndkey0
1663	vcipherlast	$in4,$out4,$in4
1664	 vadduqm	$out4,$out2,$two
1665	 vxor		$out2,$out2,$rndkey0
1666	vcipherlast	$in5,$out5,$in5
1667	 vadduqm	$out5,$out3,$two
1668	 vxor		$out3,$out3,$rndkey0
1669	vcipherlast	$in6,$out6,$in6
1670	 vadduqm	$out6,$out4,$two
1671	 vxor		$out4,$out4,$rndkey0
1672	vcipherlast	$in7,$out7,$in7
1673	 vadduqm	$out7,$out5,$two
1674	 vxor		$out5,$out5,$rndkey0
1675	le?vperm	$in0,$in0,$in0,$inpperm
1676	 vadduqm	$ivec,$out6,$two	# next counter value
1677	 vxor		$out6,$out6,$rndkey0
1678	le?vperm	$in1,$in1,$in1,$inpperm
1679	 vxor		$out7,$out7,$rndkey0
1680	mtctr		$rounds
1681
1682	 vcipher	$out0,$out0,v24
1683	stvx_u		$in0,$x00,$out
1684	le?vperm	$in2,$in2,$in2,$inpperm
1685	 vcipher	$out1,$out1,v24
1686	stvx_u		$in1,$x10,$out
1687	le?vperm	$in3,$in3,$in3,$inpperm
1688	 vcipher	$out2,$out2,v24
1689	stvx_u		$in2,$x20,$out
1690	le?vperm	$in4,$in4,$in4,$inpperm
1691	 vcipher	$out3,$out3,v24
1692	stvx_u		$in3,$x30,$out
1693	le?vperm	$in5,$in5,$in5,$inpperm
1694	 vcipher	$out4,$out4,v24
1695	stvx_u		$in4,$x40,$out
1696	le?vperm	$in6,$in6,$in6,$inpperm
1697	 vcipher	$out5,$out5,v24
1698	stvx_u		$in5,$x50,$out
1699	le?vperm	$in7,$in7,$in7,$inpperm
1700	 vcipher	$out6,$out6,v24
1701	stvx_u		$in6,$x60,$out
1702	 vcipher	$out7,$out7,v24
1703	stvx_u		$in7,$x70,$out
1704	addi		$out,$out,0x80
1705
1706	b		Loop_ctr32_enc8x_middle
1707
1708.align	5
1709Lctr32_enc8x_break:
1710	cmpwi		$len,-0x60
1711	blt		Lctr32_enc8x_one
1712	nop
1713	beq		Lctr32_enc8x_two
1714	cmpwi		$len,-0x40
1715	blt		Lctr32_enc8x_three
1716	nop
1717	beq		Lctr32_enc8x_four
1718	cmpwi		$len,-0x20
1719	blt		Lctr32_enc8x_five
1720	nop
1721	beq		Lctr32_enc8x_six
1722	cmpwi		$len,0x00
1723	blt		Lctr32_enc8x_seven
1724
1725Lctr32_enc8x_eight:
1726	vcipherlast	$out0,$out0,$in0
1727	vcipherlast	$out1,$out1,$in1
1728	vcipherlast	$out2,$out2,$in2
1729	vcipherlast	$out3,$out3,$in3
1730	vcipherlast	$out4,$out4,$in4
1731	vcipherlast	$out5,$out5,$in5
1732	vcipherlast	$out6,$out6,$in6
1733	vcipherlast	$out7,$out7,$in7
1734
1735	le?vperm	$out0,$out0,$out0,$inpperm
1736	le?vperm	$out1,$out1,$out1,$inpperm
1737	stvx_u		$out0,$x00,$out
1738	le?vperm	$out2,$out2,$out2,$inpperm
1739	stvx_u		$out1,$x10,$out
1740	le?vperm	$out3,$out3,$out3,$inpperm
1741	stvx_u		$out2,$x20,$out
1742	le?vperm	$out4,$out4,$out4,$inpperm
1743	stvx_u		$out3,$x30,$out
1744	le?vperm	$out5,$out5,$out5,$inpperm
1745	stvx_u		$out4,$x40,$out
1746	le?vperm	$out6,$out6,$out6,$inpperm
1747	stvx_u		$out5,$x50,$out
1748	le?vperm	$out7,$out7,$out7,$inpperm
1749	stvx_u		$out6,$x60,$out
1750	stvx_u		$out7,$x70,$out
1751	addi		$out,$out,0x80
1752	b		Lctr32_enc8x_done
1753
1754.align	5
1755Lctr32_enc8x_seven:
1756	vcipherlast	$out0,$out0,$in1
1757	vcipherlast	$out1,$out1,$in2
1758	vcipherlast	$out2,$out2,$in3
1759	vcipherlast	$out3,$out3,$in4
1760	vcipherlast	$out4,$out4,$in5
1761	vcipherlast	$out5,$out5,$in6
1762	vcipherlast	$out6,$out6,$in7
1763
1764	le?vperm	$out0,$out0,$out0,$inpperm
1765	le?vperm	$out1,$out1,$out1,$inpperm
1766	stvx_u		$out0,$x00,$out
1767	le?vperm	$out2,$out2,$out2,$inpperm
1768	stvx_u		$out1,$x10,$out
1769	le?vperm	$out3,$out3,$out3,$inpperm
1770	stvx_u		$out2,$x20,$out
1771	le?vperm	$out4,$out4,$out4,$inpperm
1772	stvx_u		$out3,$x30,$out
1773	le?vperm	$out5,$out5,$out5,$inpperm
1774	stvx_u		$out4,$x40,$out
1775	le?vperm	$out6,$out6,$out6,$inpperm
1776	stvx_u		$out5,$x50,$out
1777	stvx_u		$out6,$x60,$out
1778	addi		$out,$out,0x70
1779	b		Lctr32_enc8x_done
1780
1781.align	5
1782Lctr32_enc8x_six:
1783	vcipherlast	$out0,$out0,$in2
1784	vcipherlast	$out1,$out1,$in3
1785	vcipherlast	$out2,$out2,$in4
1786	vcipherlast	$out3,$out3,$in5
1787	vcipherlast	$out4,$out4,$in6
1788	vcipherlast	$out5,$out5,$in7
1789
1790	le?vperm	$out0,$out0,$out0,$inpperm
1791	le?vperm	$out1,$out1,$out1,$inpperm
1792	stvx_u		$out0,$x00,$out
1793	le?vperm	$out2,$out2,$out2,$inpperm
1794	stvx_u		$out1,$x10,$out
1795	le?vperm	$out3,$out3,$out3,$inpperm
1796	stvx_u		$out2,$x20,$out
1797	le?vperm	$out4,$out4,$out4,$inpperm
1798	stvx_u		$out3,$x30,$out
1799	le?vperm	$out5,$out5,$out5,$inpperm
1800	stvx_u		$out4,$x40,$out
1801	stvx_u		$out5,$x50,$out
1802	addi		$out,$out,0x60
1803	b		Lctr32_enc8x_done
1804
1805.align	5
1806Lctr32_enc8x_five:
1807	vcipherlast	$out0,$out0,$in3
1808	vcipherlast	$out1,$out1,$in4
1809	vcipherlast	$out2,$out2,$in5
1810	vcipherlast	$out3,$out3,$in6
1811	vcipherlast	$out4,$out4,$in7
1812
1813	le?vperm	$out0,$out0,$out0,$inpperm
1814	le?vperm	$out1,$out1,$out1,$inpperm
1815	stvx_u		$out0,$x00,$out
1816	le?vperm	$out2,$out2,$out2,$inpperm
1817	stvx_u		$out1,$x10,$out
1818	le?vperm	$out3,$out3,$out3,$inpperm
1819	stvx_u		$out2,$x20,$out
1820	le?vperm	$out4,$out4,$out4,$inpperm
1821	stvx_u		$out3,$x30,$out
1822	stvx_u		$out4,$x40,$out
1823	addi		$out,$out,0x50
1824	b		Lctr32_enc8x_done
1825
1826.align	5
1827Lctr32_enc8x_four:
1828	vcipherlast	$out0,$out0,$in4
1829	vcipherlast	$out1,$out1,$in5
1830	vcipherlast	$out2,$out2,$in6
1831	vcipherlast	$out3,$out3,$in7
1832
1833	le?vperm	$out0,$out0,$out0,$inpperm
1834	le?vperm	$out1,$out1,$out1,$inpperm
1835	stvx_u		$out0,$x00,$out
1836	le?vperm	$out2,$out2,$out2,$inpperm
1837	stvx_u		$out1,$x10,$out
1838	le?vperm	$out3,$out3,$out3,$inpperm
1839	stvx_u		$out2,$x20,$out
1840	stvx_u		$out3,$x30,$out
1841	addi		$out,$out,0x40
1842	b		Lctr32_enc8x_done
1843
1844.align	5
1845Lctr32_enc8x_three:
1846	vcipherlast	$out0,$out0,$in5
1847	vcipherlast	$out1,$out1,$in6
1848	vcipherlast	$out2,$out2,$in7
1849
1850	le?vperm	$out0,$out0,$out0,$inpperm
1851	le?vperm	$out1,$out1,$out1,$inpperm
1852	stvx_u		$out0,$x00,$out
1853	le?vperm	$out2,$out2,$out2,$inpperm
1854	stvx_u		$out1,$x10,$out
1855	stvx_u		$out2,$x20,$out
1856	addi		$out,$out,0x30
1857	b		Lcbc_dec8x_done
1858
1859.align	5
1860Lctr32_enc8x_two:
1861	vcipherlast	$out0,$out0,$in6
1862	vcipherlast	$out1,$out1,$in7
1863
1864	le?vperm	$out0,$out0,$out0,$inpperm
1865	le?vperm	$out1,$out1,$out1,$inpperm
1866	stvx_u		$out0,$x00,$out
1867	stvx_u		$out1,$x10,$out
1868	addi		$out,$out,0x20
1869	b		Lcbc_dec8x_done
1870
1871.align	5
1872Lctr32_enc8x_one:
1873	vcipherlast	$out0,$out0,$in7
1874
1875	le?vperm	$out0,$out0,$out0,$inpperm
1876	stvx_u		$out0,0,$out
1877	addi		$out,$out,0x10
1878
1879Lctr32_enc8x_done:
1880	li		r10,`$FRAME+15`
1881	li		r11,`$FRAME+31`
1882	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1883	addi		r10,r10,32
1884	stvx		$inpperm,r11,$sp
1885	addi		r11,r11,32
1886	stvx		$inpperm,r10,$sp
1887	addi		r10,r10,32
1888	stvx		$inpperm,r11,$sp
1889	addi		r11,r11,32
1890	stvx		$inpperm,r10,$sp
1891	addi		r10,r10,32
1892	stvx		$inpperm,r11,$sp
1893	addi		r11,r11,32
1894	stvx		$inpperm,r10,$sp
1895	addi		r10,r10,32
1896	stvx		$inpperm,r11,$sp
1897	addi		r11,r11,32
1898
1899	mtspr		256,$vrsave
1900	lvx		v20,r10,$sp		# ABI says so
1901	addi		r10,r10,32
1902	lvx		v21,r11,$sp
1903	addi		r11,r11,32
1904	lvx		v22,r10,$sp
1905	addi		r10,r10,32
1906	lvx		v23,r11,$sp
1907	addi		r11,r11,32
1908	lvx		v24,r10,$sp
1909	addi		r10,r10,32
1910	lvx		v25,r11,$sp
1911	addi		r11,r11,32
1912	lvx		v26,r10,$sp
1913	addi		r10,r10,32
1914	lvx		v27,r11,$sp
1915	addi		r11,r11,32
1916	lvx		v28,r10,$sp
1917	addi		r10,r10,32
1918	lvx		v29,r11,$sp
1919	addi		r11,r11,32
1920	lvx		v30,r10,$sp
1921	lvx		v31,r11,$sp
1922	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1923	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1924	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1925	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1926	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1927	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1928	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1929	blr
1930	.long		0
1931	.byte		0,12,0x14,0,0x80,6,6,0
1932	.long		0
1933.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1934___
1935}}	}}}
1936
1937#########################################################################
1938{{{	# XTS procedures						#
1939# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1940#                             const AES_KEY *key1, const AES_KEY *key2,	#
1941#                             [const] unsigned char iv[16]);		#
1942# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1943# input tweak value is assumed to be encrypted already, and last tweak	#
1944# value, one suitable for consecutive call on same chunk of data, is	#
1945# written back to original buffer. In addition, in "tweak chaining"	#
1946# mode only complete input blocks are processed.			#
1947
1948my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1949my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1950my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1951my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1952my $taillen = $key2;
1953
1954   ($inp,$idx) = ($idx,$inp);				# reassign
1955
1956$code.=<<___;
1957.globl	.${prefix}_xts_encrypt
1958	mr		$inp,r3				# reassign
1959	li		r3,-1
1960	${UCMP}i	$len,16
1961	bltlr-
1962
1963	lis		r0,0xfff0
1964	mfspr		r12,256				# save vrsave
1965	li		r11,0
1966	mtspr		256,r0
1967
1968	vspltisb	$seven,0x07			# 0x070707..07
1969	le?lvsl		$leperm,r11,r11
1970	le?vspltisb	$tmp,0x0f
1971	le?vxor		$leperm,$leperm,$seven
1972
1973	li		$idx,15
1974	lvx		$tweak,0,$ivp			# load [unaligned] iv
1975	lvsl		$inpperm,0,$ivp
1976	lvx		$inptail,$idx,$ivp
1977	le?vxor		$inpperm,$inpperm,$tmp
1978	vperm		$tweak,$tweak,$inptail,$inpperm
1979
1980	neg		r11,$inp
1981	lvsr		$inpperm,0,r11			# prepare for unaligned load
1982	lvx		$inout,0,$inp
1983	addi		$inp,$inp,15			# 15 is not typo
1984	le?vxor		$inpperm,$inpperm,$tmp
1985
1986	${UCMP}i	$key2,0				# key2==NULL?
1987	beq		Lxts_enc_no_key2
1988
1989	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
1990	lwz		$rounds,240($key2)
1991	srwi		$rounds,$rounds,1
1992	subi		$rounds,$rounds,1
1993	li		$idx,16
1994
1995	lvx		$rndkey0,0,$key2
1996	lvx		$rndkey1,$idx,$key2
1997	addi		$idx,$idx,16
1998	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1999	vxor		$tweak,$tweak,$rndkey0
2000	lvx		$rndkey0,$idx,$key2
2001	addi		$idx,$idx,16
2002	mtctr		$rounds
2003
2004Ltweak_xts_enc:
2005	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2006	vcipher		$tweak,$tweak,$rndkey1
2007	lvx		$rndkey1,$idx,$key2
2008	addi		$idx,$idx,16
2009	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2010	vcipher		$tweak,$tweak,$rndkey0
2011	lvx		$rndkey0,$idx,$key2
2012	addi		$idx,$idx,16
2013	bdnz		Ltweak_xts_enc
2014
2015	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2016	vcipher		$tweak,$tweak,$rndkey1
2017	lvx		$rndkey1,$idx,$key2
2018	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2019	vcipherlast	$tweak,$tweak,$rndkey0
2020
2021	li		$ivp,0				# don't chain the tweak
2022	b		Lxts_enc
2023
2024Lxts_enc_no_key2:
2025	li		$idx,-16
2026	and		$len,$len,$idx			# in "tweak chaining"
2027							# mode only complete
2028							# blocks are processed
2029Lxts_enc:
2030	lvx		$inptail,0,$inp
2031	addi		$inp,$inp,16
2032
2033	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2034	lwz		$rounds,240($key1)
2035	srwi		$rounds,$rounds,1
2036	subi		$rounds,$rounds,1
2037	li		$idx,16
2038
2039	vslb		$eighty7,$seven,$seven		# 0x808080..80
2040	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2041	vspltisb	$tmp,1				# 0x010101..01
2042	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2043
2044	${UCMP}i	$len,96
2045	bge		_aesp8_xts_encrypt6x
2046
2047	andi.		$taillen,$len,15
2048	subic		r0,$len,32
2049	subi		$taillen,$taillen,16
2050	subfe		r0,r0,r0
2051	and		r0,r0,$taillen
2052	add		$inp,$inp,r0
2053
2054	lvx		$rndkey0,0,$key1
2055	lvx		$rndkey1,$idx,$key1
2056	addi		$idx,$idx,16
2057	vperm		$inout,$inout,$inptail,$inpperm
2058	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2059	vxor		$inout,$inout,$tweak
2060	vxor		$inout,$inout,$rndkey0
2061	lvx		$rndkey0,$idx,$key1
2062	addi		$idx,$idx,16
2063	mtctr		$rounds
2064	b		Loop_xts_enc
2065
2066.align	5
2067Loop_xts_enc:
2068	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2069	vcipher		$inout,$inout,$rndkey1
2070	lvx		$rndkey1,$idx,$key1
2071	addi		$idx,$idx,16
2072	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2073	vcipher		$inout,$inout,$rndkey0
2074	lvx		$rndkey0,$idx,$key1
2075	addi		$idx,$idx,16
2076	bdnz		Loop_xts_enc
2077
2078	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2079	vcipher		$inout,$inout,$rndkey1
2080	lvx		$rndkey1,$idx,$key1
2081	li		$idx,16
2082	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2083	vxor		$rndkey0,$rndkey0,$tweak
2084	vcipherlast	$output,$inout,$rndkey0
2085
2086	le?vperm	$tmp,$output,$output,$leperm
2087	be?nop
2088	le?stvx_u	$tmp,0,$out
2089	be?stvx_u	$output,0,$out
2090	addi		$out,$out,16
2091
2092	subic.		$len,$len,16
2093	beq		Lxts_enc_done
2094
2095	vmr		$inout,$inptail
2096	lvx		$inptail,0,$inp
2097	addi		$inp,$inp,16
2098	lvx		$rndkey0,0,$key1
2099	lvx		$rndkey1,$idx,$key1
2100	addi		$idx,$idx,16
2101
2102	subic		r0,$len,32
2103	subfe		r0,r0,r0
2104	and		r0,r0,$taillen
2105	add		$inp,$inp,r0
2106
2107	vsrab		$tmp,$tweak,$seven		# next tweak value
2108	vaddubm		$tweak,$tweak,$tweak
2109	vsldoi		$tmp,$tmp,$tmp,15
2110	vand		$tmp,$tmp,$eighty7
2111	vxor		$tweak,$tweak,$tmp
2112
2113	vperm		$inout,$inout,$inptail,$inpperm
2114	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2115	vxor		$inout,$inout,$tweak
2116	vxor		$output,$output,$rndkey0	# just in case $len<16
2117	vxor		$inout,$inout,$rndkey0
2118	lvx		$rndkey0,$idx,$key1
2119	addi		$idx,$idx,16
2120
2121	mtctr		$rounds
2122	${UCMP}i	$len,16
2123	bge		Loop_xts_enc
2124
2125	vxor		$output,$output,$tweak
2126	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2127	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2128	vspltisb	$tmp,-1
2129	vperm		$inptail,$inptail,$tmp,$inpperm
2130	vsel		$inout,$inout,$output,$inptail
2131
2132	subi		r11,$out,17
2133	subi		$out,$out,16
2134	mtctr		$len
2135	li		$len,16
2136Loop_xts_enc_steal:
2137	lbzu		r0,1(r11)
2138	stb		r0,16(r11)
2139	bdnz		Loop_xts_enc_steal
2140
2141	mtctr		$rounds
2142	b		Loop_xts_enc			# one more time...
2143
2144Lxts_enc_done:
2145	${UCMP}i	$ivp,0
2146	beq		Lxts_enc_ret
2147
2148	vsrab		$tmp,$tweak,$seven		# next tweak value
2149	vaddubm		$tweak,$tweak,$tweak
2150	vsldoi		$tmp,$tmp,$tmp,15
2151	vand		$tmp,$tmp,$eighty7
2152	vxor		$tweak,$tweak,$tmp
2153
2154	le?vperm	$tweak,$tweak,$tweak,$leperm
2155	stvx_u		$tweak,0,$ivp
2156
2157Lxts_enc_ret:
2158	mtspr		256,r12				# restore vrsave
2159	li		r3,0
2160	blr
2161	.long		0
2162	.byte		0,12,0x04,0,0x80,6,6,0
2163	.long		0
2164.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2165
2166.globl	.${prefix}_xts_decrypt
2167	mr		$inp,r3				# reassign
2168	li		r3,-1
2169	${UCMP}i	$len,16
2170	bltlr-
2171
2172	lis		r0,0xfff8
2173	mfspr		r12,256				# save vrsave
2174	li		r11,0
2175	mtspr		256,r0
2176
2177	andi.		r0,$len,15
2178	neg		r0,r0
2179	andi.		r0,r0,16
2180	sub		$len,$len,r0
2181
2182	vspltisb	$seven,0x07			# 0x070707..07
2183	le?lvsl		$leperm,r11,r11
2184	le?vspltisb	$tmp,0x0f
2185	le?vxor		$leperm,$leperm,$seven
2186
2187	li		$idx,15
2188	lvx		$tweak,0,$ivp			# load [unaligned] iv
2189	lvsl		$inpperm,0,$ivp
2190	lvx		$inptail,$idx,$ivp
2191	le?vxor		$inpperm,$inpperm,$tmp
2192	vperm		$tweak,$tweak,$inptail,$inpperm
2193
2194	neg		r11,$inp
2195	lvsr		$inpperm,0,r11			# prepare for unaligned load
2196	lvx		$inout,0,$inp
2197	addi		$inp,$inp,15			# 15 is not typo
2198	le?vxor		$inpperm,$inpperm,$tmp
2199
2200	${UCMP}i	$key2,0				# key2==NULL?
2201	beq		Lxts_dec_no_key2
2202
2203	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2204	lwz		$rounds,240($key2)
2205	srwi		$rounds,$rounds,1
2206	subi		$rounds,$rounds,1
2207	li		$idx,16
2208
2209	lvx		$rndkey0,0,$key2
2210	lvx		$rndkey1,$idx,$key2
2211	addi		$idx,$idx,16
2212	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2213	vxor		$tweak,$tweak,$rndkey0
2214	lvx		$rndkey0,$idx,$key2
2215	addi		$idx,$idx,16
2216	mtctr		$rounds
2217
2218Ltweak_xts_dec:
2219	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2220	vcipher		$tweak,$tweak,$rndkey1
2221	lvx		$rndkey1,$idx,$key2
2222	addi		$idx,$idx,16
2223	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2224	vcipher		$tweak,$tweak,$rndkey0
2225	lvx		$rndkey0,$idx,$key2
2226	addi		$idx,$idx,16
2227	bdnz		Ltweak_xts_dec
2228
2229	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2230	vcipher		$tweak,$tweak,$rndkey1
2231	lvx		$rndkey1,$idx,$key2
2232	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2233	vcipherlast	$tweak,$tweak,$rndkey0
2234
2235	li		$ivp,0				# don't chain the tweak
2236	b		Lxts_dec
2237
2238Lxts_dec_no_key2:
2239	neg		$idx,$len
2240	andi.		$idx,$idx,15
2241	add		$len,$len,$idx			# in "tweak chaining"
2242							# mode only complete
2243							# blocks are processed
2244Lxts_dec:
2245	lvx		$inptail,0,$inp
2246	addi		$inp,$inp,16
2247
2248	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2249	lwz		$rounds,240($key1)
2250	srwi		$rounds,$rounds,1
2251	subi		$rounds,$rounds,1
2252	li		$idx,16
2253
2254	vslb		$eighty7,$seven,$seven		# 0x808080..80
2255	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2256	vspltisb	$tmp,1				# 0x010101..01
2257	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2258
2259	${UCMP}i	$len,96
2260	bge		_aesp8_xts_decrypt6x
2261
2262	lvx		$rndkey0,0,$key1
2263	lvx		$rndkey1,$idx,$key1
2264	addi		$idx,$idx,16
2265	vperm		$inout,$inout,$inptail,$inpperm
2266	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2267	vxor		$inout,$inout,$tweak
2268	vxor		$inout,$inout,$rndkey0
2269	lvx		$rndkey0,$idx,$key1
2270	addi		$idx,$idx,16
2271	mtctr		$rounds
2272
2273	${UCMP}i	$len,16
2274	blt		Ltail_xts_dec
2275	be?b		Loop_xts_dec
2276
2277.align	5
2278Loop_xts_dec:
2279	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2280	vncipher	$inout,$inout,$rndkey1
2281	lvx		$rndkey1,$idx,$key1
2282	addi		$idx,$idx,16
2283	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2284	vncipher	$inout,$inout,$rndkey0
2285	lvx		$rndkey0,$idx,$key1
2286	addi		$idx,$idx,16
2287	bdnz		Loop_xts_dec
2288
2289	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2290	vncipher	$inout,$inout,$rndkey1
2291	lvx		$rndkey1,$idx,$key1
2292	li		$idx,16
2293	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2294	vxor		$rndkey0,$rndkey0,$tweak
2295	vncipherlast	$output,$inout,$rndkey0
2296
2297	le?vperm	$tmp,$output,$output,$leperm
2298	be?nop
2299	le?stvx_u	$tmp,0,$out
2300	be?stvx_u	$output,0,$out
2301	addi		$out,$out,16
2302
2303	subic.		$len,$len,16
2304	beq		Lxts_dec_done
2305
2306	vmr		$inout,$inptail
2307	lvx		$inptail,0,$inp
2308	addi		$inp,$inp,16
2309	lvx		$rndkey0,0,$key1
2310	lvx		$rndkey1,$idx,$key1
2311	addi		$idx,$idx,16
2312
2313	vsrab		$tmp,$tweak,$seven		# next tweak value
2314	vaddubm		$tweak,$tweak,$tweak
2315	vsldoi		$tmp,$tmp,$tmp,15
2316	vand		$tmp,$tmp,$eighty7
2317	vxor		$tweak,$tweak,$tmp
2318
2319	vperm		$inout,$inout,$inptail,$inpperm
2320	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2321	vxor		$inout,$inout,$tweak
2322	vxor		$inout,$inout,$rndkey0
2323	lvx		$rndkey0,$idx,$key1
2324	addi		$idx,$idx,16
2325
2326	mtctr		$rounds
2327	${UCMP}i	$len,16
2328	bge		Loop_xts_dec
2329
2330Ltail_xts_dec:
2331	vsrab		$tmp,$tweak,$seven		# next tweak value
2332	vaddubm		$tweak1,$tweak,$tweak
2333	vsldoi		$tmp,$tmp,$tmp,15
2334	vand		$tmp,$tmp,$eighty7
2335	vxor		$tweak1,$tweak1,$tmp
2336
2337	subi		$inp,$inp,16
2338	add		$inp,$inp,$len
2339
2340	vxor		$inout,$inout,$tweak		# :-(
2341	vxor		$inout,$inout,$tweak1		# :-)
2342
2343Loop_xts_dec_short:
2344	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2345	vncipher	$inout,$inout,$rndkey1
2346	lvx		$rndkey1,$idx,$key1
2347	addi		$idx,$idx,16
2348	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2349	vncipher	$inout,$inout,$rndkey0
2350	lvx		$rndkey0,$idx,$key1
2351	addi		$idx,$idx,16
2352	bdnz		Loop_xts_dec_short
2353
2354	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2355	vncipher	$inout,$inout,$rndkey1
2356	lvx		$rndkey1,$idx,$key1
2357	li		$idx,16
2358	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2359	vxor		$rndkey0,$rndkey0,$tweak1
2360	vncipherlast	$output,$inout,$rndkey0
2361
2362	le?vperm	$tmp,$output,$output,$leperm
2363	be?nop
2364	le?stvx_u	$tmp,0,$out
2365	be?stvx_u	$output,0,$out
2366
2367	vmr		$inout,$inptail
2368	lvx		$inptail,0,$inp
2369	#addi		$inp,$inp,16
2370	lvx		$rndkey0,0,$key1
2371	lvx		$rndkey1,$idx,$key1
2372	addi		$idx,$idx,16
2373	vperm		$inout,$inout,$inptail,$inpperm
2374	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2375
2376	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2377	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2378	vspltisb	$tmp,-1
2379	vperm		$inptail,$inptail,$tmp,$inpperm
2380	vsel		$inout,$inout,$output,$inptail
2381
2382	vxor		$rndkey0,$rndkey0,$tweak
2383	vxor		$inout,$inout,$rndkey0
2384	lvx		$rndkey0,$idx,$key1
2385	addi		$idx,$idx,16
2386
2387	subi		r11,$out,1
2388	mtctr		$len
2389	li		$len,16
2390Loop_xts_dec_steal:
2391	lbzu		r0,1(r11)
2392	stb		r0,16(r11)
2393	bdnz		Loop_xts_dec_steal
2394
2395	mtctr		$rounds
2396	b		Loop_xts_dec			# one more time...
2397
2398Lxts_dec_done:
2399	${UCMP}i	$ivp,0
2400	beq		Lxts_dec_ret
2401
2402	vsrab		$tmp,$tweak,$seven		# next tweak value
2403	vaddubm		$tweak,$tweak,$tweak
2404	vsldoi		$tmp,$tmp,$tmp,15
2405	vand		$tmp,$tmp,$eighty7
2406	vxor		$tweak,$tweak,$tmp
2407
2408	le?vperm	$tweak,$tweak,$tweak,$leperm
2409	stvx_u		$tweak,0,$ivp
2410
2411Lxts_dec_ret:
2412	mtspr		256,r12				# restore vrsave
2413	li		r3,0
2414	blr
2415	.long		0
2416	.byte		0,12,0x04,0,0x80,6,6,0
2417	.long		0
2418.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2419___
2420#########################################################################
2421{{	# Optimized XTS procedures					#
2422my $key_=$key2;
2423my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2424    $x00=0 if ($flavour =~ /osx/);
2425my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2426my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2427my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2428my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2429			# v26-v31 last 6 round keys
2430my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2431my $taillen=$x70;
2432
2433$code.=<<___;
2434.align	5
2435_aesp8_xts_encrypt6x:
2436	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2437	mflr		r11
2438	li		r7,`$FRAME+8*16+15`
2439	li		r3,`$FRAME+8*16+31`
2440	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2441	stvx		v20,r7,$sp		# ABI says so
2442	addi		r7,r7,32
2443	stvx		v21,r3,$sp
2444	addi		r3,r3,32
2445	stvx		v22,r7,$sp
2446	addi		r7,r7,32
2447	stvx		v23,r3,$sp
2448	addi		r3,r3,32
2449	stvx		v24,r7,$sp
2450	addi		r7,r7,32
2451	stvx		v25,r3,$sp
2452	addi		r3,r3,32
2453	stvx		v26,r7,$sp
2454	addi		r7,r7,32
2455	stvx		v27,r3,$sp
2456	addi		r3,r3,32
2457	stvx		v28,r7,$sp
2458	addi		r7,r7,32
2459	stvx		v29,r3,$sp
2460	addi		r3,r3,32
2461	stvx		v30,r7,$sp
2462	stvx		v31,r3,$sp
2463	li		r0,-1
2464	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2465	li		$x10,0x10
2466	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2467	li		$x20,0x20
2468	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2469	li		$x30,0x30
2470	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2471	li		$x40,0x40
2472	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2473	li		$x50,0x50
2474	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2475	li		$x60,0x60
2476	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2477	li		$x70,0x70
2478	mtspr		256,r0
2479
2480	subi		$rounds,$rounds,3	# -4 in total
2481
2482	lvx		$rndkey0,$x00,$key1	# load key schedule
2483	lvx		v30,$x10,$key1
2484	addi		$key1,$key1,0x20
2485	lvx		v31,$x00,$key1
2486	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2487	addi		$key_,$sp,$FRAME+15
2488	mtctr		$rounds
2489
2490Load_xts_enc_key:
2491	?vperm		v24,v30,v31,$keyperm
2492	lvx		v30,$x10,$key1
2493	addi		$key1,$key1,0x20
2494	stvx		v24,$x00,$key_		# off-load round[1]
2495	?vperm		v25,v31,v30,$keyperm
2496	lvx		v31,$x00,$key1
2497	stvx		v25,$x10,$key_		# off-load round[2]
2498	addi		$key_,$key_,0x20
2499	bdnz		Load_xts_enc_key
2500
2501	lvx		v26,$x10,$key1
2502	?vperm		v24,v30,v31,$keyperm
2503	lvx		v27,$x20,$key1
2504	stvx		v24,$x00,$key_		# off-load round[3]
2505	?vperm		v25,v31,v26,$keyperm
2506	lvx		v28,$x30,$key1
2507	stvx		v25,$x10,$key_		# off-load round[4]
2508	addi		$key_,$sp,$FRAME+15	# rewind $key_
2509	?vperm		v26,v26,v27,$keyperm
2510	lvx		v29,$x40,$key1
2511	?vperm		v27,v27,v28,$keyperm
2512	lvx		v30,$x50,$key1
2513	?vperm		v28,v28,v29,$keyperm
2514	lvx		v31,$x60,$key1
2515	?vperm		v29,v29,v30,$keyperm
2516	lvx		$twk5,$x70,$key1	# borrow $twk5
2517	?vperm		v30,v30,v31,$keyperm
2518	lvx		v24,$x00,$key_		# pre-load round[1]
2519	?vperm		v31,v31,$twk5,$keyperm
2520	lvx		v25,$x10,$key_		# pre-load round[2]
2521
2522	 vperm		$in0,$inout,$inptail,$inpperm
2523	 subi		$inp,$inp,31		# undo "caller"
2524	vxor		$twk0,$tweak,$rndkey0
2525	vsrab		$tmp,$tweak,$seven	# next tweak value
2526	vaddubm		$tweak,$tweak,$tweak
2527	vsldoi		$tmp,$tmp,$tmp,15
2528	vand		$tmp,$tmp,$eighty7
2529	 vxor		$out0,$in0,$twk0
2530	vxor		$tweak,$tweak,$tmp
2531
2532	 lvx_u		$in1,$x10,$inp
2533	vxor		$twk1,$tweak,$rndkey0
2534	vsrab		$tmp,$tweak,$seven	# next tweak value
2535	vaddubm		$tweak,$tweak,$tweak
2536	vsldoi		$tmp,$tmp,$tmp,15
2537	 le?vperm	$in1,$in1,$in1,$leperm
2538	vand		$tmp,$tmp,$eighty7
2539	 vxor		$out1,$in1,$twk1
2540	vxor		$tweak,$tweak,$tmp
2541
2542	 lvx_u		$in2,$x20,$inp
2543	 andi.		$taillen,$len,15
2544	vxor		$twk2,$tweak,$rndkey0
2545	vsrab		$tmp,$tweak,$seven	# next tweak value
2546	vaddubm		$tweak,$tweak,$tweak
2547	vsldoi		$tmp,$tmp,$tmp,15
2548	 le?vperm	$in2,$in2,$in2,$leperm
2549	vand		$tmp,$tmp,$eighty7
2550	 vxor		$out2,$in2,$twk2
2551	vxor		$tweak,$tweak,$tmp
2552
2553	 lvx_u		$in3,$x30,$inp
2554	 sub		$len,$len,$taillen
2555	vxor		$twk3,$tweak,$rndkey0
2556	vsrab		$tmp,$tweak,$seven	# next tweak value
2557	vaddubm		$tweak,$tweak,$tweak
2558	vsldoi		$tmp,$tmp,$tmp,15
2559	 le?vperm	$in3,$in3,$in3,$leperm
2560	vand		$tmp,$tmp,$eighty7
2561	 vxor		$out3,$in3,$twk3
2562	vxor		$tweak,$tweak,$tmp
2563
2564	 lvx_u		$in4,$x40,$inp
2565	 subi		$len,$len,0x60
2566	vxor		$twk4,$tweak,$rndkey0
2567	vsrab		$tmp,$tweak,$seven	# next tweak value
2568	vaddubm		$tweak,$tweak,$tweak
2569	vsldoi		$tmp,$tmp,$tmp,15
2570	 le?vperm	$in4,$in4,$in4,$leperm
2571	vand		$tmp,$tmp,$eighty7
2572	 vxor		$out4,$in4,$twk4
2573	vxor		$tweak,$tweak,$tmp
2574
2575	 lvx_u		$in5,$x50,$inp
2576	 addi		$inp,$inp,0x60
2577	vxor		$twk5,$tweak,$rndkey0
2578	vsrab		$tmp,$tweak,$seven	# next tweak value
2579	vaddubm		$tweak,$tweak,$tweak
2580	vsldoi		$tmp,$tmp,$tmp,15
2581	 le?vperm	$in5,$in5,$in5,$leperm
2582	vand		$tmp,$tmp,$eighty7
2583	 vxor		$out5,$in5,$twk5
2584	vxor		$tweak,$tweak,$tmp
2585
2586	vxor		v31,v31,$rndkey0
2587	mtctr		$rounds
2588	b		Loop_xts_enc6x
2589
2590.align	5
2591Loop_xts_enc6x:
2592	vcipher		$out0,$out0,v24
2593	vcipher		$out1,$out1,v24
2594	vcipher		$out2,$out2,v24
2595	vcipher		$out3,$out3,v24
2596	vcipher		$out4,$out4,v24
2597	vcipher		$out5,$out5,v24
2598	lvx		v24,$x20,$key_		# round[3]
2599	addi		$key_,$key_,0x20
2600
2601	vcipher		$out0,$out0,v25
2602	vcipher		$out1,$out1,v25
2603	vcipher		$out2,$out2,v25
2604	vcipher		$out3,$out3,v25
2605	vcipher		$out4,$out4,v25
2606	vcipher		$out5,$out5,v25
2607	lvx		v25,$x10,$key_		# round[4]
2608	bdnz		Loop_xts_enc6x
2609
2610	subic		$len,$len,96		# $len-=96
2611	 vxor		$in0,$twk0,v31		# xor with last round key
2612	vcipher		$out0,$out0,v24
2613	vcipher		$out1,$out1,v24
2614	 vsrab		$tmp,$tweak,$seven	# next tweak value
2615	 vxor		$twk0,$tweak,$rndkey0
2616	 vaddubm	$tweak,$tweak,$tweak
2617	vcipher		$out2,$out2,v24
2618	vcipher		$out3,$out3,v24
2619	 vsldoi		$tmp,$tmp,$tmp,15
2620	vcipher		$out4,$out4,v24
2621	vcipher		$out5,$out5,v24
2622
2623	subfe.		r0,r0,r0		# borrow?-1:0
2624	 vand		$tmp,$tmp,$eighty7
2625	vcipher		$out0,$out0,v25
2626	vcipher		$out1,$out1,v25
2627	 vxor		$tweak,$tweak,$tmp
2628	vcipher		$out2,$out2,v25
2629	vcipher		$out3,$out3,v25
2630	 vxor		$in1,$twk1,v31
2631	 vsrab		$tmp,$tweak,$seven	# next tweak value
2632	 vxor		$twk1,$tweak,$rndkey0
2633	vcipher		$out4,$out4,v25
2634	vcipher		$out5,$out5,v25
2635
2636	and		r0,r0,$len
2637	 vaddubm	$tweak,$tweak,$tweak
2638	 vsldoi		$tmp,$tmp,$tmp,15
2639	vcipher		$out0,$out0,v26
2640	vcipher		$out1,$out1,v26
2641	 vand		$tmp,$tmp,$eighty7
2642	vcipher		$out2,$out2,v26
2643	vcipher		$out3,$out3,v26
2644	 vxor		$tweak,$tweak,$tmp
2645	vcipher		$out4,$out4,v26
2646	vcipher		$out5,$out5,v26
2647
2648	add		$inp,$inp,r0		# $inp is adjusted in such
2649						# way that at exit from the
2650						# loop inX-in5 are loaded
2651						# with last "words"
2652	 vxor		$in2,$twk2,v31
2653	 vsrab		$tmp,$tweak,$seven	# next tweak value
2654	 vxor		$twk2,$tweak,$rndkey0
2655	 vaddubm	$tweak,$tweak,$tweak
2656	vcipher		$out0,$out0,v27
2657	vcipher		$out1,$out1,v27
2658	 vsldoi		$tmp,$tmp,$tmp,15
2659	vcipher		$out2,$out2,v27
2660	vcipher		$out3,$out3,v27
2661	 vand		$tmp,$tmp,$eighty7
2662	vcipher		$out4,$out4,v27
2663	vcipher		$out5,$out5,v27
2664
2665	addi		$key_,$sp,$FRAME+15	# rewind $key_
2666	 vxor		$tweak,$tweak,$tmp
2667	vcipher		$out0,$out0,v28
2668	vcipher		$out1,$out1,v28
2669	 vxor		$in3,$twk3,v31
2670	 vsrab		$tmp,$tweak,$seven	# next tweak value
2671	 vxor		$twk3,$tweak,$rndkey0
2672	vcipher		$out2,$out2,v28
2673	vcipher		$out3,$out3,v28
2674	 vaddubm	$tweak,$tweak,$tweak
2675	 vsldoi		$tmp,$tmp,$tmp,15
2676	vcipher		$out4,$out4,v28
2677	vcipher		$out5,$out5,v28
2678	lvx		v24,$x00,$key_		# re-pre-load round[1]
2679	 vand		$tmp,$tmp,$eighty7
2680
2681	vcipher		$out0,$out0,v29
2682	vcipher		$out1,$out1,v29
2683	 vxor		$tweak,$tweak,$tmp
2684	vcipher		$out2,$out2,v29
2685	vcipher		$out3,$out3,v29
2686	 vxor		$in4,$twk4,v31
2687	 vsrab		$tmp,$tweak,$seven	# next tweak value
2688	 vxor		$twk4,$tweak,$rndkey0
2689	vcipher		$out4,$out4,v29
2690	vcipher		$out5,$out5,v29
2691	lvx		v25,$x10,$key_		# re-pre-load round[2]
2692	 vaddubm	$tweak,$tweak,$tweak
2693	 vsldoi		$tmp,$tmp,$tmp,15
2694
2695	vcipher		$out0,$out0,v30
2696	vcipher		$out1,$out1,v30
2697	 vand		$tmp,$tmp,$eighty7
2698	vcipher		$out2,$out2,v30
2699	vcipher		$out3,$out3,v30
2700	 vxor		$tweak,$tweak,$tmp
2701	vcipher		$out4,$out4,v30
2702	vcipher		$out5,$out5,v30
2703	 vxor		$in5,$twk5,v31
2704	 vsrab		$tmp,$tweak,$seven	# next tweak value
2705	 vxor		$twk5,$tweak,$rndkey0
2706
2707	vcipherlast	$out0,$out0,$in0
2708	 lvx_u		$in0,$x00,$inp		# load next input block
2709	 vaddubm	$tweak,$tweak,$tweak
2710	 vsldoi		$tmp,$tmp,$tmp,15
2711	vcipherlast	$out1,$out1,$in1
2712	 lvx_u		$in1,$x10,$inp
2713	vcipherlast	$out2,$out2,$in2
2714	 le?vperm	$in0,$in0,$in0,$leperm
2715	 lvx_u		$in2,$x20,$inp
2716	 vand		$tmp,$tmp,$eighty7
2717	vcipherlast	$out3,$out3,$in3
2718	 le?vperm	$in1,$in1,$in1,$leperm
2719	 lvx_u		$in3,$x30,$inp
2720	vcipherlast	$out4,$out4,$in4
2721	 le?vperm	$in2,$in2,$in2,$leperm
2722	 lvx_u		$in4,$x40,$inp
2723	 vxor		$tweak,$tweak,$tmp
2724	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2725						# in stealing mode
2726	 le?vperm	$in3,$in3,$in3,$leperm
2727	 lvx_u		$in5,$x50,$inp
2728	 addi		$inp,$inp,0x60
2729	 le?vperm	$in4,$in4,$in4,$leperm
2730	 le?vperm	$in5,$in5,$in5,$leperm
2731
2732	le?vperm	$out0,$out0,$out0,$leperm
2733	le?vperm	$out1,$out1,$out1,$leperm
2734	stvx_u		$out0,$x00,$out		# store output
2735	 vxor		$out0,$in0,$twk0
2736	le?vperm	$out2,$out2,$out2,$leperm
2737	stvx_u		$out1,$x10,$out
2738	 vxor		$out1,$in1,$twk1
2739	le?vperm	$out3,$out3,$out3,$leperm
2740	stvx_u		$out2,$x20,$out
2741	 vxor		$out2,$in2,$twk2
2742	le?vperm	$out4,$out4,$out4,$leperm
2743	stvx_u		$out3,$x30,$out
2744	 vxor		$out3,$in3,$twk3
2745	le?vperm	$out5,$tmp,$tmp,$leperm
2746	stvx_u		$out4,$x40,$out
2747	 vxor		$out4,$in4,$twk4
2748	le?stvx_u	$out5,$x50,$out
2749	be?stvx_u	$tmp, $x50,$out
2750	 vxor		$out5,$in5,$twk5
2751	addi		$out,$out,0x60
2752
2753	mtctr		$rounds
2754	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2755
2756	addic.		$len,$len,0x60
2757	beq		Lxts_enc6x_zero
2758	cmpwi		$len,0x20
2759	blt		Lxts_enc6x_one
2760	nop
2761	beq		Lxts_enc6x_two
2762	cmpwi		$len,0x40
2763	blt		Lxts_enc6x_three
2764	nop
2765	beq		Lxts_enc6x_four
2766
2767Lxts_enc6x_five:
2768	vxor		$out0,$in1,$twk0
2769	vxor		$out1,$in2,$twk1
2770	vxor		$out2,$in3,$twk2
2771	vxor		$out3,$in4,$twk3
2772	vxor		$out4,$in5,$twk4
2773
2774	bl		_aesp8_xts_enc5x
2775
2776	le?vperm	$out0,$out0,$out0,$leperm
2777	vmr		$twk0,$twk5		# unused tweak
2778	le?vperm	$out1,$out1,$out1,$leperm
2779	stvx_u		$out0,$x00,$out		# store output
2780	le?vperm	$out2,$out2,$out2,$leperm
2781	stvx_u		$out1,$x10,$out
2782	le?vperm	$out3,$out3,$out3,$leperm
2783	stvx_u		$out2,$x20,$out
2784	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2785	le?vperm	$out4,$out4,$out4,$leperm
2786	stvx_u		$out3,$x30,$out
2787	stvx_u		$out4,$x40,$out
2788	addi		$out,$out,0x50
2789	bne		Lxts_enc6x_steal
2790	b		Lxts_enc6x_done
2791
2792.align	4
2793Lxts_enc6x_four:
2794	vxor		$out0,$in2,$twk0
2795	vxor		$out1,$in3,$twk1
2796	vxor		$out2,$in4,$twk2
2797	vxor		$out3,$in5,$twk3
2798	vxor		$out4,$out4,$out4
2799
2800	bl		_aesp8_xts_enc5x
2801
2802	le?vperm	$out0,$out0,$out0,$leperm
2803	vmr		$twk0,$twk4		# unused tweak
2804	le?vperm	$out1,$out1,$out1,$leperm
2805	stvx_u		$out0,$x00,$out		# store output
2806	le?vperm	$out2,$out2,$out2,$leperm
2807	stvx_u		$out1,$x10,$out
2808	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2809	le?vperm	$out3,$out3,$out3,$leperm
2810	stvx_u		$out2,$x20,$out
2811	stvx_u		$out3,$x30,$out
2812	addi		$out,$out,0x40
2813	bne		Lxts_enc6x_steal
2814	b		Lxts_enc6x_done
2815
2816.align	4
2817Lxts_enc6x_three:
2818	vxor		$out0,$in3,$twk0
2819	vxor		$out1,$in4,$twk1
2820	vxor		$out2,$in5,$twk2
2821	vxor		$out3,$out3,$out3
2822	vxor		$out4,$out4,$out4
2823
2824	bl		_aesp8_xts_enc5x
2825
2826	le?vperm	$out0,$out0,$out0,$leperm
2827	vmr		$twk0,$twk3		# unused tweak
2828	le?vperm	$out1,$out1,$out1,$leperm
2829	stvx_u		$out0,$x00,$out		# store output
2830	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2831	le?vperm	$out2,$out2,$out2,$leperm
2832	stvx_u		$out1,$x10,$out
2833	stvx_u		$out2,$x20,$out
2834	addi		$out,$out,0x30
2835	bne		Lxts_enc6x_steal
2836	b		Lxts_enc6x_done
2837
2838.align	4
2839Lxts_enc6x_two:
2840	vxor		$out0,$in4,$twk0
2841	vxor		$out1,$in5,$twk1
2842	vxor		$out2,$out2,$out2
2843	vxor		$out3,$out3,$out3
2844	vxor		$out4,$out4,$out4
2845
2846	bl		_aesp8_xts_enc5x
2847
2848	le?vperm	$out0,$out0,$out0,$leperm
2849	vmr		$twk0,$twk2		# unused tweak
2850	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2851	le?vperm	$out1,$out1,$out1,$leperm
2852	stvx_u		$out0,$x00,$out		# store output
2853	stvx_u		$out1,$x10,$out
2854	addi		$out,$out,0x20
2855	bne		Lxts_enc6x_steal
2856	b		Lxts_enc6x_done
2857
2858.align	4
2859Lxts_enc6x_one:
2860	vxor		$out0,$in5,$twk0
2861	nop
2862Loop_xts_enc1x:
2863	vcipher		$out0,$out0,v24
2864	lvx		v24,$x20,$key_		# round[3]
2865	addi		$key_,$key_,0x20
2866
2867	vcipher		$out0,$out0,v25
2868	lvx		v25,$x10,$key_		# round[4]
2869	bdnz		Loop_xts_enc1x
2870
2871	add		$inp,$inp,$taillen
2872	cmpwi		$taillen,0
2873	vcipher		$out0,$out0,v24
2874
2875	subi		$inp,$inp,16
2876	vcipher		$out0,$out0,v25
2877
2878	lvsr		$inpperm,0,$taillen
2879	vcipher		$out0,$out0,v26
2880
2881	lvx_u		$in0,0,$inp
2882	vcipher		$out0,$out0,v27
2883
2884	addi		$key_,$sp,$FRAME+15	# rewind $key_
2885	vcipher		$out0,$out0,v28
2886	lvx		v24,$x00,$key_		# re-pre-load round[1]
2887
2888	vcipher		$out0,$out0,v29
2889	lvx		v25,$x10,$key_		# re-pre-load round[2]
2890	 vxor		$twk0,$twk0,v31
2891
2892	le?vperm	$in0,$in0,$in0,$leperm
2893	vcipher		$out0,$out0,v30
2894
2895	vperm		$in0,$in0,$in0,$inpperm
2896	vcipherlast	$out0,$out0,$twk0
2897
2898	vmr		$twk0,$twk1		# unused tweak
2899	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2900	le?vperm	$out0,$out0,$out0,$leperm
2901	stvx_u		$out0,$x00,$out		# store output
2902	addi		$out,$out,0x10
2903	bne		Lxts_enc6x_steal
2904	b		Lxts_enc6x_done
2905
2906.align	4
2907Lxts_enc6x_zero:
2908	cmpwi		$taillen,0
2909	beq		Lxts_enc6x_done
2910
2911	add		$inp,$inp,$taillen
2912	subi		$inp,$inp,16
2913	lvx_u		$in0,0,$inp
2914	lvsr		$inpperm,0,$taillen	# $in5 is no more
2915	le?vperm	$in0,$in0,$in0,$leperm
2916	vperm		$in0,$in0,$in0,$inpperm
2917	vxor		$tmp,$tmp,$twk0
2918Lxts_enc6x_steal:
2919	vxor		$in0,$in0,$twk0
2920	vxor		$out0,$out0,$out0
2921	vspltisb	$out1,-1
2922	vperm		$out0,$out0,$out1,$inpperm
2923	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2924
2925	subi		r30,$out,17
2926	subi		$out,$out,16
2927	mtctr		$taillen
2928Loop_xts_enc6x_steal:
2929	lbzu		r0,1(r30)
2930	stb		r0,16(r30)
2931	bdnz		Loop_xts_enc6x_steal
2932
2933	li		$taillen,0
2934	mtctr		$rounds
2935	b		Loop_xts_enc1x		# one more time...
2936
2937.align	4
2938Lxts_enc6x_done:
2939	${UCMP}i	$ivp,0
2940	beq		Lxts_enc6x_ret
2941
2942	vxor		$tweak,$twk0,$rndkey0
2943	le?vperm	$tweak,$tweak,$tweak,$leperm
2944	stvx_u		$tweak,0,$ivp
2945
2946Lxts_enc6x_ret:
2947	mtlr		r11
2948	li		r10,`$FRAME+15`
2949	li		r11,`$FRAME+31`
2950	stvx		$seven,r10,$sp		# wipe copies of round keys
2951	addi		r10,r10,32
2952	stvx		$seven,r11,$sp
2953	addi		r11,r11,32
2954	stvx		$seven,r10,$sp
2955	addi		r10,r10,32
2956	stvx		$seven,r11,$sp
2957	addi		r11,r11,32
2958	stvx		$seven,r10,$sp
2959	addi		r10,r10,32
2960	stvx		$seven,r11,$sp
2961	addi		r11,r11,32
2962	stvx		$seven,r10,$sp
2963	addi		r10,r10,32
2964	stvx		$seven,r11,$sp
2965	addi		r11,r11,32
2966
2967	mtspr		256,$vrsave
2968	lvx		v20,r10,$sp		# ABI says so
2969	addi		r10,r10,32
2970	lvx		v21,r11,$sp
2971	addi		r11,r11,32
2972	lvx		v22,r10,$sp
2973	addi		r10,r10,32
2974	lvx		v23,r11,$sp
2975	addi		r11,r11,32
2976	lvx		v24,r10,$sp
2977	addi		r10,r10,32
2978	lvx		v25,r11,$sp
2979	addi		r11,r11,32
2980	lvx		v26,r10,$sp
2981	addi		r10,r10,32
2982	lvx		v27,r11,$sp
2983	addi		r11,r11,32
2984	lvx		v28,r10,$sp
2985	addi		r10,r10,32
2986	lvx		v29,r11,$sp
2987	addi		r11,r11,32
2988	lvx		v30,r10,$sp
2989	lvx		v31,r11,$sp
2990	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2991	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2992	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2993	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2994	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2995	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2996	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
2997	blr
2998	.long		0
2999	.byte		0,12,0x04,1,0x80,6,6,0
3000	.long		0
3001
3002.align	5
3003_aesp8_xts_enc5x:
3004	vcipher		$out0,$out0,v24
3005	vcipher		$out1,$out1,v24
3006	vcipher		$out2,$out2,v24
3007	vcipher		$out3,$out3,v24
3008	vcipher		$out4,$out4,v24
3009	lvx		v24,$x20,$key_		# round[3]
3010	addi		$key_,$key_,0x20
3011
3012	vcipher		$out0,$out0,v25
3013	vcipher		$out1,$out1,v25
3014	vcipher		$out2,$out2,v25
3015	vcipher		$out3,$out3,v25
3016	vcipher		$out4,$out4,v25
3017	lvx		v25,$x10,$key_		# round[4]
3018	bdnz		_aesp8_xts_enc5x
3019
3020	add		$inp,$inp,$taillen
3021	cmpwi		$taillen,0
3022	vcipher		$out0,$out0,v24
3023	vcipher		$out1,$out1,v24
3024	vcipher		$out2,$out2,v24
3025	vcipher		$out3,$out3,v24
3026	vcipher		$out4,$out4,v24
3027
3028	subi		$inp,$inp,16
3029	vcipher		$out0,$out0,v25
3030	vcipher		$out1,$out1,v25
3031	vcipher		$out2,$out2,v25
3032	vcipher		$out3,$out3,v25
3033	vcipher		$out4,$out4,v25
3034	 vxor		$twk0,$twk0,v31
3035
3036	vcipher		$out0,$out0,v26
3037	lvsr		$inpperm,r0,$taillen	# $in5 is no more
3038	vcipher		$out1,$out1,v26
3039	vcipher		$out2,$out2,v26
3040	vcipher		$out3,$out3,v26
3041	vcipher		$out4,$out4,v26
3042	 vxor		$in1,$twk1,v31
3043
3044	vcipher		$out0,$out0,v27
3045	lvx_u		$in0,0,$inp
3046	vcipher		$out1,$out1,v27
3047	vcipher		$out2,$out2,v27
3048	vcipher		$out3,$out3,v27
3049	vcipher		$out4,$out4,v27
3050	 vxor		$in2,$twk2,v31
3051
3052	addi		$key_,$sp,$FRAME+15	# rewind $key_
3053	vcipher		$out0,$out0,v28
3054	vcipher		$out1,$out1,v28
3055	vcipher		$out2,$out2,v28
3056	vcipher		$out3,$out3,v28
3057	vcipher		$out4,$out4,v28
3058	lvx		v24,$x00,$key_		# re-pre-load round[1]
3059	 vxor		$in3,$twk3,v31
3060
3061	vcipher		$out0,$out0,v29
3062	le?vperm	$in0,$in0,$in0,$leperm
3063	vcipher		$out1,$out1,v29
3064	vcipher		$out2,$out2,v29
3065	vcipher		$out3,$out3,v29
3066	vcipher		$out4,$out4,v29
3067	lvx		v25,$x10,$key_		# re-pre-load round[2]
3068	 vxor		$in4,$twk4,v31
3069
3070	vcipher		$out0,$out0,v30
3071	vperm		$in0,$in0,$in0,$inpperm
3072	vcipher		$out1,$out1,v30
3073	vcipher		$out2,$out2,v30
3074	vcipher		$out3,$out3,v30
3075	vcipher		$out4,$out4,v30
3076
3077	vcipherlast	$out0,$out0,$twk0
3078	vcipherlast	$out1,$out1,$in1
3079	vcipherlast	$out2,$out2,$in2
3080	vcipherlast	$out3,$out3,$in3
3081	vcipherlast	$out4,$out4,$in4
3082	blr
3083        .long   	0
3084        .byte   	0,12,0x14,0,0,0,0,0
3085
3086.align	5
3087_aesp8_xts_decrypt6x:
3088	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3089	mflr		r11
3090	li		r7,`$FRAME+8*16+15`
3091	li		r3,`$FRAME+8*16+31`
3092	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3093	stvx		v20,r7,$sp		# ABI says so
3094	addi		r7,r7,32
3095	stvx		v21,r3,$sp
3096	addi		r3,r3,32
3097	stvx		v22,r7,$sp
3098	addi		r7,r7,32
3099	stvx		v23,r3,$sp
3100	addi		r3,r3,32
3101	stvx		v24,r7,$sp
3102	addi		r7,r7,32
3103	stvx		v25,r3,$sp
3104	addi		r3,r3,32
3105	stvx		v26,r7,$sp
3106	addi		r7,r7,32
3107	stvx		v27,r3,$sp
3108	addi		r3,r3,32
3109	stvx		v28,r7,$sp
3110	addi		r7,r7,32
3111	stvx		v29,r3,$sp
3112	addi		r3,r3,32
3113	stvx		v30,r7,$sp
3114	stvx		v31,r3,$sp
3115	li		r0,-1
3116	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3117	li		$x10,0x10
3118	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3119	li		$x20,0x20
3120	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3121	li		$x30,0x30
3122	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3123	li		$x40,0x40
3124	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3125	li		$x50,0x50
3126	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3127	li		$x60,0x60
3128	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3129	li		$x70,0x70
3130	mtspr		256,r0
3131
3132	subi		$rounds,$rounds,3	# -4 in total
3133
3134	lvx		$rndkey0,$x00,$key1	# load key schedule
3135	lvx		v30,$x10,$key1
3136	addi		$key1,$key1,0x20
3137	lvx		v31,$x00,$key1
3138	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3139	addi		$key_,$sp,$FRAME+15
3140	mtctr		$rounds
3141
3142Load_xts_dec_key:
3143	?vperm		v24,v30,v31,$keyperm
3144	lvx		v30,$x10,$key1
3145	addi		$key1,$key1,0x20
3146	stvx		v24,$x00,$key_		# off-load round[1]
3147	?vperm		v25,v31,v30,$keyperm
3148	lvx		v31,$x00,$key1
3149	stvx		v25,$x10,$key_		# off-load round[2]
3150	addi		$key_,$key_,0x20
3151	bdnz		Load_xts_dec_key
3152
3153	lvx		v26,$x10,$key1
3154	?vperm		v24,v30,v31,$keyperm
3155	lvx		v27,$x20,$key1
3156	stvx		v24,$x00,$key_		# off-load round[3]
3157	?vperm		v25,v31,v26,$keyperm
3158	lvx		v28,$x30,$key1
3159	stvx		v25,$x10,$key_		# off-load round[4]
3160	addi		$key_,$sp,$FRAME+15	# rewind $key_
3161	?vperm		v26,v26,v27,$keyperm
3162	lvx		v29,$x40,$key1
3163	?vperm		v27,v27,v28,$keyperm
3164	lvx		v30,$x50,$key1
3165	?vperm		v28,v28,v29,$keyperm
3166	lvx		v31,$x60,$key1
3167	?vperm		v29,v29,v30,$keyperm
3168	lvx		$twk5,$x70,$key1	# borrow $twk5
3169	?vperm		v30,v30,v31,$keyperm
3170	lvx		v24,$x00,$key_		# pre-load round[1]
3171	?vperm		v31,v31,$twk5,$keyperm
3172	lvx		v25,$x10,$key_		# pre-load round[2]
3173
3174	 vperm		$in0,$inout,$inptail,$inpperm
3175	 subi		$inp,$inp,31		# undo "caller"
3176	vxor		$twk0,$tweak,$rndkey0
3177	vsrab		$tmp,$tweak,$seven	# next tweak value
3178	vaddubm		$tweak,$tweak,$tweak
3179	vsldoi		$tmp,$tmp,$tmp,15
3180	vand		$tmp,$tmp,$eighty7
3181	 vxor		$out0,$in0,$twk0
3182	vxor		$tweak,$tweak,$tmp
3183
3184	 lvx_u		$in1,$x10,$inp
3185	vxor		$twk1,$tweak,$rndkey0
3186	vsrab		$tmp,$tweak,$seven	# next tweak value
3187	vaddubm		$tweak,$tweak,$tweak
3188	vsldoi		$tmp,$tmp,$tmp,15
3189	 le?vperm	$in1,$in1,$in1,$leperm
3190	vand		$tmp,$tmp,$eighty7
3191	 vxor		$out1,$in1,$twk1
3192	vxor		$tweak,$tweak,$tmp
3193
3194	 lvx_u		$in2,$x20,$inp
3195	 andi.		$taillen,$len,15
3196	vxor		$twk2,$tweak,$rndkey0
3197	vsrab		$tmp,$tweak,$seven	# next tweak value
3198	vaddubm		$tweak,$tweak,$tweak
3199	vsldoi		$tmp,$tmp,$tmp,15
3200	 le?vperm	$in2,$in2,$in2,$leperm
3201	vand		$tmp,$tmp,$eighty7
3202	 vxor		$out2,$in2,$twk2
3203	vxor		$tweak,$tweak,$tmp
3204
3205	 lvx_u		$in3,$x30,$inp
3206	 sub		$len,$len,$taillen
3207	vxor		$twk3,$tweak,$rndkey0
3208	vsrab		$tmp,$tweak,$seven	# next tweak value
3209	vaddubm		$tweak,$tweak,$tweak
3210	vsldoi		$tmp,$tmp,$tmp,15
3211	 le?vperm	$in3,$in3,$in3,$leperm
3212	vand		$tmp,$tmp,$eighty7
3213	 vxor		$out3,$in3,$twk3
3214	vxor		$tweak,$tweak,$tmp
3215
3216	 lvx_u		$in4,$x40,$inp
3217	 subi		$len,$len,0x60
3218	vxor		$twk4,$tweak,$rndkey0
3219	vsrab		$tmp,$tweak,$seven	# next tweak value
3220	vaddubm		$tweak,$tweak,$tweak
3221	vsldoi		$tmp,$tmp,$tmp,15
3222	 le?vperm	$in4,$in4,$in4,$leperm
3223	vand		$tmp,$tmp,$eighty7
3224	 vxor		$out4,$in4,$twk4
3225	vxor		$tweak,$tweak,$tmp
3226
3227	 lvx_u		$in5,$x50,$inp
3228	 addi		$inp,$inp,0x60
3229	vxor		$twk5,$tweak,$rndkey0
3230	vsrab		$tmp,$tweak,$seven	# next tweak value
3231	vaddubm		$tweak,$tweak,$tweak
3232	vsldoi		$tmp,$tmp,$tmp,15
3233	 le?vperm	$in5,$in5,$in5,$leperm
3234	vand		$tmp,$tmp,$eighty7
3235	 vxor		$out5,$in5,$twk5
3236	vxor		$tweak,$tweak,$tmp
3237
3238	vxor		v31,v31,$rndkey0
3239	mtctr		$rounds
3240	b		Loop_xts_dec6x
3241
3242.align	5
3243Loop_xts_dec6x:
3244	vncipher	$out0,$out0,v24
3245	vncipher	$out1,$out1,v24
3246	vncipher	$out2,$out2,v24
3247	vncipher	$out3,$out3,v24
3248	vncipher	$out4,$out4,v24
3249	vncipher	$out5,$out5,v24
3250	lvx		v24,$x20,$key_		# round[3]
3251	addi		$key_,$key_,0x20
3252
3253	vncipher	$out0,$out0,v25
3254	vncipher	$out1,$out1,v25
3255	vncipher	$out2,$out2,v25
3256	vncipher	$out3,$out3,v25
3257	vncipher	$out4,$out4,v25
3258	vncipher	$out5,$out5,v25
3259	lvx		v25,$x10,$key_		# round[4]
3260	bdnz		Loop_xts_dec6x
3261
3262	subic		$len,$len,96		# $len-=96
3263	 vxor		$in0,$twk0,v31		# xor with last round key
3264	vncipher	$out0,$out0,v24
3265	vncipher	$out1,$out1,v24
3266	 vsrab		$tmp,$tweak,$seven	# next tweak value
3267	 vxor		$twk0,$tweak,$rndkey0
3268	 vaddubm	$tweak,$tweak,$tweak
3269	vncipher	$out2,$out2,v24
3270	vncipher	$out3,$out3,v24
3271	 vsldoi		$tmp,$tmp,$tmp,15
3272	vncipher	$out4,$out4,v24
3273	vncipher	$out5,$out5,v24
3274
3275	subfe.		r0,r0,r0		# borrow?-1:0
3276	 vand		$tmp,$tmp,$eighty7
3277	vncipher	$out0,$out0,v25
3278	vncipher	$out1,$out1,v25
3279	 vxor		$tweak,$tweak,$tmp
3280	vncipher	$out2,$out2,v25
3281	vncipher	$out3,$out3,v25
3282	 vxor		$in1,$twk1,v31
3283	 vsrab		$tmp,$tweak,$seven	# next tweak value
3284	 vxor		$twk1,$tweak,$rndkey0
3285	vncipher	$out4,$out4,v25
3286	vncipher	$out5,$out5,v25
3287
3288	and		r0,r0,$len
3289	 vaddubm	$tweak,$tweak,$tweak
3290	 vsldoi		$tmp,$tmp,$tmp,15
3291	vncipher	$out0,$out0,v26
3292	vncipher	$out1,$out1,v26
3293	 vand		$tmp,$tmp,$eighty7
3294	vncipher	$out2,$out2,v26
3295	vncipher	$out3,$out3,v26
3296	 vxor		$tweak,$tweak,$tmp
3297	vncipher	$out4,$out4,v26
3298	vncipher	$out5,$out5,v26
3299
3300	add		$inp,$inp,r0		# $inp is adjusted in such
3301						# way that at exit from the
3302						# loop inX-in5 are loaded
3303						# with last "words"
3304	 vxor		$in2,$twk2,v31
3305	 vsrab		$tmp,$tweak,$seven	# next tweak value
3306	 vxor		$twk2,$tweak,$rndkey0
3307	 vaddubm	$tweak,$tweak,$tweak
3308	vncipher	$out0,$out0,v27
3309	vncipher	$out1,$out1,v27
3310	 vsldoi		$tmp,$tmp,$tmp,15
3311	vncipher	$out2,$out2,v27
3312	vncipher	$out3,$out3,v27
3313	 vand		$tmp,$tmp,$eighty7
3314	vncipher	$out4,$out4,v27
3315	vncipher	$out5,$out5,v27
3316
3317	addi		$key_,$sp,$FRAME+15	# rewind $key_
3318	 vxor		$tweak,$tweak,$tmp
3319	vncipher	$out0,$out0,v28
3320	vncipher	$out1,$out1,v28
3321	 vxor		$in3,$twk3,v31
3322	 vsrab		$tmp,$tweak,$seven	# next tweak value
3323	 vxor		$twk3,$tweak,$rndkey0
3324	vncipher	$out2,$out2,v28
3325	vncipher	$out3,$out3,v28
3326	 vaddubm	$tweak,$tweak,$tweak
3327	 vsldoi		$tmp,$tmp,$tmp,15
3328	vncipher	$out4,$out4,v28
3329	vncipher	$out5,$out5,v28
3330	lvx		v24,$x00,$key_		# re-pre-load round[1]
3331	 vand		$tmp,$tmp,$eighty7
3332
3333	vncipher	$out0,$out0,v29
3334	vncipher	$out1,$out1,v29
3335	 vxor		$tweak,$tweak,$tmp
3336	vncipher	$out2,$out2,v29
3337	vncipher	$out3,$out3,v29
3338	 vxor		$in4,$twk4,v31
3339	 vsrab		$tmp,$tweak,$seven	# next tweak value
3340	 vxor		$twk4,$tweak,$rndkey0
3341	vncipher	$out4,$out4,v29
3342	vncipher	$out5,$out5,v29
3343	lvx		v25,$x10,$key_		# re-pre-load round[2]
3344	 vaddubm	$tweak,$tweak,$tweak
3345	 vsldoi		$tmp,$tmp,$tmp,15
3346
3347	vncipher	$out0,$out0,v30
3348	vncipher	$out1,$out1,v30
3349	 vand		$tmp,$tmp,$eighty7
3350	vncipher	$out2,$out2,v30
3351	vncipher	$out3,$out3,v30
3352	 vxor		$tweak,$tweak,$tmp
3353	vncipher	$out4,$out4,v30
3354	vncipher	$out5,$out5,v30
3355	 vxor		$in5,$twk5,v31
3356	 vsrab		$tmp,$tweak,$seven	# next tweak value
3357	 vxor		$twk5,$tweak,$rndkey0
3358
3359	vncipherlast	$out0,$out0,$in0
3360	 lvx_u		$in0,$x00,$inp		# load next input block
3361	 vaddubm	$tweak,$tweak,$tweak
3362	 vsldoi		$tmp,$tmp,$tmp,15
3363	vncipherlast	$out1,$out1,$in1
3364	 lvx_u		$in1,$x10,$inp
3365	vncipherlast	$out2,$out2,$in2
3366	 le?vperm	$in0,$in0,$in0,$leperm
3367	 lvx_u		$in2,$x20,$inp
3368	 vand		$tmp,$tmp,$eighty7
3369	vncipherlast	$out3,$out3,$in3
3370	 le?vperm	$in1,$in1,$in1,$leperm
3371	 lvx_u		$in3,$x30,$inp
3372	vncipherlast	$out4,$out4,$in4
3373	 le?vperm	$in2,$in2,$in2,$leperm
3374	 lvx_u		$in4,$x40,$inp
3375	 vxor		$tweak,$tweak,$tmp
3376	vncipherlast	$out5,$out5,$in5
3377	 le?vperm	$in3,$in3,$in3,$leperm
3378	 lvx_u		$in5,$x50,$inp
3379	 addi		$inp,$inp,0x60
3380	 le?vperm	$in4,$in4,$in4,$leperm
3381	 le?vperm	$in5,$in5,$in5,$leperm
3382
3383	le?vperm	$out0,$out0,$out0,$leperm
3384	le?vperm	$out1,$out1,$out1,$leperm
3385	stvx_u		$out0,$x00,$out		# store output
3386	 vxor		$out0,$in0,$twk0
3387	le?vperm	$out2,$out2,$out2,$leperm
3388	stvx_u		$out1,$x10,$out
3389	 vxor		$out1,$in1,$twk1
3390	le?vperm	$out3,$out3,$out3,$leperm
3391	stvx_u		$out2,$x20,$out
3392	 vxor		$out2,$in2,$twk2
3393	le?vperm	$out4,$out4,$out4,$leperm
3394	stvx_u		$out3,$x30,$out
3395	 vxor		$out3,$in3,$twk3
3396	le?vperm	$out5,$out5,$out5,$leperm
3397	stvx_u		$out4,$x40,$out
3398	 vxor		$out4,$in4,$twk4
3399	stvx_u		$out5,$x50,$out
3400	 vxor		$out5,$in5,$twk5
3401	addi		$out,$out,0x60
3402
3403	mtctr		$rounds
3404	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3405
3406	addic.		$len,$len,0x60
3407	beq		Lxts_dec6x_zero
3408	cmpwi		$len,0x20
3409	blt		Lxts_dec6x_one
3410	nop
3411	beq		Lxts_dec6x_two
3412	cmpwi		$len,0x40
3413	blt		Lxts_dec6x_three
3414	nop
3415	beq		Lxts_dec6x_four
3416
3417Lxts_dec6x_five:
3418	vxor		$out0,$in1,$twk0
3419	vxor		$out1,$in2,$twk1
3420	vxor		$out2,$in3,$twk2
3421	vxor		$out3,$in4,$twk3
3422	vxor		$out4,$in5,$twk4
3423
3424	bl		_aesp8_xts_dec5x
3425
3426	le?vperm	$out0,$out0,$out0,$leperm
3427	vmr		$twk0,$twk5		# unused tweak
3428	vxor		$twk1,$tweak,$rndkey0
3429	le?vperm	$out1,$out1,$out1,$leperm
3430	stvx_u		$out0,$x00,$out		# store output
3431	vxor		$out0,$in0,$twk1
3432	le?vperm	$out2,$out2,$out2,$leperm
3433	stvx_u		$out1,$x10,$out
3434	le?vperm	$out3,$out3,$out3,$leperm
3435	stvx_u		$out2,$x20,$out
3436	le?vperm	$out4,$out4,$out4,$leperm
3437	stvx_u		$out3,$x30,$out
3438	stvx_u		$out4,$x40,$out
3439	addi		$out,$out,0x50
3440	bne		Lxts_dec6x_steal
3441	b		Lxts_dec6x_done
3442
3443.align	4
3444Lxts_dec6x_four:
3445	vxor		$out0,$in2,$twk0
3446	vxor		$out1,$in3,$twk1
3447	vxor		$out2,$in4,$twk2
3448	vxor		$out3,$in5,$twk3
3449	vxor		$out4,$out4,$out4
3450
3451	bl		_aesp8_xts_dec5x
3452
3453	le?vperm	$out0,$out0,$out0,$leperm
3454	vmr		$twk0,$twk4		# unused tweak
3455	vmr		$twk1,$twk5
3456	le?vperm	$out1,$out1,$out1,$leperm
3457	stvx_u		$out0,$x00,$out		# store output
3458	vxor		$out0,$in0,$twk5
3459	le?vperm	$out2,$out2,$out2,$leperm
3460	stvx_u		$out1,$x10,$out
3461	le?vperm	$out3,$out3,$out3,$leperm
3462	stvx_u		$out2,$x20,$out
3463	stvx_u		$out3,$x30,$out
3464	addi		$out,$out,0x40
3465	bne		Lxts_dec6x_steal
3466	b		Lxts_dec6x_done
3467
3468.align	4
3469Lxts_dec6x_three:
3470	vxor		$out0,$in3,$twk0
3471	vxor		$out1,$in4,$twk1
3472	vxor		$out2,$in5,$twk2
3473	vxor		$out3,$out3,$out3
3474	vxor		$out4,$out4,$out4
3475
3476	bl		_aesp8_xts_dec5x
3477
3478	le?vperm	$out0,$out0,$out0,$leperm
3479	vmr		$twk0,$twk3		# unused tweak
3480	vmr		$twk1,$twk4
3481	le?vperm	$out1,$out1,$out1,$leperm
3482	stvx_u		$out0,$x00,$out		# store output
3483	vxor		$out0,$in0,$twk4
3484	le?vperm	$out2,$out2,$out2,$leperm
3485	stvx_u		$out1,$x10,$out
3486	stvx_u		$out2,$x20,$out
3487	addi		$out,$out,0x30
3488	bne		Lxts_dec6x_steal
3489	b		Lxts_dec6x_done
3490
3491.align	4
3492Lxts_dec6x_two:
3493	vxor		$out0,$in4,$twk0
3494	vxor		$out1,$in5,$twk1
3495	vxor		$out2,$out2,$out2
3496	vxor		$out3,$out3,$out3
3497	vxor		$out4,$out4,$out4
3498
3499	bl		_aesp8_xts_dec5x
3500
3501	le?vperm	$out0,$out0,$out0,$leperm
3502	vmr		$twk0,$twk2		# unused tweak
3503	vmr		$twk1,$twk3
3504	le?vperm	$out1,$out1,$out1,$leperm
3505	stvx_u		$out0,$x00,$out		# store output
3506	vxor		$out0,$in0,$twk3
3507	stvx_u		$out1,$x10,$out
3508	addi		$out,$out,0x20
3509	bne		Lxts_dec6x_steal
3510	b		Lxts_dec6x_done
3511
3512.align	4
3513Lxts_dec6x_one:
3514	vxor		$out0,$in5,$twk0
3515	nop
3516Loop_xts_dec1x:
3517	vncipher	$out0,$out0,v24
3518	lvx		v24,$x20,$key_		# round[3]
3519	addi		$key_,$key_,0x20
3520
3521	vncipher	$out0,$out0,v25
3522	lvx		v25,$x10,$key_		# round[4]
3523	bdnz		Loop_xts_dec1x
3524
3525	subi		r0,$taillen,1
3526	vncipher	$out0,$out0,v24
3527
3528	andi.		r0,r0,16
3529	cmpwi		$taillen,0
3530	vncipher	$out0,$out0,v25
3531
3532	sub		$inp,$inp,r0
3533	vncipher	$out0,$out0,v26
3534
3535	lvx_u		$in0,0,$inp
3536	vncipher	$out0,$out0,v27
3537
3538	addi		$key_,$sp,$FRAME+15	# rewind $key_
3539	vncipher	$out0,$out0,v28
3540	lvx		v24,$x00,$key_		# re-pre-load round[1]
3541
3542	vncipher	$out0,$out0,v29
3543	lvx		v25,$x10,$key_		# re-pre-load round[2]
3544	 vxor		$twk0,$twk0,v31
3545
3546	le?vperm	$in0,$in0,$in0,$leperm
3547	vncipher	$out0,$out0,v30
3548
3549	mtctr		$rounds
3550	vncipherlast	$out0,$out0,$twk0
3551
3552	vmr		$twk0,$twk1		# unused tweak
3553	vmr		$twk1,$twk2
3554	le?vperm	$out0,$out0,$out0,$leperm
3555	stvx_u		$out0,$x00,$out		# store output
3556	addi		$out,$out,0x10
3557	vxor		$out0,$in0,$twk2
3558	bne		Lxts_dec6x_steal
3559	b		Lxts_dec6x_done
3560
3561.align	4
3562Lxts_dec6x_zero:
3563	cmpwi		$taillen,0
3564	beq		Lxts_dec6x_done
3565
3566	lvx_u		$in0,0,$inp
3567	le?vperm	$in0,$in0,$in0,$leperm
3568	vxor		$out0,$in0,$twk1
3569Lxts_dec6x_steal:
3570	vncipher	$out0,$out0,v24
3571	lvx		v24,$x20,$key_		# round[3]
3572	addi		$key_,$key_,0x20
3573
3574	vncipher	$out0,$out0,v25
3575	lvx		v25,$x10,$key_		# round[4]
3576	bdnz		Lxts_dec6x_steal
3577
3578	add		$inp,$inp,$taillen
3579	vncipher	$out0,$out0,v24
3580
3581	cmpwi		$taillen,0
3582	vncipher	$out0,$out0,v25
3583
3584	lvx_u		$in0,0,$inp
3585	vncipher	$out0,$out0,v26
3586
3587	lvsr		$inpperm,0,$taillen	# $in5 is no more
3588	vncipher	$out0,$out0,v27
3589
3590	addi		$key_,$sp,$FRAME+15	# rewind $key_
3591	vncipher	$out0,$out0,v28
3592	lvx		v24,$x00,$key_		# re-pre-load round[1]
3593
3594	vncipher	$out0,$out0,v29
3595	lvx		v25,$x10,$key_		# re-pre-load round[2]
3596	 vxor		$twk1,$twk1,v31
3597
3598	le?vperm	$in0,$in0,$in0,$leperm
3599	vncipher	$out0,$out0,v30
3600
3601	vperm		$in0,$in0,$in0,$inpperm
3602	vncipherlast	$tmp,$out0,$twk1
3603
3604	le?vperm	$out0,$tmp,$tmp,$leperm
3605	le?stvx_u	$out0,0,$out
3606	be?stvx_u	$tmp,0,$out
3607
3608	vxor		$out0,$out0,$out0
3609	vspltisb	$out1,-1
3610	vperm		$out0,$out0,$out1,$inpperm
3611	vsel		$out0,$in0,$tmp,$out0
3612	vxor		$out0,$out0,$twk0
3613
3614	subi		r30,$out,1
3615	mtctr		$taillen
3616Loop_xts_dec6x_steal:
3617	lbzu		r0,1(r30)
3618	stb		r0,16(r30)
3619	bdnz		Loop_xts_dec6x_steal
3620
3621	li		$taillen,0
3622	mtctr		$rounds
3623	b		Loop_xts_dec1x		# one more time...
3624
3625.align	4
3626Lxts_dec6x_done:
3627	${UCMP}i	$ivp,0
3628	beq		Lxts_dec6x_ret
3629
3630	vxor		$tweak,$twk0,$rndkey0
3631	le?vperm	$tweak,$tweak,$tweak,$leperm
3632	stvx_u		$tweak,0,$ivp
3633
3634Lxts_dec6x_ret:
3635	mtlr		r11
3636	li		r10,`$FRAME+15`
3637	li		r11,`$FRAME+31`
3638	stvx		$seven,r10,$sp		# wipe copies of round keys
3639	addi		r10,r10,32
3640	stvx		$seven,r11,$sp
3641	addi		r11,r11,32
3642	stvx		$seven,r10,$sp
3643	addi		r10,r10,32
3644	stvx		$seven,r11,$sp
3645	addi		r11,r11,32
3646	stvx		$seven,r10,$sp
3647	addi		r10,r10,32
3648	stvx		$seven,r11,$sp
3649	addi		r11,r11,32
3650	stvx		$seven,r10,$sp
3651	addi		r10,r10,32
3652	stvx		$seven,r11,$sp
3653	addi		r11,r11,32
3654
3655	mtspr		256,$vrsave
3656	lvx		v20,r10,$sp		# ABI says so
3657	addi		r10,r10,32
3658	lvx		v21,r11,$sp
3659	addi		r11,r11,32
3660	lvx		v22,r10,$sp
3661	addi		r10,r10,32
3662	lvx		v23,r11,$sp
3663	addi		r11,r11,32
3664	lvx		v24,r10,$sp
3665	addi		r10,r10,32
3666	lvx		v25,r11,$sp
3667	addi		r11,r11,32
3668	lvx		v26,r10,$sp
3669	addi		r10,r10,32
3670	lvx		v27,r11,$sp
3671	addi		r11,r11,32
3672	lvx		v28,r10,$sp
3673	addi		r10,r10,32
3674	lvx		v29,r11,$sp
3675	addi		r11,r11,32
3676	lvx		v30,r10,$sp
3677	lvx		v31,r11,$sp
3678	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3679	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3680	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3681	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3682	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3683	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3684	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3685	blr
3686	.long		0
3687	.byte		0,12,0x04,1,0x80,6,6,0
3688	.long		0
3689
3690.align	5
3691_aesp8_xts_dec5x:
3692	vncipher	$out0,$out0,v24
3693	vncipher	$out1,$out1,v24
3694	vncipher	$out2,$out2,v24
3695	vncipher	$out3,$out3,v24
3696	vncipher	$out4,$out4,v24
3697	lvx		v24,$x20,$key_		# round[3]
3698	addi		$key_,$key_,0x20
3699
3700	vncipher	$out0,$out0,v25
3701	vncipher	$out1,$out1,v25
3702	vncipher	$out2,$out2,v25
3703	vncipher	$out3,$out3,v25
3704	vncipher	$out4,$out4,v25
3705	lvx		v25,$x10,$key_		# round[4]
3706	bdnz		_aesp8_xts_dec5x
3707
3708	subi		r0,$taillen,1
3709	vncipher	$out0,$out0,v24
3710	vncipher	$out1,$out1,v24
3711	vncipher	$out2,$out2,v24
3712	vncipher	$out3,$out3,v24
3713	vncipher	$out4,$out4,v24
3714
3715	andi.		r0,r0,16
3716	cmpwi		$taillen,0
3717	vncipher	$out0,$out0,v25
3718	vncipher	$out1,$out1,v25
3719	vncipher	$out2,$out2,v25
3720	vncipher	$out3,$out3,v25
3721	vncipher	$out4,$out4,v25
3722	 vxor		$twk0,$twk0,v31
3723
3724	sub		$inp,$inp,r0
3725	vncipher	$out0,$out0,v26
3726	vncipher	$out1,$out1,v26
3727	vncipher	$out2,$out2,v26
3728	vncipher	$out3,$out3,v26
3729	vncipher	$out4,$out4,v26
3730	 vxor		$in1,$twk1,v31
3731
3732	vncipher	$out0,$out0,v27
3733	lvx_u		$in0,0,$inp
3734	vncipher	$out1,$out1,v27
3735	vncipher	$out2,$out2,v27
3736	vncipher	$out3,$out3,v27
3737	vncipher	$out4,$out4,v27
3738	 vxor		$in2,$twk2,v31
3739
3740	addi		$key_,$sp,$FRAME+15	# rewind $key_
3741	vncipher	$out0,$out0,v28
3742	vncipher	$out1,$out1,v28
3743	vncipher	$out2,$out2,v28
3744	vncipher	$out3,$out3,v28
3745	vncipher	$out4,$out4,v28
3746	lvx		v24,$x00,$key_		# re-pre-load round[1]
3747	 vxor		$in3,$twk3,v31
3748
3749	vncipher	$out0,$out0,v29
3750	le?vperm	$in0,$in0,$in0,$leperm
3751	vncipher	$out1,$out1,v29
3752	vncipher	$out2,$out2,v29
3753	vncipher	$out3,$out3,v29
3754	vncipher	$out4,$out4,v29
3755	lvx		v25,$x10,$key_		# re-pre-load round[2]
3756	 vxor		$in4,$twk4,v31
3757
3758	vncipher	$out0,$out0,v30
3759	vncipher	$out1,$out1,v30
3760	vncipher	$out2,$out2,v30
3761	vncipher	$out3,$out3,v30
3762	vncipher	$out4,$out4,v30
3763
3764	vncipherlast	$out0,$out0,$twk0
3765	vncipherlast	$out1,$out1,$in1
3766	vncipherlast	$out2,$out2,$in2
3767	vncipherlast	$out3,$out3,$in3
3768	vncipherlast	$out4,$out4,$in4
3769	mtctr		$rounds
3770	blr
3771        .long   	0
3772        .byte   	0,12,0x14,0,0,0,0,0
3773___
3774}}	}}}
3775
3776my $consts=1;
3777foreach(split("\n",$code)) {
3778        s/\`([^\`]*)\`/eval($1)/geo;
3779
3780	# constants table endian-specific conversion
3781	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3782	    my $conv=$3;
3783	    my @bytes=();
3784
3785	    # convert to endian-agnostic format
3786	    if ($1 eq "long") {
3787	      foreach (split(/,\s*/,$2)) {
3788		my $l = /^0/?oct:int;
3789		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3790	      }
3791	    } else {
3792		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3793	    }
3794
3795	    # little-endian conversion
3796	    if ($flavour =~ /le$/o) {
3797		SWITCH: for($conv)  {
3798		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3799		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3800		}
3801	    }
3802
3803	    #emit
3804	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3805	    next;
3806	}
3807	$consts=0 if (m/Lconsts:/o);	# end of table
3808
3809	# instructions prefixed with '?' are endian-specific and need
3810	# to be adjusted accordingly...
3811	if ($flavour =~ /le$/o) {	# little-endian
3812	    s/le\?//o		or
3813	    s/be\?/#be#/o	or
3814	    s/\?lvsr/lvsl/o	or
3815	    s/\?lvsl/lvsr/o	or
3816	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3817	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3818	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3819	} else {			# big-endian
3820	    s/le\?/#le#/o	or
3821	    s/be\?//o		or
3822	    s/\?([a-z]+)/$1/o;
3823	}
3824
3825        print $_,"\n";
3826}
3827
3828close STDOUT;
3829