xref: /openbmc/linux/drivers/crypto/vmx/aesp8-ppc.pl (revision e0d07278)
1#! /usr/bin/env perl
2# SPDX-License-Identifier: GPL-2.0
3
4# This code is taken from CRYPTOGAMs[1] and is included here using the option
5# in the license to distribute the code under the GPL. Therefore this program
6# is free software; you can redistribute it and/or modify it under the terms of
7# the GNU General Public License version 2 as published by the Free Software
8# Foundation.
9#
10# [1] https://www.openssl.org/~appro/cryptogams/
11
12# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
13# All rights reserved.
14#
15# Redistribution and use in source and binary forms, with or without
16# modification, are permitted provided that the following conditions
17# are met:
18#
19#       * Redistributions of source code must retain copyright notices,
20#         this list of conditions and the following disclaimer.
21#
22#       * Redistributions in binary form must reproduce the above
23#         copyright notice, this list of conditions and the following
24#         disclaimer in the documentation and/or other materials
25#         provided with the distribution.
26#
27#       * Neither the name of the CRYPTOGAMS nor the names of its
28#         copyright holder and contributors may be used to endorse or
29#         promote products derived from this software without specific
30#         prior written permission.
31#
32# ALTERNATIVELY, provided that this notice is retained in full, this
33# product may be distributed under the terms of the GNU General Public
34# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
35# those given above.
36#
37# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
38# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48
49# ====================================================================
50# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
51# project. The module is, however, dual licensed under OpenSSL and
52# CRYPTOGAMS licenses depending on where you obtain it. For further
53# details see https://www.openssl.org/~appro/cryptogams/.
54# ====================================================================
55#
56# This module implements support for AES instructions as per PowerISA
57# specification version 2.07, first implemented by POWER8 processor.
58# The module is endian-agnostic in sense that it supports both big-
59# and little-endian cases. Data alignment in parallelizable modes is
60# handled with VSX loads and stores, which implies MSR.VSX flag being
61# set. It should also be noted that ISA specification doesn't prohibit
62# alignment exceptions for these instructions on page boundaries.
63# Initially alignment was handled in pure AltiVec/VMX way [when data
64# is aligned programmatically, which in turn guarantees exception-
65# free execution], but it turned to hamper performance when vcipher
66# instructions are interleaved. It's reckoned that eventual
67# misalignment penalties at page boundaries are in average lower
68# than additional overhead in pure AltiVec approach.
69#
70# May 2016
71#
72# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
73# systems were measured.
74#
75######################################################################
76# Current large-block performance in cycles per byte processed with
77# 128-bit key (less is better).
78#
79#		CBC en-/decrypt	CTR	XTS
80# POWER8[le]	3.96/0.72	0.74	1.1
81# POWER8[be]	3.75/0.65	0.66	1.0
82
83$flavour = shift;
84
85if ($flavour =~ /64/) {
86	$SIZE_T	=8;
87	$LRSAVE	=2*$SIZE_T;
88	$STU	="stdu";
89	$POP	="ld";
90	$PUSH	="std";
91	$UCMP	="cmpld";
92	$SHL	="sldi";
93} elsif ($flavour =~ /32/) {
94	$SIZE_T	=4;
95	$LRSAVE	=$SIZE_T;
96	$STU	="stwu";
97	$POP	="lwz";
98	$PUSH	="stw";
99	$UCMP	="cmplw";
100	$SHL	="slwi";
101} else { die "nonsense $flavour"; }
102
103$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
104
105$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
106( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
107( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
108die "can't locate ppc-xlate.pl";
109
110open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
111
112$FRAME=8*$SIZE_T;
113$prefix="aes_p8";
114
115$sp="r1";
116$vrsave="r12";
117
118#########################################################################
119{{{	# Key setup procedures						#
120my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
121my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
122my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
123
124$code.=<<___;
125.machine	"any"
126
127.text
128
129.align	7
130rcon:
131.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
132.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
133.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
134.long	0,0,0,0						?asis
135Lconsts:
136	mflr	r0
137	bcl	20,31,\$+4
138	mflr	$ptr	 #vvvvv "distance between . and rcon
139	addi	$ptr,$ptr,-0x48
140	mtlr	r0
141	blr
142	.long	0
143	.byte	0,12,0x14,0,0,0,0,0
144.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
145
146.globl	.${prefix}_set_encrypt_key
147Lset_encrypt_key:
148	mflr		r11
149	$PUSH		r11,$LRSAVE($sp)
150
151	li		$ptr,-1
152	${UCMP}i	$inp,0
153	beq-		Lenc_key_abort		# if ($inp==0) return -1;
154	${UCMP}i	$out,0
155	beq-		Lenc_key_abort		# if ($out==0) return -1;
156	li		$ptr,-2
157	cmpwi		$bits,128
158	blt-		Lenc_key_abort
159	cmpwi		$bits,256
160	bgt-		Lenc_key_abort
161	andi.		r0,$bits,0x3f
162	bne-		Lenc_key_abort
163
164	lis		r0,0xfff0
165	mfspr		$vrsave,256
166	mtspr		256,r0
167
168	bl		Lconsts
169	mtlr		r11
170
171	neg		r9,$inp
172	lvx		$in0,0,$inp
173	addi		$inp,$inp,15		# 15 is not typo
174	lvsr		$key,0,r9		# borrow $key
175	li		r8,0x20
176	cmpwi		$bits,192
177	lvx		$in1,0,$inp
178	le?vspltisb	$mask,0x0f		# borrow $mask
179	lvx		$rcon,0,$ptr
180	le?vxor		$key,$key,$mask		# adjust for byte swap
181	lvx		$mask,r8,$ptr
182	addi		$ptr,$ptr,0x10
183	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
184	li		$cnt,8
185	vxor		$zero,$zero,$zero
186	mtctr		$cnt
187
188	?lvsr		$outperm,0,$out
189	vspltisb	$outmask,-1
190	lvx		$outhead,0,$out
191	?vperm		$outmask,$zero,$outmask,$outperm
192
193	blt		Loop128
194	addi		$inp,$inp,8
195	beq		L192
196	addi		$inp,$inp,8
197	b		L256
198
199.align	4
200Loop128:
201	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
202	vsldoi		$tmp,$zero,$in0,12	# >>32
203	 vperm		$outtail,$in0,$in0,$outperm	# rotate
204	 vsel		$stage,$outhead,$outtail,$outmask
205	 vmr		$outhead,$outtail
206	vcipherlast	$key,$key,$rcon
207	 stvx		$stage,0,$out
208	 addi		$out,$out,16
209
210	vxor		$in0,$in0,$tmp
211	vsldoi		$tmp,$zero,$tmp,12	# >>32
212	vxor		$in0,$in0,$tmp
213	vsldoi		$tmp,$zero,$tmp,12	# >>32
214	vxor		$in0,$in0,$tmp
215	 vadduwm	$rcon,$rcon,$rcon
216	vxor		$in0,$in0,$key
217	bdnz		Loop128
218
219	lvx		$rcon,0,$ptr		# last two round keys
220
221	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
222	vsldoi		$tmp,$zero,$in0,12	# >>32
223	 vperm		$outtail,$in0,$in0,$outperm	# rotate
224	 vsel		$stage,$outhead,$outtail,$outmask
225	 vmr		$outhead,$outtail
226	vcipherlast	$key,$key,$rcon
227	 stvx		$stage,0,$out
228	 addi		$out,$out,16
229
230	vxor		$in0,$in0,$tmp
231	vsldoi		$tmp,$zero,$tmp,12	# >>32
232	vxor		$in0,$in0,$tmp
233	vsldoi		$tmp,$zero,$tmp,12	# >>32
234	vxor		$in0,$in0,$tmp
235	 vadduwm	$rcon,$rcon,$rcon
236	vxor		$in0,$in0,$key
237
238	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
239	vsldoi		$tmp,$zero,$in0,12	# >>32
240	 vperm		$outtail,$in0,$in0,$outperm	# rotate
241	 vsel		$stage,$outhead,$outtail,$outmask
242	 vmr		$outhead,$outtail
243	vcipherlast	$key,$key,$rcon
244	 stvx		$stage,0,$out
245	 addi		$out,$out,16
246
247	vxor		$in0,$in0,$tmp
248	vsldoi		$tmp,$zero,$tmp,12	# >>32
249	vxor		$in0,$in0,$tmp
250	vsldoi		$tmp,$zero,$tmp,12	# >>32
251	vxor		$in0,$in0,$tmp
252	vxor		$in0,$in0,$key
253	 vperm		$outtail,$in0,$in0,$outperm	# rotate
254	 vsel		$stage,$outhead,$outtail,$outmask
255	 vmr		$outhead,$outtail
256	 stvx		$stage,0,$out
257
258	addi		$inp,$out,15		# 15 is not typo
259	addi		$out,$out,0x50
260
261	li		$rounds,10
262	b		Ldone
263
264.align	4
265L192:
266	lvx		$tmp,0,$inp
267	li		$cnt,4
268	 vperm		$outtail,$in0,$in0,$outperm	# rotate
269	 vsel		$stage,$outhead,$outtail,$outmask
270	 vmr		$outhead,$outtail
271	 stvx		$stage,0,$out
272	 addi		$out,$out,16
273	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
274	vspltisb	$key,8			# borrow $key
275	mtctr		$cnt
276	vsububm		$mask,$mask,$key	# adjust the mask
277
278Loop192:
279	vperm		$key,$in1,$in1,$mask	# roate-n-splat
280	vsldoi		$tmp,$zero,$in0,12	# >>32
281	vcipherlast	$key,$key,$rcon
282
283	vxor		$in0,$in0,$tmp
284	vsldoi		$tmp,$zero,$tmp,12	# >>32
285	vxor		$in0,$in0,$tmp
286	vsldoi		$tmp,$zero,$tmp,12	# >>32
287	vxor		$in0,$in0,$tmp
288
289	 vsldoi		$stage,$zero,$in1,8
290	vspltw		$tmp,$in0,3
291	vxor		$tmp,$tmp,$in1
292	vsldoi		$in1,$zero,$in1,12	# >>32
293	 vadduwm	$rcon,$rcon,$rcon
294	vxor		$in1,$in1,$tmp
295	vxor		$in0,$in0,$key
296	vxor		$in1,$in1,$key
297	 vsldoi		$stage,$stage,$in0,8
298
299	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
300	vsldoi		$tmp,$zero,$in0,12	# >>32
301	 vperm		$outtail,$stage,$stage,$outperm	# rotate
302	 vsel		$stage,$outhead,$outtail,$outmask
303	 vmr		$outhead,$outtail
304	vcipherlast	$key,$key,$rcon
305	 stvx		$stage,0,$out
306	 addi		$out,$out,16
307
308	 vsldoi		$stage,$in0,$in1,8
309	vxor		$in0,$in0,$tmp
310	vsldoi		$tmp,$zero,$tmp,12	# >>32
311	 vperm		$outtail,$stage,$stage,$outperm	# rotate
312	 vsel		$stage,$outhead,$outtail,$outmask
313	 vmr		$outhead,$outtail
314	vxor		$in0,$in0,$tmp
315	vsldoi		$tmp,$zero,$tmp,12	# >>32
316	vxor		$in0,$in0,$tmp
317	 stvx		$stage,0,$out
318	 addi		$out,$out,16
319
320	vspltw		$tmp,$in0,3
321	vxor		$tmp,$tmp,$in1
322	vsldoi		$in1,$zero,$in1,12	# >>32
323	 vadduwm	$rcon,$rcon,$rcon
324	vxor		$in1,$in1,$tmp
325	vxor		$in0,$in0,$key
326	vxor		$in1,$in1,$key
327	 vperm		$outtail,$in0,$in0,$outperm	# rotate
328	 vsel		$stage,$outhead,$outtail,$outmask
329	 vmr		$outhead,$outtail
330	 stvx		$stage,0,$out
331	 addi		$inp,$out,15		# 15 is not typo
332	 addi		$out,$out,16
333	bdnz		Loop192
334
335	li		$rounds,12
336	addi		$out,$out,0x20
337	b		Ldone
338
339.align	4
340L256:
341	lvx		$tmp,0,$inp
342	li		$cnt,7
343	li		$rounds,14
344	 vperm		$outtail,$in0,$in0,$outperm	# rotate
345	 vsel		$stage,$outhead,$outtail,$outmask
346	 vmr		$outhead,$outtail
347	 stvx		$stage,0,$out
348	 addi		$out,$out,16
349	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
350	mtctr		$cnt
351
352Loop256:
353	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
354	vsldoi		$tmp,$zero,$in0,12	# >>32
355	 vperm		$outtail,$in1,$in1,$outperm	# rotate
356	 vsel		$stage,$outhead,$outtail,$outmask
357	 vmr		$outhead,$outtail
358	vcipherlast	$key,$key,$rcon
359	 stvx		$stage,0,$out
360	 addi		$out,$out,16
361
362	vxor		$in0,$in0,$tmp
363	vsldoi		$tmp,$zero,$tmp,12	# >>32
364	vxor		$in0,$in0,$tmp
365	vsldoi		$tmp,$zero,$tmp,12	# >>32
366	vxor		$in0,$in0,$tmp
367	 vadduwm	$rcon,$rcon,$rcon
368	vxor		$in0,$in0,$key
369	 vperm		$outtail,$in0,$in0,$outperm	# rotate
370	 vsel		$stage,$outhead,$outtail,$outmask
371	 vmr		$outhead,$outtail
372	 stvx		$stage,0,$out
373	 addi		$inp,$out,15		# 15 is not typo
374	 addi		$out,$out,16
375	bdz		Ldone
376
377	vspltw		$key,$in0,3		# just splat
378	vsldoi		$tmp,$zero,$in1,12	# >>32
379	vsbox		$key,$key
380
381	vxor		$in1,$in1,$tmp
382	vsldoi		$tmp,$zero,$tmp,12	# >>32
383	vxor		$in1,$in1,$tmp
384	vsldoi		$tmp,$zero,$tmp,12	# >>32
385	vxor		$in1,$in1,$tmp
386
387	vxor		$in1,$in1,$key
388	b		Loop256
389
390.align	4
391Ldone:
392	lvx		$in1,0,$inp		# redundant in aligned case
393	vsel		$in1,$outhead,$in1,$outmask
394	stvx		$in1,0,$inp
395	li		$ptr,0
396	mtspr		256,$vrsave
397	stw		$rounds,0($out)
398
399Lenc_key_abort:
400	mr		r3,$ptr
401	blr
402	.long		0
403	.byte		0,12,0x14,1,0,0,3,0
404	.long		0
405.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
406
407.globl	.${prefix}_set_decrypt_key
408	$STU		$sp,-$FRAME($sp)
409	mflr		r10
410	$PUSH		r10,$FRAME+$LRSAVE($sp)
411	bl		Lset_encrypt_key
412	mtlr		r10
413
414	cmpwi		r3,0
415	bne-		Ldec_key_abort
416
417	slwi		$cnt,$rounds,4
418	subi		$inp,$out,240		# first round key
419	srwi		$rounds,$rounds,1
420	add		$out,$inp,$cnt		# last round key
421	mtctr		$rounds
422
423Ldeckey:
424	lwz		r0, 0($inp)
425	lwz		r6, 4($inp)
426	lwz		r7, 8($inp)
427	lwz		r8, 12($inp)
428	addi		$inp,$inp,16
429	lwz		r9, 0($out)
430	lwz		r10,4($out)
431	lwz		r11,8($out)
432	lwz		r12,12($out)
433	stw		r0, 0($out)
434	stw		r6, 4($out)
435	stw		r7, 8($out)
436	stw		r8, 12($out)
437	subi		$out,$out,16
438	stw		r9, -16($inp)
439	stw		r10,-12($inp)
440	stw		r11,-8($inp)
441	stw		r12,-4($inp)
442	bdnz		Ldeckey
443
444	xor		r3,r3,r3		# return value
445Ldec_key_abort:
446	addi		$sp,$sp,$FRAME
447	blr
448	.long		0
449	.byte		0,12,4,1,0x80,0,3,0
450	.long		0
451.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
452___
453}}}
454#########################################################################
455{{{	# Single block en- and decrypt procedures			#
456sub gen_block () {
457my $dir = shift;
458my $n   = $dir eq "de" ? "n" : "";
459my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
460
461$code.=<<___;
462.globl	.${prefix}_${dir}crypt
463	lwz		$rounds,240($key)
464	lis		r0,0xfc00
465	mfspr		$vrsave,256
466	li		$idx,15			# 15 is not typo
467	mtspr		256,r0
468
469	lvx		v0,0,$inp
470	neg		r11,$out
471	lvx		v1,$idx,$inp
472	lvsl		v2,0,$inp		# inpperm
473	le?vspltisb	v4,0x0f
474	?lvsl		v3,0,r11		# outperm
475	le?vxor		v2,v2,v4
476	li		$idx,16
477	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
478	lvx		v1,0,$key
479	?lvsl		v5,0,$key		# keyperm
480	srwi		$rounds,$rounds,1
481	lvx		v2,$idx,$key
482	addi		$idx,$idx,16
483	subi		$rounds,$rounds,1
484	?vperm		v1,v1,v2,v5		# align round key
485
486	vxor		v0,v0,v1
487	lvx		v1,$idx,$key
488	addi		$idx,$idx,16
489	mtctr		$rounds
490
491Loop_${dir}c:
492	?vperm		v2,v2,v1,v5
493	v${n}cipher	v0,v0,v2
494	lvx		v2,$idx,$key
495	addi		$idx,$idx,16
496	?vperm		v1,v1,v2,v5
497	v${n}cipher	v0,v0,v1
498	lvx		v1,$idx,$key
499	addi		$idx,$idx,16
500	bdnz		Loop_${dir}c
501
502	?vperm		v2,v2,v1,v5
503	v${n}cipher	v0,v0,v2
504	lvx		v2,$idx,$key
505	?vperm		v1,v1,v2,v5
506	v${n}cipherlast	v0,v0,v1
507
508	vspltisb	v2,-1
509	vxor		v1,v1,v1
510	li		$idx,15			# 15 is not typo
511	?vperm		v2,v1,v2,v3		# outmask
512	le?vxor		v3,v3,v4
513	lvx		v1,0,$out		# outhead
514	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
515	vsel		v1,v1,v0,v2
516	lvx		v4,$idx,$out
517	stvx		v1,0,$out
518	vsel		v0,v0,v4,v2
519	stvx		v0,$idx,$out
520
521	mtspr		256,$vrsave
522	blr
523	.long		0
524	.byte		0,12,0x14,0,0,0,3,0
525	.long		0
526.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
527___
528}
529&gen_block("en");
530&gen_block("de");
531}}}
532#########################################################################
533{{{	# CBC en- and decrypt procedures				#
534my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
535my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
536my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
537						map("v$_",(4..10));
538$code.=<<___;
539.globl	.${prefix}_cbc_encrypt
540	${UCMP}i	$len,16
541	bltlr-
542
543	cmpwi		$enc,0			# test direction
544	lis		r0,0xffe0
545	mfspr		$vrsave,256
546	mtspr		256,r0
547
548	li		$idx,15
549	vxor		$rndkey0,$rndkey0,$rndkey0
550	le?vspltisb	$tmp,0x0f
551
552	lvx		$ivec,0,$ivp		# load [unaligned] iv
553	lvsl		$inpperm,0,$ivp
554	lvx		$inptail,$idx,$ivp
555	le?vxor		$inpperm,$inpperm,$tmp
556	vperm		$ivec,$ivec,$inptail,$inpperm
557
558	neg		r11,$inp
559	?lvsl		$keyperm,0,$key		# prepare for unaligned key
560	lwz		$rounds,240($key)
561
562	lvsr		$inpperm,0,r11		# prepare for unaligned load
563	lvx		$inptail,0,$inp
564	addi		$inp,$inp,15		# 15 is not typo
565	le?vxor		$inpperm,$inpperm,$tmp
566
567	?lvsr		$outperm,0,$out		# prepare for unaligned store
568	vspltisb	$outmask,-1
569	lvx		$outhead,0,$out
570	?vperm		$outmask,$rndkey0,$outmask,$outperm
571	le?vxor		$outperm,$outperm,$tmp
572
573	srwi		$rounds,$rounds,1
574	li		$idx,16
575	subi		$rounds,$rounds,1
576	beq		Lcbc_dec
577
578Lcbc_enc:
579	vmr		$inout,$inptail
580	lvx		$inptail,0,$inp
581	addi		$inp,$inp,16
582	mtctr		$rounds
583	subi		$len,$len,16		# len-=16
584
585	lvx		$rndkey0,0,$key
586	 vperm		$inout,$inout,$inptail,$inpperm
587	lvx		$rndkey1,$idx,$key
588	addi		$idx,$idx,16
589	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
590	vxor		$inout,$inout,$rndkey0
591	lvx		$rndkey0,$idx,$key
592	addi		$idx,$idx,16
593	vxor		$inout,$inout,$ivec
594
595Loop_cbc_enc:
596	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
597	vcipher		$inout,$inout,$rndkey1
598	lvx		$rndkey1,$idx,$key
599	addi		$idx,$idx,16
600	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
601	vcipher		$inout,$inout,$rndkey0
602	lvx		$rndkey0,$idx,$key
603	addi		$idx,$idx,16
604	bdnz		Loop_cbc_enc
605
606	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
607	vcipher		$inout,$inout,$rndkey1
608	lvx		$rndkey1,$idx,$key
609	li		$idx,16
610	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
611	vcipherlast	$ivec,$inout,$rndkey0
612	${UCMP}i	$len,16
613
614	vperm		$tmp,$ivec,$ivec,$outperm
615	vsel		$inout,$outhead,$tmp,$outmask
616	vmr		$outhead,$tmp
617	stvx		$inout,0,$out
618	addi		$out,$out,16
619	bge		Lcbc_enc
620
621	b		Lcbc_done
622
623.align	4
624Lcbc_dec:
625	${UCMP}i	$len,128
626	bge		_aesp8_cbc_decrypt8x
627	vmr		$tmp,$inptail
628	lvx		$inptail,0,$inp
629	addi		$inp,$inp,16
630	mtctr		$rounds
631	subi		$len,$len,16		# len-=16
632
633	lvx		$rndkey0,0,$key
634	 vperm		$tmp,$tmp,$inptail,$inpperm
635	lvx		$rndkey1,$idx,$key
636	addi		$idx,$idx,16
637	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
638	vxor		$inout,$tmp,$rndkey0
639	lvx		$rndkey0,$idx,$key
640	addi		$idx,$idx,16
641
642Loop_cbc_dec:
643	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
644	vncipher	$inout,$inout,$rndkey1
645	lvx		$rndkey1,$idx,$key
646	addi		$idx,$idx,16
647	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
648	vncipher	$inout,$inout,$rndkey0
649	lvx		$rndkey0,$idx,$key
650	addi		$idx,$idx,16
651	bdnz		Loop_cbc_dec
652
653	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
654	vncipher	$inout,$inout,$rndkey1
655	lvx		$rndkey1,$idx,$key
656	li		$idx,16
657	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
658	vncipherlast	$inout,$inout,$rndkey0
659	${UCMP}i	$len,16
660
661	vxor		$inout,$inout,$ivec
662	vmr		$ivec,$tmp
663	vperm		$tmp,$inout,$inout,$outperm
664	vsel		$inout,$outhead,$tmp,$outmask
665	vmr		$outhead,$tmp
666	stvx		$inout,0,$out
667	addi		$out,$out,16
668	bge		Lcbc_dec
669
670Lcbc_done:
671	addi		$out,$out,-1
672	lvx		$inout,0,$out		# redundant in aligned case
673	vsel		$inout,$outhead,$inout,$outmask
674	stvx		$inout,0,$out
675
676	neg		$enc,$ivp		# write [unaligned] iv
677	li		$idx,15			# 15 is not typo
678	vxor		$rndkey0,$rndkey0,$rndkey0
679	vspltisb	$outmask,-1
680	le?vspltisb	$tmp,0x0f
681	?lvsl		$outperm,0,$enc
682	?vperm		$outmask,$rndkey0,$outmask,$outperm
683	le?vxor		$outperm,$outperm,$tmp
684	lvx		$outhead,0,$ivp
685	vperm		$ivec,$ivec,$ivec,$outperm
686	vsel		$inout,$outhead,$ivec,$outmask
687	lvx		$inptail,$idx,$ivp
688	stvx		$inout,0,$ivp
689	vsel		$inout,$ivec,$inptail,$outmask
690	stvx		$inout,$idx,$ivp
691
692	mtspr		256,$vrsave
693	blr
694	.long		0
695	.byte		0,12,0x14,0,0,0,6,0
696	.long		0
697___
698#########################################################################
699{{	# Optimized CBC decrypt procedure				#
700my $key_="r11";
701my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
702my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
703my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
704my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
705			# v26-v31 last 6 round keys
706my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
707
708$code.=<<___;
709.align	5
710_aesp8_cbc_decrypt8x:
711	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
712	li		r10,`$FRAME+8*16+15`
713	li		r11,`$FRAME+8*16+31`
714	stvx		v20,r10,$sp		# ABI says so
715	addi		r10,r10,32
716	stvx		v21,r11,$sp
717	addi		r11,r11,32
718	stvx		v22,r10,$sp
719	addi		r10,r10,32
720	stvx		v23,r11,$sp
721	addi		r11,r11,32
722	stvx		v24,r10,$sp
723	addi		r10,r10,32
724	stvx		v25,r11,$sp
725	addi		r11,r11,32
726	stvx		v26,r10,$sp
727	addi		r10,r10,32
728	stvx		v27,r11,$sp
729	addi		r11,r11,32
730	stvx		v28,r10,$sp
731	addi		r10,r10,32
732	stvx		v29,r11,$sp
733	addi		r11,r11,32
734	stvx		v30,r10,$sp
735	stvx		v31,r11,$sp
736	li		r0,-1
737	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
738	li		$x10,0x10
739	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
740	li		$x20,0x20
741	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
742	li		$x30,0x30
743	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
744	li		$x40,0x40
745	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
746	li		$x50,0x50
747	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
748	li		$x60,0x60
749	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
750	li		$x70,0x70
751	mtspr		256,r0
752
753	subi		$rounds,$rounds,3	# -4 in total
754	subi		$len,$len,128		# bias
755
756	lvx		$rndkey0,$x00,$key	# load key schedule
757	lvx		v30,$x10,$key
758	addi		$key,$key,0x20
759	lvx		v31,$x00,$key
760	?vperm		$rndkey0,$rndkey0,v30,$keyperm
761	addi		$key_,$sp,$FRAME+15
762	mtctr		$rounds
763
764Load_cbc_dec_key:
765	?vperm		v24,v30,v31,$keyperm
766	lvx		v30,$x10,$key
767	addi		$key,$key,0x20
768	stvx		v24,$x00,$key_		# off-load round[1]
769	?vperm		v25,v31,v30,$keyperm
770	lvx		v31,$x00,$key
771	stvx		v25,$x10,$key_		# off-load round[2]
772	addi		$key_,$key_,0x20
773	bdnz		Load_cbc_dec_key
774
775	lvx		v26,$x10,$key
776	?vperm		v24,v30,v31,$keyperm
777	lvx		v27,$x20,$key
778	stvx		v24,$x00,$key_		# off-load round[3]
779	?vperm		v25,v31,v26,$keyperm
780	lvx		v28,$x30,$key
781	stvx		v25,$x10,$key_		# off-load round[4]
782	addi		$key_,$sp,$FRAME+15	# rewind $key_
783	?vperm		v26,v26,v27,$keyperm
784	lvx		v29,$x40,$key
785	?vperm		v27,v27,v28,$keyperm
786	lvx		v30,$x50,$key
787	?vperm		v28,v28,v29,$keyperm
788	lvx		v31,$x60,$key
789	?vperm		v29,v29,v30,$keyperm
790	lvx		$out0,$x70,$key		# borrow $out0
791	?vperm		v30,v30,v31,$keyperm
792	lvx		v24,$x00,$key_		# pre-load round[1]
793	?vperm		v31,v31,$out0,$keyperm
794	lvx		v25,$x10,$key_		# pre-load round[2]
795
796	#lvx		$inptail,0,$inp		# "caller" already did this
797	#addi		$inp,$inp,15		# 15 is not typo
798	subi		$inp,$inp,15		# undo "caller"
799
800	 le?li		$idx,8
801	lvx_u		$in0,$x00,$inp		# load first 8 "words"
802	 le?lvsl	$inpperm,0,$idx
803	 le?vspltisb	$tmp,0x0f
804	lvx_u		$in1,$x10,$inp
805	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
806	lvx_u		$in2,$x20,$inp
807	 le?vperm	$in0,$in0,$in0,$inpperm
808	lvx_u		$in3,$x30,$inp
809	 le?vperm	$in1,$in1,$in1,$inpperm
810	lvx_u		$in4,$x40,$inp
811	 le?vperm	$in2,$in2,$in2,$inpperm
812	vxor		$out0,$in0,$rndkey0
813	lvx_u		$in5,$x50,$inp
814	 le?vperm	$in3,$in3,$in3,$inpperm
815	vxor		$out1,$in1,$rndkey0
816	lvx_u		$in6,$x60,$inp
817	 le?vperm	$in4,$in4,$in4,$inpperm
818	vxor		$out2,$in2,$rndkey0
819	lvx_u		$in7,$x70,$inp
820	addi		$inp,$inp,0x80
821	 le?vperm	$in5,$in5,$in5,$inpperm
822	vxor		$out3,$in3,$rndkey0
823	 le?vperm	$in6,$in6,$in6,$inpperm
824	vxor		$out4,$in4,$rndkey0
825	 le?vperm	$in7,$in7,$in7,$inpperm
826	vxor		$out5,$in5,$rndkey0
827	vxor		$out6,$in6,$rndkey0
828	vxor		$out7,$in7,$rndkey0
829
830	mtctr		$rounds
831	b		Loop_cbc_dec8x
832.align	5
833Loop_cbc_dec8x:
834	vncipher	$out0,$out0,v24
835	vncipher	$out1,$out1,v24
836	vncipher	$out2,$out2,v24
837	vncipher	$out3,$out3,v24
838	vncipher	$out4,$out4,v24
839	vncipher	$out5,$out5,v24
840	vncipher	$out6,$out6,v24
841	vncipher	$out7,$out7,v24
842	lvx		v24,$x20,$key_		# round[3]
843	addi		$key_,$key_,0x20
844
845	vncipher	$out0,$out0,v25
846	vncipher	$out1,$out1,v25
847	vncipher	$out2,$out2,v25
848	vncipher	$out3,$out3,v25
849	vncipher	$out4,$out4,v25
850	vncipher	$out5,$out5,v25
851	vncipher	$out6,$out6,v25
852	vncipher	$out7,$out7,v25
853	lvx		v25,$x10,$key_		# round[4]
854	bdnz		Loop_cbc_dec8x
855
856	subic		$len,$len,128		# $len-=128
857	vncipher	$out0,$out0,v24
858	vncipher	$out1,$out1,v24
859	vncipher	$out2,$out2,v24
860	vncipher	$out3,$out3,v24
861	vncipher	$out4,$out4,v24
862	vncipher	$out5,$out5,v24
863	vncipher	$out6,$out6,v24
864	vncipher	$out7,$out7,v24
865
866	subfe.		r0,r0,r0		# borrow?-1:0
867	vncipher	$out0,$out0,v25
868	vncipher	$out1,$out1,v25
869	vncipher	$out2,$out2,v25
870	vncipher	$out3,$out3,v25
871	vncipher	$out4,$out4,v25
872	vncipher	$out5,$out5,v25
873	vncipher	$out6,$out6,v25
874	vncipher	$out7,$out7,v25
875
876	and		r0,r0,$len
877	vncipher	$out0,$out0,v26
878	vncipher	$out1,$out1,v26
879	vncipher	$out2,$out2,v26
880	vncipher	$out3,$out3,v26
881	vncipher	$out4,$out4,v26
882	vncipher	$out5,$out5,v26
883	vncipher	$out6,$out6,v26
884	vncipher	$out7,$out7,v26
885
886	add		$inp,$inp,r0		# $inp is adjusted in such
887						# way that at exit from the
888						# loop inX-in7 are loaded
889						# with last "words"
890	vncipher	$out0,$out0,v27
891	vncipher	$out1,$out1,v27
892	vncipher	$out2,$out2,v27
893	vncipher	$out3,$out3,v27
894	vncipher	$out4,$out4,v27
895	vncipher	$out5,$out5,v27
896	vncipher	$out6,$out6,v27
897	vncipher	$out7,$out7,v27
898
899	addi		$key_,$sp,$FRAME+15	# rewind $key_
900	vncipher	$out0,$out0,v28
901	vncipher	$out1,$out1,v28
902	vncipher	$out2,$out2,v28
903	vncipher	$out3,$out3,v28
904	vncipher	$out4,$out4,v28
905	vncipher	$out5,$out5,v28
906	vncipher	$out6,$out6,v28
907	vncipher	$out7,$out7,v28
908	lvx		v24,$x00,$key_		# re-pre-load round[1]
909
910	vncipher	$out0,$out0,v29
911	vncipher	$out1,$out1,v29
912	vncipher	$out2,$out2,v29
913	vncipher	$out3,$out3,v29
914	vncipher	$out4,$out4,v29
915	vncipher	$out5,$out5,v29
916	vncipher	$out6,$out6,v29
917	vncipher	$out7,$out7,v29
918	lvx		v25,$x10,$key_		# re-pre-load round[2]
919
920	vncipher	$out0,$out0,v30
921	 vxor		$ivec,$ivec,v31		# xor with last round key
922	vncipher	$out1,$out1,v30
923	 vxor		$in0,$in0,v31
924	vncipher	$out2,$out2,v30
925	 vxor		$in1,$in1,v31
926	vncipher	$out3,$out3,v30
927	 vxor		$in2,$in2,v31
928	vncipher	$out4,$out4,v30
929	 vxor		$in3,$in3,v31
930	vncipher	$out5,$out5,v30
931	 vxor		$in4,$in4,v31
932	vncipher	$out6,$out6,v30
933	 vxor		$in5,$in5,v31
934	vncipher	$out7,$out7,v30
935	 vxor		$in6,$in6,v31
936
937	vncipherlast	$out0,$out0,$ivec
938	vncipherlast	$out1,$out1,$in0
939	 lvx_u		$in0,$x00,$inp		# load next input block
940	vncipherlast	$out2,$out2,$in1
941	 lvx_u		$in1,$x10,$inp
942	vncipherlast	$out3,$out3,$in2
943	 le?vperm	$in0,$in0,$in0,$inpperm
944	 lvx_u		$in2,$x20,$inp
945	vncipherlast	$out4,$out4,$in3
946	 le?vperm	$in1,$in1,$in1,$inpperm
947	 lvx_u		$in3,$x30,$inp
948	vncipherlast	$out5,$out5,$in4
949	 le?vperm	$in2,$in2,$in2,$inpperm
950	 lvx_u		$in4,$x40,$inp
951	vncipherlast	$out6,$out6,$in5
952	 le?vperm	$in3,$in3,$in3,$inpperm
953	 lvx_u		$in5,$x50,$inp
954	vncipherlast	$out7,$out7,$in6
955	 le?vperm	$in4,$in4,$in4,$inpperm
956	 lvx_u		$in6,$x60,$inp
957	vmr		$ivec,$in7
958	 le?vperm	$in5,$in5,$in5,$inpperm
959	 lvx_u		$in7,$x70,$inp
960	 addi		$inp,$inp,0x80
961
962	le?vperm	$out0,$out0,$out0,$inpperm
963	le?vperm	$out1,$out1,$out1,$inpperm
964	stvx_u		$out0,$x00,$out
965	 le?vperm	$in6,$in6,$in6,$inpperm
966	 vxor		$out0,$in0,$rndkey0
967	le?vperm	$out2,$out2,$out2,$inpperm
968	stvx_u		$out1,$x10,$out
969	 le?vperm	$in7,$in7,$in7,$inpperm
970	 vxor		$out1,$in1,$rndkey0
971	le?vperm	$out3,$out3,$out3,$inpperm
972	stvx_u		$out2,$x20,$out
973	 vxor		$out2,$in2,$rndkey0
974	le?vperm	$out4,$out4,$out4,$inpperm
975	stvx_u		$out3,$x30,$out
976	 vxor		$out3,$in3,$rndkey0
977	le?vperm	$out5,$out5,$out5,$inpperm
978	stvx_u		$out4,$x40,$out
979	 vxor		$out4,$in4,$rndkey0
980	le?vperm	$out6,$out6,$out6,$inpperm
981	stvx_u		$out5,$x50,$out
982	 vxor		$out5,$in5,$rndkey0
983	le?vperm	$out7,$out7,$out7,$inpperm
984	stvx_u		$out6,$x60,$out
985	 vxor		$out6,$in6,$rndkey0
986	stvx_u		$out7,$x70,$out
987	addi		$out,$out,0x80
988	 vxor		$out7,$in7,$rndkey0
989
990	mtctr		$rounds
991	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
992
993	addic.		$len,$len,128
994	beq		Lcbc_dec8x_done
995	nop
996	nop
997
998Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
999	vncipher	$out1,$out1,v24
1000	vncipher	$out2,$out2,v24
1001	vncipher	$out3,$out3,v24
1002	vncipher	$out4,$out4,v24
1003	vncipher	$out5,$out5,v24
1004	vncipher	$out6,$out6,v24
1005	vncipher	$out7,$out7,v24
1006	lvx		v24,$x20,$key_		# round[3]
1007	addi		$key_,$key_,0x20
1008
1009	vncipher	$out1,$out1,v25
1010	vncipher	$out2,$out2,v25
1011	vncipher	$out3,$out3,v25
1012	vncipher	$out4,$out4,v25
1013	vncipher	$out5,$out5,v25
1014	vncipher	$out6,$out6,v25
1015	vncipher	$out7,$out7,v25
1016	lvx		v25,$x10,$key_		# round[4]
1017	bdnz		Loop_cbc_dec8x_tail
1018
1019	vncipher	$out1,$out1,v24
1020	vncipher	$out2,$out2,v24
1021	vncipher	$out3,$out3,v24
1022	vncipher	$out4,$out4,v24
1023	vncipher	$out5,$out5,v24
1024	vncipher	$out6,$out6,v24
1025	vncipher	$out7,$out7,v24
1026
1027	vncipher	$out1,$out1,v25
1028	vncipher	$out2,$out2,v25
1029	vncipher	$out3,$out3,v25
1030	vncipher	$out4,$out4,v25
1031	vncipher	$out5,$out5,v25
1032	vncipher	$out6,$out6,v25
1033	vncipher	$out7,$out7,v25
1034
1035	vncipher	$out1,$out1,v26
1036	vncipher	$out2,$out2,v26
1037	vncipher	$out3,$out3,v26
1038	vncipher	$out4,$out4,v26
1039	vncipher	$out5,$out5,v26
1040	vncipher	$out6,$out6,v26
1041	vncipher	$out7,$out7,v26
1042
1043	vncipher	$out1,$out1,v27
1044	vncipher	$out2,$out2,v27
1045	vncipher	$out3,$out3,v27
1046	vncipher	$out4,$out4,v27
1047	vncipher	$out5,$out5,v27
1048	vncipher	$out6,$out6,v27
1049	vncipher	$out7,$out7,v27
1050
1051	vncipher	$out1,$out1,v28
1052	vncipher	$out2,$out2,v28
1053	vncipher	$out3,$out3,v28
1054	vncipher	$out4,$out4,v28
1055	vncipher	$out5,$out5,v28
1056	vncipher	$out6,$out6,v28
1057	vncipher	$out7,$out7,v28
1058
1059	vncipher	$out1,$out1,v29
1060	vncipher	$out2,$out2,v29
1061	vncipher	$out3,$out3,v29
1062	vncipher	$out4,$out4,v29
1063	vncipher	$out5,$out5,v29
1064	vncipher	$out6,$out6,v29
1065	vncipher	$out7,$out7,v29
1066
1067	vncipher	$out1,$out1,v30
1068	 vxor		$ivec,$ivec,v31		# last round key
1069	vncipher	$out2,$out2,v30
1070	 vxor		$in1,$in1,v31
1071	vncipher	$out3,$out3,v30
1072	 vxor		$in2,$in2,v31
1073	vncipher	$out4,$out4,v30
1074	 vxor		$in3,$in3,v31
1075	vncipher	$out5,$out5,v30
1076	 vxor		$in4,$in4,v31
1077	vncipher	$out6,$out6,v30
1078	 vxor		$in5,$in5,v31
1079	vncipher	$out7,$out7,v30
1080	 vxor		$in6,$in6,v31
1081
1082	cmplwi		$len,32			# switch($len)
1083	blt		Lcbc_dec8x_one
1084	nop
1085	beq		Lcbc_dec8x_two
1086	cmplwi		$len,64
1087	blt		Lcbc_dec8x_three
1088	nop
1089	beq		Lcbc_dec8x_four
1090	cmplwi		$len,96
1091	blt		Lcbc_dec8x_five
1092	nop
1093	beq		Lcbc_dec8x_six
1094
1095Lcbc_dec8x_seven:
1096	vncipherlast	$out1,$out1,$ivec
1097	vncipherlast	$out2,$out2,$in1
1098	vncipherlast	$out3,$out3,$in2
1099	vncipherlast	$out4,$out4,$in3
1100	vncipherlast	$out5,$out5,$in4
1101	vncipherlast	$out6,$out6,$in5
1102	vncipherlast	$out7,$out7,$in6
1103	vmr		$ivec,$in7
1104
1105	le?vperm	$out1,$out1,$out1,$inpperm
1106	le?vperm	$out2,$out2,$out2,$inpperm
1107	stvx_u		$out1,$x00,$out
1108	le?vperm	$out3,$out3,$out3,$inpperm
1109	stvx_u		$out2,$x10,$out
1110	le?vperm	$out4,$out4,$out4,$inpperm
1111	stvx_u		$out3,$x20,$out
1112	le?vperm	$out5,$out5,$out5,$inpperm
1113	stvx_u		$out4,$x30,$out
1114	le?vperm	$out6,$out6,$out6,$inpperm
1115	stvx_u		$out5,$x40,$out
1116	le?vperm	$out7,$out7,$out7,$inpperm
1117	stvx_u		$out6,$x50,$out
1118	stvx_u		$out7,$x60,$out
1119	addi		$out,$out,0x70
1120	b		Lcbc_dec8x_done
1121
1122.align	5
1123Lcbc_dec8x_six:
1124	vncipherlast	$out2,$out2,$ivec
1125	vncipherlast	$out3,$out3,$in2
1126	vncipherlast	$out4,$out4,$in3
1127	vncipherlast	$out5,$out5,$in4
1128	vncipherlast	$out6,$out6,$in5
1129	vncipherlast	$out7,$out7,$in6
1130	vmr		$ivec,$in7
1131
1132	le?vperm	$out2,$out2,$out2,$inpperm
1133	le?vperm	$out3,$out3,$out3,$inpperm
1134	stvx_u		$out2,$x00,$out
1135	le?vperm	$out4,$out4,$out4,$inpperm
1136	stvx_u		$out3,$x10,$out
1137	le?vperm	$out5,$out5,$out5,$inpperm
1138	stvx_u		$out4,$x20,$out
1139	le?vperm	$out6,$out6,$out6,$inpperm
1140	stvx_u		$out5,$x30,$out
1141	le?vperm	$out7,$out7,$out7,$inpperm
1142	stvx_u		$out6,$x40,$out
1143	stvx_u		$out7,$x50,$out
1144	addi		$out,$out,0x60
1145	b		Lcbc_dec8x_done
1146
1147.align	5
1148Lcbc_dec8x_five:
1149	vncipherlast	$out3,$out3,$ivec
1150	vncipherlast	$out4,$out4,$in3
1151	vncipherlast	$out5,$out5,$in4
1152	vncipherlast	$out6,$out6,$in5
1153	vncipherlast	$out7,$out7,$in6
1154	vmr		$ivec,$in7
1155
1156	le?vperm	$out3,$out3,$out3,$inpperm
1157	le?vperm	$out4,$out4,$out4,$inpperm
1158	stvx_u		$out3,$x00,$out
1159	le?vperm	$out5,$out5,$out5,$inpperm
1160	stvx_u		$out4,$x10,$out
1161	le?vperm	$out6,$out6,$out6,$inpperm
1162	stvx_u		$out5,$x20,$out
1163	le?vperm	$out7,$out7,$out7,$inpperm
1164	stvx_u		$out6,$x30,$out
1165	stvx_u		$out7,$x40,$out
1166	addi		$out,$out,0x50
1167	b		Lcbc_dec8x_done
1168
1169.align	5
1170Lcbc_dec8x_four:
1171	vncipherlast	$out4,$out4,$ivec
1172	vncipherlast	$out5,$out5,$in4
1173	vncipherlast	$out6,$out6,$in5
1174	vncipherlast	$out7,$out7,$in6
1175	vmr		$ivec,$in7
1176
1177	le?vperm	$out4,$out4,$out4,$inpperm
1178	le?vperm	$out5,$out5,$out5,$inpperm
1179	stvx_u		$out4,$x00,$out
1180	le?vperm	$out6,$out6,$out6,$inpperm
1181	stvx_u		$out5,$x10,$out
1182	le?vperm	$out7,$out7,$out7,$inpperm
1183	stvx_u		$out6,$x20,$out
1184	stvx_u		$out7,$x30,$out
1185	addi		$out,$out,0x40
1186	b		Lcbc_dec8x_done
1187
1188.align	5
1189Lcbc_dec8x_three:
1190	vncipherlast	$out5,$out5,$ivec
1191	vncipherlast	$out6,$out6,$in5
1192	vncipherlast	$out7,$out7,$in6
1193	vmr		$ivec,$in7
1194
1195	le?vperm	$out5,$out5,$out5,$inpperm
1196	le?vperm	$out6,$out6,$out6,$inpperm
1197	stvx_u		$out5,$x00,$out
1198	le?vperm	$out7,$out7,$out7,$inpperm
1199	stvx_u		$out6,$x10,$out
1200	stvx_u		$out7,$x20,$out
1201	addi		$out,$out,0x30
1202	b		Lcbc_dec8x_done
1203
1204.align	5
1205Lcbc_dec8x_two:
1206	vncipherlast	$out6,$out6,$ivec
1207	vncipherlast	$out7,$out7,$in6
1208	vmr		$ivec,$in7
1209
1210	le?vperm	$out6,$out6,$out6,$inpperm
1211	le?vperm	$out7,$out7,$out7,$inpperm
1212	stvx_u		$out6,$x00,$out
1213	stvx_u		$out7,$x10,$out
1214	addi		$out,$out,0x20
1215	b		Lcbc_dec8x_done
1216
1217.align	5
1218Lcbc_dec8x_one:
1219	vncipherlast	$out7,$out7,$ivec
1220	vmr		$ivec,$in7
1221
1222	le?vperm	$out7,$out7,$out7,$inpperm
1223	stvx_u		$out7,0,$out
1224	addi		$out,$out,0x10
1225
1226Lcbc_dec8x_done:
1227	le?vperm	$ivec,$ivec,$ivec,$inpperm
1228	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1229
1230	li		r10,`$FRAME+15`
1231	li		r11,`$FRAME+31`
1232	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1233	addi		r10,r10,32
1234	stvx		$inpperm,r11,$sp
1235	addi		r11,r11,32
1236	stvx		$inpperm,r10,$sp
1237	addi		r10,r10,32
1238	stvx		$inpperm,r11,$sp
1239	addi		r11,r11,32
1240	stvx		$inpperm,r10,$sp
1241	addi		r10,r10,32
1242	stvx		$inpperm,r11,$sp
1243	addi		r11,r11,32
1244	stvx		$inpperm,r10,$sp
1245	addi		r10,r10,32
1246	stvx		$inpperm,r11,$sp
1247	addi		r11,r11,32
1248
1249	mtspr		256,$vrsave
1250	lvx		v20,r10,$sp		# ABI says so
1251	addi		r10,r10,32
1252	lvx		v21,r11,$sp
1253	addi		r11,r11,32
1254	lvx		v22,r10,$sp
1255	addi		r10,r10,32
1256	lvx		v23,r11,$sp
1257	addi		r11,r11,32
1258	lvx		v24,r10,$sp
1259	addi		r10,r10,32
1260	lvx		v25,r11,$sp
1261	addi		r11,r11,32
1262	lvx		v26,r10,$sp
1263	addi		r10,r10,32
1264	lvx		v27,r11,$sp
1265	addi		r11,r11,32
1266	lvx		v28,r10,$sp
1267	addi		r10,r10,32
1268	lvx		v29,r11,$sp
1269	addi		r11,r11,32
1270	lvx		v30,r10,$sp
1271	lvx		v31,r11,$sp
1272	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1273	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1274	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1275	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1276	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1277	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1278	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1279	blr
1280	.long		0
1281	.byte		0,12,0x14,0,0x80,6,6,0
1282	.long		0
1283.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1284___
1285}}	}}}
1286
1287#########################################################################
1288{{{	# CTR procedure[s]						#
1289
1290####################### WARNING: Here be dragons! #######################
1291#
1292# This code is written as 'ctr32', based on a 32-bit counter used
1293# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
1294# a 128-bit counter.
1295#
1296# This leads to subtle changes from the upstream code: the counter
1297# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
1298# both the bulk (8 blocks at a time) path, and in the individual block
1299# path. Be aware of this when doing updates.
1300#
1301# See:
1302# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
1303# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
1304# https://github.com/openssl/openssl/pull/8942
1305#
1306#########################################################################
1307my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1308my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1309my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1310						map("v$_",(4..11));
1311my $dat=$tmp;
1312
1313$code.=<<___;
1314.globl	.${prefix}_ctr32_encrypt_blocks
1315	${UCMP}i	$len,1
1316	bltlr-
1317
1318	lis		r0,0xfff0
1319	mfspr		$vrsave,256
1320	mtspr		256,r0
1321
1322	li		$idx,15
1323	vxor		$rndkey0,$rndkey0,$rndkey0
1324	le?vspltisb	$tmp,0x0f
1325
1326	lvx		$ivec,0,$ivp		# load [unaligned] iv
1327	lvsl		$inpperm,0,$ivp
1328	lvx		$inptail,$idx,$ivp
1329	 vspltisb	$one,1
1330	le?vxor		$inpperm,$inpperm,$tmp
1331	vperm		$ivec,$ivec,$inptail,$inpperm
1332	 vsldoi		$one,$rndkey0,$one,1
1333
1334	neg		r11,$inp
1335	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1336	lwz		$rounds,240($key)
1337
1338	lvsr		$inpperm,0,r11		# prepare for unaligned load
1339	lvx		$inptail,0,$inp
1340	addi		$inp,$inp,15		# 15 is not typo
1341	le?vxor		$inpperm,$inpperm,$tmp
1342
1343	srwi		$rounds,$rounds,1
1344	li		$idx,16
1345	subi		$rounds,$rounds,1
1346
1347	${UCMP}i	$len,8
1348	bge		_aesp8_ctr32_encrypt8x
1349
1350	?lvsr		$outperm,0,$out		# prepare for unaligned store
1351	vspltisb	$outmask,-1
1352	lvx		$outhead,0,$out
1353	?vperm		$outmask,$rndkey0,$outmask,$outperm
1354	le?vxor		$outperm,$outperm,$tmp
1355
1356	lvx		$rndkey0,0,$key
1357	mtctr		$rounds
1358	lvx		$rndkey1,$idx,$key
1359	addi		$idx,$idx,16
1360	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1361	vxor		$inout,$ivec,$rndkey0
1362	lvx		$rndkey0,$idx,$key
1363	addi		$idx,$idx,16
1364	b		Loop_ctr32_enc
1365
1366.align	5
1367Loop_ctr32_enc:
1368	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1369	vcipher		$inout,$inout,$rndkey1
1370	lvx		$rndkey1,$idx,$key
1371	addi		$idx,$idx,16
1372	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1373	vcipher		$inout,$inout,$rndkey0
1374	lvx		$rndkey0,$idx,$key
1375	addi		$idx,$idx,16
1376	bdnz		Loop_ctr32_enc
1377
1378	vadduqm		$ivec,$ivec,$one	# Kernel change for 128-bit
1379	 vmr		$dat,$inptail
1380	 lvx		$inptail,0,$inp
1381	 addi		$inp,$inp,16
1382	 subic.		$len,$len,1		# blocks--
1383
1384	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1385	vcipher		$inout,$inout,$rndkey1
1386	lvx		$rndkey1,$idx,$key
1387	 vperm		$dat,$dat,$inptail,$inpperm
1388	 li		$idx,16
1389	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1390	 lvx		$rndkey0,0,$key
1391	vxor		$dat,$dat,$rndkey1	# last round key
1392	vcipherlast	$inout,$inout,$dat
1393
1394	 lvx		$rndkey1,$idx,$key
1395	 addi		$idx,$idx,16
1396	vperm		$inout,$inout,$inout,$outperm
1397	vsel		$dat,$outhead,$inout,$outmask
1398	 mtctr		$rounds
1399	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1400	vmr		$outhead,$inout
1401	 vxor		$inout,$ivec,$rndkey0
1402	 lvx		$rndkey0,$idx,$key
1403	 addi		$idx,$idx,16
1404	stvx		$dat,0,$out
1405	addi		$out,$out,16
1406	bne		Loop_ctr32_enc
1407
1408	addi		$out,$out,-1
1409	lvx		$inout,0,$out		# redundant in aligned case
1410	vsel		$inout,$outhead,$inout,$outmask
1411	stvx		$inout,0,$out
1412
1413	mtspr		256,$vrsave
1414	blr
1415	.long		0
1416	.byte		0,12,0x14,0,0,0,6,0
1417	.long		0
1418___
1419#########################################################################
1420{{	# Optimized CTR procedure					#
1421my $key_="r11";
1422my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1423my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1424my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1425my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1426			# v26-v31 last 6 round keys
1427my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1428my ($two,$three,$four)=($outhead,$outperm,$outmask);
1429
1430$code.=<<___;
1431.align	5
1432_aesp8_ctr32_encrypt8x:
1433	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1434	li		r10,`$FRAME+8*16+15`
1435	li		r11,`$FRAME+8*16+31`
1436	stvx		v20,r10,$sp		# ABI says so
1437	addi		r10,r10,32
1438	stvx		v21,r11,$sp
1439	addi		r11,r11,32
1440	stvx		v22,r10,$sp
1441	addi		r10,r10,32
1442	stvx		v23,r11,$sp
1443	addi		r11,r11,32
1444	stvx		v24,r10,$sp
1445	addi		r10,r10,32
1446	stvx		v25,r11,$sp
1447	addi		r11,r11,32
1448	stvx		v26,r10,$sp
1449	addi		r10,r10,32
1450	stvx		v27,r11,$sp
1451	addi		r11,r11,32
1452	stvx		v28,r10,$sp
1453	addi		r10,r10,32
1454	stvx		v29,r11,$sp
1455	addi		r11,r11,32
1456	stvx		v30,r10,$sp
1457	stvx		v31,r11,$sp
1458	li		r0,-1
1459	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1460	li		$x10,0x10
1461	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1462	li		$x20,0x20
1463	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1464	li		$x30,0x30
1465	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1466	li		$x40,0x40
1467	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1468	li		$x50,0x50
1469	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1470	li		$x60,0x60
1471	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1472	li		$x70,0x70
1473	mtspr		256,r0
1474
1475	subi		$rounds,$rounds,3	# -4 in total
1476
1477	lvx		$rndkey0,$x00,$key	# load key schedule
1478	lvx		v30,$x10,$key
1479	addi		$key,$key,0x20
1480	lvx		v31,$x00,$key
1481	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1482	addi		$key_,$sp,$FRAME+15
1483	mtctr		$rounds
1484
1485Load_ctr32_enc_key:
1486	?vperm		v24,v30,v31,$keyperm
1487	lvx		v30,$x10,$key
1488	addi		$key,$key,0x20
1489	stvx		v24,$x00,$key_		# off-load round[1]
1490	?vperm		v25,v31,v30,$keyperm
1491	lvx		v31,$x00,$key
1492	stvx		v25,$x10,$key_		# off-load round[2]
1493	addi		$key_,$key_,0x20
1494	bdnz		Load_ctr32_enc_key
1495
1496	lvx		v26,$x10,$key
1497	?vperm		v24,v30,v31,$keyperm
1498	lvx		v27,$x20,$key
1499	stvx		v24,$x00,$key_		# off-load round[3]
1500	?vperm		v25,v31,v26,$keyperm
1501	lvx		v28,$x30,$key
1502	stvx		v25,$x10,$key_		# off-load round[4]
1503	addi		$key_,$sp,$FRAME+15	# rewind $key_
1504	?vperm		v26,v26,v27,$keyperm
1505	lvx		v29,$x40,$key
1506	?vperm		v27,v27,v28,$keyperm
1507	lvx		v30,$x50,$key
1508	?vperm		v28,v28,v29,$keyperm
1509	lvx		v31,$x60,$key
1510	?vperm		v29,v29,v30,$keyperm
1511	lvx		$out0,$x70,$key		# borrow $out0
1512	?vperm		v30,v30,v31,$keyperm
1513	lvx		v24,$x00,$key_		# pre-load round[1]
1514	?vperm		v31,v31,$out0,$keyperm
1515	lvx		v25,$x10,$key_		# pre-load round[2]
1516
1517	vadduqm		$two,$one,$one
1518	subi		$inp,$inp,15		# undo "caller"
1519	$SHL		$len,$len,4
1520
1521	vadduqm		$out1,$ivec,$one	# counter values ...
1522	vadduqm		$out2,$ivec,$two	# (do all ctr adds as 128-bit)
1523	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1524	 le?li		$idx,8
1525	vadduqm		$out3,$out1,$two
1526	vxor		$out1,$out1,$rndkey0
1527	 le?lvsl	$inpperm,0,$idx
1528	vadduqm		$out4,$out2,$two
1529	vxor		$out2,$out2,$rndkey0
1530	 le?vspltisb	$tmp,0x0f
1531	vadduqm		$out5,$out3,$two
1532	vxor		$out3,$out3,$rndkey0
1533	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1534	vadduqm		$out6,$out4,$two
1535	vxor		$out4,$out4,$rndkey0
1536	vadduqm		$out7,$out5,$two
1537	vxor		$out5,$out5,$rndkey0
1538	vadduqm		$ivec,$out6,$two	# next counter value
1539	vxor		$out6,$out6,$rndkey0
1540	vxor		$out7,$out7,$rndkey0
1541
1542	mtctr		$rounds
1543	b		Loop_ctr32_enc8x
1544.align	5
1545Loop_ctr32_enc8x:
1546	vcipher 	$out0,$out0,v24
1547	vcipher 	$out1,$out1,v24
1548	vcipher 	$out2,$out2,v24
1549	vcipher 	$out3,$out3,v24
1550	vcipher 	$out4,$out4,v24
1551	vcipher 	$out5,$out5,v24
1552	vcipher 	$out6,$out6,v24
1553	vcipher 	$out7,$out7,v24
1554Loop_ctr32_enc8x_middle:
1555	lvx		v24,$x20,$key_		# round[3]
1556	addi		$key_,$key_,0x20
1557
1558	vcipher 	$out0,$out0,v25
1559	vcipher 	$out1,$out1,v25
1560	vcipher 	$out2,$out2,v25
1561	vcipher 	$out3,$out3,v25
1562	vcipher 	$out4,$out4,v25
1563	vcipher 	$out5,$out5,v25
1564	vcipher 	$out6,$out6,v25
1565	vcipher 	$out7,$out7,v25
1566	lvx		v25,$x10,$key_		# round[4]
1567	bdnz		Loop_ctr32_enc8x
1568
1569	subic		r11,$len,256		# $len-256, borrow $key_
1570	vcipher 	$out0,$out0,v24
1571	vcipher 	$out1,$out1,v24
1572	vcipher 	$out2,$out2,v24
1573	vcipher 	$out3,$out3,v24
1574	vcipher 	$out4,$out4,v24
1575	vcipher 	$out5,$out5,v24
1576	vcipher 	$out6,$out6,v24
1577	vcipher 	$out7,$out7,v24
1578
1579	subfe		r0,r0,r0		# borrow?-1:0
1580	vcipher 	$out0,$out0,v25
1581	vcipher 	$out1,$out1,v25
1582	vcipher 	$out2,$out2,v25
1583	vcipher 	$out3,$out3,v25
1584	vcipher 	$out4,$out4,v25
1585	vcipher		$out5,$out5,v25
1586	vcipher		$out6,$out6,v25
1587	vcipher		$out7,$out7,v25
1588
1589	and		r0,r0,r11
1590	addi		$key_,$sp,$FRAME+15	# rewind $key_
1591	vcipher		$out0,$out0,v26
1592	vcipher		$out1,$out1,v26
1593	vcipher		$out2,$out2,v26
1594	vcipher		$out3,$out3,v26
1595	vcipher		$out4,$out4,v26
1596	vcipher		$out5,$out5,v26
1597	vcipher		$out6,$out6,v26
1598	vcipher		$out7,$out7,v26
1599	lvx		v24,$x00,$key_		# re-pre-load round[1]
1600
1601	subic		$len,$len,129		# $len-=129
1602	vcipher		$out0,$out0,v27
1603	addi		$len,$len,1		# $len-=128 really
1604	vcipher		$out1,$out1,v27
1605	vcipher		$out2,$out2,v27
1606	vcipher		$out3,$out3,v27
1607	vcipher		$out4,$out4,v27
1608	vcipher		$out5,$out5,v27
1609	vcipher		$out6,$out6,v27
1610	vcipher		$out7,$out7,v27
1611	lvx		v25,$x10,$key_		# re-pre-load round[2]
1612
1613	vcipher		$out0,$out0,v28
1614	 lvx_u		$in0,$x00,$inp		# load input
1615	vcipher		$out1,$out1,v28
1616	 lvx_u		$in1,$x10,$inp
1617	vcipher		$out2,$out2,v28
1618	 lvx_u		$in2,$x20,$inp
1619	vcipher		$out3,$out3,v28
1620	 lvx_u		$in3,$x30,$inp
1621	vcipher		$out4,$out4,v28
1622	 lvx_u		$in4,$x40,$inp
1623	vcipher		$out5,$out5,v28
1624	 lvx_u		$in5,$x50,$inp
1625	vcipher		$out6,$out6,v28
1626	 lvx_u		$in6,$x60,$inp
1627	vcipher		$out7,$out7,v28
1628	 lvx_u		$in7,$x70,$inp
1629	 addi		$inp,$inp,0x80
1630
1631	vcipher		$out0,$out0,v29
1632	 le?vperm	$in0,$in0,$in0,$inpperm
1633	vcipher		$out1,$out1,v29
1634	 le?vperm	$in1,$in1,$in1,$inpperm
1635	vcipher		$out2,$out2,v29
1636	 le?vperm	$in2,$in2,$in2,$inpperm
1637	vcipher		$out3,$out3,v29
1638	 le?vperm	$in3,$in3,$in3,$inpperm
1639	vcipher		$out4,$out4,v29
1640	 le?vperm	$in4,$in4,$in4,$inpperm
1641	vcipher		$out5,$out5,v29
1642	 le?vperm	$in5,$in5,$in5,$inpperm
1643	vcipher		$out6,$out6,v29
1644	 le?vperm	$in6,$in6,$in6,$inpperm
1645	vcipher		$out7,$out7,v29
1646	 le?vperm	$in7,$in7,$in7,$inpperm
1647
1648	add		$inp,$inp,r0		# $inp is adjusted in such
1649						# way that at exit from the
1650						# loop inX-in7 are loaded
1651						# with last "words"
1652	subfe.		r0,r0,r0		# borrow?-1:0
1653	vcipher		$out0,$out0,v30
1654	 vxor		$in0,$in0,v31		# xor with last round key
1655	vcipher		$out1,$out1,v30
1656	 vxor		$in1,$in1,v31
1657	vcipher		$out2,$out2,v30
1658	 vxor		$in2,$in2,v31
1659	vcipher		$out3,$out3,v30
1660	 vxor		$in3,$in3,v31
1661	vcipher		$out4,$out4,v30
1662	 vxor		$in4,$in4,v31
1663	vcipher		$out5,$out5,v30
1664	 vxor		$in5,$in5,v31
1665	vcipher		$out6,$out6,v30
1666	 vxor		$in6,$in6,v31
1667	vcipher		$out7,$out7,v30
1668	 vxor		$in7,$in7,v31
1669
1670	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1671
1672	vcipherlast	$in0,$out0,$in0
1673	vcipherlast	$in1,$out1,$in1
1674	 vadduqm	$out1,$ivec,$one	# counter values ...
1675	vcipherlast	$in2,$out2,$in2
1676	 vadduqm	$out2,$ivec,$two
1677	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1678	vcipherlast	$in3,$out3,$in3
1679	 vadduqm	$out3,$out1,$two
1680	 vxor		$out1,$out1,$rndkey0
1681	vcipherlast	$in4,$out4,$in4
1682	 vadduqm	$out4,$out2,$two
1683	 vxor		$out2,$out2,$rndkey0
1684	vcipherlast	$in5,$out5,$in5
1685	 vadduqm	$out5,$out3,$two
1686	 vxor		$out3,$out3,$rndkey0
1687	vcipherlast	$in6,$out6,$in6
1688	 vadduqm	$out6,$out4,$two
1689	 vxor		$out4,$out4,$rndkey0
1690	vcipherlast	$in7,$out7,$in7
1691	 vadduqm	$out7,$out5,$two
1692	 vxor		$out5,$out5,$rndkey0
1693	le?vperm	$in0,$in0,$in0,$inpperm
1694	 vadduqm	$ivec,$out6,$two	# next counter value
1695	 vxor		$out6,$out6,$rndkey0
1696	le?vperm	$in1,$in1,$in1,$inpperm
1697	 vxor		$out7,$out7,$rndkey0
1698	mtctr		$rounds
1699
1700	 vcipher	$out0,$out0,v24
1701	stvx_u		$in0,$x00,$out
1702	le?vperm	$in2,$in2,$in2,$inpperm
1703	 vcipher	$out1,$out1,v24
1704	stvx_u		$in1,$x10,$out
1705	le?vperm	$in3,$in3,$in3,$inpperm
1706	 vcipher	$out2,$out2,v24
1707	stvx_u		$in2,$x20,$out
1708	le?vperm	$in4,$in4,$in4,$inpperm
1709	 vcipher	$out3,$out3,v24
1710	stvx_u		$in3,$x30,$out
1711	le?vperm	$in5,$in5,$in5,$inpperm
1712	 vcipher	$out4,$out4,v24
1713	stvx_u		$in4,$x40,$out
1714	le?vperm	$in6,$in6,$in6,$inpperm
1715	 vcipher	$out5,$out5,v24
1716	stvx_u		$in5,$x50,$out
1717	le?vperm	$in7,$in7,$in7,$inpperm
1718	 vcipher	$out6,$out6,v24
1719	stvx_u		$in6,$x60,$out
1720	 vcipher	$out7,$out7,v24
1721	stvx_u		$in7,$x70,$out
1722	addi		$out,$out,0x80
1723
1724	b		Loop_ctr32_enc8x_middle
1725
1726.align	5
1727Lctr32_enc8x_break:
1728	cmpwi		$len,-0x60
1729	blt		Lctr32_enc8x_one
1730	nop
1731	beq		Lctr32_enc8x_two
1732	cmpwi		$len,-0x40
1733	blt		Lctr32_enc8x_three
1734	nop
1735	beq		Lctr32_enc8x_four
1736	cmpwi		$len,-0x20
1737	blt		Lctr32_enc8x_five
1738	nop
1739	beq		Lctr32_enc8x_six
1740	cmpwi		$len,0x00
1741	blt		Lctr32_enc8x_seven
1742
1743Lctr32_enc8x_eight:
1744	vcipherlast	$out0,$out0,$in0
1745	vcipherlast	$out1,$out1,$in1
1746	vcipherlast	$out2,$out2,$in2
1747	vcipherlast	$out3,$out3,$in3
1748	vcipherlast	$out4,$out4,$in4
1749	vcipherlast	$out5,$out5,$in5
1750	vcipherlast	$out6,$out6,$in6
1751	vcipherlast	$out7,$out7,$in7
1752
1753	le?vperm	$out0,$out0,$out0,$inpperm
1754	le?vperm	$out1,$out1,$out1,$inpperm
1755	stvx_u		$out0,$x00,$out
1756	le?vperm	$out2,$out2,$out2,$inpperm
1757	stvx_u		$out1,$x10,$out
1758	le?vperm	$out3,$out3,$out3,$inpperm
1759	stvx_u		$out2,$x20,$out
1760	le?vperm	$out4,$out4,$out4,$inpperm
1761	stvx_u		$out3,$x30,$out
1762	le?vperm	$out5,$out5,$out5,$inpperm
1763	stvx_u		$out4,$x40,$out
1764	le?vperm	$out6,$out6,$out6,$inpperm
1765	stvx_u		$out5,$x50,$out
1766	le?vperm	$out7,$out7,$out7,$inpperm
1767	stvx_u		$out6,$x60,$out
1768	stvx_u		$out7,$x70,$out
1769	addi		$out,$out,0x80
1770	b		Lctr32_enc8x_done
1771
1772.align	5
1773Lctr32_enc8x_seven:
1774	vcipherlast	$out0,$out0,$in1
1775	vcipherlast	$out1,$out1,$in2
1776	vcipherlast	$out2,$out2,$in3
1777	vcipherlast	$out3,$out3,$in4
1778	vcipherlast	$out4,$out4,$in5
1779	vcipherlast	$out5,$out5,$in6
1780	vcipherlast	$out6,$out6,$in7
1781
1782	le?vperm	$out0,$out0,$out0,$inpperm
1783	le?vperm	$out1,$out1,$out1,$inpperm
1784	stvx_u		$out0,$x00,$out
1785	le?vperm	$out2,$out2,$out2,$inpperm
1786	stvx_u		$out1,$x10,$out
1787	le?vperm	$out3,$out3,$out3,$inpperm
1788	stvx_u		$out2,$x20,$out
1789	le?vperm	$out4,$out4,$out4,$inpperm
1790	stvx_u		$out3,$x30,$out
1791	le?vperm	$out5,$out5,$out5,$inpperm
1792	stvx_u		$out4,$x40,$out
1793	le?vperm	$out6,$out6,$out6,$inpperm
1794	stvx_u		$out5,$x50,$out
1795	stvx_u		$out6,$x60,$out
1796	addi		$out,$out,0x70
1797	b		Lctr32_enc8x_done
1798
1799.align	5
1800Lctr32_enc8x_six:
1801	vcipherlast	$out0,$out0,$in2
1802	vcipherlast	$out1,$out1,$in3
1803	vcipherlast	$out2,$out2,$in4
1804	vcipherlast	$out3,$out3,$in5
1805	vcipherlast	$out4,$out4,$in6
1806	vcipherlast	$out5,$out5,$in7
1807
1808	le?vperm	$out0,$out0,$out0,$inpperm
1809	le?vperm	$out1,$out1,$out1,$inpperm
1810	stvx_u		$out0,$x00,$out
1811	le?vperm	$out2,$out2,$out2,$inpperm
1812	stvx_u		$out1,$x10,$out
1813	le?vperm	$out3,$out3,$out3,$inpperm
1814	stvx_u		$out2,$x20,$out
1815	le?vperm	$out4,$out4,$out4,$inpperm
1816	stvx_u		$out3,$x30,$out
1817	le?vperm	$out5,$out5,$out5,$inpperm
1818	stvx_u		$out4,$x40,$out
1819	stvx_u		$out5,$x50,$out
1820	addi		$out,$out,0x60
1821	b		Lctr32_enc8x_done
1822
1823.align	5
1824Lctr32_enc8x_five:
1825	vcipherlast	$out0,$out0,$in3
1826	vcipherlast	$out1,$out1,$in4
1827	vcipherlast	$out2,$out2,$in5
1828	vcipherlast	$out3,$out3,$in6
1829	vcipherlast	$out4,$out4,$in7
1830
1831	le?vperm	$out0,$out0,$out0,$inpperm
1832	le?vperm	$out1,$out1,$out1,$inpperm
1833	stvx_u		$out0,$x00,$out
1834	le?vperm	$out2,$out2,$out2,$inpperm
1835	stvx_u		$out1,$x10,$out
1836	le?vperm	$out3,$out3,$out3,$inpperm
1837	stvx_u		$out2,$x20,$out
1838	le?vperm	$out4,$out4,$out4,$inpperm
1839	stvx_u		$out3,$x30,$out
1840	stvx_u		$out4,$x40,$out
1841	addi		$out,$out,0x50
1842	b		Lctr32_enc8x_done
1843
1844.align	5
1845Lctr32_enc8x_four:
1846	vcipherlast	$out0,$out0,$in4
1847	vcipherlast	$out1,$out1,$in5
1848	vcipherlast	$out2,$out2,$in6
1849	vcipherlast	$out3,$out3,$in7
1850
1851	le?vperm	$out0,$out0,$out0,$inpperm
1852	le?vperm	$out1,$out1,$out1,$inpperm
1853	stvx_u		$out0,$x00,$out
1854	le?vperm	$out2,$out2,$out2,$inpperm
1855	stvx_u		$out1,$x10,$out
1856	le?vperm	$out3,$out3,$out3,$inpperm
1857	stvx_u		$out2,$x20,$out
1858	stvx_u		$out3,$x30,$out
1859	addi		$out,$out,0x40
1860	b		Lctr32_enc8x_done
1861
1862.align	5
1863Lctr32_enc8x_three:
1864	vcipherlast	$out0,$out0,$in5
1865	vcipherlast	$out1,$out1,$in6
1866	vcipherlast	$out2,$out2,$in7
1867
1868	le?vperm	$out0,$out0,$out0,$inpperm
1869	le?vperm	$out1,$out1,$out1,$inpperm
1870	stvx_u		$out0,$x00,$out
1871	le?vperm	$out2,$out2,$out2,$inpperm
1872	stvx_u		$out1,$x10,$out
1873	stvx_u		$out2,$x20,$out
1874	addi		$out,$out,0x30
1875	b		Lctr32_enc8x_done
1876
1877.align	5
1878Lctr32_enc8x_two:
1879	vcipherlast	$out0,$out0,$in6
1880	vcipherlast	$out1,$out1,$in7
1881
1882	le?vperm	$out0,$out0,$out0,$inpperm
1883	le?vperm	$out1,$out1,$out1,$inpperm
1884	stvx_u		$out0,$x00,$out
1885	stvx_u		$out1,$x10,$out
1886	addi		$out,$out,0x20
1887	b		Lctr32_enc8x_done
1888
1889.align	5
1890Lctr32_enc8x_one:
1891	vcipherlast	$out0,$out0,$in7
1892
1893	le?vperm	$out0,$out0,$out0,$inpperm
1894	stvx_u		$out0,0,$out
1895	addi		$out,$out,0x10
1896
1897Lctr32_enc8x_done:
1898	li		r10,`$FRAME+15`
1899	li		r11,`$FRAME+31`
1900	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1901	addi		r10,r10,32
1902	stvx		$inpperm,r11,$sp
1903	addi		r11,r11,32
1904	stvx		$inpperm,r10,$sp
1905	addi		r10,r10,32
1906	stvx		$inpperm,r11,$sp
1907	addi		r11,r11,32
1908	stvx		$inpperm,r10,$sp
1909	addi		r10,r10,32
1910	stvx		$inpperm,r11,$sp
1911	addi		r11,r11,32
1912	stvx		$inpperm,r10,$sp
1913	addi		r10,r10,32
1914	stvx		$inpperm,r11,$sp
1915	addi		r11,r11,32
1916
1917	mtspr		256,$vrsave
1918	lvx		v20,r10,$sp		# ABI says so
1919	addi		r10,r10,32
1920	lvx		v21,r11,$sp
1921	addi		r11,r11,32
1922	lvx		v22,r10,$sp
1923	addi		r10,r10,32
1924	lvx		v23,r11,$sp
1925	addi		r11,r11,32
1926	lvx		v24,r10,$sp
1927	addi		r10,r10,32
1928	lvx		v25,r11,$sp
1929	addi		r11,r11,32
1930	lvx		v26,r10,$sp
1931	addi		r10,r10,32
1932	lvx		v27,r11,$sp
1933	addi		r11,r11,32
1934	lvx		v28,r10,$sp
1935	addi		r10,r10,32
1936	lvx		v29,r11,$sp
1937	addi		r11,r11,32
1938	lvx		v30,r10,$sp
1939	lvx		v31,r11,$sp
1940	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1941	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1942	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1943	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1944	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1945	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1946	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1947	blr
1948	.long		0
1949	.byte		0,12,0x14,0,0x80,6,6,0
1950	.long		0
1951.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1952___
1953}}	}}}
1954
1955#########################################################################
1956{{{	# XTS procedures						#
1957# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1958#                             const AES_KEY *key1, const AES_KEY *key2,	#
1959#                             [const] unsigned char iv[16]);		#
1960# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1961# input tweak value is assumed to be encrypted already, and last tweak	#
1962# value, one suitable for consecutive call on same chunk of data, is	#
1963# written back to original buffer. In addition, in "tweak chaining"	#
1964# mode only complete input blocks are processed.			#
1965
1966my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1967my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1968my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1969my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1970my $taillen = $key2;
1971
1972   ($inp,$idx) = ($idx,$inp);				# reassign
1973
1974$code.=<<___;
1975.globl	.${prefix}_xts_encrypt
1976	mr		$inp,r3				# reassign
1977	li		r3,-1
1978	${UCMP}i	$len,16
1979	bltlr-
1980
1981	lis		r0,0xfff0
1982	mfspr		r12,256				# save vrsave
1983	li		r11,0
1984	mtspr		256,r0
1985
1986	vspltisb	$seven,0x07			# 0x070707..07
1987	le?lvsl		$leperm,r11,r11
1988	le?vspltisb	$tmp,0x0f
1989	le?vxor		$leperm,$leperm,$seven
1990
1991	li		$idx,15
1992	lvx		$tweak,0,$ivp			# load [unaligned] iv
1993	lvsl		$inpperm,0,$ivp
1994	lvx		$inptail,$idx,$ivp
1995	le?vxor		$inpperm,$inpperm,$tmp
1996	vperm		$tweak,$tweak,$inptail,$inpperm
1997
1998	neg		r11,$inp
1999	lvsr		$inpperm,0,r11			# prepare for unaligned load
2000	lvx		$inout,0,$inp
2001	addi		$inp,$inp,15			# 15 is not typo
2002	le?vxor		$inpperm,$inpperm,$tmp
2003
2004	${UCMP}i	$key2,0				# key2==NULL?
2005	beq		Lxts_enc_no_key2
2006
2007	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2008	lwz		$rounds,240($key2)
2009	srwi		$rounds,$rounds,1
2010	subi		$rounds,$rounds,1
2011	li		$idx,16
2012
2013	lvx		$rndkey0,0,$key2
2014	lvx		$rndkey1,$idx,$key2
2015	addi		$idx,$idx,16
2016	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2017	vxor		$tweak,$tweak,$rndkey0
2018	lvx		$rndkey0,$idx,$key2
2019	addi		$idx,$idx,16
2020	mtctr		$rounds
2021
2022Ltweak_xts_enc:
2023	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2024	vcipher		$tweak,$tweak,$rndkey1
2025	lvx		$rndkey1,$idx,$key2
2026	addi		$idx,$idx,16
2027	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2028	vcipher		$tweak,$tweak,$rndkey0
2029	lvx		$rndkey0,$idx,$key2
2030	addi		$idx,$idx,16
2031	bdnz		Ltweak_xts_enc
2032
2033	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2034	vcipher		$tweak,$tweak,$rndkey1
2035	lvx		$rndkey1,$idx,$key2
2036	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2037	vcipherlast	$tweak,$tweak,$rndkey0
2038
2039	li		$ivp,0				# don't chain the tweak
2040	b		Lxts_enc
2041
2042Lxts_enc_no_key2:
2043	li		$idx,-16
2044	and		$len,$len,$idx			# in "tweak chaining"
2045							# mode only complete
2046							# blocks are processed
2047Lxts_enc:
2048	lvx		$inptail,0,$inp
2049	addi		$inp,$inp,16
2050
2051	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2052	lwz		$rounds,240($key1)
2053	srwi		$rounds,$rounds,1
2054	subi		$rounds,$rounds,1
2055	li		$idx,16
2056
2057	vslb		$eighty7,$seven,$seven		# 0x808080..80
2058	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2059	vspltisb	$tmp,1				# 0x010101..01
2060	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2061
2062	${UCMP}i	$len,96
2063	bge		_aesp8_xts_encrypt6x
2064
2065	andi.		$taillen,$len,15
2066	subic		r0,$len,32
2067	subi		$taillen,$taillen,16
2068	subfe		r0,r0,r0
2069	and		r0,r0,$taillen
2070	add		$inp,$inp,r0
2071
2072	lvx		$rndkey0,0,$key1
2073	lvx		$rndkey1,$idx,$key1
2074	addi		$idx,$idx,16
2075	vperm		$inout,$inout,$inptail,$inpperm
2076	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2077	vxor		$inout,$inout,$tweak
2078	vxor		$inout,$inout,$rndkey0
2079	lvx		$rndkey0,$idx,$key1
2080	addi		$idx,$idx,16
2081	mtctr		$rounds
2082	b		Loop_xts_enc
2083
2084.align	5
2085Loop_xts_enc:
2086	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2087	vcipher		$inout,$inout,$rndkey1
2088	lvx		$rndkey1,$idx,$key1
2089	addi		$idx,$idx,16
2090	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2091	vcipher		$inout,$inout,$rndkey0
2092	lvx		$rndkey0,$idx,$key1
2093	addi		$idx,$idx,16
2094	bdnz		Loop_xts_enc
2095
2096	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2097	vcipher		$inout,$inout,$rndkey1
2098	lvx		$rndkey1,$idx,$key1
2099	li		$idx,16
2100	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2101	vxor		$rndkey0,$rndkey0,$tweak
2102	vcipherlast	$output,$inout,$rndkey0
2103
2104	le?vperm	$tmp,$output,$output,$leperm
2105	be?nop
2106	le?stvx_u	$tmp,0,$out
2107	be?stvx_u	$output,0,$out
2108	addi		$out,$out,16
2109
2110	subic.		$len,$len,16
2111	beq		Lxts_enc_done
2112
2113	vmr		$inout,$inptail
2114	lvx		$inptail,0,$inp
2115	addi		$inp,$inp,16
2116	lvx		$rndkey0,0,$key1
2117	lvx		$rndkey1,$idx,$key1
2118	addi		$idx,$idx,16
2119
2120	subic		r0,$len,32
2121	subfe		r0,r0,r0
2122	and		r0,r0,$taillen
2123	add		$inp,$inp,r0
2124
2125	vsrab		$tmp,$tweak,$seven		# next tweak value
2126	vaddubm		$tweak,$tweak,$tweak
2127	vsldoi		$tmp,$tmp,$tmp,15
2128	vand		$tmp,$tmp,$eighty7
2129	vxor		$tweak,$tweak,$tmp
2130
2131	vperm		$inout,$inout,$inptail,$inpperm
2132	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2133	vxor		$inout,$inout,$tweak
2134	vxor		$output,$output,$rndkey0	# just in case $len<16
2135	vxor		$inout,$inout,$rndkey0
2136	lvx		$rndkey0,$idx,$key1
2137	addi		$idx,$idx,16
2138
2139	mtctr		$rounds
2140	${UCMP}i	$len,16
2141	bge		Loop_xts_enc
2142
2143	vxor		$output,$output,$tweak
2144	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2145	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2146	vspltisb	$tmp,-1
2147	vperm		$inptail,$inptail,$tmp,$inpperm
2148	vsel		$inout,$inout,$output,$inptail
2149
2150	subi		r11,$out,17
2151	subi		$out,$out,16
2152	mtctr		$len
2153	li		$len,16
2154Loop_xts_enc_steal:
2155	lbzu		r0,1(r11)
2156	stb		r0,16(r11)
2157	bdnz		Loop_xts_enc_steal
2158
2159	mtctr		$rounds
2160	b		Loop_xts_enc			# one more time...
2161
2162Lxts_enc_done:
2163	${UCMP}i	$ivp,0
2164	beq		Lxts_enc_ret
2165
2166	vsrab		$tmp,$tweak,$seven		# next tweak value
2167	vaddubm		$tweak,$tweak,$tweak
2168	vsldoi		$tmp,$tmp,$tmp,15
2169	vand		$tmp,$tmp,$eighty7
2170	vxor		$tweak,$tweak,$tmp
2171
2172	le?vperm	$tweak,$tweak,$tweak,$leperm
2173	stvx_u		$tweak,0,$ivp
2174
2175Lxts_enc_ret:
2176	mtspr		256,r12				# restore vrsave
2177	li		r3,0
2178	blr
2179	.long		0
2180	.byte		0,12,0x04,0,0x80,6,6,0
2181	.long		0
2182.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2183
2184.globl	.${prefix}_xts_decrypt
2185	mr		$inp,r3				# reassign
2186	li		r3,-1
2187	${UCMP}i	$len,16
2188	bltlr-
2189
2190	lis		r0,0xfff8
2191	mfspr		r12,256				# save vrsave
2192	li		r11,0
2193	mtspr		256,r0
2194
2195	andi.		r0,$len,15
2196	neg		r0,r0
2197	andi.		r0,r0,16
2198	sub		$len,$len,r0
2199
2200	vspltisb	$seven,0x07			# 0x070707..07
2201	le?lvsl		$leperm,r11,r11
2202	le?vspltisb	$tmp,0x0f
2203	le?vxor		$leperm,$leperm,$seven
2204
2205	li		$idx,15
2206	lvx		$tweak,0,$ivp			# load [unaligned] iv
2207	lvsl		$inpperm,0,$ivp
2208	lvx		$inptail,$idx,$ivp
2209	le?vxor		$inpperm,$inpperm,$tmp
2210	vperm		$tweak,$tweak,$inptail,$inpperm
2211
2212	neg		r11,$inp
2213	lvsr		$inpperm,0,r11			# prepare for unaligned load
2214	lvx		$inout,0,$inp
2215	addi		$inp,$inp,15			# 15 is not typo
2216	le?vxor		$inpperm,$inpperm,$tmp
2217
2218	${UCMP}i	$key2,0				# key2==NULL?
2219	beq		Lxts_dec_no_key2
2220
2221	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2222	lwz		$rounds,240($key2)
2223	srwi		$rounds,$rounds,1
2224	subi		$rounds,$rounds,1
2225	li		$idx,16
2226
2227	lvx		$rndkey0,0,$key2
2228	lvx		$rndkey1,$idx,$key2
2229	addi		$idx,$idx,16
2230	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2231	vxor		$tweak,$tweak,$rndkey0
2232	lvx		$rndkey0,$idx,$key2
2233	addi		$idx,$idx,16
2234	mtctr		$rounds
2235
2236Ltweak_xts_dec:
2237	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2238	vcipher		$tweak,$tweak,$rndkey1
2239	lvx		$rndkey1,$idx,$key2
2240	addi		$idx,$idx,16
2241	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2242	vcipher		$tweak,$tweak,$rndkey0
2243	lvx		$rndkey0,$idx,$key2
2244	addi		$idx,$idx,16
2245	bdnz		Ltweak_xts_dec
2246
2247	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2248	vcipher		$tweak,$tweak,$rndkey1
2249	lvx		$rndkey1,$idx,$key2
2250	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2251	vcipherlast	$tweak,$tweak,$rndkey0
2252
2253	li		$ivp,0				# don't chain the tweak
2254	b		Lxts_dec
2255
2256Lxts_dec_no_key2:
2257	neg		$idx,$len
2258	andi.		$idx,$idx,15
2259	add		$len,$len,$idx			# in "tweak chaining"
2260							# mode only complete
2261							# blocks are processed
2262Lxts_dec:
2263	lvx		$inptail,0,$inp
2264	addi		$inp,$inp,16
2265
2266	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2267	lwz		$rounds,240($key1)
2268	srwi		$rounds,$rounds,1
2269	subi		$rounds,$rounds,1
2270	li		$idx,16
2271
2272	vslb		$eighty7,$seven,$seven		# 0x808080..80
2273	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2274	vspltisb	$tmp,1				# 0x010101..01
2275	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2276
2277	${UCMP}i	$len,96
2278	bge		_aesp8_xts_decrypt6x
2279
2280	lvx		$rndkey0,0,$key1
2281	lvx		$rndkey1,$idx,$key1
2282	addi		$idx,$idx,16
2283	vperm		$inout,$inout,$inptail,$inpperm
2284	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2285	vxor		$inout,$inout,$tweak
2286	vxor		$inout,$inout,$rndkey0
2287	lvx		$rndkey0,$idx,$key1
2288	addi		$idx,$idx,16
2289	mtctr		$rounds
2290
2291	${UCMP}i	$len,16
2292	blt		Ltail_xts_dec
2293	be?b		Loop_xts_dec
2294
2295.align	5
2296Loop_xts_dec:
2297	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2298	vncipher	$inout,$inout,$rndkey1
2299	lvx		$rndkey1,$idx,$key1
2300	addi		$idx,$idx,16
2301	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2302	vncipher	$inout,$inout,$rndkey0
2303	lvx		$rndkey0,$idx,$key1
2304	addi		$idx,$idx,16
2305	bdnz		Loop_xts_dec
2306
2307	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2308	vncipher	$inout,$inout,$rndkey1
2309	lvx		$rndkey1,$idx,$key1
2310	li		$idx,16
2311	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2312	vxor		$rndkey0,$rndkey0,$tweak
2313	vncipherlast	$output,$inout,$rndkey0
2314
2315	le?vperm	$tmp,$output,$output,$leperm
2316	be?nop
2317	le?stvx_u	$tmp,0,$out
2318	be?stvx_u	$output,0,$out
2319	addi		$out,$out,16
2320
2321	subic.		$len,$len,16
2322	beq		Lxts_dec_done
2323
2324	vmr		$inout,$inptail
2325	lvx		$inptail,0,$inp
2326	addi		$inp,$inp,16
2327	lvx		$rndkey0,0,$key1
2328	lvx		$rndkey1,$idx,$key1
2329	addi		$idx,$idx,16
2330
2331	vsrab		$tmp,$tweak,$seven		# next tweak value
2332	vaddubm		$tweak,$tweak,$tweak
2333	vsldoi		$tmp,$tmp,$tmp,15
2334	vand		$tmp,$tmp,$eighty7
2335	vxor		$tweak,$tweak,$tmp
2336
2337	vperm		$inout,$inout,$inptail,$inpperm
2338	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2339	vxor		$inout,$inout,$tweak
2340	vxor		$inout,$inout,$rndkey0
2341	lvx		$rndkey0,$idx,$key1
2342	addi		$idx,$idx,16
2343
2344	mtctr		$rounds
2345	${UCMP}i	$len,16
2346	bge		Loop_xts_dec
2347
2348Ltail_xts_dec:
2349	vsrab		$tmp,$tweak,$seven		# next tweak value
2350	vaddubm		$tweak1,$tweak,$tweak
2351	vsldoi		$tmp,$tmp,$tmp,15
2352	vand		$tmp,$tmp,$eighty7
2353	vxor		$tweak1,$tweak1,$tmp
2354
2355	subi		$inp,$inp,16
2356	add		$inp,$inp,$len
2357
2358	vxor		$inout,$inout,$tweak		# :-(
2359	vxor		$inout,$inout,$tweak1		# :-)
2360
2361Loop_xts_dec_short:
2362	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2363	vncipher	$inout,$inout,$rndkey1
2364	lvx		$rndkey1,$idx,$key1
2365	addi		$idx,$idx,16
2366	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2367	vncipher	$inout,$inout,$rndkey0
2368	lvx		$rndkey0,$idx,$key1
2369	addi		$idx,$idx,16
2370	bdnz		Loop_xts_dec_short
2371
2372	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2373	vncipher	$inout,$inout,$rndkey1
2374	lvx		$rndkey1,$idx,$key1
2375	li		$idx,16
2376	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2377	vxor		$rndkey0,$rndkey0,$tweak1
2378	vncipherlast	$output,$inout,$rndkey0
2379
2380	le?vperm	$tmp,$output,$output,$leperm
2381	be?nop
2382	le?stvx_u	$tmp,0,$out
2383	be?stvx_u	$output,0,$out
2384
2385	vmr		$inout,$inptail
2386	lvx		$inptail,0,$inp
2387	#addi		$inp,$inp,16
2388	lvx		$rndkey0,0,$key1
2389	lvx		$rndkey1,$idx,$key1
2390	addi		$idx,$idx,16
2391	vperm		$inout,$inout,$inptail,$inpperm
2392	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2393
2394	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2395	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2396	vspltisb	$tmp,-1
2397	vperm		$inptail,$inptail,$tmp,$inpperm
2398	vsel		$inout,$inout,$output,$inptail
2399
2400	vxor		$rndkey0,$rndkey0,$tweak
2401	vxor		$inout,$inout,$rndkey0
2402	lvx		$rndkey0,$idx,$key1
2403	addi		$idx,$idx,16
2404
2405	subi		r11,$out,1
2406	mtctr		$len
2407	li		$len,16
2408Loop_xts_dec_steal:
2409	lbzu		r0,1(r11)
2410	stb		r0,16(r11)
2411	bdnz		Loop_xts_dec_steal
2412
2413	mtctr		$rounds
2414	b		Loop_xts_dec			# one more time...
2415
2416Lxts_dec_done:
2417	${UCMP}i	$ivp,0
2418	beq		Lxts_dec_ret
2419
2420	vsrab		$tmp,$tweak,$seven		# next tweak value
2421	vaddubm		$tweak,$tweak,$tweak
2422	vsldoi		$tmp,$tmp,$tmp,15
2423	vand		$tmp,$tmp,$eighty7
2424	vxor		$tweak,$tweak,$tmp
2425
2426	le?vperm	$tweak,$tweak,$tweak,$leperm
2427	stvx_u		$tweak,0,$ivp
2428
2429Lxts_dec_ret:
2430	mtspr		256,r12				# restore vrsave
2431	li		r3,0
2432	blr
2433	.long		0
2434	.byte		0,12,0x04,0,0x80,6,6,0
2435	.long		0
2436.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2437___
2438#########################################################################
2439{{	# Optimized XTS procedures					#
2440my $key_=$key2;
2441my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2442    $x00=0 if ($flavour =~ /osx/);
2443my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2444my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2445my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2446my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2447			# v26-v31 last 6 round keys
2448my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2449my $taillen=$x70;
2450
2451$code.=<<___;
2452.align	5
2453_aesp8_xts_encrypt6x:
2454	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2455	mflr		r11
2456	li		r7,`$FRAME+8*16+15`
2457	li		r3,`$FRAME+8*16+31`
2458	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2459	stvx		v20,r7,$sp		# ABI says so
2460	addi		r7,r7,32
2461	stvx		v21,r3,$sp
2462	addi		r3,r3,32
2463	stvx		v22,r7,$sp
2464	addi		r7,r7,32
2465	stvx		v23,r3,$sp
2466	addi		r3,r3,32
2467	stvx		v24,r7,$sp
2468	addi		r7,r7,32
2469	stvx		v25,r3,$sp
2470	addi		r3,r3,32
2471	stvx		v26,r7,$sp
2472	addi		r7,r7,32
2473	stvx		v27,r3,$sp
2474	addi		r3,r3,32
2475	stvx		v28,r7,$sp
2476	addi		r7,r7,32
2477	stvx		v29,r3,$sp
2478	addi		r3,r3,32
2479	stvx		v30,r7,$sp
2480	stvx		v31,r3,$sp
2481	li		r0,-1
2482	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2483	li		$x10,0x10
2484	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2485	li		$x20,0x20
2486	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2487	li		$x30,0x30
2488	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2489	li		$x40,0x40
2490	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2491	li		$x50,0x50
2492	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2493	li		$x60,0x60
2494	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2495	li		$x70,0x70
2496	mtspr		256,r0
2497
2498	subi		$rounds,$rounds,3	# -4 in total
2499
2500	lvx		$rndkey0,$x00,$key1	# load key schedule
2501	lvx		v30,$x10,$key1
2502	addi		$key1,$key1,0x20
2503	lvx		v31,$x00,$key1
2504	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2505	addi		$key_,$sp,$FRAME+15
2506	mtctr		$rounds
2507
2508Load_xts_enc_key:
2509	?vperm		v24,v30,v31,$keyperm
2510	lvx		v30,$x10,$key1
2511	addi		$key1,$key1,0x20
2512	stvx		v24,$x00,$key_		# off-load round[1]
2513	?vperm		v25,v31,v30,$keyperm
2514	lvx		v31,$x00,$key1
2515	stvx		v25,$x10,$key_		# off-load round[2]
2516	addi		$key_,$key_,0x20
2517	bdnz		Load_xts_enc_key
2518
2519	lvx		v26,$x10,$key1
2520	?vperm		v24,v30,v31,$keyperm
2521	lvx		v27,$x20,$key1
2522	stvx		v24,$x00,$key_		# off-load round[3]
2523	?vperm		v25,v31,v26,$keyperm
2524	lvx		v28,$x30,$key1
2525	stvx		v25,$x10,$key_		# off-load round[4]
2526	addi		$key_,$sp,$FRAME+15	# rewind $key_
2527	?vperm		v26,v26,v27,$keyperm
2528	lvx		v29,$x40,$key1
2529	?vperm		v27,v27,v28,$keyperm
2530	lvx		v30,$x50,$key1
2531	?vperm		v28,v28,v29,$keyperm
2532	lvx		v31,$x60,$key1
2533	?vperm		v29,v29,v30,$keyperm
2534	lvx		$twk5,$x70,$key1	# borrow $twk5
2535	?vperm		v30,v30,v31,$keyperm
2536	lvx		v24,$x00,$key_		# pre-load round[1]
2537	?vperm		v31,v31,$twk5,$keyperm
2538	lvx		v25,$x10,$key_		# pre-load round[2]
2539
2540	 vperm		$in0,$inout,$inptail,$inpperm
2541	 subi		$inp,$inp,31		# undo "caller"
2542	vxor		$twk0,$tweak,$rndkey0
2543	vsrab		$tmp,$tweak,$seven	# next tweak value
2544	vaddubm		$tweak,$tweak,$tweak
2545	vsldoi		$tmp,$tmp,$tmp,15
2546	vand		$tmp,$tmp,$eighty7
2547	 vxor		$out0,$in0,$twk0
2548	vxor		$tweak,$tweak,$tmp
2549
2550	 lvx_u		$in1,$x10,$inp
2551	vxor		$twk1,$tweak,$rndkey0
2552	vsrab		$tmp,$tweak,$seven	# next tweak value
2553	vaddubm		$tweak,$tweak,$tweak
2554	vsldoi		$tmp,$tmp,$tmp,15
2555	 le?vperm	$in1,$in1,$in1,$leperm
2556	vand		$tmp,$tmp,$eighty7
2557	 vxor		$out1,$in1,$twk1
2558	vxor		$tweak,$tweak,$tmp
2559
2560	 lvx_u		$in2,$x20,$inp
2561	 andi.		$taillen,$len,15
2562	vxor		$twk2,$tweak,$rndkey0
2563	vsrab		$tmp,$tweak,$seven	# next tweak value
2564	vaddubm		$tweak,$tweak,$tweak
2565	vsldoi		$tmp,$tmp,$tmp,15
2566	 le?vperm	$in2,$in2,$in2,$leperm
2567	vand		$tmp,$tmp,$eighty7
2568	 vxor		$out2,$in2,$twk2
2569	vxor		$tweak,$tweak,$tmp
2570
2571	 lvx_u		$in3,$x30,$inp
2572	 sub		$len,$len,$taillen
2573	vxor		$twk3,$tweak,$rndkey0
2574	vsrab		$tmp,$tweak,$seven	# next tweak value
2575	vaddubm		$tweak,$tweak,$tweak
2576	vsldoi		$tmp,$tmp,$tmp,15
2577	 le?vperm	$in3,$in3,$in3,$leperm
2578	vand		$tmp,$tmp,$eighty7
2579	 vxor		$out3,$in3,$twk3
2580	vxor		$tweak,$tweak,$tmp
2581
2582	 lvx_u		$in4,$x40,$inp
2583	 subi		$len,$len,0x60
2584	vxor		$twk4,$tweak,$rndkey0
2585	vsrab		$tmp,$tweak,$seven	# next tweak value
2586	vaddubm		$tweak,$tweak,$tweak
2587	vsldoi		$tmp,$tmp,$tmp,15
2588	 le?vperm	$in4,$in4,$in4,$leperm
2589	vand		$tmp,$tmp,$eighty7
2590	 vxor		$out4,$in4,$twk4
2591	vxor		$tweak,$tweak,$tmp
2592
2593	 lvx_u		$in5,$x50,$inp
2594	 addi		$inp,$inp,0x60
2595	vxor		$twk5,$tweak,$rndkey0
2596	vsrab		$tmp,$tweak,$seven	# next tweak value
2597	vaddubm		$tweak,$tweak,$tweak
2598	vsldoi		$tmp,$tmp,$tmp,15
2599	 le?vperm	$in5,$in5,$in5,$leperm
2600	vand		$tmp,$tmp,$eighty7
2601	 vxor		$out5,$in5,$twk5
2602	vxor		$tweak,$tweak,$tmp
2603
2604	vxor		v31,v31,$rndkey0
2605	mtctr		$rounds
2606	b		Loop_xts_enc6x
2607
2608.align	5
2609Loop_xts_enc6x:
2610	vcipher		$out0,$out0,v24
2611	vcipher		$out1,$out1,v24
2612	vcipher		$out2,$out2,v24
2613	vcipher		$out3,$out3,v24
2614	vcipher		$out4,$out4,v24
2615	vcipher		$out5,$out5,v24
2616	lvx		v24,$x20,$key_		# round[3]
2617	addi		$key_,$key_,0x20
2618
2619	vcipher		$out0,$out0,v25
2620	vcipher		$out1,$out1,v25
2621	vcipher		$out2,$out2,v25
2622	vcipher		$out3,$out3,v25
2623	vcipher		$out4,$out4,v25
2624	vcipher		$out5,$out5,v25
2625	lvx		v25,$x10,$key_		# round[4]
2626	bdnz		Loop_xts_enc6x
2627
2628	subic		$len,$len,96		# $len-=96
2629	 vxor		$in0,$twk0,v31		# xor with last round key
2630	vcipher		$out0,$out0,v24
2631	vcipher		$out1,$out1,v24
2632	 vsrab		$tmp,$tweak,$seven	# next tweak value
2633	 vxor		$twk0,$tweak,$rndkey0
2634	 vaddubm	$tweak,$tweak,$tweak
2635	vcipher		$out2,$out2,v24
2636	vcipher		$out3,$out3,v24
2637	 vsldoi		$tmp,$tmp,$tmp,15
2638	vcipher		$out4,$out4,v24
2639	vcipher		$out5,$out5,v24
2640
2641	subfe.		r0,r0,r0		# borrow?-1:0
2642	 vand		$tmp,$tmp,$eighty7
2643	vcipher		$out0,$out0,v25
2644	vcipher		$out1,$out1,v25
2645	 vxor		$tweak,$tweak,$tmp
2646	vcipher		$out2,$out2,v25
2647	vcipher		$out3,$out3,v25
2648	 vxor		$in1,$twk1,v31
2649	 vsrab		$tmp,$tweak,$seven	# next tweak value
2650	 vxor		$twk1,$tweak,$rndkey0
2651	vcipher		$out4,$out4,v25
2652	vcipher		$out5,$out5,v25
2653
2654	and		r0,r0,$len
2655	 vaddubm	$tweak,$tweak,$tweak
2656	 vsldoi		$tmp,$tmp,$tmp,15
2657	vcipher		$out0,$out0,v26
2658	vcipher		$out1,$out1,v26
2659	 vand		$tmp,$tmp,$eighty7
2660	vcipher		$out2,$out2,v26
2661	vcipher		$out3,$out3,v26
2662	 vxor		$tweak,$tweak,$tmp
2663	vcipher		$out4,$out4,v26
2664	vcipher		$out5,$out5,v26
2665
2666	add		$inp,$inp,r0		# $inp is adjusted in such
2667						# way that at exit from the
2668						# loop inX-in5 are loaded
2669						# with last "words"
2670	 vxor		$in2,$twk2,v31
2671	 vsrab		$tmp,$tweak,$seven	# next tweak value
2672	 vxor		$twk2,$tweak,$rndkey0
2673	 vaddubm	$tweak,$tweak,$tweak
2674	vcipher		$out0,$out0,v27
2675	vcipher		$out1,$out1,v27
2676	 vsldoi		$tmp,$tmp,$tmp,15
2677	vcipher		$out2,$out2,v27
2678	vcipher		$out3,$out3,v27
2679	 vand		$tmp,$tmp,$eighty7
2680	vcipher		$out4,$out4,v27
2681	vcipher		$out5,$out5,v27
2682
2683	addi		$key_,$sp,$FRAME+15	# rewind $key_
2684	 vxor		$tweak,$tweak,$tmp
2685	vcipher		$out0,$out0,v28
2686	vcipher		$out1,$out1,v28
2687	 vxor		$in3,$twk3,v31
2688	 vsrab		$tmp,$tweak,$seven	# next tweak value
2689	 vxor		$twk3,$tweak,$rndkey0
2690	vcipher		$out2,$out2,v28
2691	vcipher		$out3,$out3,v28
2692	 vaddubm	$tweak,$tweak,$tweak
2693	 vsldoi		$tmp,$tmp,$tmp,15
2694	vcipher		$out4,$out4,v28
2695	vcipher		$out5,$out5,v28
2696	lvx		v24,$x00,$key_		# re-pre-load round[1]
2697	 vand		$tmp,$tmp,$eighty7
2698
2699	vcipher		$out0,$out0,v29
2700	vcipher		$out1,$out1,v29
2701	 vxor		$tweak,$tweak,$tmp
2702	vcipher		$out2,$out2,v29
2703	vcipher		$out3,$out3,v29
2704	 vxor		$in4,$twk4,v31
2705	 vsrab		$tmp,$tweak,$seven	# next tweak value
2706	 vxor		$twk4,$tweak,$rndkey0
2707	vcipher		$out4,$out4,v29
2708	vcipher		$out5,$out5,v29
2709	lvx		v25,$x10,$key_		# re-pre-load round[2]
2710	 vaddubm	$tweak,$tweak,$tweak
2711	 vsldoi		$tmp,$tmp,$tmp,15
2712
2713	vcipher		$out0,$out0,v30
2714	vcipher		$out1,$out1,v30
2715	 vand		$tmp,$tmp,$eighty7
2716	vcipher		$out2,$out2,v30
2717	vcipher		$out3,$out3,v30
2718	 vxor		$tweak,$tweak,$tmp
2719	vcipher		$out4,$out4,v30
2720	vcipher		$out5,$out5,v30
2721	 vxor		$in5,$twk5,v31
2722	 vsrab		$tmp,$tweak,$seven	# next tweak value
2723	 vxor		$twk5,$tweak,$rndkey0
2724
2725	vcipherlast	$out0,$out0,$in0
2726	 lvx_u		$in0,$x00,$inp		# load next input block
2727	 vaddubm	$tweak,$tweak,$tweak
2728	 vsldoi		$tmp,$tmp,$tmp,15
2729	vcipherlast	$out1,$out1,$in1
2730	 lvx_u		$in1,$x10,$inp
2731	vcipherlast	$out2,$out2,$in2
2732	 le?vperm	$in0,$in0,$in0,$leperm
2733	 lvx_u		$in2,$x20,$inp
2734	 vand		$tmp,$tmp,$eighty7
2735	vcipherlast	$out3,$out3,$in3
2736	 le?vperm	$in1,$in1,$in1,$leperm
2737	 lvx_u		$in3,$x30,$inp
2738	vcipherlast	$out4,$out4,$in4
2739	 le?vperm	$in2,$in2,$in2,$leperm
2740	 lvx_u		$in4,$x40,$inp
2741	 vxor		$tweak,$tweak,$tmp
2742	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2743						# in stealing mode
2744	 le?vperm	$in3,$in3,$in3,$leperm
2745	 lvx_u		$in5,$x50,$inp
2746	 addi		$inp,$inp,0x60
2747	 le?vperm	$in4,$in4,$in4,$leperm
2748	 le?vperm	$in5,$in5,$in5,$leperm
2749
2750	le?vperm	$out0,$out0,$out0,$leperm
2751	le?vperm	$out1,$out1,$out1,$leperm
2752	stvx_u		$out0,$x00,$out		# store output
2753	 vxor		$out0,$in0,$twk0
2754	le?vperm	$out2,$out2,$out2,$leperm
2755	stvx_u		$out1,$x10,$out
2756	 vxor		$out1,$in1,$twk1
2757	le?vperm	$out3,$out3,$out3,$leperm
2758	stvx_u		$out2,$x20,$out
2759	 vxor		$out2,$in2,$twk2
2760	le?vperm	$out4,$out4,$out4,$leperm
2761	stvx_u		$out3,$x30,$out
2762	 vxor		$out3,$in3,$twk3
2763	le?vperm	$out5,$tmp,$tmp,$leperm
2764	stvx_u		$out4,$x40,$out
2765	 vxor		$out4,$in4,$twk4
2766	le?stvx_u	$out5,$x50,$out
2767	be?stvx_u	$tmp, $x50,$out
2768	 vxor		$out5,$in5,$twk5
2769	addi		$out,$out,0x60
2770
2771	mtctr		$rounds
2772	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2773
2774	addic.		$len,$len,0x60
2775	beq		Lxts_enc6x_zero
2776	cmpwi		$len,0x20
2777	blt		Lxts_enc6x_one
2778	nop
2779	beq		Lxts_enc6x_two
2780	cmpwi		$len,0x40
2781	blt		Lxts_enc6x_three
2782	nop
2783	beq		Lxts_enc6x_four
2784
2785Lxts_enc6x_five:
2786	vxor		$out0,$in1,$twk0
2787	vxor		$out1,$in2,$twk1
2788	vxor		$out2,$in3,$twk2
2789	vxor		$out3,$in4,$twk3
2790	vxor		$out4,$in5,$twk4
2791
2792	bl		_aesp8_xts_enc5x
2793
2794	le?vperm	$out0,$out0,$out0,$leperm
2795	vmr		$twk0,$twk5		# unused tweak
2796	le?vperm	$out1,$out1,$out1,$leperm
2797	stvx_u		$out0,$x00,$out		# store output
2798	le?vperm	$out2,$out2,$out2,$leperm
2799	stvx_u		$out1,$x10,$out
2800	le?vperm	$out3,$out3,$out3,$leperm
2801	stvx_u		$out2,$x20,$out
2802	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2803	le?vperm	$out4,$out4,$out4,$leperm
2804	stvx_u		$out3,$x30,$out
2805	stvx_u		$out4,$x40,$out
2806	addi		$out,$out,0x50
2807	bne		Lxts_enc6x_steal
2808	b		Lxts_enc6x_done
2809
2810.align	4
2811Lxts_enc6x_four:
2812	vxor		$out0,$in2,$twk0
2813	vxor		$out1,$in3,$twk1
2814	vxor		$out2,$in4,$twk2
2815	vxor		$out3,$in5,$twk3
2816	vxor		$out4,$out4,$out4
2817
2818	bl		_aesp8_xts_enc5x
2819
2820	le?vperm	$out0,$out0,$out0,$leperm
2821	vmr		$twk0,$twk4		# unused tweak
2822	le?vperm	$out1,$out1,$out1,$leperm
2823	stvx_u		$out0,$x00,$out		# store output
2824	le?vperm	$out2,$out2,$out2,$leperm
2825	stvx_u		$out1,$x10,$out
2826	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2827	le?vperm	$out3,$out3,$out3,$leperm
2828	stvx_u		$out2,$x20,$out
2829	stvx_u		$out3,$x30,$out
2830	addi		$out,$out,0x40
2831	bne		Lxts_enc6x_steal
2832	b		Lxts_enc6x_done
2833
2834.align	4
2835Lxts_enc6x_three:
2836	vxor		$out0,$in3,$twk0
2837	vxor		$out1,$in4,$twk1
2838	vxor		$out2,$in5,$twk2
2839	vxor		$out3,$out3,$out3
2840	vxor		$out4,$out4,$out4
2841
2842	bl		_aesp8_xts_enc5x
2843
2844	le?vperm	$out0,$out0,$out0,$leperm
2845	vmr		$twk0,$twk3		# unused tweak
2846	le?vperm	$out1,$out1,$out1,$leperm
2847	stvx_u		$out0,$x00,$out		# store output
2848	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2849	le?vperm	$out2,$out2,$out2,$leperm
2850	stvx_u		$out1,$x10,$out
2851	stvx_u		$out2,$x20,$out
2852	addi		$out,$out,0x30
2853	bne		Lxts_enc6x_steal
2854	b		Lxts_enc6x_done
2855
2856.align	4
2857Lxts_enc6x_two:
2858	vxor		$out0,$in4,$twk0
2859	vxor		$out1,$in5,$twk1
2860	vxor		$out2,$out2,$out2
2861	vxor		$out3,$out3,$out3
2862	vxor		$out4,$out4,$out4
2863
2864	bl		_aesp8_xts_enc5x
2865
2866	le?vperm	$out0,$out0,$out0,$leperm
2867	vmr		$twk0,$twk2		# unused tweak
2868	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2869	le?vperm	$out1,$out1,$out1,$leperm
2870	stvx_u		$out0,$x00,$out		# store output
2871	stvx_u		$out1,$x10,$out
2872	addi		$out,$out,0x20
2873	bne		Lxts_enc6x_steal
2874	b		Lxts_enc6x_done
2875
2876.align	4
2877Lxts_enc6x_one:
2878	vxor		$out0,$in5,$twk0
2879	nop
2880Loop_xts_enc1x:
2881	vcipher		$out0,$out0,v24
2882	lvx		v24,$x20,$key_		# round[3]
2883	addi		$key_,$key_,0x20
2884
2885	vcipher		$out0,$out0,v25
2886	lvx		v25,$x10,$key_		# round[4]
2887	bdnz		Loop_xts_enc1x
2888
2889	add		$inp,$inp,$taillen
2890	cmpwi		$taillen,0
2891	vcipher		$out0,$out0,v24
2892
2893	subi		$inp,$inp,16
2894	vcipher		$out0,$out0,v25
2895
2896	lvsr		$inpperm,0,$taillen
2897	vcipher		$out0,$out0,v26
2898
2899	lvx_u		$in0,0,$inp
2900	vcipher		$out0,$out0,v27
2901
2902	addi		$key_,$sp,$FRAME+15	# rewind $key_
2903	vcipher		$out0,$out0,v28
2904	lvx		v24,$x00,$key_		# re-pre-load round[1]
2905
2906	vcipher		$out0,$out0,v29
2907	lvx		v25,$x10,$key_		# re-pre-load round[2]
2908	 vxor		$twk0,$twk0,v31
2909
2910	le?vperm	$in0,$in0,$in0,$leperm
2911	vcipher		$out0,$out0,v30
2912
2913	vperm		$in0,$in0,$in0,$inpperm
2914	vcipherlast	$out0,$out0,$twk0
2915
2916	vmr		$twk0,$twk1		# unused tweak
2917	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2918	le?vperm	$out0,$out0,$out0,$leperm
2919	stvx_u		$out0,$x00,$out		# store output
2920	addi		$out,$out,0x10
2921	bne		Lxts_enc6x_steal
2922	b		Lxts_enc6x_done
2923
2924.align	4
2925Lxts_enc6x_zero:
2926	cmpwi		$taillen,0
2927	beq		Lxts_enc6x_done
2928
2929	add		$inp,$inp,$taillen
2930	subi		$inp,$inp,16
2931	lvx_u		$in0,0,$inp
2932	lvsr		$inpperm,0,$taillen	# $in5 is no more
2933	le?vperm	$in0,$in0,$in0,$leperm
2934	vperm		$in0,$in0,$in0,$inpperm
2935	vxor		$tmp,$tmp,$twk0
2936Lxts_enc6x_steal:
2937	vxor		$in0,$in0,$twk0
2938	vxor		$out0,$out0,$out0
2939	vspltisb	$out1,-1
2940	vperm		$out0,$out0,$out1,$inpperm
2941	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2942
2943	subi		r30,$out,17
2944	subi		$out,$out,16
2945	mtctr		$taillen
2946Loop_xts_enc6x_steal:
2947	lbzu		r0,1(r30)
2948	stb		r0,16(r30)
2949	bdnz		Loop_xts_enc6x_steal
2950
2951	li		$taillen,0
2952	mtctr		$rounds
2953	b		Loop_xts_enc1x		# one more time...
2954
2955.align	4
2956Lxts_enc6x_done:
2957	${UCMP}i	$ivp,0
2958	beq		Lxts_enc6x_ret
2959
2960	vxor		$tweak,$twk0,$rndkey0
2961	le?vperm	$tweak,$tweak,$tweak,$leperm
2962	stvx_u		$tweak,0,$ivp
2963
2964Lxts_enc6x_ret:
2965	mtlr		r11
2966	li		r10,`$FRAME+15`
2967	li		r11,`$FRAME+31`
2968	stvx		$seven,r10,$sp		# wipe copies of round keys
2969	addi		r10,r10,32
2970	stvx		$seven,r11,$sp
2971	addi		r11,r11,32
2972	stvx		$seven,r10,$sp
2973	addi		r10,r10,32
2974	stvx		$seven,r11,$sp
2975	addi		r11,r11,32
2976	stvx		$seven,r10,$sp
2977	addi		r10,r10,32
2978	stvx		$seven,r11,$sp
2979	addi		r11,r11,32
2980	stvx		$seven,r10,$sp
2981	addi		r10,r10,32
2982	stvx		$seven,r11,$sp
2983	addi		r11,r11,32
2984
2985	mtspr		256,$vrsave
2986	lvx		v20,r10,$sp		# ABI says so
2987	addi		r10,r10,32
2988	lvx		v21,r11,$sp
2989	addi		r11,r11,32
2990	lvx		v22,r10,$sp
2991	addi		r10,r10,32
2992	lvx		v23,r11,$sp
2993	addi		r11,r11,32
2994	lvx		v24,r10,$sp
2995	addi		r10,r10,32
2996	lvx		v25,r11,$sp
2997	addi		r11,r11,32
2998	lvx		v26,r10,$sp
2999	addi		r10,r10,32
3000	lvx		v27,r11,$sp
3001	addi		r11,r11,32
3002	lvx		v28,r10,$sp
3003	addi		r10,r10,32
3004	lvx		v29,r11,$sp
3005	addi		r11,r11,32
3006	lvx		v30,r10,$sp
3007	lvx		v31,r11,$sp
3008	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3009	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3010	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3011	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3012	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3013	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3014	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3015	blr
3016	.long		0
3017	.byte		0,12,0x04,1,0x80,6,6,0
3018	.long		0
3019
3020.align	5
3021_aesp8_xts_enc5x:
3022	vcipher		$out0,$out0,v24
3023	vcipher		$out1,$out1,v24
3024	vcipher		$out2,$out2,v24
3025	vcipher		$out3,$out3,v24
3026	vcipher		$out4,$out4,v24
3027	lvx		v24,$x20,$key_		# round[3]
3028	addi		$key_,$key_,0x20
3029
3030	vcipher		$out0,$out0,v25
3031	vcipher		$out1,$out1,v25
3032	vcipher		$out2,$out2,v25
3033	vcipher		$out3,$out3,v25
3034	vcipher		$out4,$out4,v25
3035	lvx		v25,$x10,$key_		# round[4]
3036	bdnz		_aesp8_xts_enc5x
3037
3038	add		$inp,$inp,$taillen
3039	cmpwi		$taillen,0
3040	vcipher		$out0,$out0,v24
3041	vcipher		$out1,$out1,v24
3042	vcipher		$out2,$out2,v24
3043	vcipher		$out3,$out3,v24
3044	vcipher		$out4,$out4,v24
3045
3046	subi		$inp,$inp,16
3047	vcipher		$out0,$out0,v25
3048	vcipher		$out1,$out1,v25
3049	vcipher		$out2,$out2,v25
3050	vcipher		$out3,$out3,v25
3051	vcipher		$out4,$out4,v25
3052	 vxor		$twk0,$twk0,v31
3053
3054	vcipher		$out0,$out0,v26
3055	lvsr		$inpperm,r0,$taillen	# $in5 is no more
3056	vcipher		$out1,$out1,v26
3057	vcipher		$out2,$out2,v26
3058	vcipher		$out3,$out3,v26
3059	vcipher		$out4,$out4,v26
3060	 vxor		$in1,$twk1,v31
3061
3062	vcipher		$out0,$out0,v27
3063	lvx_u		$in0,0,$inp
3064	vcipher		$out1,$out1,v27
3065	vcipher		$out2,$out2,v27
3066	vcipher		$out3,$out3,v27
3067	vcipher		$out4,$out4,v27
3068	 vxor		$in2,$twk2,v31
3069
3070	addi		$key_,$sp,$FRAME+15	# rewind $key_
3071	vcipher		$out0,$out0,v28
3072	vcipher		$out1,$out1,v28
3073	vcipher		$out2,$out2,v28
3074	vcipher		$out3,$out3,v28
3075	vcipher		$out4,$out4,v28
3076	lvx		v24,$x00,$key_		# re-pre-load round[1]
3077	 vxor		$in3,$twk3,v31
3078
3079	vcipher		$out0,$out0,v29
3080	le?vperm	$in0,$in0,$in0,$leperm
3081	vcipher		$out1,$out1,v29
3082	vcipher		$out2,$out2,v29
3083	vcipher		$out3,$out3,v29
3084	vcipher		$out4,$out4,v29
3085	lvx		v25,$x10,$key_		# re-pre-load round[2]
3086	 vxor		$in4,$twk4,v31
3087
3088	vcipher		$out0,$out0,v30
3089	vperm		$in0,$in0,$in0,$inpperm
3090	vcipher		$out1,$out1,v30
3091	vcipher		$out2,$out2,v30
3092	vcipher		$out3,$out3,v30
3093	vcipher		$out4,$out4,v30
3094
3095	vcipherlast	$out0,$out0,$twk0
3096	vcipherlast	$out1,$out1,$in1
3097	vcipherlast	$out2,$out2,$in2
3098	vcipherlast	$out3,$out3,$in3
3099	vcipherlast	$out4,$out4,$in4
3100	blr
3101        .long   	0
3102        .byte   	0,12,0x14,0,0,0,0,0
3103
3104.align	5
3105_aesp8_xts_decrypt6x:
3106	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3107	mflr		r11
3108	li		r7,`$FRAME+8*16+15`
3109	li		r3,`$FRAME+8*16+31`
3110	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3111	stvx		v20,r7,$sp		# ABI says so
3112	addi		r7,r7,32
3113	stvx		v21,r3,$sp
3114	addi		r3,r3,32
3115	stvx		v22,r7,$sp
3116	addi		r7,r7,32
3117	stvx		v23,r3,$sp
3118	addi		r3,r3,32
3119	stvx		v24,r7,$sp
3120	addi		r7,r7,32
3121	stvx		v25,r3,$sp
3122	addi		r3,r3,32
3123	stvx		v26,r7,$sp
3124	addi		r7,r7,32
3125	stvx		v27,r3,$sp
3126	addi		r3,r3,32
3127	stvx		v28,r7,$sp
3128	addi		r7,r7,32
3129	stvx		v29,r3,$sp
3130	addi		r3,r3,32
3131	stvx		v30,r7,$sp
3132	stvx		v31,r3,$sp
3133	li		r0,-1
3134	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3135	li		$x10,0x10
3136	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3137	li		$x20,0x20
3138	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3139	li		$x30,0x30
3140	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3141	li		$x40,0x40
3142	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3143	li		$x50,0x50
3144	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3145	li		$x60,0x60
3146	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3147	li		$x70,0x70
3148	mtspr		256,r0
3149
3150	subi		$rounds,$rounds,3	# -4 in total
3151
3152	lvx		$rndkey0,$x00,$key1	# load key schedule
3153	lvx		v30,$x10,$key1
3154	addi		$key1,$key1,0x20
3155	lvx		v31,$x00,$key1
3156	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3157	addi		$key_,$sp,$FRAME+15
3158	mtctr		$rounds
3159
3160Load_xts_dec_key:
3161	?vperm		v24,v30,v31,$keyperm
3162	lvx		v30,$x10,$key1
3163	addi		$key1,$key1,0x20
3164	stvx		v24,$x00,$key_		# off-load round[1]
3165	?vperm		v25,v31,v30,$keyperm
3166	lvx		v31,$x00,$key1
3167	stvx		v25,$x10,$key_		# off-load round[2]
3168	addi		$key_,$key_,0x20
3169	bdnz		Load_xts_dec_key
3170
3171	lvx		v26,$x10,$key1
3172	?vperm		v24,v30,v31,$keyperm
3173	lvx		v27,$x20,$key1
3174	stvx		v24,$x00,$key_		# off-load round[3]
3175	?vperm		v25,v31,v26,$keyperm
3176	lvx		v28,$x30,$key1
3177	stvx		v25,$x10,$key_		# off-load round[4]
3178	addi		$key_,$sp,$FRAME+15	# rewind $key_
3179	?vperm		v26,v26,v27,$keyperm
3180	lvx		v29,$x40,$key1
3181	?vperm		v27,v27,v28,$keyperm
3182	lvx		v30,$x50,$key1
3183	?vperm		v28,v28,v29,$keyperm
3184	lvx		v31,$x60,$key1
3185	?vperm		v29,v29,v30,$keyperm
3186	lvx		$twk5,$x70,$key1	# borrow $twk5
3187	?vperm		v30,v30,v31,$keyperm
3188	lvx		v24,$x00,$key_		# pre-load round[1]
3189	?vperm		v31,v31,$twk5,$keyperm
3190	lvx		v25,$x10,$key_		# pre-load round[2]
3191
3192	 vperm		$in0,$inout,$inptail,$inpperm
3193	 subi		$inp,$inp,31		# undo "caller"
3194	vxor		$twk0,$tweak,$rndkey0
3195	vsrab		$tmp,$tweak,$seven	# next tweak value
3196	vaddubm		$tweak,$tweak,$tweak
3197	vsldoi		$tmp,$tmp,$tmp,15
3198	vand		$tmp,$tmp,$eighty7
3199	 vxor		$out0,$in0,$twk0
3200	vxor		$tweak,$tweak,$tmp
3201
3202	 lvx_u		$in1,$x10,$inp
3203	vxor		$twk1,$tweak,$rndkey0
3204	vsrab		$tmp,$tweak,$seven	# next tweak value
3205	vaddubm		$tweak,$tweak,$tweak
3206	vsldoi		$tmp,$tmp,$tmp,15
3207	 le?vperm	$in1,$in1,$in1,$leperm
3208	vand		$tmp,$tmp,$eighty7
3209	 vxor		$out1,$in1,$twk1
3210	vxor		$tweak,$tweak,$tmp
3211
3212	 lvx_u		$in2,$x20,$inp
3213	 andi.		$taillen,$len,15
3214	vxor		$twk2,$tweak,$rndkey0
3215	vsrab		$tmp,$tweak,$seven	# next tweak value
3216	vaddubm		$tweak,$tweak,$tweak
3217	vsldoi		$tmp,$tmp,$tmp,15
3218	 le?vperm	$in2,$in2,$in2,$leperm
3219	vand		$tmp,$tmp,$eighty7
3220	 vxor		$out2,$in2,$twk2
3221	vxor		$tweak,$tweak,$tmp
3222
3223	 lvx_u		$in3,$x30,$inp
3224	 sub		$len,$len,$taillen
3225	vxor		$twk3,$tweak,$rndkey0
3226	vsrab		$tmp,$tweak,$seven	# next tweak value
3227	vaddubm		$tweak,$tweak,$tweak
3228	vsldoi		$tmp,$tmp,$tmp,15
3229	 le?vperm	$in3,$in3,$in3,$leperm
3230	vand		$tmp,$tmp,$eighty7
3231	 vxor		$out3,$in3,$twk3
3232	vxor		$tweak,$tweak,$tmp
3233
3234	 lvx_u		$in4,$x40,$inp
3235	 subi		$len,$len,0x60
3236	vxor		$twk4,$tweak,$rndkey0
3237	vsrab		$tmp,$tweak,$seven	# next tweak value
3238	vaddubm		$tweak,$tweak,$tweak
3239	vsldoi		$tmp,$tmp,$tmp,15
3240	 le?vperm	$in4,$in4,$in4,$leperm
3241	vand		$tmp,$tmp,$eighty7
3242	 vxor		$out4,$in4,$twk4
3243	vxor		$tweak,$tweak,$tmp
3244
3245	 lvx_u		$in5,$x50,$inp
3246	 addi		$inp,$inp,0x60
3247	vxor		$twk5,$tweak,$rndkey0
3248	vsrab		$tmp,$tweak,$seven	# next tweak value
3249	vaddubm		$tweak,$tweak,$tweak
3250	vsldoi		$tmp,$tmp,$tmp,15
3251	 le?vperm	$in5,$in5,$in5,$leperm
3252	vand		$tmp,$tmp,$eighty7
3253	 vxor		$out5,$in5,$twk5
3254	vxor		$tweak,$tweak,$tmp
3255
3256	vxor		v31,v31,$rndkey0
3257	mtctr		$rounds
3258	b		Loop_xts_dec6x
3259
3260.align	5
3261Loop_xts_dec6x:
3262	vncipher	$out0,$out0,v24
3263	vncipher	$out1,$out1,v24
3264	vncipher	$out2,$out2,v24
3265	vncipher	$out3,$out3,v24
3266	vncipher	$out4,$out4,v24
3267	vncipher	$out5,$out5,v24
3268	lvx		v24,$x20,$key_		# round[3]
3269	addi		$key_,$key_,0x20
3270
3271	vncipher	$out0,$out0,v25
3272	vncipher	$out1,$out1,v25
3273	vncipher	$out2,$out2,v25
3274	vncipher	$out3,$out3,v25
3275	vncipher	$out4,$out4,v25
3276	vncipher	$out5,$out5,v25
3277	lvx		v25,$x10,$key_		# round[4]
3278	bdnz		Loop_xts_dec6x
3279
3280	subic		$len,$len,96		# $len-=96
3281	 vxor		$in0,$twk0,v31		# xor with last round key
3282	vncipher	$out0,$out0,v24
3283	vncipher	$out1,$out1,v24
3284	 vsrab		$tmp,$tweak,$seven	# next tweak value
3285	 vxor		$twk0,$tweak,$rndkey0
3286	 vaddubm	$tweak,$tweak,$tweak
3287	vncipher	$out2,$out2,v24
3288	vncipher	$out3,$out3,v24
3289	 vsldoi		$tmp,$tmp,$tmp,15
3290	vncipher	$out4,$out4,v24
3291	vncipher	$out5,$out5,v24
3292
3293	subfe.		r0,r0,r0		# borrow?-1:0
3294	 vand		$tmp,$tmp,$eighty7
3295	vncipher	$out0,$out0,v25
3296	vncipher	$out1,$out1,v25
3297	 vxor		$tweak,$tweak,$tmp
3298	vncipher	$out2,$out2,v25
3299	vncipher	$out3,$out3,v25
3300	 vxor		$in1,$twk1,v31
3301	 vsrab		$tmp,$tweak,$seven	# next tweak value
3302	 vxor		$twk1,$tweak,$rndkey0
3303	vncipher	$out4,$out4,v25
3304	vncipher	$out5,$out5,v25
3305
3306	and		r0,r0,$len
3307	 vaddubm	$tweak,$tweak,$tweak
3308	 vsldoi		$tmp,$tmp,$tmp,15
3309	vncipher	$out0,$out0,v26
3310	vncipher	$out1,$out1,v26
3311	 vand		$tmp,$tmp,$eighty7
3312	vncipher	$out2,$out2,v26
3313	vncipher	$out3,$out3,v26
3314	 vxor		$tweak,$tweak,$tmp
3315	vncipher	$out4,$out4,v26
3316	vncipher	$out5,$out5,v26
3317
3318	add		$inp,$inp,r0		# $inp is adjusted in such
3319						# way that at exit from the
3320						# loop inX-in5 are loaded
3321						# with last "words"
3322	 vxor		$in2,$twk2,v31
3323	 vsrab		$tmp,$tweak,$seven	# next tweak value
3324	 vxor		$twk2,$tweak,$rndkey0
3325	 vaddubm	$tweak,$tweak,$tweak
3326	vncipher	$out0,$out0,v27
3327	vncipher	$out1,$out1,v27
3328	 vsldoi		$tmp,$tmp,$tmp,15
3329	vncipher	$out2,$out2,v27
3330	vncipher	$out3,$out3,v27
3331	 vand		$tmp,$tmp,$eighty7
3332	vncipher	$out4,$out4,v27
3333	vncipher	$out5,$out5,v27
3334
3335	addi		$key_,$sp,$FRAME+15	# rewind $key_
3336	 vxor		$tweak,$tweak,$tmp
3337	vncipher	$out0,$out0,v28
3338	vncipher	$out1,$out1,v28
3339	 vxor		$in3,$twk3,v31
3340	 vsrab		$tmp,$tweak,$seven	# next tweak value
3341	 vxor		$twk3,$tweak,$rndkey0
3342	vncipher	$out2,$out2,v28
3343	vncipher	$out3,$out3,v28
3344	 vaddubm	$tweak,$tweak,$tweak
3345	 vsldoi		$tmp,$tmp,$tmp,15
3346	vncipher	$out4,$out4,v28
3347	vncipher	$out5,$out5,v28
3348	lvx		v24,$x00,$key_		# re-pre-load round[1]
3349	 vand		$tmp,$tmp,$eighty7
3350
3351	vncipher	$out0,$out0,v29
3352	vncipher	$out1,$out1,v29
3353	 vxor		$tweak,$tweak,$tmp
3354	vncipher	$out2,$out2,v29
3355	vncipher	$out3,$out3,v29
3356	 vxor		$in4,$twk4,v31
3357	 vsrab		$tmp,$tweak,$seven	# next tweak value
3358	 vxor		$twk4,$tweak,$rndkey0
3359	vncipher	$out4,$out4,v29
3360	vncipher	$out5,$out5,v29
3361	lvx		v25,$x10,$key_		# re-pre-load round[2]
3362	 vaddubm	$tweak,$tweak,$tweak
3363	 vsldoi		$tmp,$tmp,$tmp,15
3364
3365	vncipher	$out0,$out0,v30
3366	vncipher	$out1,$out1,v30
3367	 vand		$tmp,$tmp,$eighty7
3368	vncipher	$out2,$out2,v30
3369	vncipher	$out3,$out3,v30
3370	 vxor		$tweak,$tweak,$tmp
3371	vncipher	$out4,$out4,v30
3372	vncipher	$out5,$out5,v30
3373	 vxor		$in5,$twk5,v31
3374	 vsrab		$tmp,$tweak,$seven	# next tweak value
3375	 vxor		$twk5,$tweak,$rndkey0
3376
3377	vncipherlast	$out0,$out0,$in0
3378	 lvx_u		$in0,$x00,$inp		# load next input block
3379	 vaddubm	$tweak,$tweak,$tweak
3380	 vsldoi		$tmp,$tmp,$tmp,15
3381	vncipherlast	$out1,$out1,$in1
3382	 lvx_u		$in1,$x10,$inp
3383	vncipherlast	$out2,$out2,$in2
3384	 le?vperm	$in0,$in0,$in0,$leperm
3385	 lvx_u		$in2,$x20,$inp
3386	 vand		$tmp,$tmp,$eighty7
3387	vncipherlast	$out3,$out3,$in3
3388	 le?vperm	$in1,$in1,$in1,$leperm
3389	 lvx_u		$in3,$x30,$inp
3390	vncipherlast	$out4,$out4,$in4
3391	 le?vperm	$in2,$in2,$in2,$leperm
3392	 lvx_u		$in4,$x40,$inp
3393	 vxor		$tweak,$tweak,$tmp
3394	vncipherlast	$out5,$out5,$in5
3395	 le?vperm	$in3,$in3,$in3,$leperm
3396	 lvx_u		$in5,$x50,$inp
3397	 addi		$inp,$inp,0x60
3398	 le?vperm	$in4,$in4,$in4,$leperm
3399	 le?vperm	$in5,$in5,$in5,$leperm
3400
3401	le?vperm	$out0,$out0,$out0,$leperm
3402	le?vperm	$out1,$out1,$out1,$leperm
3403	stvx_u		$out0,$x00,$out		# store output
3404	 vxor		$out0,$in0,$twk0
3405	le?vperm	$out2,$out2,$out2,$leperm
3406	stvx_u		$out1,$x10,$out
3407	 vxor		$out1,$in1,$twk1
3408	le?vperm	$out3,$out3,$out3,$leperm
3409	stvx_u		$out2,$x20,$out
3410	 vxor		$out2,$in2,$twk2
3411	le?vperm	$out4,$out4,$out4,$leperm
3412	stvx_u		$out3,$x30,$out
3413	 vxor		$out3,$in3,$twk3
3414	le?vperm	$out5,$out5,$out5,$leperm
3415	stvx_u		$out4,$x40,$out
3416	 vxor		$out4,$in4,$twk4
3417	stvx_u		$out5,$x50,$out
3418	 vxor		$out5,$in5,$twk5
3419	addi		$out,$out,0x60
3420
3421	mtctr		$rounds
3422	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3423
3424	addic.		$len,$len,0x60
3425	beq		Lxts_dec6x_zero
3426	cmpwi		$len,0x20
3427	blt		Lxts_dec6x_one
3428	nop
3429	beq		Lxts_dec6x_two
3430	cmpwi		$len,0x40
3431	blt		Lxts_dec6x_three
3432	nop
3433	beq		Lxts_dec6x_four
3434
3435Lxts_dec6x_five:
3436	vxor		$out0,$in1,$twk0
3437	vxor		$out1,$in2,$twk1
3438	vxor		$out2,$in3,$twk2
3439	vxor		$out3,$in4,$twk3
3440	vxor		$out4,$in5,$twk4
3441
3442	bl		_aesp8_xts_dec5x
3443
3444	le?vperm	$out0,$out0,$out0,$leperm
3445	vmr		$twk0,$twk5		# unused tweak
3446	vxor		$twk1,$tweak,$rndkey0
3447	le?vperm	$out1,$out1,$out1,$leperm
3448	stvx_u		$out0,$x00,$out		# store output
3449	vxor		$out0,$in0,$twk1
3450	le?vperm	$out2,$out2,$out2,$leperm
3451	stvx_u		$out1,$x10,$out
3452	le?vperm	$out3,$out3,$out3,$leperm
3453	stvx_u		$out2,$x20,$out
3454	le?vperm	$out4,$out4,$out4,$leperm
3455	stvx_u		$out3,$x30,$out
3456	stvx_u		$out4,$x40,$out
3457	addi		$out,$out,0x50
3458	bne		Lxts_dec6x_steal
3459	b		Lxts_dec6x_done
3460
3461.align	4
3462Lxts_dec6x_four:
3463	vxor		$out0,$in2,$twk0
3464	vxor		$out1,$in3,$twk1
3465	vxor		$out2,$in4,$twk2
3466	vxor		$out3,$in5,$twk3
3467	vxor		$out4,$out4,$out4
3468
3469	bl		_aesp8_xts_dec5x
3470
3471	le?vperm	$out0,$out0,$out0,$leperm
3472	vmr		$twk0,$twk4		# unused tweak
3473	vmr		$twk1,$twk5
3474	le?vperm	$out1,$out1,$out1,$leperm
3475	stvx_u		$out0,$x00,$out		# store output
3476	vxor		$out0,$in0,$twk5
3477	le?vperm	$out2,$out2,$out2,$leperm
3478	stvx_u		$out1,$x10,$out
3479	le?vperm	$out3,$out3,$out3,$leperm
3480	stvx_u		$out2,$x20,$out
3481	stvx_u		$out3,$x30,$out
3482	addi		$out,$out,0x40
3483	bne		Lxts_dec6x_steal
3484	b		Lxts_dec6x_done
3485
3486.align	4
3487Lxts_dec6x_three:
3488	vxor		$out0,$in3,$twk0
3489	vxor		$out1,$in4,$twk1
3490	vxor		$out2,$in5,$twk2
3491	vxor		$out3,$out3,$out3
3492	vxor		$out4,$out4,$out4
3493
3494	bl		_aesp8_xts_dec5x
3495
3496	le?vperm	$out0,$out0,$out0,$leperm
3497	vmr		$twk0,$twk3		# unused tweak
3498	vmr		$twk1,$twk4
3499	le?vperm	$out1,$out1,$out1,$leperm
3500	stvx_u		$out0,$x00,$out		# store output
3501	vxor		$out0,$in0,$twk4
3502	le?vperm	$out2,$out2,$out2,$leperm
3503	stvx_u		$out1,$x10,$out
3504	stvx_u		$out2,$x20,$out
3505	addi		$out,$out,0x30
3506	bne		Lxts_dec6x_steal
3507	b		Lxts_dec6x_done
3508
3509.align	4
3510Lxts_dec6x_two:
3511	vxor		$out0,$in4,$twk0
3512	vxor		$out1,$in5,$twk1
3513	vxor		$out2,$out2,$out2
3514	vxor		$out3,$out3,$out3
3515	vxor		$out4,$out4,$out4
3516
3517	bl		_aesp8_xts_dec5x
3518
3519	le?vperm	$out0,$out0,$out0,$leperm
3520	vmr		$twk0,$twk2		# unused tweak
3521	vmr		$twk1,$twk3
3522	le?vperm	$out1,$out1,$out1,$leperm
3523	stvx_u		$out0,$x00,$out		# store output
3524	vxor		$out0,$in0,$twk3
3525	stvx_u		$out1,$x10,$out
3526	addi		$out,$out,0x20
3527	bne		Lxts_dec6x_steal
3528	b		Lxts_dec6x_done
3529
3530.align	4
3531Lxts_dec6x_one:
3532	vxor		$out0,$in5,$twk0
3533	nop
3534Loop_xts_dec1x:
3535	vncipher	$out0,$out0,v24
3536	lvx		v24,$x20,$key_		# round[3]
3537	addi		$key_,$key_,0x20
3538
3539	vncipher	$out0,$out0,v25
3540	lvx		v25,$x10,$key_		# round[4]
3541	bdnz		Loop_xts_dec1x
3542
3543	subi		r0,$taillen,1
3544	vncipher	$out0,$out0,v24
3545
3546	andi.		r0,r0,16
3547	cmpwi		$taillen,0
3548	vncipher	$out0,$out0,v25
3549
3550	sub		$inp,$inp,r0
3551	vncipher	$out0,$out0,v26
3552
3553	lvx_u		$in0,0,$inp
3554	vncipher	$out0,$out0,v27
3555
3556	addi		$key_,$sp,$FRAME+15	# rewind $key_
3557	vncipher	$out0,$out0,v28
3558	lvx		v24,$x00,$key_		# re-pre-load round[1]
3559
3560	vncipher	$out0,$out0,v29
3561	lvx		v25,$x10,$key_		# re-pre-load round[2]
3562	 vxor		$twk0,$twk0,v31
3563
3564	le?vperm	$in0,$in0,$in0,$leperm
3565	vncipher	$out0,$out0,v30
3566
3567	mtctr		$rounds
3568	vncipherlast	$out0,$out0,$twk0
3569
3570	vmr		$twk0,$twk1		# unused tweak
3571	vmr		$twk1,$twk2
3572	le?vperm	$out0,$out0,$out0,$leperm
3573	stvx_u		$out0,$x00,$out		# store output
3574	addi		$out,$out,0x10
3575	vxor		$out0,$in0,$twk2
3576	bne		Lxts_dec6x_steal
3577	b		Lxts_dec6x_done
3578
3579.align	4
3580Lxts_dec6x_zero:
3581	cmpwi		$taillen,0
3582	beq		Lxts_dec6x_done
3583
3584	lvx_u		$in0,0,$inp
3585	le?vperm	$in0,$in0,$in0,$leperm
3586	vxor		$out0,$in0,$twk1
3587Lxts_dec6x_steal:
3588	vncipher	$out0,$out0,v24
3589	lvx		v24,$x20,$key_		# round[3]
3590	addi		$key_,$key_,0x20
3591
3592	vncipher	$out0,$out0,v25
3593	lvx		v25,$x10,$key_		# round[4]
3594	bdnz		Lxts_dec6x_steal
3595
3596	add		$inp,$inp,$taillen
3597	vncipher	$out0,$out0,v24
3598
3599	cmpwi		$taillen,0
3600	vncipher	$out0,$out0,v25
3601
3602	lvx_u		$in0,0,$inp
3603	vncipher	$out0,$out0,v26
3604
3605	lvsr		$inpperm,0,$taillen	# $in5 is no more
3606	vncipher	$out0,$out0,v27
3607
3608	addi		$key_,$sp,$FRAME+15	# rewind $key_
3609	vncipher	$out0,$out0,v28
3610	lvx		v24,$x00,$key_		# re-pre-load round[1]
3611
3612	vncipher	$out0,$out0,v29
3613	lvx		v25,$x10,$key_		# re-pre-load round[2]
3614	 vxor		$twk1,$twk1,v31
3615
3616	le?vperm	$in0,$in0,$in0,$leperm
3617	vncipher	$out0,$out0,v30
3618
3619	vperm		$in0,$in0,$in0,$inpperm
3620	vncipherlast	$tmp,$out0,$twk1
3621
3622	le?vperm	$out0,$tmp,$tmp,$leperm
3623	le?stvx_u	$out0,0,$out
3624	be?stvx_u	$tmp,0,$out
3625
3626	vxor		$out0,$out0,$out0
3627	vspltisb	$out1,-1
3628	vperm		$out0,$out0,$out1,$inpperm
3629	vsel		$out0,$in0,$tmp,$out0
3630	vxor		$out0,$out0,$twk0
3631
3632	subi		r30,$out,1
3633	mtctr		$taillen
3634Loop_xts_dec6x_steal:
3635	lbzu		r0,1(r30)
3636	stb		r0,16(r30)
3637	bdnz		Loop_xts_dec6x_steal
3638
3639	li		$taillen,0
3640	mtctr		$rounds
3641	b		Loop_xts_dec1x		# one more time...
3642
3643.align	4
3644Lxts_dec6x_done:
3645	${UCMP}i	$ivp,0
3646	beq		Lxts_dec6x_ret
3647
3648	vxor		$tweak,$twk0,$rndkey0
3649	le?vperm	$tweak,$tweak,$tweak,$leperm
3650	stvx_u		$tweak,0,$ivp
3651
3652Lxts_dec6x_ret:
3653	mtlr		r11
3654	li		r10,`$FRAME+15`
3655	li		r11,`$FRAME+31`
3656	stvx		$seven,r10,$sp		# wipe copies of round keys
3657	addi		r10,r10,32
3658	stvx		$seven,r11,$sp
3659	addi		r11,r11,32
3660	stvx		$seven,r10,$sp
3661	addi		r10,r10,32
3662	stvx		$seven,r11,$sp
3663	addi		r11,r11,32
3664	stvx		$seven,r10,$sp
3665	addi		r10,r10,32
3666	stvx		$seven,r11,$sp
3667	addi		r11,r11,32
3668	stvx		$seven,r10,$sp
3669	addi		r10,r10,32
3670	stvx		$seven,r11,$sp
3671	addi		r11,r11,32
3672
3673	mtspr		256,$vrsave
3674	lvx		v20,r10,$sp		# ABI says so
3675	addi		r10,r10,32
3676	lvx		v21,r11,$sp
3677	addi		r11,r11,32
3678	lvx		v22,r10,$sp
3679	addi		r10,r10,32
3680	lvx		v23,r11,$sp
3681	addi		r11,r11,32
3682	lvx		v24,r10,$sp
3683	addi		r10,r10,32
3684	lvx		v25,r11,$sp
3685	addi		r11,r11,32
3686	lvx		v26,r10,$sp
3687	addi		r10,r10,32
3688	lvx		v27,r11,$sp
3689	addi		r11,r11,32
3690	lvx		v28,r10,$sp
3691	addi		r10,r10,32
3692	lvx		v29,r11,$sp
3693	addi		r11,r11,32
3694	lvx		v30,r10,$sp
3695	lvx		v31,r11,$sp
3696	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3697	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3698	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3699	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3700	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3701	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3702	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3703	blr
3704	.long		0
3705	.byte		0,12,0x04,1,0x80,6,6,0
3706	.long		0
3707
3708.align	5
3709_aesp8_xts_dec5x:
3710	vncipher	$out0,$out0,v24
3711	vncipher	$out1,$out1,v24
3712	vncipher	$out2,$out2,v24
3713	vncipher	$out3,$out3,v24
3714	vncipher	$out4,$out4,v24
3715	lvx		v24,$x20,$key_		# round[3]
3716	addi		$key_,$key_,0x20
3717
3718	vncipher	$out0,$out0,v25
3719	vncipher	$out1,$out1,v25
3720	vncipher	$out2,$out2,v25
3721	vncipher	$out3,$out3,v25
3722	vncipher	$out4,$out4,v25
3723	lvx		v25,$x10,$key_		# round[4]
3724	bdnz		_aesp8_xts_dec5x
3725
3726	subi		r0,$taillen,1
3727	vncipher	$out0,$out0,v24
3728	vncipher	$out1,$out1,v24
3729	vncipher	$out2,$out2,v24
3730	vncipher	$out3,$out3,v24
3731	vncipher	$out4,$out4,v24
3732
3733	andi.		r0,r0,16
3734	cmpwi		$taillen,0
3735	vncipher	$out0,$out0,v25
3736	vncipher	$out1,$out1,v25
3737	vncipher	$out2,$out2,v25
3738	vncipher	$out3,$out3,v25
3739	vncipher	$out4,$out4,v25
3740	 vxor		$twk0,$twk0,v31
3741
3742	sub		$inp,$inp,r0
3743	vncipher	$out0,$out0,v26
3744	vncipher	$out1,$out1,v26
3745	vncipher	$out2,$out2,v26
3746	vncipher	$out3,$out3,v26
3747	vncipher	$out4,$out4,v26
3748	 vxor		$in1,$twk1,v31
3749
3750	vncipher	$out0,$out0,v27
3751	lvx_u		$in0,0,$inp
3752	vncipher	$out1,$out1,v27
3753	vncipher	$out2,$out2,v27
3754	vncipher	$out3,$out3,v27
3755	vncipher	$out4,$out4,v27
3756	 vxor		$in2,$twk2,v31
3757
3758	addi		$key_,$sp,$FRAME+15	# rewind $key_
3759	vncipher	$out0,$out0,v28
3760	vncipher	$out1,$out1,v28
3761	vncipher	$out2,$out2,v28
3762	vncipher	$out3,$out3,v28
3763	vncipher	$out4,$out4,v28
3764	lvx		v24,$x00,$key_		# re-pre-load round[1]
3765	 vxor		$in3,$twk3,v31
3766
3767	vncipher	$out0,$out0,v29
3768	le?vperm	$in0,$in0,$in0,$leperm
3769	vncipher	$out1,$out1,v29
3770	vncipher	$out2,$out2,v29
3771	vncipher	$out3,$out3,v29
3772	vncipher	$out4,$out4,v29
3773	lvx		v25,$x10,$key_		# re-pre-load round[2]
3774	 vxor		$in4,$twk4,v31
3775
3776	vncipher	$out0,$out0,v30
3777	vncipher	$out1,$out1,v30
3778	vncipher	$out2,$out2,v30
3779	vncipher	$out3,$out3,v30
3780	vncipher	$out4,$out4,v30
3781
3782	vncipherlast	$out0,$out0,$twk0
3783	vncipherlast	$out1,$out1,$in1
3784	vncipherlast	$out2,$out2,$in2
3785	vncipherlast	$out3,$out3,$in3
3786	vncipherlast	$out4,$out4,$in4
3787	mtctr		$rounds
3788	blr
3789        .long   	0
3790        .byte   	0,12,0x14,0,0,0,0,0
3791___
3792}}	}}}
3793
3794my $consts=1;
3795foreach(split("\n",$code)) {
3796        s/\`([^\`]*)\`/eval($1)/geo;
3797
3798	# constants table endian-specific conversion
3799	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3800	    my $conv=$3;
3801	    my @bytes=();
3802
3803	    # convert to endian-agnostic format
3804	    if ($1 eq "long") {
3805	      foreach (split(/,\s*/,$2)) {
3806		my $l = /^0/?oct:int;
3807		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3808	      }
3809	    } else {
3810		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3811	    }
3812
3813	    # little-endian conversion
3814	    if ($flavour =~ /le$/o) {
3815		SWITCH: for($conv)  {
3816		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3817		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3818		}
3819	    }
3820
3821	    #emit
3822	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3823	    next;
3824	}
3825	$consts=0 if (m/Lconsts:/o);	# end of table
3826
3827	# instructions prefixed with '?' are endian-specific and need
3828	# to be adjusted accordingly...
3829	if ($flavour =~ /le$/o) {	# little-endian
3830	    s/le\?//o		or
3831	    s/be\?/#be#/o	or
3832	    s/\?lvsr/lvsl/o	or
3833	    s/\?lvsl/lvsr/o	or
3834	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3835	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3836	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3837	} else {			# big-endian
3838	    s/le\?/#le#/o	or
3839	    s/be\?//o		or
3840	    s/\?([a-z]+)/$1/o;
3841	}
3842
3843        print $_,"\n";
3844}
3845
3846close STDOUT;
3847