xref: /openbmc/linux/arch/powerpc/crypto/chacha-p10le-8x.S (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1*62d9e475SDanny Tsen/* SPDX-License-Identifier: GPL-2.0-or-later */
2*62d9e475SDanny Tsen#
3*62d9e475SDanny Tsen# Accelerated chacha20 implementation for ppc64le.
4*62d9e475SDanny Tsen#
5*62d9e475SDanny Tsen# Copyright 2023- IBM Corp. All rights reserved
6*62d9e475SDanny Tsen#
7*62d9e475SDanny Tsen#===================================================================================
8*62d9e475SDanny Tsen# Written by Danny Tsen <dtsen@us.ibm.com>
9*62d9e475SDanny Tsen#
10*62d9e475SDanny Tsen# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
11*62d9e475SDanny Tsen#				 size_t len, int nrounds);
12*62d9e475SDanny Tsen#
13*62d9e475SDanny Tsen# do rounds,  8 quarter rounds
14*62d9e475SDanny Tsen# 1.  a += b; d ^= a; d <<<= 16;
15*62d9e475SDanny Tsen# 2.  c += d; b ^= c; b <<<= 12;
16*62d9e475SDanny Tsen# 3.  a += b; d ^= a; d <<<= 8;
17*62d9e475SDanny Tsen# 4.  c += d; b ^= c; b <<<= 7
18*62d9e475SDanny Tsen#
19*62d9e475SDanny Tsen# row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16
20*62d9e475SDanny Tsen# row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12
21*62d9e475SDanny Tsen# row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8
22*62d9e475SDanny Tsen# row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7
23*62d9e475SDanny Tsen#
24*62d9e475SDanny Tsen# 4 blocks (a b c d)
25*62d9e475SDanny Tsen#
26*62d9e475SDanny Tsen# a0 b0 c0 d0
27*62d9e475SDanny Tsen# a1 b1 c1 d1
28*62d9e475SDanny Tsen# ...
29*62d9e475SDanny Tsen# a4 b4 c4 d4
30*62d9e475SDanny Tsen# ...
31*62d9e475SDanny Tsen# a8 b8 c8 d8
32*62d9e475SDanny Tsen# ...
33*62d9e475SDanny Tsen# a12 b12 c12 d12
34*62d9e475SDanny Tsen# a13 ...
35*62d9e475SDanny Tsen# a14 ...
36*62d9e475SDanny Tsen# a15 b15 c15 d15
37*62d9e475SDanny Tsen#
38*62d9e475SDanny Tsen# Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
39*62d9e475SDanny Tsen# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
40*62d9e475SDanny Tsen#
41*62d9e475SDanny Tsen
42*62d9e475SDanny Tsen#include <asm/ppc_asm.h>
43*62d9e475SDanny Tsen#include <asm/asm-offsets.h>
44*62d9e475SDanny Tsen#include <asm/asm-compat.h>
45*62d9e475SDanny Tsen#include <linux/linkage.h>
46*62d9e475SDanny Tsen
47*62d9e475SDanny Tsen.machine	"any"
48*62d9e475SDanny Tsen.text
49*62d9e475SDanny Tsen
50*62d9e475SDanny Tsen.macro	SAVE_GPR GPR OFFSET FRAME
51*62d9e475SDanny Tsen	std	\GPR,\OFFSET(\FRAME)
52*62d9e475SDanny Tsen.endm
53*62d9e475SDanny Tsen
54*62d9e475SDanny Tsen.macro	SAVE_VRS VRS OFFSET FRAME
55*62d9e475SDanny Tsen	li	16, \OFFSET
56*62d9e475SDanny Tsen	stvx	\VRS, 16, \FRAME
57*62d9e475SDanny Tsen.endm
58*62d9e475SDanny Tsen
59*62d9e475SDanny Tsen.macro	SAVE_VSX VSX OFFSET FRAME
60*62d9e475SDanny Tsen	li	16, \OFFSET
61*62d9e475SDanny Tsen	stxvx	\VSX, 16, \FRAME
62*62d9e475SDanny Tsen.endm
63*62d9e475SDanny Tsen
64*62d9e475SDanny Tsen.macro	RESTORE_GPR GPR OFFSET FRAME
65*62d9e475SDanny Tsen	ld	\GPR,\OFFSET(\FRAME)
66*62d9e475SDanny Tsen.endm
67*62d9e475SDanny Tsen
68*62d9e475SDanny Tsen.macro	RESTORE_VRS VRS OFFSET FRAME
69*62d9e475SDanny Tsen	li	16, \OFFSET
70*62d9e475SDanny Tsen	lvx	\VRS, 16, \FRAME
71*62d9e475SDanny Tsen.endm
72*62d9e475SDanny Tsen
73*62d9e475SDanny Tsen.macro	RESTORE_VSX VSX OFFSET FRAME
74*62d9e475SDanny Tsen	li	16, \OFFSET
75*62d9e475SDanny Tsen	lxvx	\VSX, 16, \FRAME
76*62d9e475SDanny Tsen.endm
77*62d9e475SDanny Tsen
78*62d9e475SDanny Tsen.macro SAVE_REGS
79*62d9e475SDanny Tsen	mflr 0
80*62d9e475SDanny Tsen	std 0, 16(1)
81*62d9e475SDanny Tsen	stdu 1,-752(1)
82*62d9e475SDanny Tsen
83*62d9e475SDanny Tsen	SAVE_GPR 14, 112, 1
84*62d9e475SDanny Tsen	SAVE_GPR 15, 120, 1
85*62d9e475SDanny Tsen	SAVE_GPR 16, 128, 1
86*62d9e475SDanny Tsen	SAVE_GPR 17, 136, 1
87*62d9e475SDanny Tsen	SAVE_GPR 18, 144, 1
88*62d9e475SDanny Tsen	SAVE_GPR 19, 152, 1
89*62d9e475SDanny Tsen	SAVE_GPR 20, 160, 1
90*62d9e475SDanny Tsen	SAVE_GPR 21, 168, 1
91*62d9e475SDanny Tsen	SAVE_GPR 22, 176, 1
92*62d9e475SDanny Tsen	SAVE_GPR 23, 184, 1
93*62d9e475SDanny Tsen	SAVE_GPR 24, 192, 1
94*62d9e475SDanny Tsen	SAVE_GPR 25, 200, 1
95*62d9e475SDanny Tsen	SAVE_GPR 26, 208, 1
96*62d9e475SDanny Tsen	SAVE_GPR 27, 216, 1
97*62d9e475SDanny Tsen	SAVE_GPR 28, 224, 1
98*62d9e475SDanny Tsen	SAVE_GPR 29, 232, 1
99*62d9e475SDanny Tsen	SAVE_GPR 30, 240, 1
100*62d9e475SDanny Tsen	SAVE_GPR 31, 248, 1
101*62d9e475SDanny Tsen
102*62d9e475SDanny Tsen	addi	9, 1, 256
103*62d9e475SDanny Tsen	SAVE_VRS 20, 0, 9
104*62d9e475SDanny Tsen	SAVE_VRS 21, 16, 9
105*62d9e475SDanny Tsen	SAVE_VRS 22, 32, 9
106*62d9e475SDanny Tsen	SAVE_VRS 23, 48, 9
107*62d9e475SDanny Tsen	SAVE_VRS 24, 64, 9
108*62d9e475SDanny Tsen	SAVE_VRS 25, 80, 9
109*62d9e475SDanny Tsen	SAVE_VRS 26, 96, 9
110*62d9e475SDanny Tsen	SAVE_VRS 27, 112, 9
111*62d9e475SDanny Tsen	SAVE_VRS 28, 128, 9
112*62d9e475SDanny Tsen	SAVE_VRS 29, 144, 9
113*62d9e475SDanny Tsen	SAVE_VRS 30, 160, 9
114*62d9e475SDanny Tsen	SAVE_VRS 31, 176, 9
115*62d9e475SDanny Tsen
116*62d9e475SDanny Tsen	SAVE_VSX 14, 192, 9
117*62d9e475SDanny Tsen	SAVE_VSX 15, 208, 9
118*62d9e475SDanny Tsen	SAVE_VSX 16, 224, 9
119*62d9e475SDanny Tsen	SAVE_VSX 17, 240, 9
120*62d9e475SDanny Tsen	SAVE_VSX 18, 256, 9
121*62d9e475SDanny Tsen	SAVE_VSX 19, 272, 9
122*62d9e475SDanny Tsen	SAVE_VSX 20, 288, 9
123*62d9e475SDanny Tsen	SAVE_VSX 21, 304, 9
124*62d9e475SDanny Tsen	SAVE_VSX 22, 320, 9
125*62d9e475SDanny Tsen	SAVE_VSX 23, 336, 9
126*62d9e475SDanny Tsen	SAVE_VSX 24, 352, 9
127*62d9e475SDanny Tsen	SAVE_VSX 25, 368, 9
128*62d9e475SDanny Tsen	SAVE_VSX 26, 384, 9
129*62d9e475SDanny Tsen	SAVE_VSX 27, 400, 9
130*62d9e475SDanny Tsen	SAVE_VSX 28, 416, 9
131*62d9e475SDanny Tsen	SAVE_VSX 29, 432, 9
132*62d9e475SDanny Tsen	SAVE_VSX 30, 448, 9
133*62d9e475SDanny Tsen	SAVE_VSX 31, 464, 9
134*62d9e475SDanny Tsen.endm # SAVE_REGS
135*62d9e475SDanny Tsen
136*62d9e475SDanny Tsen.macro RESTORE_REGS
137*62d9e475SDanny Tsen	addi	9, 1, 256
138*62d9e475SDanny Tsen	RESTORE_VRS 20, 0, 9
139*62d9e475SDanny Tsen	RESTORE_VRS 21, 16, 9
140*62d9e475SDanny Tsen	RESTORE_VRS 22, 32, 9
141*62d9e475SDanny Tsen	RESTORE_VRS 23, 48, 9
142*62d9e475SDanny Tsen	RESTORE_VRS 24, 64, 9
143*62d9e475SDanny Tsen	RESTORE_VRS 25, 80, 9
144*62d9e475SDanny Tsen	RESTORE_VRS 26, 96, 9
145*62d9e475SDanny Tsen	RESTORE_VRS 27, 112, 9
146*62d9e475SDanny Tsen	RESTORE_VRS 28, 128, 9
147*62d9e475SDanny Tsen	RESTORE_VRS 29, 144, 9
148*62d9e475SDanny Tsen	RESTORE_VRS 30, 160, 9
149*62d9e475SDanny Tsen	RESTORE_VRS 31, 176, 9
150*62d9e475SDanny Tsen
151*62d9e475SDanny Tsen	RESTORE_VSX 14, 192, 9
152*62d9e475SDanny Tsen	RESTORE_VSX 15, 208, 9
153*62d9e475SDanny Tsen	RESTORE_VSX 16, 224, 9
154*62d9e475SDanny Tsen	RESTORE_VSX 17, 240, 9
155*62d9e475SDanny Tsen	RESTORE_VSX 18, 256, 9
156*62d9e475SDanny Tsen	RESTORE_VSX 19, 272, 9
157*62d9e475SDanny Tsen	RESTORE_VSX 20, 288, 9
158*62d9e475SDanny Tsen	RESTORE_VSX 21, 304, 9
159*62d9e475SDanny Tsen	RESTORE_VSX 22, 320, 9
160*62d9e475SDanny Tsen	RESTORE_VSX 23, 336, 9
161*62d9e475SDanny Tsen	RESTORE_VSX 24, 352, 9
162*62d9e475SDanny Tsen	RESTORE_VSX 25, 368, 9
163*62d9e475SDanny Tsen	RESTORE_VSX 26, 384, 9
164*62d9e475SDanny Tsen	RESTORE_VSX 27, 400, 9
165*62d9e475SDanny Tsen	RESTORE_VSX 28, 416, 9
166*62d9e475SDanny Tsen	RESTORE_VSX 29, 432, 9
167*62d9e475SDanny Tsen	RESTORE_VSX 30, 448, 9
168*62d9e475SDanny Tsen	RESTORE_VSX 31, 464, 9
169*62d9e475SDanny Tsen
170*62d9e475SDanny Tsen	RESTORE_GPR 14, 112, 1
171*62d9e475SDanny Tsen	RESTORE_GPR 15, 120, 1
172*62d9e475SDanny Tsen	RESTORE_GPR 16, 128, 1
173*62d9e475SDanny Tsen	RESTORE_GPR 17, 136, 1
174*62d9e475SDanny Tsen	RESTORE_GPR 18, 144, 1
175*62d9e475SDanny Tsen	RESTORE_GPR 19, 152, 1
176*62d9e475SDanny Tsen	RESTORE_GPR 20, 160, 1
177*62d9e475SDanny Tsen	RESTORE_GPR 21, 168, 1
178*62d9e475SDanny Tsen	RESTORE_GPR 22, 176, 1
179*62d9e475SDanny Tsen	RESTORE_GPR 23, 184, 1
180*62d9e475SDanny Tsen	RESTORE_GPR 24, 192, 1
181*62d9e475SDanny Tsen	RESTORE_GPR 25, 200, 1
182*62d9e475SDanny Tsen	RESTORE_GPR 26, 208, 1
183*62d9e475SDanny Tsen	RESTORE_GPR 27, 216, 1
184*62d9e475SDanny Tsen	RESTORE_GPR 28, 224, 1
185*62d9e475SDanny Tsen	RESTORE_GPR 29, 232, 1
186*62d9e475SDanny Tsen	RESTORE_GPR 30, 240, 1
187*62d9e475SDanny Tsen	RESTORE_GPR 31, 248, 1
188*62d9e475SDanny Tsen
189*62d9e475SDanny Tsen	addi    1, 1, 752
190*62d9e475SDanny Tsen	ld 0, 16(1)
191*62d9e475SDanny Tsen	mtlr 0
192*62d9e475SDanny Tsen.endm # RESTORE_REGS
193*62d9e475SDanny Tsen
194*62d9e475SDanny Tsen.macro QT_loop_8x
195*62d9e475SDanny Tsen	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
196*62d9e475SDanny Tsen	xxlor	0, 32+25, 32+25
197*62d9e475SDanny Tsen	xxlor	32+25, 20, 20
198*62d9e475SDanny Tsen	vadduwm 0, 0, 4
199*62d9e475SDanny Tsen	vadduwm 1, 1, 5
200*62d9e475SDanny Tsen	vadduwm 2, 2, 6
201*62d9e475SDanny Tsen	vadduwm 3, 3, 7
202*62d9e475SDanny Tsen	  vadduwm 16, 16, 20
203*62d9e475SDanny Tsen	  vadduwm 17, 17, 21
204*62d9e475SDanny Tsen	  vadduwm 18, 18, 22
205*62d9e475SDanny Tsen	  vadduwm 19, 19, 23
206*62d9e475SDanny Tsen
207*62d9e475SDanny Tsen	  vpermxor 12, 12, 0, 25
208*62d9e475SDanny Tsen	  vpermxor 13, 13, 1, 25
209*62d9e475SDanny Tsen	  vpermxor 14, 14, 2, 25
210*62d9e475SDanny Tsen	  vpermxor 15, 15, 3, 25
211*62d9e475SDanny Tsen	  vpermxor 28, 28, 16, 25
212*62d9e475SDanny Tsen	  vpermxor 29, 29, 17, 25
213*62d9e475SDanny Tsen	  vpermxor 30, 30, 18, 25
214*62d9e475SDanny Tsen	  vpermxor 31, 31, 19, 25
215*62d9e475SDanny Tsen	xxlor	32+25, 0, 0
216*62d9e475SDanny Tsen	vadduwm 8, 8, 12
217*62d9e475SDanny Tsen	vadduwm 9, 9, 13
218*62d9e475SDanny Tsen	vadduwm 10, 10, 14
219*62d9e475SDanny Tsen	vadduwm 11, 11, 15
220*62d9e475SDanny Tsen	  vadduwm 24, 24, 28
221*62d9e475SDanny Tsen	  vadduwm 25, 25, 29
222*62d9e475SDanny Tsen	  vadduwm 26, 26, 30
223*62d9e475SDanny Tsen	  vadduwm 27, 27, 31
224*62d9e475SDanny Tsen	vxor 4, 4, 8
225*62d9e475SDanny Tsen	vxor 5, 5, 9
226*62d9e475SDanny Tsen	vxor 6, 6, 10
227*62d9e475SDanny Tsen	vxor 7, 7, 11
228*62d9e475SDanny Tsen	  vxor 20, 20, 24
229*62d9e475SDanny Tsen	  vxor 21, 21, 25
230*62d9e475SDanny Tsen	  vxor 22, 22, 26
231*62d9e475SDanny Tsen	  vxor 23, 23, 27
232*62d9e475SDanny Tsen
233*62d9e475SDanny Tsen	xxlor	0, 32+25, 32+25
234*62d9e475SDanny Tsen	xxlor	32+25, 21, 21
235*62d9e475SDanny Tsen	vrlw 4, 4, 25  #
236*62d9e475SDanny Tsen	vrlw 5, 5, 25
237*62d9e475SDanny Tsen	vrlw 6, 6, 25
238*62d9e475SDanny Tsen	vrlw 7, 7, 25
239*62d9e475SDanny Tsen	  vrlw 20, 20, 25  #
240*62d9e475SDanny Tsen	  vrlw 21, 21, 25
241*62d9e475SDanny Tsen	  vrlw 22, 22, 25
242*62d9e475SDanny Tsen	  vrlw 23, 23, 25
243*62d9e475SDanny Tsen	xxlor	32+25, 0, 0
244*62d9e475SDanny Tsen	vadduwm 0, 0, 4
245*62d9e475SDanny Tsen	vadduwm 1, 1, 5
246*62d9e475SDanny Tsen	vadduwm 2, 2, 6
247*62d9e475SDanny Tsen	vadduwm 3, 3, 7
248*62d9e475SDanny Tsen	  vadduwm 16, 16, 20
249*62d9e475SDanny Tsen	  vadduwm 17, 17, 21
250*62d9e475SDanny Tsen	  vadduwm 18, 18, 22
251*62d9e475SDanny Tsen	  vadduwm 19, 19, 23
252*62d9e475SDanny Tsen
253*62d9e475SDanny Tsen	xxlor	0, 32+25, 32+25
254*62d9e475SDanny Tsen	xxlor	32+25, 22, 22
255*62d9e475SDanny Tsen	  vpermxor 12, 12, 0, 25
256*62d9e475SDanny Tsen	  vpermxor 13, 13, 1, 25
257*62d9e475SDanny Tsen	  vpermxor 14, 14, 2, 25
258*62d9e475SDanny Tsen	  vpermxor 15, 15, 3, 25
259*62d9e475SDanny Tsen	  vpermxor 28, 28, 16, 25
260*62d9e475SDanny Tsen	  vpermxor 29, 29, 17, 25
261*62d9e475SDanny Tsen	  vpermxor 30, 30, 18, 25
262*62d9e475SDanny Tsen	  vpermxor 31, 31, 19, 25
263*62d9e475SDanny Tsen	xxlor	32+25, 0, 0
264*62d9e475SDanny Tsen	vadduwm 8, 8, 12
265*62d9e475SDanny Tsen	vadduwm 9, 9, 13
266*62d9e475SDanny Tsen	vadduwm 10, 10, 14
267*62d9e475SDanny Tsen	vadduwm 11, 11, 15
268*62d9e475SDanny Tsen	  vadduwm 24, 24, 28
269*62d9e475SDanny Tsen	  vadduwm 25, 25, 29
270*62d9e475SDanny Tsen	  vadduwm 26, 26, 30
271*62d9e475SDanny Tsen	  vadduwm 27, 27, 31
272*62d9e475SDanny Tsen	xxlor	0, 32+28, 32+28
273*62d9e475SDanny Tsen	xxlor	32+28, 23, 23
274*62d9e475SDanny Tsen	vxor 4, 4, 8
275*62d9e475SDanny Tsen	vxor 5, 5, 9
276*62d9e475SDanny Tsen	vxor 6, 6, 10
277*62d9e475SDanny Tsen	vxor 7, 7, 11
278*62d9e475SDanny Tsen	  vxor 20, 20, 24
279*62d9e475SDanny Tsen	  vxor 21, 21, 25
280*62d9e475SDanny Tsen	  vxor 22, 22, 26
281*62d9e475SDanny Tsen	  vxor 23, 23, 27
282*62d9e475SDanny Tsen	vrlw 4, 4, 28  #
283*62d9e475SDanny Tsen	vrlw 5, 5, 28
284*62d9e475SDanny Tsen	vrlw 6, 6, 28
285*62d9e475SDanny Tsen	vrlw 7, 7, 28
286*62d9e475SDanny Tsen	  vrlw 20, 20, 28  #
287*62d9e475SDanny Tsen	  vrlw 21, 21, 28
288*62d9e475SDanny Tsen	  vrlw 22, 22, 28
289*62d9e475SDanny Tsen	  vrlw 23, 23, 28
290*62d9e475SDanny Tsen	xxlor	32+28, 0, 0
291*62d9e475SDanny Tsen
292*62d9e475SDanny Tsen	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
293*62d9e475SDanny Tsen	xxlor	0, 32+25, 32+25
294*62d9e475SDanny Tsen	xxlor	32+25, 20, 20
295*62d9e475SDanny Tsen	vadduwm 0, 0, 5
296*62d9e475SDanny Tsen	vadduwm 1, 1, 6
297*62d9e475SDanny Tsen	vadduwm 2, 2, 7
298*62d9e475SDanny Tsen	vadduwm 3, 3, 4
299*62d9e475SDanny Tsen	  vadduwm 16, 16, 21
300*62d9e475SDanny Tsen	  vadduwm 17, 17, 22
301*62d9e475SDanny Tsen	  vadduwm 18, 18, 23
302*62d9e475SDanny Tsen	  vadduwm 19, 19, 20
303*62d9e475SDanny Tsen
304*62d9e475SDanny Tsen	  vpermxor 15, 15, 0, 25
305*62d9e475SDanny Tsen	  vpermxor 12, 12, 1, 25
306*62d9e475SDanny Tsen	  vpermxor 13, 13, 2, 25
307*62d9e475SDanny Tsen	  vpermxor 14, 14, 3, 25
308*62d9e475SDanny Tsen	  vpermxor 31, 31, 16, 25
309*62d9e475SDanny Tsen	  vpermxor 28, 28, 17, 25
310*62d9e475SDanny Tsen	  vpermxor 29, 29, 18, 25
311*62d9e475SDanny Tsen	  vpermxor 30, 30, 19, 25
312*62d9e475SDanny Tsen
313*62d9e475SDanny Tsen	xxlor	32+25, 0, 0
314*62d9e475SDanny Tsen	vadduwm 10, 10, 15
315*62d9e475SDanny Tsen	vadduwm 11, 11, 12
316*62d9e475SDanny Tsen	vadduwm 8, 8, 13
317*62d9e475SDanny Tsen	vadduwm 9, 9, 14
318*62d9e475SDanny Tsen	  vadduwm 26, 26, 31
319*62d9e475SDanny Tsen	  vadduwm 27, 27, 28
320*62d9e475SDanny Tsen	  vadduwm 24, 24, 29
321*62d9e475SDanny Tsen	  vadduwm 25, 25, 30
322*62d9e475SDanny Tsen	vxor 5, 5, 10
323*62d9e475SDanny Tsen	vxor 6, 6, 11
324*62d9e475SDanny Tsen	vxor 7, 7, 8
325*62d9e475SDanny Tsen	vxor 4, 4, 9
326*62d9e475SDanny Tsen	  vxor 21, 21, 26
327*62d9e475SDanny Tsen	  vxor 22, 22, 27
328*62d9e475SDanny Tsen	  vxor 23, 23, 24
329*62d9e475SDanny Tsen	  vxor 20, 20, 25
330*62d9e475SDanny Tsen
331*62d9e475SDanny Tsen	xxlor	0, 32+25, 32+25
332*62d9e475SDanny Tsen	xxlor	32+25, 21, 21
333*62d9e475SDanny Tsen	vrlw 5, 5, 25
334*62d9e475SDanny Tsen	vrlw 6, 6, 25
335*62d9e475SDanny Tsen	vrlw 7, 7, 25
336*62d9e475SDanny Tsen	vrlw 4, 4, 25
337*62d9e475SDanny Tsen	  vrlw 21, 21, 25
338*62d9e475SDanny Tsen	  vrlw 22, 22, 25
339*62d9e475SDanny Tsen	  vrlw 23, 23, 25
340*62d9e475SDanny Tsen	  vrlw 20, 20, 25
341*62d9e475SDanny Tsen	xxlor	32+25, 0, 0
342*62d9e475SDanny Tsen
343*62d9e475SDanny Tsen	vadduwm 0, 0, 5
344*62d9e475SDanny Tsen	vadduwm 1, 1, 6
345*62d9e475SDanny Tsen	vadduwm 2, 2, 7
346*62d9e475SDanny Tsen	vadduwm 3, 3, 4
347*62d9e475SDanny Tsen	  vadduwm 16, 16, 21
348*62d9e475SDanny Tsen	  vadduwm 17, 17, 22
349*62d9e475SDanny Tsen	  vadduwm 18, 18, 23
350*62d9e475SDanny Tsen	  vadduwm 19, 19, 20
351*62d9e475SDanny Tsen
352*62d9e475SDanny Tsen	xxlor	0, 32+25, 32+25
353*62d9e475SDanny Tsen	xxlor	32+25, 22, 22
354*62d9e475SDanny Tsen	  vpermxor 15, 15, 0, 25
355*62d9e475SDanny Tsen	  vpermxor 12, 12, 1, 25
356*62d9e475SDanny Tsen	  vpermxor 13, 13, 2, 25
357*62d9e475SDanny Tsen	  vpermxor 14, 14, 3, 25
358*62d9e475SDanny Tsen	  vpermxor 31, 31, 16, 25
359*62d9e475SDanny Tsen	  vpermxor 28, 28, 17, 25
360*62d9e475SDanny Tsen	  vpermxor 29, 29, 18, 25
361*62d9e475SDanny Tsen	  vpermxor 30, 30, 19, 25
362*62d9e475SDanny Tsen	xxlor	32+25, 0, 0
363*62d9e475SDanny Tsen
364*62d9e475SDanny Tsen	vadduwm 10, 10, 15
365*62d9e475SDanny Tsen	vadduwm 11, 11, 12
366*62d9e475SDanny Tsen	vadduwm 8, 8, 13
367*62d9e475SDanny Tsen	vadduwm 9, 9, 14
368*62d9e475SDanny Tsen	  vadduwm 26, 26, 31
369*62d9e475SDanny Tsen	  vadduwm 27, 27, 28
370*62d9e475SDanny Tsen	  vadduwm 24, 24, 29
371*62d9e475SDanny Tsen	  vadduwm 25, 25, 30
372*62d9e475SDanny Tsen
373*62d9e475SDanny Tsen	xxlor	0, 32+28, 32+28
374*62d9e475SDanny Tsen	xxlor	32+28, 23, 23
375*62d9e475SDanny Tsen	vxor 5, 5, 10
376*62d9e475SDanny Tsen	vxor 6, 6, 11
377*62d9e475SDanny Tsen	vxor 7, 7, 8
378*62d9e475SDanny Tsen	vxor 4, 4, 9
379*62d9e475SDanny Tsen	  vxor 21, 21, 26
380*62d9e475SDanny Tsen	  vxor 22, 22, 27
381*62d9e475SDanny Tsen	  vxor 23, 23, 24
382*62d9e475SDanny Tsen	  vxor 20, 20, 25
383*62d9e475SDanny Tsen	vrlw 5, 5, 28
384*62d9e475SDanny Tsen	vrlw 6, 6, 28
385*62d9e475SDanny Tsen	vrlw 7, 7, 28
386*62d9e475SDanny Tsen	vrlw 4, 4, 28
387*62d9e475SDanny Tsen	  vrlw 21, 21, 28
388*62d9e475SDanny Tsen	  vrlw 22, 22, 28
389*62d9e475SDanny Tsen	  vrlw 23, 23, 28
390*62d9e475SDanny Tsen	  vrlw 20, 20, 28
391*62d9e475SDanny Tsen	xxlor	32+28, 0, 0
392*62d9e475SDanny Tsen.endm
393*62d9e475SDanny Tsen
394*62d9e475SDanny Tsen.macro QT_loop_4x
395*62d9e475SDanny Tsen	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
396*62d9e475SDanny Tsen	vadduwm 0, 0, 4
397*62d9e475SDanny Tsen	vadduwm 1, 1, 5
398*62d9e475SDanny Tsen	vadduwm 2, 2, 6
399*62d9e475SDanny Tsen	vadduwm 3, 3, 7
400*62d9e475SDanny Tsen	  vpermxor 12, 12, 0, 20
401*62d9e475SDanny Tsen	  vpermxor 13, 13, 1, 20
402*62d9e475SDanny Tsen	  vpermxor 14, 14, 2, 20
403*62d9e475SDanny Tsen	  vpermxor 15, 15, 3, 20
404*62d9e475SDanny Tsen	vadduwm 8, 8, 12
405*62d9e475SDanny Tsen	vadduwm 9, 9, 13
406*62d9e475SDanny Tsen	vadduwm 10, 10, 14
407*62d9e475SDanny Tsen	vadduwm 11, 11, 15
408*62d9e475SDanny Tsen	vxor 4, 4, 8
409*62d9e475SDanny Tsen	vxor 5, 5, 9
410*62d9e475SDanny Tsen	vxor 6, 6, 10
411*62d9e475SDanny Tsen	vxor 7, 7, 11
412*62d9e475SDanny Tsen	vrlw 4, 4, 21
413*62d9e475SDanny Tsen	vrlw 5, 5, 21
414*62d9e475SDanny Tsen	vrlw 6, 6, 21
415*62d9e475SDanny Tsen	vrlw 7, 7, 21
416*62d9e475SDanny Tsen	vadduwm 0, 0, 4
417*62d9e475SDanny Tsen	vadduwm 1, 1, 5
418*62d9e475SDanny Tsen	vadduwm 2, 2, 6
419*62d9e475SDanny Tsen	vadduwm 3, 3, 7
420*62d9e475SDanny Tsen	  vpermxor 12, 12, 0, 22
421*62d9e475SDanny Tsen	  vpermxor 13, 13, 1, 22
422*62d9e475SDanny Tsen	  vpermxor 14, 14, 2, 22
423*62d9e475SDanny Tsen	  vpermxor 15, 15, 3, 22
424*62d9e475SDanny Tsen	vadduwm 8, 8, 12
425*62d9e475SDanny Tsen	vadduwm 9, 9, 13
426*62d9e475SDanny Tsen	vadduwm 10, 10, 14
427*62d9e475SDanny Tsen	vadduwm 11, 11, 15
428*62d9e475SDanny Tsen	vxor 4, 4, 8
429*62d9e475SDanny Tsen	vxor 5, 5, 9
430*62d9e475SDanny Tsen	vxor 6, 6, 10
431*62d9e475SDanny Tsen	vxor 7, 7, 11
432*62d9e475SDanny Tsen	vrlw 4, 4, 23
433*62d9e475SDanny Tsen	vrlw 5, 5, 23
434*62d9e475SDanny Tsen	vrlw 6, 6, 23
435*62d9e475SDanny Tsen	vrlw 7, 7, 23
436*62d9e475SDanny Tsen
437*62d9e475SDanny Tsen	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
438*62d9e475SDanny Tsen	vadduwm 0, 0, 5
439*62d9e475SDanny Tsen	vadduwm 1, 1, 6
440*62d9e475SDanny Tsen	vadduwm 2, 2, 7
441*62d9e475SDanny Tsen	vadduwm 3, 3, 4
442*62d9e475SDanny Tsen	  vpermxor 15, 15, 0, 20
443*62d9e475SDanny Tsen	  vpermxor 12, 12, 1, 20
444*62d9e475SDanny Tsen	  vpermxor 13, 13, 2, 20
445*62d9e475SDanny Tsen	  vpermxor 14, 14, 3, 20
446*62d9e475SDanny Tsen	vadduwm 10, 10, 15
447*62d9e475SDanny Tsen	vadduwm 11, 11, 12
448*62d9e475SDanny Tsen	vadduwm 8, 8, 13
449*62d9e475SDanny Tsen	vadduwm 9, 9, 14
450*62d9e475SDanny Tsen	vxor 5, 5, 10
451*62d9e475SDanny Tsen	vxor 6, 6, 11
452*62d9e475SDanny Tsen	vxor 7, 7, 8
453*62d9e475SDanny Tsen	vxor 4, 4, 9
454*62d9e475SDanny Tsen	vrlw 5, 5, 21
455*62d9e475SDanny Tsen	vrlw 6, 6, 21
456*62d9e475SDanny Tsen	vrlw 7, 7, 21
457*62d9e475SDanny Tsen	vrlw 4, 4, 21
458*62d9e475SDanny Tsen	vadduwm 0, 0, 5
459*62d9e475SDanny Tsen	vadduwm 1, 1, 6
460*62d9e475SDanny Tsen	vadduwm 2, 2, 7
461*62d9e475SDanny Tsen	vadduwm 3, 3, 4
462*62d9e475SDanny Tsen	  vpermxor 15, 15, 0, 22
463*62d9e475SDanny Tsen	  vpermxor 12, 12, 1, 22
464*62d9e475SDanny Tsen	  vpermxor 13, 13, 2, 22
465*62d9e475SDanny Tsen	  vpermxor 14, 14, 3, 22
466*62d9e475SDanny Tsen	vadduwm 10, 10, 15
467*62d9e475SDanny Tsen	vadduwm 11, 11, 12
468*62d9e475SDanny Tsen	vadduwm 8, 8, 13
469*62d9e475SDanny Tsen	vadduwm 9, 9, 14
470*62d9e475SDanny Tsen	vxor 5, 5, 10
471*62d9e475SDanny Tsen	vxor 6, 6, 11
472*62d9e475SDanny Tsen	vxor 7, 7, 8
473*62d9e475SDanny Tsen	vxor 4, 4, 9
474*62d9e475SDanny Tsen	vrlw 5, 5, 23
475*62d9e475SDanny Tsen	vrlw 6, 6, 23
476*62d9e475SDanny Tsen	vrlw 7, 7, 23
477*62d9e475SDanny Tsen	vrlw 4, 4, 23
478*62d9e475SDanny Tsen.endm
479*62d9e475SDanny Tsen
480*62d9e475SDanny Tsen# Transpose
481*62d9e475SDanny Tsen.macro TP_4x a0 a1 a2 a3
482*62d9e475SDanny Tsen	xxmrghw  10, 32+\a0, 32+\a1	# a0, a1, b0, b1
483*62d9e475SDanny Tsen	xxmrghw  11, 32+\a2, 32+\a3	# a2, a3, b2, b3
484*62d9e475SDanny Tsen	xxmrglw  12, 32+\a0, 32+\a1	# c0, c1, d0, d1
485*62d9e475SDanny Tsen	xxmrglw  13, 32+\a2, 32+\a3	# c2, c3, d2, d3
486*62d9e475SDanny Tsen	xxpermdi	32+\a0, 10, 11, 0	# a0, a1, a2, a3
487*62d9e475SDanny Tsen	xxpermdi	32+\a1, 10, 11, 3	# b0, b1, b2, b3
488*62d9e475SDanny Tsen	xxpermdi	32+\a2, 12, 13, 0	# c0, c1, c2, c3
489*62d9e475SDanny Tsen	xxpermdi	32+\a3, 12, 13, 3	# d0, d1, d2, d3
490*62d9e475SDanny Tsen.endm
491*62d9e475SDanny Tsen
492*62d9e475SDanny Tsen# key stream = working state + state
493*62d9e475SDanny Tsen.macro Add_state S
494*62d9e475SDanny Tsen	vadduwm \S+0, \S+0, 16-\S
495*62d9e475SDanny Tsen	vadduwm \S+4, \S+4, 17-\S
496*62d9e475SDanny Tsen	vadduwm \S+8, \S+8, 18-\S
497*62d9e475SDanny Tsen	vadduwm \S+12, \S+12, 19-\S
498*62d9e475SDanny Tsen
499*62d9e475SDanny Tsen	vadduwm \S+1, \S+1, 16-\S
500*62d9e475SDanny Tsen	vadduwm \S+5, \S+5, 17-\S
501*62d9e475SDanny Tsen	vadduwm \S+9, \S+9, 18-\S
502*62d9e475SDanny Tsen	vadduwm \S+13, \S+13, 19-\S
503*62d9e475SDanny Tsen
504*62d9e475SDanny Tsen	vadduwm \S+2, \S+2, 16-\S
505*62d9e475SDanny Tsen	vadduwm \S+6, \S+6, 17-\S
506*62d9e475SDanny Tsen	vadduwm \S+10, \S+10, 18-\S
507*62d9e475SDanny Tsen	vadduwm \S+14, \S+14, 19-\S
508*62d9e475SDanny Tsen
509*62d9e475SDanny Tsen	vadduwm	\S+3, \S+3, 16-\S
510*62d9e475SDanny Tsen	vadduwm	\S+7, \S+7, 17-\S
511*62d9e475SDanny Tsen	vadduwm	\S+11, \S+11, 18-\S
512*62d9e475SDanny Tsen	vadduwm	\S+15, \S+15, 19-\S
513*62d9e475SDanny Tsen.endm
514*62d9e475SDanny Tsen
515*62d9e475SDanny Tsen#
516*62d9e475SDanny Tsen# write 256 bytes
517*62d9e475SDanny Tsen#
518*62d9e475SDanny Tsen.macro Write_256 S
519*62d9e475SDanny Tsen	add 9, 14, 5
520*62d9e475SDanny Tsen	add 16, 14, 4
521*62d9e475SDanny Tsen	lxvw4x 0, 0, 9
522*62d9e475SDanny Tsen	lxvw4x 1, 17, 9
523*62d9e475SDanny Tsen	lxvw4x 2, 18, 9
524*62d9e475SDanny Tsen	lxvw4x 3, 19, 9
525*62d9e475SDanny Tsen	lxvw4x 4, 20, 9
526*62d9e475SDanny Tsen	lxvw4x 5, 21, 9
527*62d9e475SDanny Tsen	lxvw4x 6, 22, 9
528*62d9e475SDanny Tsen	lxvw4x 7, 23, 9
529*62d9e475SDanny Tsen	lxvw4x 8, 24, 9
530*62d9e475SDanny Tsen	lxvw4x 9, 25, 9
531*62d9e475SDanny Tsen	lxvw4x 10, 26, 9
532*62d9e475SDanny Tsen	lxvw4x 11, 27, 9
533*62d9e475SDanny Tsen	lxvw4x 12, 28, 9
534*62d9e475SDanny Tsen	lxvw4x 13, 29, 9
535*62d9e475SDanny Tsen	lxvw4x 14, 30, 9
536*62d9e475SDanny Tsen	lxvw4x 15, 31, 9
537*62d9e475SDanny Tsen
538*62d9e475SDanny Tsen	xxlxor \S+32, \S+32, 0
539*62d9e475SDanny Tsen	xxlxor \S+36, \S+36, 1
540*62d9e475SDanny Tsen	xxlxor \S+40, \S+40, 2
541*62d9e475SDanny Tsen	xxlxor \S+44, \S+44, 3
542*62d9e475SDanny Tsen	xxlxor \S+33, \S+33, 4
543*62d9e475SDanny Tsen	xxlxor \S+37, \S+37, 5
544*62d9e475SDanny Tsen	xxlxor \S+41, \S+41, 6
545*62d9e475SDanny Tsen	xxlxor \S+45, \S+45, 7
546*62d9e475SDanny Tsen	xxlxor \S+34, \S+34, 8
547*62d9e475SDanny Tsen	xxlxor \S+38, \S+38, 9
548*62d9e475SDanny Tsen	xxlxor \S+42, \S+42, 10
549*62d9e475SDanny Tsen	xxlxor \S+46, \S+46, 11
550*62d9e475SDanny Tsen	xxlxor \S+35, \S+35, 12
551*62d9e475SDanny Tsen	xxlxor \S+39, \S+39, 13
552*62d9e475SDanny Tsen	xxlxor \S+43, \S+43, 14
553*62d9e475SDanny Tsen	xxlxor \S+47, \S+47, 15
554*62d9e475SDanny Tsen
555*62d9e475SDanny Tsen	stxvw4x \S+32, 0, 16
556*62d9e475SDanny Tsen	stxvw4x \S+36, 17, 16
557*62d9e475SDanny Tsen	stxvw4x \S+40, 18, 16
558*62d9e475SDanny Tsen	stxvw4x \S+44, 19, 16
559*62d9e475SDanny Tsen
560*62d9e475SDanny Tsen	stxvw4x \S+33, 20, 16
561*62d9e475SDanny Tsen	stxvw4x \S+37, 21, 16
562*62d9e475SDanny Tsen	stxvw4x \S+41, 22, 16
563*62d9e475SDanny Tsen	stxvw4x \S+45, 23, 16
564*62d9e475SDanny Tsen
565*62d9e475SDanny Tsen	stxvw4x \S+34, 24, 16
566*62d9e475SDanny Tsen	stxvw4x \S+38, 25, 16
567*62d9e475SDanny Tsen	stxvw4x \S+42, 26, 16
568*62d9e475SDanny Tsen	stxvw4x \S+46, 27, 16
569*62d9e475SDanny Tsen
570*62d9e475SDanny Tsen	stxvw4x \S+35, 28, 16
571*62d9e475SDanny Tsen	stxvw4x \S+39, 29, 16
572*62d9e475SDanny Tsen	stxvw4x \S+43, 30, 16
573*62d9e475SDanny Tsen	stxvw4x \S+47, 31, 16
574*62d9e475SDanny Tsen
575*62d9e475SDanny Tsen.endm
576*62d9e475SDanny Tsen
577*62d9e475SDanny Tsen#
578*62d9e475SDanny Tsen# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds);
579*62d9e475SDanny Tsen#
580*62d9e475SDanny TsenSYM_FUNC_START(chacha_p10le_8x)
581*62d9e475SDanny Tsen.align 5
582*62d9e475SDanny Tsen	cmpdi	6, 0
583*62d9e475SDanny Tsen	ble	Out_no_chacha
584*62d9e475SDanny Tsen
585*62d9e475SDanny Tsen	SAVE_REGS
586*62d9e475SDanny Tsen
587*62d9e475SDanny Tsen	# r17 - r31 mainly for Write_256 macro.
588*62d9e475SDanny Tsen	li	17, 16
589*62d9e475SDanny Tsen	li	18, 32
590*62d9e475SDanny Tsen	li	19, 48
591*62d9e475SDanny Tsen	li	20, 64
592*62d9e475SDanny Tsen	li	21, 80
593*62d9e475SDanny Tsen	li	22, 96
594*62d9e475SDanny Tsen	li	23, 112
595*62d9e475SDanny Tsen	li	24, 128
596*62d9e475SDanny Tsen	li	25, 144
597*62d9e475SDanny Tsen	li	26, 160
598*62d9e475SDanny Tsen	li	27, 176
599*62d9e475SDanny Tsen	li	28, 192
600*62d9e475SDanny Tsen	li	29, 208
601*62d9e475SDanny Tsen	li	30, 224
602*62d9e475SDanny Tsen	li	31, 240
603*62d9e475SDanny Tsen
604*62d9e475SDanny Tsen	mr 15, 6			# len
605*62d9e475SDanny Tsen	li 14, 0			# offset to inp and outp
606*62d9e475SDanny Tsen
607*62d9e475SDanny Tsen        lxvw4x	48, 0, 3		#  vr16, constants
608*62d9e475SDanny Tsen	lxvw4x	49, 17, 3		#  vr17, key 1
609*62d9e475SDanny Tsen	lxvw4x	50, 18, 3		#  vr18, key 2
610*62d9e475SDanny Tsen	lxvw4x	51, 19, 3		#  vr19, counter, nonce
611*62d9e475SDanny Tsen
612*62d9e475SDanny Tsen	# create (0, 1, 2, 3) counters
613*62d9e475SDanny Tsen	vspltisw 0, 0
614*62d9e475SDanny Tsen	vspltisw 1, 1
615*62d9e475SDanny Tsen	vspltisw 2, 2
616*62d9e475SDanny Tsen	vspltisw 3, 3
617*62d9e475SDanny Tsen	vmrghw	4, 0, 1
618*62d9e475SDanny Tsen	vmrglw	5, 2, 3
619*62d9e475SDanny Tsen	vsldoi	30, 4, 5, 8		# vr30 counter, 4 (0, 1, 2, 3)
620*62d9e475SDanny Tsen
621*62d9e475SDanny Tsen	vspltisw 21, 12
622*62d9e475SDanny Tsen	vspltisw 23, 7
623*62d9e475SDanny Tsen
624*62d9e475SDanny Tsen	addis	11, 2, permx@toc@ha
625*62d9e475SDanny Tsen	addi	11, 11, permx@toc@l
626*62d9e475SDanny Tsen	lxvw4x	32+20, 0, 11
627*62d9e475SDanny Tsen	lxvw4x	32+22, 17, 11
628*62d9e475SDanny Tsen
629*62d9e475SDanny Tsen	sradi	8, 7, 1
630*62d9e475SDanny Tsen
631*62d9e475SDanny Tsen	mtctr 8
632*62d9e475SDanny Tsen
633*62d9e475SDanny Tsen	# save constants to vsx
634*62d9e475SDanny Tsen	xxlor	16, 48, 48
635*62d9e475SDanny Tsen	xxlor	17, 49, 49
636*62d9e475SDanny Tsen	xxlor	18, 50, 50
637*62d9e475SDanny Tsen	xxlor	19, 51, 51
638*62d9e475SDanny Tsen
639*62d9e475SDanny Tsen	vspltisw 25, 4
640*62d9e475SDanny Tsen	vspltisw 26, 8
641*62d9e475SDanny Tsen
642*62d9e475SDanny Tsen	xxlor	25, 32+26, 32+26
643*62d9e475SDanny Tsen	xxlor	24, 32+25, 32+25
644*62d9e475SDanny Tsen
645*62d9e475SDanny Tsen	vadduwm	31, 30, 25		# counter = (0, 1, 2, 3) + (4, 4, 4, 4)
646*62d9e475SDanny Tsen	xxlor	30, 32+30, 32+30
647*62d9e475SDanny Tsen	xxlor	31, 32+31, 32+31
648*62d9e475SDanny Tsen
649*62d9e475SDanny Tsen	xxlor	20, 32+20, 32+20
650*62d9e475SDanny Tsen	xxlor	21, 32+21, 32+21
651*62d9e475SDanny Tsen	xxlor	22, 32+22, 32+22
652*62d9e475SDanny Tsen	xxlor	23, 32+23, 32+23
653*62d9e475SDanny Tsen
654*62d9e475SDanny Tsen	cmpdi	6, 512
655*62d9e475SDanny Tsen	blt	Loop_last
656*62d9e475SDanny Tsen
657*62d9e475SDanny TsenLoop_8x:
658*62d9e475SDanny Tsen	xxspltw  32+0, 16, 0
659*62d9e475SDanny Tsen	xxspltw  32+1, 16, 1
660*62d9e475SDanny Tsen	xxspltw  32+2, 16, 2
661*62d9e475SDanny Tsen	xxspltw  32+3, 16, 3
662*62d9e475SDanny Tsen
663*62d9e475SDanny Tsen	xxspltw  32+4, 17, 0
664*62d9e475SDanny Tsen	xxspltw  32+5, 17, 1
665*62d9e475SDanny Tsen	xxspltw  32+6, 17, 2
666*62d9e475SDanny Tsen	xxspltw  32+7, 17, 3
667*62d9e475SDanny Tsen	xxspltw  32+8, 18, 0
668*62d9e475SDanny Tsen	xxspltw  32+9, 18, 1
669*62d9e475SDanny Tsen	xxspltw  32+10, 18, 2
670*62d9e475SDanny Tsen	xxspltw  32+11, 18, 3
671*62d9e475SDanny Tsen	xxspltw  32+12, 19, 0
672*62d9e475SDanny Tsen	xxspltw  32+13, 19, 1
673*62d9e475SDanny Tsen	xxspltw  32+14, 19, 2
674*62d9e475SDanny Tsen	xxspltw  32+15, 19, 3
675*62d9e475SDanny Tsen	vadduwm	12, 12, 30	# increase counter
676*62d9e475SDanny Tsen
677*62d9e475SDanny Tsen	xxspltw  32+16, 16, 0
678*62d9e475SDanny Tsen	xxspltw  32+17, 16, 1
679*62d9e475SDanny Tsen	xxspltw  32+18, 16, 2
680*62d9e475SDanny Tsen	xxspltw  32+19, 16, 3
681*62d9e475SDanny Tsen
682*62d9e475SDanny Tsen	xxspltw  32+20, 17, 0
683*62d9e475SDanny Tsen	xxspltw  32+21, 17, 1
684*62d9e475SDanny Tsen	xxspltw  32+22, 17, 2
685*62d9e475SDanny Tsen	xxspltw  32+23, 17, 3
686*62d9e475SDanny Tsen	xxspltw  32+24, 18, 0
687*62d9e475SDanny Tsen	xxspltw  32+25, 18, 1
688*62d9e475SDanny Tsen	xxspltw  32+26, 18, 2
689*62d9e475SDanny Tsen	xxspltw  32+27, 18, 3
690*62d9e475SDanny Tsen	xxspltw  32+28, 19, 0
691*62d9e475SDanny Tsen	xxspltw  32+29, 19, 1
692*62d9e475SDanny Tsen	vadduwm	28, 28, 31	# increase counter
693*62d9e475SDanny Tsen	xxspltw  32+30, 19, 2
694*62d9e475SDanny Tsen	xxspltw  32+31, 19, 3
695*62d9e475SDanny Tsen
696*62d9e475SDanny Tsen.align 5
697*62d9e475SDanny Tsenquarter_loop_8x:
698*62d9e475SDanny Tsen	QT_loop_8x
699*62d9e475SDanny Tsen
700*62d9e475SDanny Tsen	bdnz	quarter_loop_8x
701*62d9e475SDanny Tsen
702*62d9e475SDanny Tsen	xxlor	0, 32+30, 32+30
703*62d9e475SDanny Tsen	xxlor	32+30, 30, 30
704*62d9e475SDanny Tsen	vadduwm	12, 12, 30
705*62d9e475SDanny Tsen	xxlor	32+30, 0, 0
706*62d9e475SDanny Tsen	TP_4x 0, 1, 2, 3
707*62d9e475SDanny Tsen	TP_4x 4, 5, 6, 7
708*62d9e475SDanny Tsen	TP_4x 8, 9, 10, 11
709*62d9e475SDanny Tsen	TP_4x 12, 13, 14, 15
710*62d9e475SDanny Tsen
711*62d9e475SDanny Tsen	xxlor	0, 48, 48
712*62d9e475SDanny Tsen	xxlor	1, 49, 49
713*62d9e475SDanny Tsen	xxlor	2, 50, 50
714*62d9e475SDanny Tsen	xxlor	3, 51, 51
715*62d9e475SDanny Tsen	xxlor	48, 16, 16
716*62d9e475SDanny Tsen	xxlor	49, 17, 17
717*62d9e475SDanny Tsen	xxlor	50, 18, 18
718*62d9e475SDanny Tsen	xxlor	51, 19, 19
719*62d9e475SDanny Tsen	Add_state 0
720*62d9e475SDanny Tsen	xxlor	48, 0, 0
721*62d9e475SDanny Tsen	xxlor	49, 1, 1
722*62d9e475SDanny Tsen	xxlor	50, 2, 2
723*62d9e475SDanny Tsen	xxlor	51, 3, 3
724*62d9e475SDanny Tsen	Write_256 0
725*62d9e475SDanny Tsen	addi	14, 14, 256	# offset +=256
726*62d9e475SDanny Tsen	addi	15, 15, -256	# len -=256
727*62d9e475SDanny Tsen
728*62d9e475SDanny Tsen	xxlor	5, 32+31, 32+31
729*62d9e475SDanny Tsen	xxlor	32+31, 31, 31
730*62d9e475SDanny Tsen	vadduwm	28, 28, 31
731*62d9e475SDanny Tsen	xxlor	32+31, 5, 5
732*62d9e475SDanny Tsen	TP_4x 16+0, 16+1, 16+2, 16+3
733*62d9e475SDanny Tsen	TP_4x 16+4, 16+5, 16+6, 16+7
734*62d9e475SDanny Tsen	TP_4x 16+8, 16+9, 16+10, 16+11
735*62d9e475SDanny Tsen	TP_4x 16+12, 16+13, 16+14, 16+15
736*62d9e475SDanny Tsen
737*62d9e475SDanny Tsen	xxlor	32, 16, 16
738*62d9e475SDanny Tsen	xxlor	33, 17, 17
739*62d9e475SDanny Tsen	xxlor	34, 18, 18
740*62d9e475SDanny Tsen	xxlor	35, 19, 19
741*62d9e475SDanny Tsen	Add_state 16
742*62d9e475SDanny Tsen	Write_256 16
743*62d9e475SDanny Tsen	addi	14, 14, 256	# offset +=256
744*62d9e475SDanny Tsen	addi	15, 15, -256	# len +=256
745*62d9e475SDanny Tsen
746*62d9e475SDanny Tsen	xxlor	32+24, 24, 24
747*62d9e475SDanny Tsen	xxlor	32+25, 25, 25
748*62d9e475SDanny Tsen	xxlor	32+30, 30, 30
749*62d9e475SDanny Tsen	vadduwm	30, 30, 25
750*62d9e475SDanny Tsen	vadduwm	31, 30, 24
751*62d9e475SDanny Tsen	xxlor	30, 32+30, 32+30
752*62d9e475SDanny Tsen	xxlor	31, 32+31, 32+31
753*62d9e475SDanny Tsen
754*62d9e475SDanny Tsen	cmpdi	15, 0
755*62d9e475SDanny Tsen	beq	Out_loop
756*62d9e475SDanny Tsen
757*62d9e475SDanny Tsen	cmpdi	15, 512
758*62d9e475SDanny Tsen	blt	Loop_last
759*62d9e475SDanny Tsen
760*62d9e475SDanny Tsen	mtctr 8
761*62d9e475SDanny Tsen	b Loop_8x
762*62d9e475SDanny Tsen
763*62d9e475SDanny TsenLoop_last:
764*62d9e475SDanny Tsen        lxvw4x	48, 0, 3		#  vr16, constants
765*62d9e475SDanny Tsen	lxvw4x	49, 17, 3		#  vr17, key 1
766*62d9e475SDanny Tsen	lxvw4x	50, 18, 3		#  vr18, key 2
767*62d9e475SDanny Tsen	lxvw4x	51, 19, 3		#  vr19, counter, nonce
768*62d9e475SDanny Tsen
769*62d9e475SDanny Tsen	vspltisw 21, 12
770*62d9e475SDanny Tsen	vspltisw 23, 7
771*62d9e475SDanny Tsen	addis	11, 2, permx@toc@ha
772*62d9e475SDanny Tsen	addi	11, 11, permx@toc@l
773*62d9e475SDanny Tsen	lxvw4x	32+20, 0, 11
774*62d9e475SDanny Tsen	lxvw4x	32+22, 17, 11
775*62d9e475SDanny Tsen
776*62d9e475SDanny Tsen	sradi	8, 7, 1
777*62d9e475SDanny Tsen	mtctr 8
778*62d9e475SDanny Tsen
779*62d9e475SDanny TsenLoop_4x:
780*62d9e475SDanny Tsen	vspltw  0, 16, 0
781*62d9e475SDanny Tsen	vspltw  1, 16, 1
782*62d9e475SDanny Tsen	vspltw  2, 16, 2
783*62d9e475SDanny Tsen	vspltw  3, 16, 3
784*62d9e475SDanny Tsen
785*62d9e475SDanny Tsen	vspltw  4, 17, 0
786*62d9e475SDanny Tsen	vspltw  5, 17, 1
787*62d9e475SDanny Tsen	vspltw  6, 17, 2
788*62d9e475SDanny Tsen	vspltw  7, 17, 3
789*62d9e475SDanny Tsen	vspltw  8, 18, 0
790*62d9e475SDanny Tsen	vspltw  9, 18, 1
791*62d9e475SDanny Tsen	vspltw  10, 18, 2
792*62d9e475SDanny Tsen	vspltw  11, 18, 3
793*62d9e475SDanny Tsen	vspltw  12, 19, 0
794*62d9e475SDanny Tsen	vadduwm	12, 12, 30	# increase counter
795*62d9e475SDanny Tsen	vspltw  13, 19, 1
796*62d9e475SDanny Tsen	vspltw  14, 19, 2
797*62d9e475SDanny Tsen	vspltw  15, 19, 3
798*62d9e475SDanny Tsen
799*62d9e475SDanny Tsen.align 5
800*62d9e475SDanny Tsenquarter_loop:
801*62d9e475SDanny Tsen	QT_loop_4x
802*62d9e475SDanny Tsen
803*62d9e475SDanny Tsen	bdnz	quarter_loop
804*62d9e475SDanny Tsen
805*62d9e475SDanny Tsen	vadduwm	12, 12, 30
806*62d9e475SDanny Tsen	TP_4x 0, 1, 2, 3
807*62d9e475SDanny Tsen	TP_4x 4, 5, 6, 7
808*62d9e475SDanny Tsen	TP_4x 8, 9, 10, 11
809*62d9e475SDanny Tsen	TP_4x 12, 13, 14, 15
810*62d9e475SDanny Tsen
811*62d9e475SDanny Tsen	Add_state 0
812*62d9e475SDanny Tsen	Write_256 0
813*62d9e475SDanny Tsen	addi	14, 14, 256	# offset += 256
814*62d9e475SDanny Tsen	addi	15, 15, -256	# len += 256
815*62d9e475SDanny Tsen
816*62d9e475SDanny Tsen	# Update state counter
817*62d9e475SDanny Tsen	vspltisw 25, 4
818*62d9e475SDanny Tsen	vadduwm	30, 30, 25
819*62d9e475SDanny Tsen
820*62d9e475SDanny Tsen	cmpdi	15, 0
821*62d9e475SDanny Tsen	beq	Out_loop
822*62d9e475SDanny Tsen	cmpdi	15, 256
823*62d9e475SDanny Tsen	blt	Out_loop
824*62d9e475SDanny Tsen
825*62d9e475SDanny Tsen	mtctr 8
826*62d9e475SDanny Tsen	b Loop_4x
827*62d9e475SDanny Tsen
828*62d9e475SDanny TsenOut_loop:
829*62d9e475SDanny Tsen	RESTORE_REGS
830*62d9e475SDanny Tsen	blr
831*62d9e475SDanny Tsen
832*62d9e475SDanny TsenOut_no_chacha:
833*62d9e475SDanny Tsen	li	3, 0
834*62d9e475SDanny Tsen	blr
835*62d9e475SDanny TsenSYM_FUNC_END(chacha_p10le_8x)
836*62d9e475SDanny Tsen
837*62d9e475SDanny TsenSYM_DATA_START_LOCAL(PERMX)
838*62d9e475SDanny Tsen.align 5
839*62d9e475SDanny Tsenpermx:
840*62d9e475SDanny Tsen.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
841*62d9e475SDanny Tsen.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
842*62d9e475SDanny TsenSYM_DATA_END(PERMX)
843