xref: /openbmc/linux/arch/s390/crypto/chacha-s390.S (revision 3b2e6a93)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Original implementation written by Andy Polyakov, @dot-asm.
4 * This is an adaptation of the original code for kernel use.
5 *
6 * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
7 */
8
9#include <linux/linkage.h>
10#include <asm/nospec-insn.h>
11#include <asm/vx-insn.h>
12
13#define SP	%r15
14#define FRAME	(16 * 8 + 4 * 8)
15
16.data
17.align	32
18
19.Lsigma:
20.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
21.long	1,0,0,0
22.long	2,0,0,0
23.long	3,0,0,0
24.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c	# byte swap
25
26.long	0,1,2,3
27.long	0x61707865,0x61707865,0x61707865,0x61707865	# smashed sigma
28.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
29.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
30.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
31
32.previous
33
34	GEN_BR_THUNK %r14
35
36.text
37
38#############################################################################
39# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
40#		      counst u32 *key, const u32 *counter)
41
42#define	OUT		%r2
43#define	INP		%r3
44#define	LEN		%r4
45#define	KEY		%r5
46#define	COUNTER		%r6
47
48#define BEPERM		%v31
49#define CTR		%v26
50
51#define K0		%v16
52#define K1		%v17
53#define K2		%v18
54#define K3		%v19
55
56#define XA0		%v0
57#define XA1		%v1
58#define XA2		%v2
59#define XA3		%v3
60
61#define XB0		%v4
62#define XB1		%v5
63#define XB2		%v6
64#define XB3		%v7
65
66#define XC0		%v8
67#define XC1		%v9
68#define XC2		%v10
69#define XC3		%v11
70
71#define XD0		%v12
72#define XD1		%v13
73#define XD2		%v14
74#define XD3		%v15
75
76#define XT0		%v27
77#define XT1		%v28
78#define XT2		%v29
79#define XT3		%v30
80
81ENTRY(chacha20_vx_4x)
82	stmg	%r6,%r7,6*8(SP)
83
84	larl	%r7,.Lsigma
85	lhi	%r0,10
86	lhi	%r1,0
87
88	VL	K0,0,,%r7		# load sigma
89	VL	K1,0,,KEY		# load key
90	VL	K2,16,,KEY
91	VL	K3,0,,COUNTER		# load counter
92
93	VL	BEPERM,0x40,,%r7
94	VL	CTR,0x50,,%r7
95
96	VLM	XA0,XA3,0x60,%r7,4	# load [smashed] sigma
97
98	VREPF	XB0,K1,0		# smash the key
99	VREPF	XB1,K1,1
100	VREPF	XB2,K1,2
101	VREPF	XB3,K1,3
102
103	VREPF	XD0,K3,0
104	VREPF	XD1,K3,1
105	VREPF	XD2,K3,2
106	VREPF	XD3,K3,3
107	VAF	XD0,XD0,CTR
108
109	VREPF	XC0,K2,0
110	VREPF	XC1,K2,1
111	VREPF	XC2,K2,2
112	VREPF	XC3,K2,3
113
114.Loop_4x:
115	VAF	XA0,XA0,XB0
116	VX	XD0,XD0,XA0
117	VERLLF	XD0,XD0,16
118
119	VAF	XA1,XA1,XB1
120	VX	XD1,XD1,XA1
121	VERLLF	XD1,XD1,16
122
123	VAF	XA2,XA2,XB2
124	VX	XD2,XD2,XA2
125	VERLLF	XD2,XD2,16
126
127	VAF	XA3,XA3,XB3
128	VX	XD3,XD3,XA3
129	VERLLF	XD3,XD3,16
130
131	VAF	XC0,XC0,XD0
132	VX	XB0,XB0,XC0
133	VERLLF	XB0,XB0,12
134
135	VAF	XC1,XC1,XD1
136	VX	XB1,XB1,XC1
137	VERLLF	XB1,XB1,12
138
139	VAF	XC2,XC2,XD2
140	VX	XB2,XB2,XC2
141	VERLLF	XB2,XB2,12
142
143	VAF	XC3,XC3,XD3
144	VX	XB3,XB3,XC3
145	VERLLF	XB3,XB3,12
146
147	VAF	XA0,XA0,XB0
148	VX	XD0,XD0,XA0
149	VERLLF	XD0,XD0,8
150
151	VAF	XA1,XA1,XB1
152	VX	XD1,XD1,XA1
153	VERLLF	XD1,XD1,8
154
155	VAF	XA2,XA2,XB2
156	VX	XD2,XD2,XA2
157	VERLLF	XD2,XD2,8
158
159	VAF	XA3,XA3,XB3
160	VX	XD3,XD3,XA3
161	VERLLF	XD3,XD3,8
162
163	VAF	XC0,XC0,XD0
164	VX	XB0,XB0,XC0
165	VERLLF	XB0,XB0,7
166
167	VAF	XC1,XC1,XD1
168	VX	XB1,XB1,XC1
169	VERLLF	XB1,XB1,7
170
171	VAF	XC2,XC2,XD2
172	VX	XB2,XB2,XC2
173	VERLLF	XB2,XB2,7
174
175	VAF	XC3,XC3,XD3
176	VX	XB3,XB3,XC3
177	VERLLF	XB3,XB3,7
178
179	VAF	XA0,XA0,XB1
180	VX	XD3,XD3,XA0
181	VERLLF	XD3,XD3,16
182
183	VAF	XA1,XA1,XB2
184	VX	XD0,XD0,XA1
185	VERLLF	XD0,XD0,16
186
187	VAF	XA2,XA2,XB3
188	VX	XD1,XD1,XA2
189	VERLLF	XD1,XD1,16
190
191	VAF	XA3,XA3,XB0
192	VX	XD2,XD2,XA3
193	VERLLF	XD2,XD2,16
194
195	VAF	XC2,XC2,XD3
196	VX	XB1,XB1,XC2
197	VERLLF	XB1,XB1,12
198
199	VAF	XC3,XC3,XD0
200	VX	XB2,XB2,XC3
201	VERLLF	XB2,XB2,12
202
203	VAF	XC0,XC0,XD1
204	VX	XB3,XB3,XC0
205	VERLLF	XB3,XB3,12
206
207	VAF	XC1,XC1,XD2
208	VX	XB0,XB0,XC1
209	VERLLF	XB0,XB0,12
210
211	VAF	XA0,XA0,XB1
212	VX	XD3,XD3,XA0
213	VERLLF	XD3,XD3,8
214
215	VAF	XA1,XA1,XB2
216	VX	XD0,XD0,XA1
217	VERLLF	XD0,XD0,8
218
219	VAF	XA2,XA2,XB3
220	VX	XD1,XD1,XA2
221	VERLLF	XD1,XD1,8
222
223	VAF	XA3,XA3,XB0
224	VX	XD2,XD2,XA3
225	VERLLF	XD2,XD2,8
226
227	VAF	XC2,XC2,XD3
228	VX	XB1,XB1,XC2
229	VERLLF	XB1,XB1,7
230
231	VAF	XC3,XC3,XD0
232	VX	XB2,XB2,XC3
233	VERLLF	XB2,XB2,7
234
235	VAF	XC0,XC0,XD1
236	VX	XB3,XB3,XC0
237	VERLLF	XB3,XB3,7
238
239	VAF	XC1,XC1,XD2
240	VX	XB0,XB0,XC1
241	VERLLF	XB0,XB0,7
242	brct	%r0,.Loop_4x
243
244	VAF	XD0,XD0,CTR
245
246	VMRHF	XT0,XA0,XA1		# transpose data
247	VMRHF	XT1,XA2,XA3
248	VMRLF	XT2,XA0,XA1
249	VMRLF	XT3,XA2,XA3
250	VPDI	XA0,XT0,XT1,0b0000
251	VPDI	XA1,XT0,XT1,0b0101
252	VPDI	XA2,XT2,XT3,0b0000
253	VPDI	XA3,XT2,XT3,0b0101
254
255	VMRHF	XT0,XB0,XB1
256	VMRHF	XT1,XB2,XB3
257	VMRLF	XT2,XB0,XB1
258	VMRLF	XT3,XB2,XB3
259	VPDI	XB0,XT0,XT1,0b0000
260	VPDI	XB1,XT0,XT1,0b0101
261	VPDI	XB2,XT2,XT3,0b0000
262	VPDI	XB3,XT2,XT3,0b0101
263
264	VMRHF	XT0,XC0,XC1
265	VMRHF	XT1,XC2,XC3
266	VMRLF	XT2,XC0,XC1
267	VMRLF	XT3,XC2,XC3
268	VPDI	XC0,XT0,XT1,0b0000
269	VPDI	XC1,XT0,XT1,0b0101
270	VPDI	XC2,XT2,XT3,0b0000
271	VPDI	XC3,XT2,XT3,0b0101
272
273	VMRHF	XT0,XD0,XD1
274	VMRHF	XT1,XD2,XD3
275	VMRLF	XT2,XD0,XD1
276	VMRLF	XT3,XD2,XD3
277	VPDI	XD0,XT0,XT1,0b0000
278	VPDI	XD1,XT0,XT1,0b0101
279	VPDI	XD2,XT2,XT3,0b0000
280	VPDI	XD3,XT2,XT3,0b0101
281
282	VAF	XA0,XA0,K0
283	VAF	XB0,XB0,K1
284	VAF	XC0,XC0,K2
285	VAF	XD0,XD0,K3
286
287	VPERM	XA0,XA0,XA0,BEPERM
288	VPERM	XB0,XB0,XB0,BEPERM
289	VPERM	XC0,XC0,XC0,BEPERM
290	VPERM	XD0,XD0,XD0,BEPERM
291
292	VLM	XT0,XT3,0,INP,0
293
294	VX	XT0,XT0,XA0
295	VX	XT1,XT1,XB0
296	VX	XT2,XT2,XC0
297	VX	XT3,XT3,XD0
298
299	VSTM	XT0,XT3,0,OUT,0
300
301	la	INP,0x40(INP)
302	la	OUT,0x40(OUT)
303	aghi	LEN,-0x40
304
305	VAF	XA0,XA1,K0
306	VAF	XB0,XB1,K1
307	VAF	XC0,XC1,K2
308	VAF	XD0,XD1,K3
309
310	VPERM	XA0,XA0,XA0,BEPERM
311	VPERM	XB0,XB0,XB0,BEPERM
312	VPERM	XC0,XC0,XC0,BEPERM
313	VPERM	XD0,XD0,XD0,BEPERM
314
315	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
316	jl	.Ltail_4x
317
318	VLM	XT0,XT3,0,INP,0
319
320	VX	XT0,XT0,XA0
321	VX	XT1,XT1,XB0
322	VX	XT2,XT2,XC0
323	VX	XT3,XT3,XD0
324
325	VSTM	XT0,XT3,0,OUT,0
326
327	la	INP,0x40(INP)
328	la	OUT,0x40(OUT)
329	aghi	LEN,-0x40
330	je	.Ldone_4x
331
332	VAF	XA0,XA2,K0
333	VAF	XB0,XB2,K1
334	VAF	XC0,XC2,K2
335	VAF	XD0,XD2,K3
336
337	VPERM	XA0,XA0,XA0,BEPERM
338	VPERM	XB0,XB0,XB0,BEPERM
339	VPERM	XC0,XC0,XC0,BEPERM
340	VPERM	XD0,XD0,XD0,BEPERM
341
342	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
343	jl	.Ltail_4x
344
345	VLM	XT0,XT3,0,INP,0
346
347	VX	XT0,XT0,XA0
348	VX	XT1,XT1,XB0
349	VX	XT2,XT2,XC0
350	VX	XT3,XT3,XD0
351
352	VSTM	XT0,XT3,0,OUT,0
353
354	la	INP,0x40(INP)
355	la	OUT,0x40(OUT)
356	aghi	LEN,-0x40
357	je	.Ldone_4x
358
359	VAF	XA0,XA3,K0
360	VAF	XB0,XB3,K1
361	VAF	XC0,XC3,K2
362	VAF	XD0,XD3,K3
363
364	VPERM	XA0,XA0,XA0,BEPERM
365	VPERM	XB0,XB0,XB0,BEPERM
366	VPERM	XC0,XC0,XC0,BEPERM
367	VPERM	XD0,XD0,XD0,BEPERM
368
369	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
370	jl	.Ltail_4x
371
372	VLM	XT0,XT3,0,INP,0
373
374	VX	XT0,XT0,XA0
375	VX	XT1,XT1,XB0
376	VX	XT2,XT2,XC0
377	VX	XT3,XT3,XD0
378
379	VSTM	XT0,XT3,0,OUT,0
380
381.Ldone_4x:
382	lmg	%r6,%r7,6*8(SP)
383	BR_EX	%r14
384
385.Ltail_4x:
386	VLR	XT0,XC0
387	VLR	XT1,XD0
388
389	VST	XA0,8*8+0x00,,SP
390	VST	XB0,8*8+0x10,,SP
391	VST	XT0,8*8+0x20,,SP
392	VST	XT1,8*8+0x30,,SP
393
394	lghi	%r1,0
395
396.Loop_tail_4x:
397	llgc	%r5,0(%r1,INP)
398	llgc	%r6,8*8(%r1,SP)
399	xr	%r6,%r5
400	stc	%r6,0(%r1,OUT)
401	la	%r1,1(%r1)
402	brct	LEN,.Loop_tail_4x
403
404	lmg	%r6,%r7,6*8(SP)
405	BR_EX	%r14
406ENDPROC(chacha20_vx_4x)
407
408#undef	OUT
409#undef	INP
410#undef	LEN
411#undef	KEY
412#undef	COUNTER
413
414#undef BEPERM
415
416#undef K0
417#undef K1
418#undef K2
419#undef K3
420
421
422#############################################################################
423# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
424#		   counst u32 *key, const u32 *counter)
425
426#define	OUT		%r2
427#define	INP		%r3
428#define	LEN		%r4
429#define	KEY		%r5
430#define	COUNTER		%r6
431
432#define BEPERM		%v31
433
434#define K0		%v27
435#define K1		%v24
436#define K2		%v25
437#define K3		%v26
438
439#define A0		%v0
440#define B0		%v1
441#define C0		%v2
442#define D0		%v3
443
444#define A1		%v4
445#define B1		%v5
446#define C1		%v6
447#define D1		%v7
448
449#define A2		%v8
450#define B2		%v9
451#define C2		%v10
452#define D2		%v11
453
454#define A3		%v12
455#define B3		%v13
456#define C3		%v14
457#define D3		%v15
458
459#define A4		%v16
460#define B4		%v17
461#define C4		%v18
462#define D4		%v19
463
464#define A5		%v20
465#define B5		%v21
466#define C5		%v22
467#define D5		%v23
468
469#define T0		%v27
470#define T1		%v28
471#define T2		%v29
472#define T3		%v30
473
474ENTRY(chacha20_vx)
475	.insn	rilu,0xc20e00000000,LEN,256	# clgfi LEN,256
476	jle	chacha20_vx_4x
477	stmg	%r6,%r7,6*8(SP)
478
479	lghi	%r1,-FRAME
480	lgr	%r0,SP
481	la	SP,0(%r1,SP)
482	stg	%r0,0(SP)		# back-chain
483
484	larl	%r7,.Lsigma
485	lhi	%r0,10
486
487	VLM	K1,K2,0,KEY,0		# load key
488	VL	K3,0,,COUNTER		# load counter
489
490	VLM	K0,BEPERM,0,%r7,4	# load sigma, increments, ...
491
492.Loop_outer_vx:
493	VLR	A0,K0
494	VLR	B0,K1
495	VLR	A1,K0
496	VLR	B1,K1
497	VLR	A2,K0
498	VLR	B2,K1
499	VLR	A3,K0
500	VLR	B3,K1
501	VLR	A4,K0
502	VLR	B4,K1
503	VLR	A5,K0
504	VLR	B5,K1
505
506	VLR	D0,K3
507	VAF	D1,K3,T1		# K[3]+1
508	VAF	D2,K3,T2		# K[3]+2
509	VAF	D3,K3,T3		# K[3]+3
510	VAF	D4,D2,T2		# K[3]+4
511	VAF	D5,D2,T3		# K[3]+5
512
513	VLR	C0,K2
514	VLR	C1,K2
515	VLR	C2,K2
516	VLR	C3,K2
517	VLR	C4,K2
518	VLR	C5,K2
519
520	VLR	T1,D1
521	VLR	T2,D2
522	VLR	T3,D3
523
524.Loop_vx:
525	VAF	A0,A0,B0
526	VAF	A1,A1,B1
527	VAF	A2,A2,B2
528	VAF	A3,A3,B3
529	VAF	A4,A4,B4
530	VAF	A5,A5,B5
531	VX	D0,D0,A0
532	VX	D1,D1,A1
533	VX	D2,D2,A2
534	VX	D3,D3,A3
535	VX	D4,D4,A4
536	VX	D5,D5,A5
537	VERLLF	D0,D0,16
538	VERLLF	D1,D1,16
539	VERLLF	D2,D2,16
540	VERLLF	D3,D3,16
541	VERLLF	D4,D4,16
542	VERLLF	D5,D5,16
543
544	VAF	C0,C0,D0
545	VAF	C1,C1,D1
546	VAF	C2,C2,D2
547	VAF	C3,C3,D3
548	VAF	C4,C4,D4
549	VAF	C5,C5,D5
550	VX	B0,B0,C0
551	VX	B1,B1,C1
552	VX	B2,B2,C2
553	VX	B3,B3,C3
554	VX	B4,B4,C4
555	VX	B5,B5,C5
556	VERLLF	B0,B0,12
557	VERLLF	B1,B1,12
558	VERLLF	B2,B2,12
559	VERLLF	B3,B3,12
560	VERLLF	B4,B4,12
561	VERLLF	B5,B5,12
562
563	VAF	A0,A0,B0
564	VAF	A1,A1,B1
565	VAF	A2,A2,B2
566	VAF	A3,A3,B3
567	VAF	A4,A4,B4
568	VAF	A5,A5,B5
569	VX	D0,D0,A0
570	VX	D1,D1,A1
571	VX	D2,D2,A2
572	VX	D3,D3,A3
573	VX	D4,D4,A4
574	VX	D5,D5,A5
575	VERLLF	D0,D0,8
576	VERLLF	D1,D1,8
577	VERLLF	D2,D2,8
578	VERLLF	D3,D3,8
579	VERLLF	D4,D4,8
580	VERLLF	D5,D5,8
581
582	VAF	C0,C0,D0
583	VAF	C1,C1,D1
584	VAF	C2,C2,D2
585	VAF	C3,C3,D3
586	VAF	C4,C4,D4
587	VAF	C5,C5,D5
588	VX	B0,B0,C0
589	VX	B1,B1,C1
590	VX	B2,B2,C2
591	VX	B3,B3,C3
592	VX	B4,B4,C4
593	VX	B5,B5,C5
594	VERLLF	B0,B0,7
595	VERLLF	B1,B1,7
596	VERLLF	B2,B2,7
597	VERLLF	B3,B3,7
598	VERLLF	B4,B4,7
599	VERLLF	B5,B5,7
600
601	VSLDB	C0,C0,C0,8
602	VSLDB	C1,C1,C1,8
603	VSLDB	C2,C2,C2,8
604	VSLDB	C3,C3,C3,8
605	VSLDB	C4,C4,C4,8
606	VSLDB	C5,C5,C5,8
607	VSLDB	B0,B0,B0,4
608	VSLDB	B1,B1,B1,4
609	VSLDB	B2,B2,B2,4
610	VSLDB	B3,B3,B3,4
611	VSLDB	B4,B4,B4,4
612	VSLDB	B5,B5,B5,4
613	VSLDB	D0,D0,D0,12
614	VSLDB	D1,D1,D1,12
615	VSLDB	D2,D2,D2,12
616	VSLDB	D3,D3,D3,12
617	VSLDB	D4,D4,D4,12
618	VSLDB	D5,D5,D5,12
619
620	VAF	A0,A0,B0
621	VAF	A1,A1,B1
622	VAF	A2,A2,B2
623	VAF	A3,A3,B3
624	VAF	A4,A4,B4
625	VAF	A5,A5,B5
626	VX	D0,D0,A0
627	VX	D1,D1,A1
628	VX	D2,D2,A2
629	VX	D3,D3,A3
630	VX	D4,D4,A4
631	VX	D5,D5,A5
632	VERLLF	D0,D0,16
633	VERLLF	D1,D1,16
634	VERLLF	D2,D2,16
635	VERLLF	D3,D3,16
636	VERLLF	D4,D4,16
637	VERLLF	D5,D5,16
638
639	VAF	C0,C0,D0
640	VAF	C1,C1,D1
641	VAF	C2,C2,D2
642	VAF	C3,C3,D3
643	VAF	C4,C4,D4
644	VAF	C5,C5,D5
645	VX	B0,B0,C0
646	VX	B1,B1,C1
647	VX	B2,B2,C2
648	VX	B3,B3,C3
649	VX	B4,B4,C4
650	VX	B5,B5,C5
651	VERLLF	B0,B0,12
652	VERLLF	B1,B1,12
653	VERLLF	B2,B2,12
654	VERLLF	B3,B3,12
655	VERLLF	B4,B4,12
656	VERLLF	B5,B5,12
657
658	VAF	A0,A0,B0
659	VAF	A1,A1,B1
660	VAF	A2,A2,B2
661	VAF	A3,A3,B3
662	VAF	A4,A4,B4
663	VAF	A5,A5,B5
664	VX	D0,D0,A0
665	VX	D1,D1,A1
666	VX	D2,D2,A2
667	VX	D3,D3,A3
668	VX	D4,D4,A4
669	VX	D5,D5,A5
670	VERLLF	D0,D0,8
671	VERLLF	D1,D1,8
672	VERLLF	D2,D2,8
673	VERLLF	D3,D3,8
674	VERLLF	D4,D4,8
675	VERLLF	D5,D5,8
676
677	VAF	C0,C0,D0
678	VAF	C1,C1,D1
679	VAF	C2,C2,D2
680	VAF	C3,C3,D3
681	VAF	C4,C4,D4
682	VAF	C5,C5,D5
683	VX	B0,B0,C0
684	VX	B1,B1,C1
685	VX	B2,B2,C2
686	VX	B3,B3,C3
687	VX	B4,B4,C4
688	VX	B5,B5,C5
689	VERLLF	B0,B0,7
690	VERLLF	B1,B1,7
691	VERLLF	B2,B2,7
692	VERLLF	B3,B3,7
693	VERLLF	B4,B4,7
694	VERLLF	B5,B5,7
695
696	VSLDB	C0,C0,C0,8
697	VSLDB	C1,C1,C1,8
698	VSLDB	C2,C2,C2,8
699	VSLDB	C3,C3,C3,8
700	VSLDB	C4,C4,C4,8
701	VSLDB	C5,C5,C5,8
702	VSLDB	B0,B0,B0,12
703	VSLDB	B1,B1,B1,12
704	VSLDB	B2,B2,B2,12
705	VSLDB	B3,B3,B3,12
706	VSLDB	B4,B4,B4,12
707	VSLDB	B5,B5,B5,12
708	VSLDB	D0,D0,D0,4
709	VSLDB	D1,D1,D1,4
710	VSLDB	D2,D2,D2,4
711	VSLDB	D3,D3,D3,4
712	VSLDB	D4,D4,D4,4
713	VSLDB	D5,D5,D5,4
714	brct	%r0,.Loop_vx
715
716	VAF	A0,A0,K0
717	VAF	B0,B0,K1
718	VAF	C0,C0,K2
719	VAF	D0,D0,K3
720	VAF	A1,A1,K0
721	VAF	D1,D1,T1		# +K[3]+1
722
723	VPERM	A0,A0,A0,BEPERM
724	VPERM	B0,B0,B0,BEPERM
725	VPERM	C0,C0,C0,BEPERM
726	VPERM	D0,D0,D0,BEPERM
727
728	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
729	jl	.Ltail_vx
730
731	VAF	D2,D2,T2		# +K[3]+2
732	VAF	D3,D3,T3		# +K[3]+3
733	VLM	T0,T3,0,INP,0
734
735	VX	A0,A0,T0
736	VX	B0,B0,T1
737	VX	C0,C0,T2
738	VX	D0,D0,T3
739
740	VLM	K0,T3,0,%r7,4		# re-load sigma and increments
741
742	VSTM	A0,D0,0,OUT,0
743
744	la	INP,0x40(INP)
745	la	OUT,0x40(OUT)
746	aghi	LEN,-0x40
747	je	.Ldone_vx
748
749	VAF	B1,B1,K1
750	VAF	C1,C1,K2
751
752	VPERM	A0,A1,A1,BEPERM
753	VPERM	B0,B1,B1,BEPERM
754	VPERM	C0,C1,C1,BEPERM
755	VPERM	D0,D1,D1,BEPERM
756
757	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
758	jl	.Ltail_vx
759
760	VLM	A1,D1,0,INP,0
761
762	VX	A0,A0,A1
763	VX	B0,B0,B1
764	VX	C0,C0,C1
765	VX	D0,D0,D1
766
767	VSTM	A0,D0,0,OUT,0
768
769	la	INP,0x40(INP)
770	la	OUT,0x40(OUT)
771	aghi	LEN,-0x40
772	je	.Ldone_vx
773
774	VAF	A2,A2,K0
775	VAF	B2,B2,K1
776	VAF	C2,C2,K2
777
778	VPERM	A0,A2,A2,BEPERM
779	VPERM	B0,B2,B2,BEPERM
780	VPERM	C0,C2,C2,BEPERM
781	VPERM	D0,D2,D2,BEPERM
782
783	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
784	jl	.Ltail_vx
785
786	VLM	A1,D1,0,INP,0
787
788	VX	A0,A0,A1
789	VX	B0,B0,B1
790	VX	C0,C0,C1
791	VX	D0,D0,D1
792
793	VSTM	A0,D0,0,OUT,0
794
795	la	INP,0x40(INP)
796	la	OUT,0x40(OUT)
797	aghi	LEN,-0x40
798	je	.Ldone_vx
799
800	VAF	A3,A3,K0
801	VAF	B3,B3,K1
802	VAF	C3,C3,K2
803	VAF	D2,K3,T3		# K[3]+3
804
805	VPERM	A0,A3,A3,BEPERM
806	VPERM	B0,B3,B3,BEPERM
807	VPERM	C0,C3,C3,BEPERM
808	VPERM	D0,D3,D3,BEPERM
809
810	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
811	jl	.Ltail_vx
812
813	VAF	D3,D2,T1		# K[3]+4
814	VLM	A1,D1,0,INP,0
815
816	VX	A0,A0,A1
817	VX	B0,B0,B1
818	VX	C0,C0,C1
819	VX	D0,D0,D1
820
821	VSTM	A0,D0,0,OUT,0
822
823	la	INP,0x40(INP)
824	la	OUT,0x40(OUT)
825	aghi	LEN,-0x40
826	je	.Ldone_vx
827
828	VAF	A4,A4,K0
829	VAF	B4,B4,K1
830	VAF	C4,C4,K2
831	VAF	D4,D4,D3		# +K[3]+4
832	VAF	D3,D3,T1		# K[3]+5
833	VAF	K3,D2,T3		# K[3]+=6
834
835	VPERM	A0,A4,A4,BEPERM
836	VPERM	B0,B4,B4,BEPERM
837	VPERM	C0,C4,C4,BEPERM
838	VPERM	D0,D4,D4,BEPERM
839
840	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
841	jl	.Ltail_vx
842
843	VLM	A1,D1,0,INP,0
844
845	VX	A0,A0,A1
846	VX	B0,B0,B1
847	VX	C0,C0,C1
848	VX	D0,D0,D1
849
850	VSTM	A0,D0,0,OUT,0
851
852	la	INP,0x40(INP)
853	la	OUT,0x40(OUT)
854	aghi	LEN,-0x40
855	je	.Ldone_vx
856
857	VAF	A5,A5,K0
858	VAF	B5,B5,K1
859	VAF	C5,C5,K2
860	VAF	D5,D5,D3		# +K[3]+5
861
862	VPERM	A0,A5,A5,BEPERM
863	VPERM	B0,B5,B5,BEPERM
864	VPERM	C0,C5,C5,BEPERM
865	VPERM	D0,D5,D5,BEPERM
866
867	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
868	jl	.Ltail_vx
869
870	VLM	A1,D1,0,INP,0
871
872	VX	A0,A0,A1
873	VX	B0,B0,B1
874	VX	C0,C0,C1
875	VX	D0,D0,D1
876
877	VSTM	A0,D0,0,OUT,0
878
879	la	INP,0x40(INP)
880	la	OUT,0x40(OUT)
881	lhi	%r0,10
882	aghi	LEN,-0x40
883	jne	.Loop_outer_vx
884
885.Ldone_vx:
886	lmg	%r6,%r7,FRAME+6*8(SP)
887	la	SP,FRAME(SP)
888	BR_EX	%r14
889
890.Ltail_vx:
891	VSTM	A0,D0,8*8,SP,3
892	lghi	%r1,0
893
894.Loop_tail_vx:
895	llgc	%r5,0(%r1,INP)
896	llgc	%r6,8*8(%r1,SP)
897	xr	%r6,%r5
898	stc	%r6,0(%r1,OUT)
899	la	%r1,1(%r1)
900	brct	LEN,.Loop_tail_vx
901
902	lmg	%r6,%r7,FRAME+6*8(SP)
903	la	SP,FRAME(SP)
904	BR_EX	%r14
905ENDPROC(chacha20_vx)
906
907.previous
908