1/*
2 * ChaCha/XChaCha NEON helper functions
3 *
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Originally based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12 *
13 * Copyright (C) 2015 Martin Willi
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 */
20
21#include <linux/linkage.h>
22#include <asm/assembler.h>
23#include <asm/cache.h>
24
25	.text
26	.align		6
27
28/*
29 * chacha_permute - permute one block
30 *
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3.  It performs matrix operations on four words in parallel,
33 * but requires shuffling to rearrange the words after each round.
34 *
35 * The round count is given in w3.
36 *
37 * Clobbers: w3, x10, v4, v12
38 */
39chacha_permute:
40
41	adr_l		x10, ROT8
42	ld1		{v12.4s}, [x10]
43
44.Ldoubleround:
45	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46	add		v0.4s, v0.4s, v1.4s
47	eor		v3.16b, v3.16b, v0.16b
48	rev32		v3.8h, v3.8h
49
50	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51	add		v2.4s, v2.4s, v3.4s
52	eor		v4.16b, v1.16b, v2.16b
53	shl		v1.4s, v4.4s, #12
54	sri		v1.4s, v4.4s, #20
55
56	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57	add		v0.4s, v0.4s, v1.4s
58	eor		v3.16b, v3.16b, v0.16b
59	tbl		v3.16b, {v3.16b}, v12.16b
60
61	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62	add		v2.4s, v2.4s, v3.4s
63	eor		v4.16b, v1.16b, v2.16b
64	shl		v1.4s, v4.4s, #7
65	sri		v1.4s, v4.4s, #25
66
67	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68	ext		v1.16b, v1.16b, v1.16b, #4
69	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70	ext		v2.16b, v2.16b, v2.16b, #8
71	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72	ext		v3.16b, v3.16b, v3.16b, #12
73
74	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75	add		v0.4s, v0.4s, v1.4s
76	eor		v3.16b, v3.16b, v0.16b
77	rev32		v3.8h, v3.8h
78
79	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80	add		v2.4s, v2.4s, v3.4s
81	eor		v4.16b, v1.16b, v2.16b
82	shl		v1.4s, v4.4s, #12
83	sri		v1.4s, v4.4s, #20
84
85	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86	add		v0.4s, v0.4s, v1.4s
87	eor		v3.16b, v3.16b, v0.16b
88	tbl		v3.16b, {v3.16b}, v12.16b
89
90	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91	add		v2.4s, v2.4s, v3.4s
92	eor		v4.16b, v1.16b, v2.16b
93	shl		v1.4s, v4.4s, #7
94	sri		v1.4s, v4.4s, #25
95
96	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97	ext		v1.16b, v1.16b, v1.16b, #12
98	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99	ext		v2.16b, v2.16b, v2.16b, #8
100	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101	ext		v3.16b, v3.16b, v3.16b, #4
102
103	subs		w3, w3, #2
104	b.ne		.Ldoubleround
105
106	ret
107ENDPROC(chacha_permute)
108
109ENTRY(chacha_block_xor_neon)
110	// x0: Input state matrix, s
111	// x1: 1 data block output, o
112	// x2: 1 data block input, i
113	// w3: nrounds
114
115	stp		x29, x30, [sp, #-16]!
116	mov		x29, sp
117
118	// x0..3 = s0..3
119	ld1		{v0.4s-v3.4s}, [x0]
120	ld1		{v8.4s-v11.4s}, [x0]
121
122	bl		chacha_permute
123
124	ld1		{v4.16b-v7.16b}, [x2]
125
126	// o0 = i0 ^ (x0 + s0)
127	add		v0.4s, v0.4s, v8.4s
128	eor		v0.16b, v0.16b, v4.16b
129
130	// o1 = i1 ^ (x1 + s1)
131	add		v1.4s, v1.4s, v9.4s
132	eor		v1.16b, v1.16b, v5.16b
133
134	// o2 = i2 ^ (x2 + s2)
135	add		v2.4s, v2.4s, v10.4s
136	eor		v2.16b, v2.16b, v6.16b
137
138	// o3 = i3 ^ (x3 + s3)
139	add		v3.4s, v3.4s, v11.4s
140	eor		v3.16b, v3.16b, v7.16b
141
142	st1		{v0.16b-v3.16b}, [x1]
143
144	ldp		x29, x30, [sp], #16
145	ret
146ENDPROC(chacha_block_xor_neon)
147
148ENTRY(hchacha_block_neon)
149	// x0: Input state matrix, s
150	// x1: output (8 32-bit words)
151	// w2: nrounds
152
153	stp		x29, x30, [sp, #-16]!
154	mov		x29, sp
155
156	ld1		{v0.4s-v3.4s}, [x0]
157
158	mov		w3, w2
159	bl		chacha_permute
160
161	st1		{v0.16b}, [x1], #16
162	st1		{v3.16b}, [x1]
163
164	ldp		x29, x30, [sp], #16
165	ret
166ENDPROC(hchacha_block_neon)
167
168	a0		.req	w12
169	a1		.req	w13
170	a2		.req	w14
171	a3		.req	w15
172	a4		.req	w16
173	a5		.req	w17
174	a6		.req	w19
175	a7		.req	w20
176	a8		.req	w21
177	a9		.req	w22
178	a10		.req	w23
179	a11		.req	w24
180	a12		.req	w25
181	a13		.req	w26
182	a14		.req	w27
183	a15		.req	w28
184
185	.align		6
186ENTRY(chacha_4block_xor_neon)
187	frame_push	10
188
189	// x0: Input state matrix, s
190	// x1: 4 data blocks output, o
191	// x2: 4 data blocks input, i
192	// w3: nrounds
193	// x4: byte count
194
195	adr_l		x10, .Lpermute
196	and		x5, x4, #63
197	add		x10, x10, x5
198	add		x11, x10, #64
199
200	//
201	// This function encrypts four consecutive ChaCha blocks by loading
202	// the state matrix in NEON registers four times. The algorithm performs
203	// each operation on the corresponding word of each state matrix, hence
204	// requires no word shuffling. For final XORing step we transpose the
205	// matrix by interleaving 32- and then 64-bit words, which allows us to
206	// do XOR in NEON registers.
207	//
208	// At the same time, a fifth block is encrypted in parallel using
209	// scalar registers
210	//
211	adr_l		x9, CTRINC		// ... and ROT8
212	ld1		{v30.4s-v31.4s}, [x9]
213
214	// x0..15[0-3] = s0..3[0..3]
215	add		x8, x0, #16
216	ld4r		{ v0.4s- v3.4s}, [x0]
217	ld4r		{ v4.4s- v7.4s}, [x8], #16
218	ld4r		{ v8.4s-v11.4s}, [x8], #16
219	ld4r		{v12.4s-v15.4s}, [x8]
220
221	mov		a0, v0.s[0]
222	mov		a1, v1.s[0]
223	mov		a2, v2.s[0]
224	mov		a3, v3.s[0]
225	mov		a4, v4.s[0]
226	mov		a5, v5.s[0]
227	mov		a6, v6.s[0]
228	mov		a7, v7.s[0]
229	mov		a8, v8.s[0]
230	mov		a9, v9.s[0]
231	mov		a10, v10.s[0]
232	mov		a11, v11.s[0]
233	mov		a12, v12.s[0]
234	mov		a13, v13.s[0]
235	mov		a14, v14.s[0]
236	mov		a15, v15.s[0]
237
238	// x12 += counter values 1-4
239	add		v12.4s, v12.4s, v30.4s
240
241.Ldoubleround4:
242	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
243	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
244	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
245	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
246	add		v0.4s, v0.4s, v4.4s
247	  add		a0, a0, a4
248	add		v1.4s, v1.4s, v5.4s
249	  add		a1, a1, a5
250	add		v2.4s, v2.4s, v6.4s
251	  add		a2, a2, a6
252	add		v3.4s, v3.4s, v7.4s
253	  add		a3, a3, a7
254
255	eor		v12.16b, v12.16b, v0.16b
256	  eor		a12, a12, a0
257	eor		v13.16b, v13.16b, v1.16b
258	  eor		a13, a13, a1
259	eor		v14.16b, v14.16b, v2.16b
260	  eor		a14, a14, a2
261	eor		v15.16b, v15.16b, v3.16b
262	  eor		a15, a15, a3
263
264	rev32		v12.8h, v12.8h
265	  ror		a12, a12, #16
266	rev32		v13.8h, v13.8h
267	  ror		a13, a13, #16
268	rev32		v14.8h, v14.8h
269	  ror		a14, a14, #16
270	rev32		v15.8h, v15.8h
271	  ror		a15, a15, #16
272
273	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
274	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
275	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
276	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
277	add		v8.4s, v8.4s, v12.4s
278	  add		a8, a8, a12
279	add		v9.4s, v9.4s, v13.4s
280	  add		a9, a9, a13
281	add		v10.4s, v10.4s, v14.4s
282	  add		a10, a10, a14
283	add		v11.4s, v11.4s, v15.4s
284	  add		a11, a11, a15
285
286	eor		v16.16b, v4.16b, v8.16b
287	  eor		a4, a4, a8
288	eor		v17.16b, v5.16b, v9.16b
289	  eor		a5, a5, a9
290	eor		v18.16b, v6.16b, v10.16b
291	  eor		a6, a6, a10
292	eor		v19.16b, v7.16b, v11.16b
293	  eor		a7, a7, a11
294
295	shl		v4.4s, v16.4s, #12
296	shl		v5.4s, v17.4s, #12
297	shl		v6.4s, v18.4s, #12
298	shl		v7.4s, v19.4s, #12
299
300	sri		v4.4s, v16.4s, #20
301	  ror		a4, a4, #20
302	sri		v5.4s, v17.4s, #20
303	  ror		a5, a5, #20
304	sri		v6.4s, v18.4s, #20
305	  ror		a6, a6, #20
306	sri		v7.4s, v19.4s, #20
307	  ror		a7, a7, #20
308
309	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
310	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
311	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
312	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
313	add		v0.4s, v0.4s, v4.4s
314	  add		a0, a0, a4
315	add		v1.4s, v1.4s, v5.4s
316	  add		a1, a1, a5
317	add		v2.4s, v2.4s, v6.4s
318	  add		a2, a2, a6
319	add		v3.4s, v3.4s, v7.4s
320	  add		a3, a3, a7
321
322	eor		v12.16b, v12.16b, v0.16b
323	  eor		a12, a12, a0
324	eor		v13.16b, v13.16b, v1.16b
325	  eor		a13, a13, a1
326	eor		v14.16b, v14.16b, v2.16b
327	  eor		a14, a14, a2
328	eor		v15.16b, v15.16b, v3.16b
329	  eor		a15, a15, a3
330
331	tbl		v12.16b, {v12.16b}, v31.16b
332	  ror		a12, a12, #24
333	tbl		v13.16b, {v13.16b}, v31.16b
334	  ror		a13, a13, #24
335	tbl		v14.16b, {v14.16b}, v31.16b
336	  ror		a14, a14, #24
337	tbl		v15.16b, {v15.16b}, v31.16b
338	  ror		a15, a15, #24
339
340	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
341	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
342	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
343	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
344	add		v8.4s, v8.4s, v12.4s
345	  add		a8, a8, a12
346	add		v9.4s, v9.4s, v13.4s
347	  add		a9, a9, a13
348	add		v10.4s, v10.4s, v14.4s
349	  add		a10, a10, a14
350	add		v11.4s, v11.4s, v15.4s
351	  add		a11, a11, a15
352
353	eor		v16.16b, v4.16b, v8.16b
354	  eor		a4, a4, a8
355	eor		v17.16b, v5.16b, v9.16b
356	  eor		a5, a5, a9
357	eor		v18.16b, v6.16b, v10.16b
358	  eor		a6, a6, a10
359	eor		v19.16b, v7.16b, v11.16b
360	  eor		a7, a7, a11
361
362	shl		v4.4s, v16.4s, #7
363	shl		v5.4s, v17.4s, #7
364	shl		v6.4s, v18.4s, #7
365	shl		v7.4s, v19.4s, #7
366
367	sri		v4.4s, v16.4s, #25
368	  ror		a4, a4, #25
369	sri		v5.4s, v17.4s, #25
370	  ror		a5, a5, #25
371	sri		v6.4s, v18.4s, #25
372	 ror		a6, a6, #25
373	sri		v7.4s, v19.4s, #25
374	  ror		a7, a7, #25
375
376	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
377	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
378	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
379	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
380	add		v0.4s, v0.4s, v5.4s
381	  add		a0, a0, a5
382	add		v1.4s, v1.4s, v6.4s
383	  add		a1, a1, a6
384	add		v2.4s, v2.4s, v7.4s
385	  add		a2, a2, a7
386	add		v3.4s, v3.4s, v4.4s
387	  add		a3, a3, a4
388
389	eor		v15.16b, v15.16b, v0.16b
390	  eor		a15, a15, a0
391	eor		v12.16b, v12.16b, v1.16b
392	  eor		a12, a12, a1
393	eor		v13.16b, v13.16b, v2.16b
394	  eor		a13, a13, a2
395	eor		v14.16b, v14.16b, v3.16b
396	  eor		a14, a14, a3
397
398	rev32		v15.8h, v15.8h
399	  ror		a15, a15, #16
400	rev32		v12.8h, v12.8h
401	  ror		a12, a12, #16
402	rev32		v13.8h, v13.8h
403	  ror		a13, a13, #16
404	rev32		v14.8h, v14.8h
405	  ror		a14, a14, #16
406
407	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
408	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
409	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
410	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
411	add		v10.4s, v10.4s, v15.4s
412	  add		a10, a10, a15
413	add		v11.4s, v11.4s, v12.4s
414	  add		a11, a11, a12
415	add		v8.4s, v8.4s, v13.4s
416	  add		a8, a8, a13
417	add		v9.4s, v9.4s, v14.4s
418	  add		a9, a9, a14
419
420	eor		v16.16b, v5.16b, v10.16b
421	  eor		a5, a5, a10
422	eor		v17.16b, v6.16b, v11.16b
423	  eor		a6, a6, a11
424	eor		v18.16b, v7.16b, v8.16b
425	  eor		a7, a7, a8
426	eor		v19.16b, v4.16b, v9.16b
427	  eor		a4, a4, a9
428
429	shl		v5.4s, v16.4s, #12
430	shl		v6.4s, v17.4s, #12
431	shl		v7.4s, v18.4s, #12
432	shl		v4.4s, v19.4s, #12
433
434	sri		v5.4s, v16.4s, #20
435	  ror		a5, a5, #20
436	sri		v6.4s, v17.4s, #20
437	  ror		a6, a6, #20
438	sri		v7.4s, v18.4s, #20
439	  ror		a7, a7, #20
440	sri		v4.4s, v19.4s, #20
441	  ror		a4, a4, #20
442
443	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
444	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
445	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
446	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
447	add		v0.4s, v0.4s, v5.4s
448	  add		a0, a0, a5
449	add		v1.4s, v1.4s, v6.4s
450	  add		a1, a1, a6
451	add		v2.4s, v2.4s, v7.4s
452	  add		a2, a2, a7
453	add		v3.4s, v3.4s, v4.4s
454	  add		a3, a3, a4
455
456	eor		v15.16b, v15.16b, v0.16b
457	  eor		a15, a15, a0
458	eor		v12.16b, v12.16b, v1.16b
459	  eor		a12, a12, a1
460	eor		v13.16b, v13.16b, v2.16b
461	  eor		a13, a13, a2
462	eor		v14.16b, v14.16b, v3.16b
463	  eor		a14, a14, a3
464
465	tbl		v15.16b, {v15.16b}, v31.16b
466	  ror		a15, a15, #24
467	tbl		v12.16b, {v12.16b}, v31.16b
468	  ror		a12, a12, #24
469	tbl		v13.16b, {v13.16b}, v31.16b
470	  ror		a13, a13, #24
471	tbl		v14.16b, {v14.16b}, v31.16b
472	  ror		a14, a14, #24
473
474	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
475	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
476	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
477	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
478	add		v10.4s, v10.4s, v15.4s
479	  add		a10, a10, a15
480	add		v11.4s, v11.4s, v12.4s
481	  add		a11, a11, a12
482	add		v8.4s, v8.4s, v13.4s
483	  add		a8, a8, a13
484	add		v9.4s, v9.4s, v14.4s
485	  add		a9, a9, a14
486
487	eor		v16.16b, v5.16b, v10.16b
488	  eor		a5, a5, a10
489	eor		v17.16b, v6.16b, v11.16b
490	  eor		a6, a6, a11
491	eor		v18.16b, v7.16b, v8.16b
492	  eor		a7, a7, a8
493	eor		v19.16b, v4.16b, v9.16b
494	  eor		a4, a4, a9
495
496	shl		v5.4s, v16.4s, #7
497	shl		v6.4s, v17.4s, #7
498	shl		v7.4s, v18.4s, #7
499	shl		v4.4s, v19.4s, #7
500
501	sri		v5.4s, v16.4s, #25
502	  ror		a5, a5, #25
503	sri		v6.4s, v17.4s, #25
504	  ror		a6, a6, #25
505	sri		v7.4s, v18.4s, #25
506	  ror		a7, a7, #25
507	sri		v4.4s, v19.4s, #25
508	  ror		a4, a4, #25
509
510	subs		w3, w3, #2
511	b.ne		.Ldoubleround4
512
513	ld4r		{v16.4s-v19.4s}, [x0], #16
514	ld4r		{v20.4s-v23.4s}, [x0], #16
515
516	// x12 += counter values 0-3
517	add		v12.4s, v12.4s, v30.4s
518
519	// x0[0-3] += s0[0]
520	// x1[0-3] += s0[1]
521	// x2[0-3] += s0[2]
522	// x3[0-3] += s0[3]
523	add		v0.4s, v0.4s, v16.4s
524	  mov		w6, v16.s[0]
525	  mov		w7, v17.s[0]
526	add		v1.4s, v1.4s, v17.4s
527	  mov		w8, v18.s[0]
528	  mov		w9, v19.s[0]
529	add		v2.4s, v2.4s, v18.4s
530	  add		a0, a0, w6
531	  add		a1, a1, w7
532	add		v3.4s, v3.4s, v19.4s
533	  add		a2, a2, w8
534	  add		a3, a3, w9
535
536	ld4r		{v24.4s-v27.4s}, [x0], #16
537	ld4r		{v28.4s-v31.4s}, [x0]
538
539	// x4[0-3] += s1[0]
540	// x5[0-3] += s1[1]
541	// x6[0-3] += s1[2]
542	// x7[0-3] += s1[3]
543	add		v4.4s, v4.4s, v20.4s
544	  mov		w6, v20.s[0]
545	  mov		w7, v21.s[0]
546	add		v5.4s, v5.4s, v21.4s
547	  mov		w8, v22.s[0]
548	  mov		w9, v23.s[0]
549	add		v6.4s, v6.4s, v22.4s
550	  add		a4, a4, w6
551	  add		a5, a5, w7
552	add		v7.4s, v7.4s, v23.4s
553	  add		a6, a6, w8
554	  add		a7, a7, w9
555
556	// x8[0-3] += s2[0]
557	// x9[0-3] += s2[1]
558	// x10[0-3] += s2[2]
559	// x11[0-3] += s2[3]
560	add		v8.4s, v8.4s, v24.4s
561	  mov		w6, v24.s[0]
562	  mov		w7, v25.s[0]
563	add		v9.4s, v9.4s, v25.4s
564	  mov		w8, v26.s[0]
565	  mov		w9, v27.s[0]
566	add		v10.4s, v10.4s, v26.4s
567	  add		a8, a8, w6
568	  add		a9, a9, w7
569	add		v11.4s, v11.4s, v27.4s
570	  add		a10, a10, w8
571	  add		a11, a11, w9
572
573	// x12[0-3] += s3[0]
574	// x13[0-3] += s3[1]
575	// x14[0-3] += s3[2]
576	// x15[0-3] += s3[3]
577	add		v12.4s, v12.4s, v28.4s
578	  mov		w6, v28.s[0]
579	  mov		w7, v29.s[0]
580	add		v13.4s, v13.4s, v29.4s
581	  mov		w8, v30.s[0]
582	  mov		w9, v31.s[0]
583	add		v14.4s, v14.4s, v30.4s
584	  add		a12, a12, w6
585	  add		a13, a13, w7
586	add		v15.4s, v15.4s, v31.4s
587	  add		a14, a14, w8
588	  add		a15, a15, w9
589
590	// interleave 32-bit words in state n, n+1
591	  ldp		w6, w7, [x2], #64
592	zip1		v16.4s, v0.4s, v1.4s
593	  ldp		w8, w9, [x2, #-56]
594	  eor		a0, a0, w6
595	zip2		v17.4s, v0.4s, v1.4s
596	  eor		a1, a1, w7
597	zip1		v18.4s, v2.4s, v3.4s
598	  eor		a2, a2, w8
599	zip2		v19.4s, v2.4s, v3.4s
600	  eor		a3, a3, w9
601	  ldp		w6, w7, [x2, #-48]
602	zip1		v20.4s, v4.4s, v5.4s
603	  ldp		w8, w9, [x2, #-40]
604	  eor		a4, a4, w6
605	zip2		v21.4s, v4.4s, v5.4s
606	  eor		a5, a5, w7
607	zip1		v22.4s, v6.4s, v7.4s
608	  eor		a6, a6, w8
609	zip2		v23.4s, v6.4s, v7.4s
610	  eor		a7, a7, w9
611	  ldp		w6, w7, [x2, #-32]
612	zip1		v24.4s, v8.4s, v9.4s
613	  ldp		w8, w9, [x2, #-24]
614	  eor		a8, a8, w6
615	zip2		v25.4s, v8.4s, v9.4s
616	  eor		a9, a9, w7
617	zip1		v26.4s, v10.4s, v11.4s
618	  eor		a10, a10, w8
619	zip2		v27.4s, v10.4s, v11.4s
620	  eor		a11, a11, w9
621	  ldp		w6, w7, [x2, #-16]
622	zip1		v28.4s, v12.4s, v13.4s
623	  ldp		w8, w9, [x2, #-8]
624	  eor		a12, a12, w6
625	zip2		v29.4s, v12.4s, v13.4s
626	  eor		a13, a13, w7
627	zip1		v30.4s, v14.4s, v15.4s
628	  eor		a14, a14, w8
629	zip2		v31.4s, v14.4s, v15.4s
630	  eor		a15, a15, w9
631
632	mov		x3, #64
633	subs		x5, x4, #128
634	add		x6, x5, x2
635	csel		x3, x3, xzr, ge
636	csel		x2, x2, x6, ge
637
638	// interleave 64-bit words in state n, n+2
639	zip1		v0.2d, v16.2d, v18.2d
640	zip2		v4.2d, v16.2d, v18.2d
641	  stp		a0, a1, [x1], #64
642	zip1		v8.2d, v17.2d, v19.2d
643	zip2		v12.2d, v17.2d, v19.2d
644	  stp		a2, a3, [x1, #-56]
645	ld1		{v16.16b-v19.16b}, [x2], x3
646
647	subs		x6, x4, #192
648	ccmp		x3, xzr, #4, lt
649	add		x7, x6, x2
650	csel		x3, x3, xzr, eq
651	csel		x2, x2, x7, eq
652
653	zip1		v1.2d, v20.2d, v22.2d
654	zip2		v5.2d, v20.2d, v22.2d
655	  stp		a4, a5, [x1, #-48]
656	zip1		v9.2d, v21.2d, v23.2d
657	zip2		v13.2d, v21.2d, v23.2d
658	  stp		a6, a7, [x1, #-40]
659	ld1		{v20.16b-v23.16b}, [x2], x3
660
661	subs		x7, x4, #256
662	ccmp		x3, xzr, #4, lt
663	add		x8, x7, x2
664	csel		x3, x3, xzr, eq
665	csel		x2, x2, x8, eq
666
667	zip1		v2.2d, v24.2d, v26.2d
668	zip2		v6.2d, v24.2d, v26.2d
669	  stp		a8, a9, [x1, #-32]
670	zip1		v10.2d, v25.2d, v27.2d
671	zip2		v14.2d, v25.2d, v27.2d
672	  stp		a10, a11, [x1, #-24]
673	ld1		{v24.16b-v27.16b}, [x2], x3
674
675	subs		x8, x4, #320
676	ccmp		x3, xzr, #4, lt
677	add		x9, x8, x2
678	csel		x2, x2, x9, eq
679
680	zip1		v3.2d, v28.2d, v30.2d
681	zip2		v7.2d, v28.2d, v30.2d
682	  stp		a12, a13, [x1, #-16]
683	zip1		v11.2d, v29.2d, v31.2d
684	zip2		v15.2d, v29.2d, v31.2d
685	  stp		a14, a15, [x1, #-8]
686	ld1		{v28.16b-v31.16b}, [x2]
687
688	// xor with corresponding input, write to output
689	tbnz		x5, #63, 0f
690	eor		v16.16b, v16.16b, v0.16b
691	eor		v17.16b, v17.16b, v1.16b
692	eor		v18.16b, v18.16b, v2.16b
693	eor		v19.16b, v19.16b, v3.16b
694	st1		{v16.16b-v19.16b}, [x1], #64
695	cbz		x5, .Lout
696
697	tbnz		x6, #63, 1f
698	eor		v20.16b, v20.16b, v4.16b
699	eor		v21.16b, v21.16b, v5.16b
700	eor		v22.16b, v22.16b, v6.16b
701	eor		v23.16b, v23.16b, v7.16b
702	st1		{v20.16b-v23.16b}, [x1], #64
703	cbz		x6, .Lout
704
705	tbnz		x7, #63, 2f
706	eor		v24.16b, v24.16b, v8.16b
707	eor		v25.16b, v25.16b, v9.16b
708	eor		v26.16b, v26.16b, v10.16b
709	eor		v27.16b, v27.16b, v11.16b
710	st1		{v24.16b-v27.16b}, [x1], #64
711	cbz		x7, .Lout
712
713	tbnz		x8, #63, 3f
714	eor		v28.16b, v28.16b, v12.16b
715	eor		v29.16b, v29.16b, v13.16b
716	eor		v30.16b, v30.16b, v14.16b
717	eor		v31.16b, v31.16b, v15.16b
718	st1		{v28.16b-v31.16b}, [x1]
719
720.Lout:	frame_pop
721	ret
722
723	// fewer than 128 bytes of in/output
7240:	ld1		{v8.16b}, [x10]
725	ld1		{v9.16b}, [x11]
726	movi		v10.16b, #16
727	sub		x2, x1, #64
728	add		x1, x1, x5
729	ld1		{v16.16b-v19.16b}, [x2]
730	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
731	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
732	add		v8.16b, v8.16b, v10.16b
733	add		v9.16b, v9.16b, v10.16b
734	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
735	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
736	add		v8.16b, v8.16b, v10.16b
737	add		v9.16b, v9.16b, v10.16b
738	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
739	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
740	add		v8.16b, v8.16b, v10.16b
741	add		v9.16b, v9.16b, v10.16b
742	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
743	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
744
745	eor		v20.16b, v20.16b, v4.16b
746	eor		v21.16b, v21.16b, v5.16b
747	eor		v22.16b, v22.16b, v6.16b
748	eor		v23.16b, v23.16b, v7.16b
749	st1		{v20.16b-v23.16b}, [x1]
750	b		.Lout
751
752	// fewer than 192 bytes of in/output
7531:	ld1		{v8.16b}, [x10]
754	ld1		{v9.16b}, [x11]
755	movi		v10.16b, #16
756	add		x1, x1, x6
757	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
758	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
759	add		v8.16b, v8.16b, v10.16b
760	add		v9.16b, v9.16b, v10.16b
761	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
762	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
763	add		v8.16b, v8.16b, v10.16b
764	add		v9.16b, v9.16b, v10.16b
765	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
766	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
767	add		v8.16b, v8.16b, v10.16b
768	add		v9.16b, v9.16b, v10.16b
769	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
770	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
771
772	eor		v20.16b, v20.16b, v0.16b
773	eor		v21.16b, v21.16b, v1.16b
774	eor		v22.16b, v22.16b, v2.16b
775	eor		v23.16b, v23.16b, v3.16b
776	st1		{v20.16b-v23.16b}, [x1]
777	b		.Lout
778
779	// fewer than 256 bytes of in/output
7802:	ld1		{v4.16b}, [x10]
781	ld1		{v5.16b}, [x11]
782	movi		v6.16b, #16
783	add		x1, x1, x7
784	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
785	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
786	add		v4.16b, v4.16b, v6.16b
787	add		v5.16b, v5.16b, v6.16b
788	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
789	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
790	add		v4.16b, v4.16b, v6.16b
791	add		v5.16b, v5.16b, v6.16b
792	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
793	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
794	add		v4.16b, v4.16b, v6.16b
795	add		v5.16b, v5.16b, v6.16b
796	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
797	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b
798
799	eor		v24.16b, v24.16b, v0.16b
800	eor		v25.16b, v25.16b, v1.16b
801	eor		v26.16b, v26.16b, v2.16b
802	eor		v27.16b, v27.16b, v3.16b
803	st1		{v24.16b-v27.16b}, [x1]
804	b		.Lout
805
806	// fewer than 320 bytes of in/output
8073:	ld1		{v4.16b}, [x10]
808	ld1		{v5.16b}, [x11]
809	movi		v6.16b, #16
810	add		x1, x1, x8
811	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
812	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
813	add		v4.16b, v4.16b, v6.16b
814	add		v5.16b, v5.16b, v6.16b
815	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
816	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
817	add		v4.16b, v4.16b, v6.16b
818	add		v5.16b, v5.16b, v6.16b
819	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
820	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
821	add		v4.16b, v4.16b, v6.16b
822	add		v5.16b, v5.16b, v6.16b
823	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
824	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
825
826	eor		v28.16b, v28.16b, v0.16b
827	eor		v29.16b, v29.16b, v1.16b
828	eor		v30.16b, v30.16b, v2.16b
829	eor		v31.16b, v31.16b, v3.16b
830	st1		{v28.16b-v31.16b}, [x1]
831	b		.Lout
832ENDPROC(chacha_4block_xor_neon)
833
834	.section	".rodata", "a", %progbits
835	.align		L1_CACHE_SHIFT
836.Lpermute:
837	.set		.Li, 0
838	.rept		192
839	.byte		(.Li - 64)
840	.set		.Li, .Li + 1
841	.endr
842
843CTRINC:	.word		1, 2, 3, 4
844ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
845