xref: /openbmc/linux/arch/arm64/crypto/chacha-neon-core.S (revision cdd38c5f1ce4398ec58fec95904b75824daab7b5)
1 /*
2  * ChaCha/XChaCha NEON helper functions
3  *
4  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  *
10  * Originally based on:
11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12  *
13  * Copyright (C) 2015 Martin Willi
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2 of the License, or
18  * (at your option) any later version.
19  */
20 
21 #include <linux/linkage.h>
22 #include <asm/assembler.h>
23 #include <asm/cache.h>
24 
25 	.text
26 	.align		6
27 
28 /*
29  * chacha_permute - permute one block
30  *
31  * Permute one 64-byte block where the state matrix is stored in the four NEON
32  * registers v0-v3.  It performs matrix operations on four words in parallel,
33  * but requires shuffling to rearrange the words after each round.
34  *
35  * The round count is given in w3.
36  *
37  * Clobbers: w3, x10, v4, v12
38  */
39 SYM_FUNC_START_LOCAL(chacha_permute)
40 
41 	adr_l		x10, ROT8
42 	ld1		{v12.4s}, [x10]
43 
44 .Ldoubleround:
45 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46 	add		v0.4s, v0.4s, v1.4s
47 	eor		v3.16b, v3.16b, v0.16b
48 	rev32		v3.8h, v3.8h
49 
50 	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51 	add		v2.4s, v2.4s, v3.4s
52 	eor		v4.16b, v1.16b, v2.16b
53 	shl		v1.4s, v4.4s, #12
54 	sri		v1.4s, v4.4s, #20
55 
56 	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57 	add		v0.4s, v0.4s, v1.4s
58 	eor		v3.16b, v3.16b, v0.16b
59 	tbl		v3.16b, {v3.16b}, v12.16b
60 
61 	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62 	add		v2.4s, v2.4s, v3.4s
63 	eor		v4.16b, v1.16b, v2.16b
64 	shl		v1.4s, v4.4s, #7
65 	sri		v1.4s, v4.4s, #25
66 
67 	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68 	ext		v1.16b, v1.16b, v1.16b, #4
69 	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70 	ext		v2.16b, v2.16b, v2.16b, #8
71 	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72 	ext		v3.16b, v3.16b, v3.16b, #12
73 
74 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75 	add		v0.4s, v0.4s, v1.4s
76 	eor		v3.16b, v3.16b, v0.16b
77 	rev32		v3.8h, v3.8h
78 
79 	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80 	add		v2.4s, v2.4s, v3.4s
81 	eor		v4.16b, v1.16b, v2.16b
82 	shl		v1.4s, v4.4s, #12
83 	sri		v1.4s, v4.4s, #20
84 
85 	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86 	add		v0.4s, v0.4s, v1.4s
87 	eor		v3.16b, v3.16b, v0.16b
88 	tbl		v3.16b, {v3.16b}, v12.16b
89 
90 	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91 	add		v2.4s, v2.4s, v3.4s
92 	eor		v4.16b, v1.16b, v2.16b
93 	shl		v1.4s, v4.4s, #7
94 	sri		v1.4s, v4.4s, #25
95 
96 	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97 	ext		v1.16b, v1.16b, v1.16b, #12
98 	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99 	ext		v2.16b, v2.16b, v2.16b, #8
100 	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101 	ext		v3.16b, v3.16b, v3.16b, #4
102 
103 	subs		w3, w3, #2
104 	b.ne		.Ldoubleround
105 
106 	ret
107 SYM_FUNC_END(chacha_permute)
108 
109 SYM_FUNC_START(chacha_block_xor_neon)
110 	// x0: Input state matrix, s
111 	// x1: 1 data block output, o
112 	// x2: 1 data block input, i
113 	// w3: nrounds
114 
115 	stp		x29, x30, [sp, #-16]!
116 	mov		x29, sp
117 
118 	// x0..3 = s0..3
119 	ld1		{v0.4s-v3.4s}, [x0]
120 	ld1		{v8.4s-v11.4s}, [x0]
121 
122 	bl		chacha_permute
123 
124 	ld1		{v4.16b-v7.16b}, [x2]
125 
126 	// o0 = i0 ^ (x0 + s0)
127 	add		v0.4s, v0.4s, v8.4s
128 	eor		v0.16b, v0.16b, v4.16b
129 
130 	// o1 = i1 ^ (x1 + s1)
131 	add		v1.4s, v1.4s, v9.4s
132 	eor		v1.16b, v1.16b, v5.16b
133 
134 	// o2 = i2 ^ (x2 + s2)
135 	add		v2.4s, v2.4s, v10.4s
136 	eor		v2.16b, v2.16b, v6.16b
137 
138 	// o3 = i3 ^ (x3 + s3)
139 	add		v3.4s, v3.4s, v11.4s
140 	eor		v3.16b, v3.16b, v7.16b
141 
142 	st1		{v0.16b-v3.16b}, [x1]
143 
144 	ldp		x29, x30, [sp], #16
145 	ret
146 SYM_FUNC_END(chacha_block_xor_neon)
147 
148 SYM_FUNC_START(hchacha_block_neon)
149 	// x0: Input state matrix, s
150 	// x1: output (8 32-bit words)
151 	// w2: nrounds
152 
153 	stp		x29, x30, [sp, #-16]!
154 	mov		x29, sp
155 
156 	ld1		{v0.4s-v3.4s}, [x0]
157 
158 	mov		w3, w2
159 	bl		chacha_permute
160 
161 	st1		{v0.4s}, [x1], #16
162 	st1		{v3.4s}, [x1]
163 
164 	ldp		x29, x30, [sp], #16
165 	ret
166 SYM_FUNC_END(hchacha_block_neon)
167 
168 	a0		.req	w12
169 	a1		.req	w13
170 	a2		.req	w14
171 	a3		.req	w15
172 	a4		.req	w16
173 	a5		.req	w17
174 	a6		.req	w19
175 	a7		.req	w20
176 	a8		.req	w21
177 	a9		.req	w22
178 	a10		.req	w23
179 	a11		.req	w24
180 	a12		.req	w25
181 	a13		.req	w26
182 	a14		.req	w27
183 	a15		.req	w28
184 
185 	.align		6
186 SYM_FUNC_START(chacha_4block_xor_neon)
187 	frame_push	10
188 
189 	// x0: Input state matrix, s
190 	// x1: 4 data blocks output, o
191 	// x2: 4 data blocks input, i
192 	// w3: nrounds
193 	// x4: byte count
194 
195 	adr_l		x10, .Lpermute
196 	and		x5, x4, #63
197 	add		x10, x10, x5
198 
199 	//
200 	// This function encrypts four consecutive ChaCha blocks by loading
201 	// the state matrix in NEON registers four times. The algorithm performs
202 	// each operation on the corresponding word of each state matrix, hence
203 	// requires no word shuffling. For final XORing step we transpose the
204 	// matrix by interleaving 32- and then 64-bit words, which allows us to
205 	// do XOR in NEON registers.
206 	//
207 	// At the same time, a fifth block is encrypted in parallel using
208 	// scalar registers
209 	//
210 	adr_l		x9, CTRINC		// ... and ROT8
211 	ld1		{v30.4s-v31.4s}, [x9]
212 
213 	// x0..15[0-3] = s0..3[0..3]
214 	add		x8, x0, #16
215 	ld4r		{ v0.4s- v3.4s}, [x0]
216 	ld4r		{ v4.4s- v7.4s}, [x8], #16
217 	ld4r		{ v8.4s-v11.4s}, [x8], #16
218 	ld4r		{v12.4s-v15.4s}, [x8]
219 
220 	mov		a0, v0.s[0]
221 	mov		a1, v1.s[0]
222 	mov		a2, v2.s[0]
223 	mov		a3, v3.s[0]
224 	mov		a4, v4.s[0]
225 	mov		a5, v5.s[0]
226 	mov		a6, v6.s[0]
227 	mov		a7, v7.s[0]
228 	mov		a8, v8.s[0]
229 	mov		a9, v9.s[0]
230 	mov		a10, v10.s[0]
231 	mov		a11, v11.s[0]
232 	mov		a12, v12.s[0]
233 	mov		a13, v13.s[0]
234 	mov		a14, v14.s[0]
235 	mov		a15, v15.s[0]
236 
237 	// x12 += counter values 1-4
238 	add		v12.4s, v12.4s, v30.4s
239 
240 .Ldoubleround4:
241 	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
242 	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
243 	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
244 	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
245 	add		v0.4s, v0.4s, v4.4s
246 	  add		a0, a0, a4
247 	add		v1.4s, v1.4s, v5.4s
248 	  add		a1, a1, a5
249 	add		v2.4s, v2.4s, v6.4s
250 	  add		a2, a2, a6
251 	add		v3.4s, v3.4s, v7.4s
252 	  add		a3, a3, a7
253 
254 	eor		v12.16b, v12.16b, v0.16b
255 	  eor		a12, a12, a0
256 	eor		v13.16b, v13.16b, v1.16b
257 	  eor		a13, a13, a1
258 	eor		v14.16b, v14.16b, v2.16b
259 	  eor		a14, a14, a2
260 	eor		v15.16b, v15.16b, v3.16b
261 	  eor		a15, a15, a3
262 
263 	rev32		v12.8h, v12.8h
264 	  ror		a12, a12, #16
265 	rev32		v13.8h, v13.8h
266 	  ror		a13, a13, #16
267 	rev32		v14.8h, v14.8h
268 	  ror		a14, a14, #16
269 	rev32		v15.8h, v15.8h
270 	  ror		a15, a15, #16
271 
272 	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
273 	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
274 	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
275 	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
276 	add		v8.4s, v8.4s, v12.4s
277 	  add		a8, a8, a12
278 	add		v9.4s, v9.4s, v13.4s
279 	  add		a9, a9, a13
280 	add		v10.4s, v10.4s, v14.4s
281 	  add		a10, a10, a14
282 	add		v11.4s, v11.4s, v15.4s
283 	  add		a11, a11, a15
284 
285 	eor		v16.16b, v4.16b, v8.16b
286 	  eor		a4, a4, a8
287 	eor		v17.16b, v5.16b, v9.16b
288 	  eor		a5, a5, a9
289 	eor		v18.16b, v6.16b, v10.16b
290 	  eor		a6, a6, a10
291 	eor		v19.16b, v7.16b, v11.16b
292 	  eor		a7, a7, a11
293 
294 	shl		v4.4s, v16.4s, #12
295 	shl		v5.4s, v17.4s, #12
296 	shl		v6.4s, v18.4s, #12
297 	shl		v7.4s, v19.4s, #12
298 
299 	sri		v4.4s, v16.4s, #20
300 	  ror		a4, a4, #20
301 	sri		v5.4s, v17.4s, #20
302 	  ror		a5, a5, #20
303 	sri		v6.4s, v18.4s, #20
304 	  ror		a6, a6, #20
305 	sri		v7.4s, v19.4s, #20
306 	  ror		a7, a7, #20
307 
308 	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
309 	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
310 	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
311 	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
312 	add		v0.4s, v0.4s, v4.4s
313 	  add		a0, a0, a4
314 	add		v1.4s, v1.4s, v5.4s
315 	  add		a1, a1, a5
316 	add		v2.4s, v2.4s, v6.4s
317 	  add		a2, a2, a6
318 	add		v3.4s, v3.4s, v7.4s
319 	  add		a3, a3, a7
320 
321 	eor		v12.16b, v12.16b, v0.16b
322 	  eor		a12, a12, a0
323 	eor		v13.16b, v13.16b, v1.16b
324 	  eor		a13, a13, a1
325 	eor		v14.16b, v14.16b, v2.16b
326 	  eor		a14, a14, a2
327 	eor		v15.16b, v15.16b, v3.16b
328 	  eor		a15, a15, a3
329 
330 	tbl		v12.16b, {v12.16b}, v31.16b
331 	  ror		a12, a12, #24
332 	tbl		v13.16b, {v13.16b}, v31.16b
333 	  ror		a13, a13, #24
334 	tbl		v14.16b, {v14.16b}, v31.16b
335 	  ror		a14, a14, #24
336 	tbl		v15.16b, {v15.16b}, v31.16b
337 	  ror		a15, a15, #24
338 
339 	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
340 	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
341 	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
342 	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
343 	add		v8.4s, v8.4s, v12.4s
344 	  add		a8, a8, a12
345 	add		v9.4s, v9.4s, v13.4s
346 	  add		a9, a9, a13
347 	add		v10.4s, v10.4s, v14.4s
348 	  add		a10, a10, a14
349 	add		v11.4s, v11.4s, v15.4s
350 	  add		a11, a11, a15
351 
352 	eor		v16.16b, v4.16b, v8.16b
353 	  eor		a4, a4, a8
354 	eor		v17.16b, v5.16b, v9.16b
355 	  eor		a5, a5, a9
356 	eor		v18.16b, v6.16b, v10.16b
357 	  eor		a6, a6, a10
358 	eor		v19.16b, v7.16b, v11.16b
359 	  eor		a7, a7, a11
360 
361 	shl		v4.4s, v16.4s, #7
362 	shl		v5.4s, v17.4s, #7
363 	shl		v6.4s, v18.4s, #7
364 	shl		v7.4s, v19.4s, #7
365 
366 	sri		v4.4s, v16.4s, #25
367 	  ror		a4, a4, #25
368 	sri		v5.4s, v17.4s, #25
369 	  ror		a5, a5, #25
370 	sri		v6.4s, v18.4s, #25
371 	 ror		a6, a6, #25
372 	sri		v7.4s, v19.4s, #25
373 	  ror		a7, a7, #25
374 
375 	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
376 	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
377 	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
378 	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379 	add		v0.4s, v0.4s, v5.4s
380 	  add		a0, a0, a5
381 	add		v1.4s, v1.4s, v6.4s
382 	  add		a1, a1, a6
383 	add		v2.4s, v2.4s, v7.4s
384 	  add		a2, a2, a7
385 	add		v3.4s, v3.4s, v4.4s
386 	  add		a3, a3, a4
387 
388 	eor		v15.16b, v15.16b, v0.16b
389 	  eor		a15, a15, a0
390 	eor		v12.16b, v12.16b, v1.16b
391 	  eor		a12, a12, a1
392 	eor		v13.16b, v13.16b, v2.16b
393 	  eor		a13, a13, a2
394 	eor		v14.16b, v14.16b, v3.16b
395 	  eor		a14, a14, a3
396 
397 	rev32		v15.8h, v15.8h
398 	  ror		a15, a15, #16
399 	rev32		v12.8h, v12.8h
400 	  ror		a12, a12, #16
401 	rev32		v13.8h, v13.8h
402 	  ror		a13, a13, #16
403 	rev32		v14.8h, v14.8h
404 	  ror		a14, a14, #16
405 
406 	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
407 	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
408 	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
409 	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
410 	add		v10.4s, v10.4s, v15.4s
411 	  add		a10, a10, a15
412 	add		v11.4s, v11.4s, v12.4s
413 	  add		a11, a11, a12
414 	add		v8.4s, v8.4s, v13.4s
415 	  add		a8, a8, a13
416 	add		v9.4s, v9.4s, v14.4s
417 	  add		a9, a9, a14
418 
419 	eor		v16.16b, v5.16b, v10.16b
420 	  eor		a5, a5, a10
421 	eor		v17.16b, v6.16b, v11.16b
422 	  eor		a6, a6, a11
423 	eor		v18.16b, v7.16b, v8.16b
424 	  eor		a7, a7, a8
425 	eor		v19.16b, v4.16b, v9.16b
426 	  eor		a4, a4, a9
427 
428 	shl		v5.4s, v16.4s, #12
429 	shl		v6.4s, v17.4s, #12
430 	shl		v7.4s, v18.4s, #12
431 	shl		v4.4s, v19.4s, #12
432 
433 	sri		v5.4s, v16.4s, #20
434 	  ror		a5, a5, #20
435 	sri		v6.4s, v17.4s, #20
436 	  ror		a6, a6, #20
437 	sri		v7.4s, v18.4s, #20
438 	  ror		a7, a7, #20
439 	sri		v4.4s, v19.4s, #20
440 	  ror		a4, a4, #20
441 
442 	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
443 	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
444 	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
445 	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
446 	add		v0.4s, v0.4s, v5.4s
447 	  add		a0, a0, a5
448 	add		v1.4s, v1.4s, v6.4s
449 	  add		a1, a1, a6
450 	add		v2.4s, v2.4s, v7.4s
451 	  add		a2, a2, a7
452 	add		v3.4s, v3.4s, v4.4s
453 	  add		a3, a3, a4
454 
455 	eor		v15.16b, v15.16b, v0.16b
456 	  eor		a15, a15, a0
457 	eor		v12.16b, v12.16b, v1.16b
458 	  eor		a12, a12, a1
459 	eor		v13.16b, v13.16b, v2.16b
460 	  eor		a13, a13, a2
461 	eor		v14.16b, v14.16b, v3.16b
462 	  eor		a14, a14, a3
463 
464 	tbl		v15.16b, {v15.16b}, v31.16b
465 	  ror		a15, a15, #24
466 	tbl		v12.16b, {v12.16b}, v31.16b
467 	  ror		a12, a12, #24
468 	tbl		v13.16b, {v13.16b}, v31.16b
469 	  ror		a13, a13, #24
470 	tbl		v14.16b, {v14.16b}, v31.16b
471 	  ror		a14, a14, #24
472 
473 	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
474 	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
475 	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
476 	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
477 	add		v10.4s, v10.4s, v15.4s
478 	  add		a10, a10, a15
479 	add		v11.4s, v11.4s, v12.4s
480 	  add		a11, a11, a12
481 	add		v8.4s, v8.4s, v13.4s
482 	  add		a8, a8, a13
483 	add		v9.4s, v9.4s, v14.4s
484 	  add		a9, a9, a14
485 
486 	eor		v16.16b, v5.16b, v10.16b
487 	  eor		a5, a5, a10
488 	eor		v17.16b, v6.16b, v11.16b
489 	  eor		a6, a6, a11
490 	eor		v18.16b, v7.16b, v8.16b
491 	  eor		a7, a7, a8
492 	eor		v19.16b, v4.16b, v9.16b
493 	  eor		a4, a4, a9
494 
495 	shl		v5.4s, v16.4s, #7
496 	shl		v6.4s, v17.4s, #7
497 	shl		v7.4s, v18.4s, #7
498 	shl		v4.4s, v19.4s, #7
499 
500 	sri		v5.4s, v16.4s, #25
501 	  ror		a5, a5, #25
502 	sri		v6.4s, v17.4s, #25
503 	  ror		a6, a6, #25
504 	sri		v7.4s, v18.4s, #25
505 	  ror		a7, a7, #25
506 	sri		v4.4s, v19.4s, #25
507 	  ror		a4, a4, #25
508 
509 	subs		w3, w3, #2
510 	b.ne		.Ldoubleround4
511 
512 	ld4r		{v16.4s-v19.4s}, [x0], #16
513 	ld4r		{v20.4s-v23.4s}, [x0], #16
514 
515 	// x12 += counter values 0-3
516 	add		v12.4s, v12.4s, v30.4s
517 
518 	// x0[0-3] += s0[0]
519 	// x1[0-3] += s0[1]
520 	// x2[0-3] += s0[2]
521 	// x3[0-3] += s0[3]
522 	add		v0.4s, v0.4s, v16.4s
523 	  mov		w6, v16.s[0]
524 	  mov		w7, v17.s[0]
525 	add		v1.4s, v1.4s, v17.4s
526 	  mov		w8, v18.s[0]
527 	  mov		w9, v19.s[0]
528 	add		v2.4s, v2.4s, v18.4s
529 	  add		a0, a0, w6
530 	  add		a1, a1, w7
531 	add		v3.4s, v3.4s, v19.4s
532 	  add		a2, a2, w8
533 	  add		a3, a3, w9
534 CPU_BE(	  rev		a0, a0		)
535 CPU_BE(	  rev		a1, a1		)
536 CPU_BE(	  rev		a2, a2		)
537 CPU_BE(	  rev		a3, a3		)
538 
539 	ld4r		{v24.4s-v27.4s}, [x0], #16
540 	ld4r		{v28.4s-v31.4s}, [x0]
541 
542 	// x4[0-3] += s1[0]
543 	// x5[0-3] += s1[1]
544 	// x6[0-3] += s1[2]
545 	// x7[0-3] += s1[3]
546 	add		v4.4s, v4.4s, v20.4s
547 	  mov		w6, v20.s[0]
548 	  mov		w7, v21.s[0]
549 	add		v5.4s, v5.4s, v21.4s
550 	  mov		w8, v22.s[0]
551 	  mov		w9, v23.s[0]
552 	add		v6.4s, v6.4s, v22.4s
553 	  add		a4, a4, w6
554 	  add		a5, a5, w7
555 	add		v7.4s, v7.4s, v23.4s
556 	  add		a6, a6, w8
557 	  add		a7, a7, w9
558 CPU_BE(	  rev		a4, a4		)
559 CPU_BE(	  rev		a5, a5		)
560 CPU_BE(	  rev		a6, a6		)
561 CPU_BE(	  rev		a7, a7		)
562 
563 	// x8[0-3] += s2[0]
564 	// x9[0-3] += s2[1]
565 	// x10[0-3] += s2[2]
566 	// x11[0-3] += s2[3]
567 	add		v8.4s, v8.4s, v24.4s
568 	  mov		w6, v24.s[0]
569 	  mov		w7, v25.s[0]
570 	add		v9.4s, v9.4s, v25.4s
571 	  mov		w8, v26.s[0]
572 	  mov		w9, v27.s[0]
573 	add		v10.4s, v10.4s, v26.4s
574 	  add		a8, a8, w6
575 	  add		a9, a9, w7
576 	add		v11.4s, v11.4s, v27.4s
577 	  add		a10, a10, w8
578 	  add		a11, a11, w9
579 CPU_BE(	  rev		a8, a8		)
580 CPU_BE(	  rev		a9, a9		)
581 CPU_BE(	  rev		a10, a10	)
582 CPU_BE(	  rev		a11, a11	)
583 
584 	// x12[0-3] += s3[0]
585 	// x13[0-3] += s3[1]
586 	// x14[0-3] += s3[2]
587 	// x15[0-3] += s3[3]
588 	add		v12.4s, v12.4s, v28.4s
589 	  mov		w6, v28.s[0]
590 	  mov		w7, v29.s[0]
591 	add		v13.4s, v13.4s, v29.4s
592 	  mov		w8, v30.s[0]
593 	  mov		w9, v31.s[0]
594 	add		v14.4s, v14.4s, v30.4s
595 	  add		a12, a12, w6
596 	  add		a13, a13, w7
597 	add		v15.4s, v15.4s, v31.4s
598 	  add		a14, a14, w8
599 	  add		a15, a15, w9
600 CPU_BE(	  rev		a12, a12	)
601 CPU_BE(	  rev		a13, a13	)
602 CPU_BE(	  rev		a14, a14	)
603 CPU_BE(	  rev		a15, a15	)
604 
605 	// interleave 32-bit words in state n, n+1
606 	  ldp		w6, w7, [x2], #64
607 	zip1		v16.4s, v0.4s, v1.4s
608 	  ldp		w8, w9, [x2, #-56]
609 	  eor		a0, a0, w6
610 	zip2		v17.4s, v0.4s, v1.4s
611 	  eor		a1, a1, w7
612 	zip1		v18.4s, v2.4s, v3.4s
613 	  eor		a2, a2, w8
614 	zip2		v19.4s, v2.4s, v3.4s
615 	  eor		a3, a3, w9
616 	  ldp		w6, w7, [x2, #-48]
617 	zip1		v20.4s, v4.4s, v5.4s
618 	  ldp		w8, w9, [x2, #-40]
619 	  eor		a4, a4, w6
620 	zip2		v21.4s, v4.4s, v5.4s
621 	  eor		a5, a5, w7
622 	zip1		v22.4s, v6.4s, v7.4s
623 	  eor		a6, a6, w8
624 	zip2		v23.4s, v6.4s, v7.4s
625 	  eor		a7, a7, w9
626 	  ldp		w6, w7, [x2, #-32]
627 	zip1		v24.4s, v8.4s, v9.4s
628 	  ldp		w8, w9, [x2, #-24]
629 	  eor		a8, a8, w6
630 	zip2		v25.4s, v8.4s, v9.4s
631 	  eor		a9, a9, w7
632 	zip1		v26.4s, v10.4s, v11.4s
633 	  eor		a10, a10, w8
634 	zip2		v27.4s, v10.4s, v11.4s
635 	  eor		a11, a11, w9
636 	  ldp		w6, w7, [x2, #-16]
637 	zip1		v28.4s, v12.4s, v13.4s
638 	  ldp		w8, w9, [x2, #-8]
639 	  eor		a12, a12, w6
640 	zip2		v29.4s, v12.4s, v13.4s
641 	  eor		a13, a13, w7
642 	zip1		v30.4s, v14.4s, v15.4s
643 	  eor		a14, a14, w8
644 	zip2		v31.4s, v14.4s, v15.4s
645 	  eor		a15, a15, w9
646 
647 	add		x3, x2, x4
648 	sub		x3, x3, #128		// start of last block
649 
650 	subs		x5, x4, #128
651 	csel		x2, x2, x3, ge
652 
653 	// interleave 64-bit words in state n, n+2
654 	zip1		v0.2d, v16.2d, v18.2d
655 	zip2		v4.2d, v16.2d, v18.2d
656 	  stp		a0, a1, [x1], #64
657 	zip1		v8.2d, v17.2d, v19.2d
658 	zip2		v12.2d, v17.2d, v19.2d
659 	  stp		a2, a3, [x1, #-56]
660 
661 	subs		x6, x4, #192
662 	ld1		{v16.16b-v19.16b}, [x2], #64
663 	csel		x2, x2, x3, ge
664 
665 	zip1		v1.2d, v20.2d, v22.2d
666 	zip2		v5.2d, v20.2d, v22.2d
667 	  stp		a4, a5, [x1, #-48]
668 	zip1		v9.2d, v21.2d, v23.2d
669 	zip2		v13.2d, v21.2d, v23.2d
670 	  stp		a6, a7, [x1, #-40]
671 
672 	subs		x7, x4, #256
673 	ld1		{v20.16b-v23.16b}, [x2], #64
674 	csel		x2, x2, x3, ge
675 
676 	zip1		v2.2d, v24.2d, v26.2d
677 	zip2		v6.2d, v24.2d, v26.2d
678 	  stp		a8, a9, [x1, #-32]
679 	zip1		v10.2d, v25.2d, v27.2d
680 	zip2		v14.2d, v25.2d, v27.2d
681 	  stp		a10, a11, [x1, #-24]
682 
683 	subs		x8, x4, #320
684 	ld1		{v24.16b-v27.16b}, [x2], #64
685 	csel		x2, x2, x3, ge
686 
687 	zip1		v3.2d, v28.2d, v30.2d
688 	zip2		v7.2d, v28.2d, v30.2d
689 	  stp		a12, a13, [x1, #-16]
690 	zip1		v11.2d, v29.2d, v31.2d
691 	zip2		v15.2d, v29.2d, v31.2d
692 	  stp		a14, a15, [x1, #-8]
693 
694 	tbnz		x5, #63, .Lt128
695 	ld1		{v28.16b-v31.16b}, [x2]
696 
697 	// xor with corresponding input, write to output
698 	eor		v16.16b, v16.16b, v0.16b
699 	eor		v17.16b, v17.16b, v1.16b
700 	eor		v18.16b, v18.16b, v2.16b
701 	eor		v19.16b, v19.16b, v3.16b
702 
703 	tbnz		x6, #63, .Lt192
704 
705 	eor		v20.16b, v20.16b, v4.16b
706 	eor		v21.16b, v21.16b, v5.16b
707 	eor		v22.16b, v22.16b, v6.16b
708 	eor		v23.16b, v23.16b, v7.16b
709 
710 	st1		{v16.16b-v19.16b}, [x1], #64
711 	tbnz		x7, #63, .Lt256
712 
713 	eor		v24.16b, v24.16b, v8.16b
714 	eor		v25.16b, v25.16b, v9.16b
715 	eor		v26.16b, v26.16b, v10.16b
716 	eor		v27.16b, v27.16b, v11.16b
717 
718 	st1		{v20.16b-v23.16b}, [x1], #64
719 	tbnz		x8, #63, .Lt320
720 
721 	eor		v28.16b, v28.16b, v12.16b
722 	eor		v29.16b, v29.16b, v13.16b
723 	eor		v30.16b, v30.16b, v14.16b
724 	eor		v31.16b, v31.16b, v15.16b
725 
726 	st1		{v24.16b-v27.16b}, [x1], #64
727 	st1		{v28.16b-v31.16b}, [x1]
728 
729 .Lout:	frame_pop
730 	ret
731 
732 	// fewer than 192 bytes of in/output
733 .Lt192:	cbz		x5, 1f				// exactly 128 bytes?
734 	ld1		{v28.16b-v31.16b}, [x10]
735 	add		x5, x5, x1
736 	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
737 	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
738 	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
739 	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
740 
741 0:	eor		v20.16b, v20.16b, v28.16b
742 	eor		v21.16b, v21.16b, v29.16b
743 	eor		v22.16b, v22.16b, v30.16b
744 	eor		v23.16b, v23.16b, v31.16b
745 	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
746 1:	st1		{v16.16b-v19.16b}, [x1]
747 	b		.Lout
748 
749 	// fewer than 128 bytes of in/output
750 .Lt128:	ld1		{v28.16b-v31.16b}, [x10]
751 	add		x5, x5, x1
752 	sub		x1, x1, #64
753 	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
754 	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
755 	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
756 	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
757 	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
758 	b		0b
759 
760 	// fewer than 256 bytes of in/output
761 .Lt256:	cbz		x6, 2f				// exactly 192 bytes?
762 	ld1		{v4.16b-v7.16b}, [x10]
763 	add		x6, x6, x1
764 	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
765 	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
766 	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
767 	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
768 
769 	eor		v28.16b, v28.16b, v0.16b
770 	eor		v29.16b, v29.16b, v1.16b
771 	eor		v30.16b, v30.16b, v2.16b
772 	eor		v31.16b, v31.16b, v3.16b
773 	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
774 2:	st1		{v20.16b-v23.16b}, [x1]
775 	b		.Lout
776 
777 	// fewer than 320 bytes of in/output
778 .Lt320:	cbz		x7, 3f				// exactly 256 bytes?
779 	ld1		{v4.16b-v7.16b}, [x10]
780 	add		x7, x7, x1
781 	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
782 	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
783 	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
784 	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
785 
786 	eor		v28.16b, v28.16b, v0.16b
787 	eor		v29.16b, v29.16b, v1.16b
788 	eor		v30.16b, v30.16b, v2.16b
789 	eor		v31.16b, v31.16b, v3.16b
790 	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
791 3:	st1		{v24.16b-v27.16b}, [x1]
792 	b		.Lout
793 SYM_FUNC_END(chacha_4block_xor_neon)
794 
795 	.section	".rodata", "a", %progbits
796 	.align		L1_CACHE_SHIFT
797 .Lpermute:
798 	.set		.Li, 0
799 	.rept		128
800 	.byte		(.Li - 64)
801 	.set		.Li, .Li + 1
802 	.endr
803 
804 CTRINC:	.word		1, 2, 3, 4
805 ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
806