1/*
2 * ChaCha/XChaCha NEON helper functions
3 *
4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12 *
13 * Copyright (C) 2015 Martin Willi
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 */
20
21#include <linux/linkage.h>
22
23	.text
24	.align		6
25
26/*
27 * chacha_permute - permute one block
28 *
29 * Permute one 64-byte block where the state matrix is stored in the four NEON
30 * registers v0-v3.  It performs matrix operations on four words in parallel,
31 * but requires shuffling to rearrange the words after each round.
32 *
33 * The round count is given in w3.
34 *
35 * Clobbers: w3, x10, v4, v12
36 */
37chacha_permute:
38
39	adr		x10, ROT8
40	ld1		{v12.4s}, [x10]
41
42.Ldoubleround:
43	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
44	add		v0.4s, v0.4s, v1.4s
45	eor		v3.16b, v3.16b, v0.16b
46	rev32		v3.8h, v3.8h
47
48	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
49	add		v2.4s, v2.4s, v3.4s
50	eor		v4.16b, v1.16b, v2.16b
51	shl		v1.4s, v4.4s, #12
52	sri		v1.4s, v4.4s, #20
53
54	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
55	add		v0.4s, v0.4s, v1.4s
56	eor		v3.16b, v3.16b, v0.16b
57	tbl		v3.16b, {v3.16b}, v12.16b
58
59	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
60	add		v2.4s, v2.4s, v3.4s
61	eor		v4.16b, v1.16b, v2.16b
62	shl		v1.4s, v4.4s, #7
63	sri		v1.4s, v4.4s, #25
64
65	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
66	ext		v1.16b, v1.16b, v1.16b, #4
67	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
68	ext		v2.16b, v2.16b, v2.16b, #8
69	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
70	ext		v3.16b, v3.16b, v3.16b, #12
71
72	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
73	add		v0.4s, v0.4s, v1.4s
74	eor		v3.16b, v3.16b, v0.16b
75	rev32		v3.8h, v3.8h
76
77	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
78	add		v2.4s, v2.4s, v3.4s
79	eor		v4.16b, v1.16b, v2.16b
80	shl		v1.4s, v4.4s, #12
81	sri		v1.4s, v4.4s, #20
82
83	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
84	add		v0.4s, v0.4s, v1.4s
85	eor		v3.16b, v3.16b, v0.16b
86	tbl		v3.16b, {v3.16b}, v12.16b
87
88	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
89	add		v2.4s, v2.4s, v3.4s
90	eor		v4.16b, v1.16b, v2.16b
91	shl		v1.4s, v4.4s, #7
92	sri		v1.4s, v4.4s, #25
93
94	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
95	ext		v1.16b, v1.16b, v1.16b, #12
96	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
97	ext		v2.16b, v2.16b, v2.16b, #8
98	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
99	ext		v3.16b, v3.16b, v3.16b, #4
100
101	subs		w3, w3, #2
102	b.ne		.Ldoubleround
103
104	ret
105ENDPROC(chacha_permute)
106
107ENTRY(chacha_block_xor_neon)
108	// x0: Input state matrix, s
109	// x1: 1 data block output, o
110	// x2: 1 data block input, i
111	// w3: nrounds
112
113	stp		x29, x30, [sp, #-16]!
114	mov		x29, sp
115
116	// x0..3 = s0..3
117	ld1		{v0.4s-v3.4s}, [x0]
118	ld1		{v8.4s-v11.4s}, [x0]
119
120	bl		chacha_permute
121
122	ld1		{v4.16b-v7.16b}, [x2]
123
124	// o0 = i0 ^ (x0 + s0)
125	add		v0.4s, v0.4s, v8.4s
126	eor		v0.16b, v0.16b, v4.16b
127
128	// o1 = i1 ^ (x1 + s1)
129	add		v1.4s, v1.4s, v9.4s
130	eor		v1.16b, v1.16b, v5.16b
131
132	// o2 = i2 ^ (x2 + s2)
133	add		v2.4s, v2.4s, v10.4s
134	eor		v2.16b, v2.16b, v6.16b
135
136	// o3 = i3 ^ (x3 + s3)
137	add		v3.4s, v3.4s, v11.4s
138	eor		v3.16b, v3.16b, v7.16b
139
140	st1		{v0.16b-v3.16b}, [x1]
141
142	ldp		x29, x30, [sp], #16
143	ret
144ENDPROC(chacha_block_xor_neon)
145
146ENTRY(hchacha_block_neon)
147	// x0: Input state matrix, s
148	// x1: output (8 32-bit words)
149	// w2: nrounds
150
151	stp		x29, x30, [sp, #-16]!
152	mov		x29, sp
153
154	ld1		{v0.4s-v3.4s}, [x0]
155
156	mov		w3, w2
157	bl		chacha_permute
158
159	st1		{v0.16b}, [x1], #16
160	st1		{v3.16b}, [x1]
161
162	ldp		x29, x30, [sp], #16
163	ret
164ENDPROC(hchacha_block_neon)
165
166	.align		6
167ENTRY(chacha_4block_xor_neon)
168	// x0: Input state matrix, s
169	// x1: 4 data blocks output, o
170	// x2: 4 data blocks input, i
171	// w3: nrounds
172
173	//
174	// This function encrypts four consecutive ChaCha blocks by loading
175	// the state matrix in NEON registers four times. The algorithm performs
176	// each operation on the corresponding word of each state matrix, hence
177	// requires no word shuffling. For final XORing step we transpose the
178	// matrix by interleaving 32- and then 64-bit words, which allows us to
179	// do XOR in NEON registers.
180	//
181	adr		x9, CTRINC		// ... and ROT8
182	ld1		{v30.4s-v31.4s}, [x9]
183
184	// x0..15[0-3] = s0..3[0..3]
185	mov		x4, x0
186	ld4r		{ v0.4s- v3.4s}, [x4], #16
187	ld4r		{ v4.4s- v7.4s}, [x4], #16
188	ld4r		{ v8.4s-v11.4s}, [x4], #16
189	ld4r		{v12.4s-v15.4s}, [x4]
190
191	// x12 += counter values 0-3
192	add		v12.4s, v12.4s, v30.4s
193
194.Ldoubleround4:
195	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
196	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
197	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
198	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
199	add		v0.4s, v0.4s, v4.4s
200	add		v1.4s, v1.4s, v5.4s
201	add		v2.4s, v2.4s, v6.4s
202	add		v3.4s, v3.4s, v7.4s
203
204	eor		v12.16b, v12.16b, v0.16b
205	eor		v13.16b, v13.16b, v1.16b
206	eor		v14.16b, v14.16b, v2.16b
207	eor		v15.16b, v15.16b, v3.16b
208
209	rev32		v12.8h, v12.8h
210	rev32		v13.8h, v13.8h
211	rev32		v14.8h, v14.8h
212	rev32		v15.8h, v15.8h
213
214	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
215	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
216	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
217	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
218	add		v8.4s, v8.4s, v12.4s
219	add		v9.4s, v9.4s, v13.4s
220	add		v10.4s, v10.4s, v14.4s
221	add		v11.4s, v11.4s, v15.4s
222
223	eor		v16.16b, v4.16b, v8.16b
224	eor		v17.16b, v5.16b, v9.16b
225	eor		v18.16b, v6.16b, v10.16b
226	eor		v19.16b, v7.16b, v11.16b
227
228	shl		v4.4s, v16.4s, #12
229	shl		v5.4s, v17.4s, #12
230	shl		v6.4s, v18.4s, #12
231	shl		v7.4s, v19.4s, #12
232
233	sri		v4.4s, v16.4s, #20
234	sri		v5.4s, v17.4s, #20
235	sri		v6.4s, v18.4s, #20
236	sri		v7.4s, v19.4s, #20
237
238	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
239	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
240	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
241	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
242	add		v0.4s, v0.4s, v4.4s
243	add		v1.4s, v1.4s, v5.4s
244	add		v2.4s, v2.4s, v6.4s
245	add		v3.4s, v3.4s, v7.4s
246
247	eor		v12.16b, v12.16b, v0.16b
248	eor		v13.16b, v13.16b, v1.16b
249	eor		v14.16b, v14.16b, v2.16b
250	eor		v15.16b, v15.16b, v3.16b
251
252	tbl		v12.16b, {v12.16b}, v31.16b
253	tbl		v13.16b, {v13.16b}, v31.16b
254	tbl		v14.16b, {v14.16b}, v31.16b
255	tbl		v15.16b, {v15.16b}, v31.16b
256
257	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
258	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
259	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
260	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
261	add		v8.4s, v8.4s, v12.4s
262	add		v9.4s, v9.4s, v13.4s
263	add		v10.4s, v10.4s, v14.4s
264	add		v11.4s, v11.4s, v15.4s
265
266	eor		v16.16b, v4.16b, v8.16b
267	eor		v17.16b, v5.16b, v9.16b
268	eor		v18.16b, v6.16b, v10.16b
269	eor		v19.16b, v7.16b, v11.16b
270
271	shl		v4.4s, v16.4s, #7
272	shl		v5.4s, v17.4s, #7
273	shl		v6.4s, v18.4s, #7
274	shl		v7.4s, v19.4s, #7
275
276	sri		v4.4s, v16.4s, #25
277	sri		v5.4s, v17.4s, #25
278	sri		v6.4s, v18.4s, #25
279	sri		v7.4s, v19.4s, #25
280
281	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
282	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
283	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
284	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
285	add		v0.4s, v0.4s, v5.4s
286	add		v1.4s, v1.4s, v6.4s
287	add		v2.4s, v2.4s, v7.4s
288	add		v3.4s, v3.4s, v4.4s
289
290	eor		v15.16b, v15.16b, v0.16b
291	eor		v12.16b, v12.16b, v1.16b
292	eor		v13.16b, v13.16b, v2.16b
293	eor		v14.16b, v14.16b, v3.16b
294
295	rev32		v15.8h, v15.8h
296	rev32		v12.8h, v12.8h
297	rev32		v13.8h, v13.8h
298	rev32		v14.8h, v14.8h
299
300	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
301	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
302	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
303	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
304	add		v10.4s, v10.4s, v15.4s
305	add		v11.4s, v11.4s, v12.4s
306	add		v8.4s, v8.4s, v13.4s
307	add		v9.4s, v9.4s, v14.4s
308
309	eor		v16.16b, v5.16b, v10.16b
310	eor		v17.16b, v6.16b, v11.16b
311	eor		v18.16b, v7.16b, v8.16b
312	eor		v19.16b, v4.16b, v9.16b
313
314	shl		v5.4s, v16.4s, #12
315	shl		v6.4s, v17.4s, #12
316	shl		v7.4s, v18.4s, #12
317	shl		v4.4s, v19.4s, #12
318
319	sri		v5.4s, v16.4s, #20
320	sri		v6.4s, v17.4s, #20
321	sri		v7.4s, v18.4s, #20
322	sri		v4.4s, v19.4s, #20
323
324	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
325	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
326	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
327	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
328	add		v0.4s, v0.4s, v5.4s
329	add		v1.4s, v1.4s, v6.4s
330	add		v2.4s, v2.4s, v7.4s
331	add		v3.4s, v3.4s, v4.4s
332
333	eor		v15.16b, v15.16b, v0.16b
334	eor		v12.16b, v12.16b, v1.16b
335	eor		v13.16b, v13.16b, v2.16b
336	eor		v14.16b, v14.16b, v3.16b
337
338	tbl		v15.16b, {v15.16b}, v31.16b
339	tbl		v12.16b, {v12.16b}, v31.16b
340	tbl		v13.16b, {v13.16b}, v31.16b
341	tbl		v14.16b, {v14.16b}, v31.16b
342
343	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
344	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
345	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
346	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
347	add		v10.4s, v10.4s, v15.4s
348	add		v11.4s, v11.4s, v12.4s
349	add		v8.4s, v8.4s, v13.4s
350	add		v9.4s, v9.4s, v14.4s
351
352	eor		v16.16b, v5.16b, v10.16b
353	eor		v17.16b, v6.16b, v11.16b
354	eor		v18.16b, v7.16b, v8.16b
355	eor		v19.16b, v4.16b, v9.16b
356
357	shl		v5.4s, v16.4s, #7
358	shl		v6.4s, v17.4s, #7
359	shl		v7.4s, v18.4s, #7
360	shl		v4.4s, v19.4s, #7
361
362	sri		v5.4s, v16.4s, #25
363	sri		v6.4s, v17.4s, #25
364	sri		v7.4s, v18.4s, #25
365	sri		v4.4s, v19.4s, #25
366
367	subs		w3, w3, #2
368	b.ne		.Ldoubleround4
369
370	ld4r		{v16.4s-v19.4s}, [x0], #16
371	ld4r		{v20.4s-v23.4s}, [x0], #16
372
373	// x12 += counter values 0-3
374	add		v12.4s, v12.4s, v30.4s
375
376	// x0[0-3] += s0[0]
377	// x1[0-3] += s0[1]
378	// x2[0-3] += s0[2]
379	// x3[0-3] += s0[3]
380	add		v0.4s, v0.4s, v16.4s
381	add		v1.4s, v1.4s, v17.4s
382	add		v2.4s, v2.4s, v18.4s
383	add		v3.4s, v3.4s, v19.4s
384
385	ld4r		{v24.4s-v27.4s}, [x0], #16
386	ld4r		{v28.4s-v31.4s}, [x0]
387
388	// x4[0-3] += s1[0]
389	// x5[0-3] += s1[1]
390	// x6[0-3] += s1[2]
391	// x7[0-3] += s1[3]
392	add		v4.4s, v4.4s, v20.4s
393	add		v5.4s, v5.4s, v21.4s
394	add		v6.4s, v6.4s, v22.4s
395	add		v7.4s, v7.4s, v23.4s
396
397	// x8[0-3] += s2[0]
398	// x9[0-3] += s2[1]
399	// x10[0-3] += s2[2]
400	// x11[0-3] += s2[3]
401	add		v8.4s, v8.4s, v24.4s
402	add		v9.4s, v9.4s, v25.4s
403	add		v10.4s, v10.4s, v26.4s
404	add		v11.4s, v11.4s, v27.4s
405
406	// x12[0-3] += s3[0]
407	// x13[0-3] += s3[1]
408	// x14[0-3] += s3[2]
409	// x15[0-3] += s3[3]
410	add		v12.4s, v12.4s, v28.4s
411	add		v13.4s, v13.4s, v29.4s
412	add		v14.4s, v14.4s, v30.4s
413	add		v15.4s, v15.4s, v31.4s
414
415	// interleave 32-bit words in state n, n+1
416	zip1		v16.4s, v0.4s, v1.4s
417	zip2		v17.4s, v0.4s, v1.4s
418	zip1		v18.4s, v2.4s, v3.4s
419	zip2		v19.4s, v2.4s, v3.4s
420	zip1		v20.4s, v4.4s, v5.4s
421	zip2		v21.4s, v4.4s, v5.4s
422	zip1		v22.4s, v6.4s, v7.4s
423	zip2		v23.4s, v6.4s, v7.4s
424	zip1		v24.4s, v8.4s, v9.4s
425	zip2		v25.4s, v8.4s, v9.4s
426	zip1		v26.4s, v10.4s, v11.4s
427	zip2		v27.4s, v10.4s, v11.4s
428	zip1		v28.4s, v12.4s, v13.4s
429	zip2		v29.4s, v12.4s, v13.4s
430	zip1		v30.4s, v14.4s, v15.4s
431	zip2		v31.4s, v14.4s, v15.4s
432
433	// interleave 64-bit words in state n, n+2
434	zip1		v0.2d, v16.2d, v18.2d
435	zip2		v4.2d, v16.2d, v18.2d
436	zip1		v8.2d, v17.2d, v19.2d
437	zip2		v12.2d, v17.2d, v19.2d
438	ld1		{v16.16b-v19.16b}, [x2], #64
439
440	zip1		v1.2d, v20.2d, v22.2d
441	zip2		v5.2d, v20.2d, v22.2d
442	zip1		v9.2d, v21.2d, v23.2d
443	zip2		v13.2d, v21.2d, v23.2d
444	ld1		{v20.16b-v23.16b}, [x2], #64
445
446	zip1		v2.2d, v24.2d, v26.2d
447	zip2		v6.2d, v24.2d, v26.2d
448	zip1		v10.2d, v25.2d, v27.2d
449	zip2		v14.2d, v25.2d, v27.2d
450	ld1		{v24.16b-v27.16b}, [x2], #64
451
452	zip1		v3.2d, v28.2d, v30.2d
453	zip2		v7.2d, v28.2d, v30.2d
454	zip1		v11.2d, v29.2d, v31.2d
455	zip2		v15.2d, v29.2d, v31.2d
456	ld1		{v28.16b-v31.16b}, [x2]
457
458	// xor with corresponding input, write to output
459	eor		v16.16b, v16.16b, v0.16b
460	eor		v17.16b, v17.16b, v1.16b
461	eor		v18.16b, v18.16b, v2.16b
462	eor		v19.16b, v19.16b, v3.16b
463	eor		v20.16b, v20.16b, v4.16b
464	eor		v21.16b, v21.16b, v5.16b
465	st1		{v16.16b-v19.16b}, [x1], #64
466	eor		v22.16b, v22.16b, v6.16b
467	eor		v23.16b, v23.16b, v7.16b
468	eor		v24.16b, v24.16b, v8.16b
469	eor		v25.16b, v25.16b, v9.16b
470	st1		{v20.16b-v23.16b}, [x1], #64
471	eor		v26.16b, v26.16b, v10.16b
472	eor		v27.16b, v27.16b, v11.16b
473	eor		v28.16b, v28.16b, v12.16b
474	st1		{v24.16b-v27.16b}, [x1], #64
475	eor		v29.16b, v29.16b, v13.16b
476	eor		v30.16b, v30.16b, v14.16b
477	eor		v31.16b, v31.16b, v15.16b
478	st1		{v28.16b-v31.16b}, [x1]
479
480	ret
481ENDPROC(chacha_4block_xor_neon)
482
483CTRINC:	.word		0, 1, 2, 3
484ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
485