1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * AES-NI + SSE2 implementation of AEGIS-128
4 *
5 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
6 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
7 */
8
9#include <linux/linkage.h>
10#include <asm/frame.h>
11
12#define STATE0	%xmm0
13#define STATE1	%xmm1
14#define STATE2	%xmm2
15#define STATE3	%xmm3
16#define STATE4	%xmm4
17#define KEY	%xmm5
18#define MSG	%xmm5
19#define T0	%xmm6
20#define T1	%xmm7
21
22#define STATEP	%rdi
23#define LEN	%rsi
24#define SRC	%rdx
25#define DST	%rcx
26
27.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
28.align 16
29.Laegis128_const_0:
30	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
31	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
32.Laegis128_const_1:
33	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
34	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
35
36.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
37.align 16
38.Laegis128_counter:
39	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
40	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
41
42.text
43
44/*
45 * aegis128_update
46 * input:
47 *   STATE[0-4] - input state
48 * output:
49 *   STATE[0-4] - output state (shifted positions)
50 * changed:
51 *   T0
52 */
53.macro aegis128_update
54	movdqa STATE4, T0
55	aesenc STATE0, STATE4
56	aesenc STATE1, STATE0
57	aesenc STATE2, STATE1
58	aesenc STATE3, STATE2
59	aesenc T0,     STATE3
60.endm
61
62/*
63 * __load_partial: internal ABI
64 * input:
65 *   LEN - bytes
66 *   SRC - src
67 * output:
68 *   MSG  - message block
69 * changed:
70 *   T0
71 *   %r8
72 *   %r9
73 */
74SYM_FUNC_START_LOCAL(__load_partial)
75	xor %r9d, %r9d
76	pxor MSG, MSG
77
78	mov LEN, %r8
79	and $0x1, %r8
80	jz .Lld_partial_1
81
82	mov LEN, %r8
83	and $0x1E, %r8
84	add SRC, %r8
85	mov (%r8), %r9b
86
87.Lld_partial_1:
88	mov LEN, %r8
89	and $0x2, %r8
90	jz .Lld_partial_2
91
92	mov LEN, %r8
93	and $0x1C, %r8
94	add SRC, %r8
95	shl $0x10, %r9
96	mov (%r8), %r9w
97
98.Lld_partial_2:
99	mov LEN, %r8
100	and $0x4, %r8
101	jz .Lld_partial_4
102
103	mov LEN, %r8
104	and $0x18, %r8
105	add SRC, %r8
106	shl $32, %r9
107	mov (%r8), %r8d
108	xor %r8, %r9
109
110.Lld_partial_4:
111	movq %r9, MSG
112
113	mov LEN, %r8
114	and $0x8, %r8
115	jz .Lld_partial_8
116
117	mov LEN, %r8
118	and $0x10, %r8
119	add SRC, %r8
120	pslldq $8, MSG
121	movq (%r8), T0
122	pxor T0, MSG
123
124.Lld_partial_8:
125	ret
126SYM_FUNC_END(__load_partial)
127
128/*
129 * __store_partial: internal ABI
130 * input:
131 *   LEN - bytes
132 *   DST - dst
133 * output:
134 *   T0   - message block
135 * changed:
136 *   %r8
137 *   %r9
138 *   %r10
139 */
140SYM_FUNC_START_LOCAL(__store_partial)
141	mov LEN, %r8
142	mov DST, %r9
143
144	movq T0, %r10
145
146	cmp $8, %r8
147	jl .Lst_partial_8
148
149	mov %r10, (%r9)
150	psrldq $8, T0
151	movq T0, %r10
152
153	sub $8, %r8
154	add $8, %r9
155
156.Lst_partial_8:
157	cmp $4, %r8
158	jl .Lst_partial_4
159
160	mov %r10d, (%r9)
161	shr $32, %r10
162
163	sub $4, %r8
164	add $4, %r9
165
166.Lst_partial_4:
167	cmp $2, %r8
168	jl .Lst_partial_2
169
170	mov %r10w, (%r9)
171	shr $0x10, %r10
172
173	sub $2, %r8
174	add $2, %r9
175
176.Lst_partial_2:
177	cmp $1, %r8
178	jl .Lst_partial_1
179
180	mov %r10b, (%r9)
181
182.Lst_partial_1:
183	ret
184SYM_FUNC_END(__store_partial)
185
186/*
187 * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
188 */
189SYM_FUNC_START(crypto_aegis128_aesni_init)
190	FRAME_BEGIN
191
192	/* load IV: */
193	movdqu (%rdx), T1
194
195	/* load key: */
196	movdqa (%rsi), KEY
197	pxor KEY, T1
198	movdqa T1, STATE0
199	movdqa KEY, STATE3
200	movdqa KEY, STATE4
201
202	/* load the constants: */
203	movdqa .Laegis128_const_0, STATE2
204	movdqa .Laegis128_const_1, STATE1
205	pxor STATE2, STATE3
206	pxor STATE1, STATE4
207
208	/* update 10 times with KEY / KEY xor IV: */
209	aegis128_update; pxor KEY, STATE4
210	aegis128_update; pxor T1,  STATE3
211	aegis128_update; pxor KEY, STATE2
212	aegis128_update; pxor T1,  STATE1
213	aegis128_update; pxor KEY, STATE0
214	aegis128_update; pxor T1,  STATE4
215	aegis128_update; pxor KEY, STATE3
216	aegis128_update; pxor T1,  STATE2
217	aegis128_update; pxor KEY, STATE1
218	aegis128_update; pxor T1,  STATE0
219
220	/* store the state: */
221	movdqu STATE0, 0x00(STATEP)
222	movdqu STATE1, 0x10(STATEP)
223	movdqu STATE2, 0x20(STATEP)
224	movdqu STATE3, 0x30(STATEP)
225	movdqu STATE4, 0x40(STATEP)
226
227	FRAME_END
228	ret
229SYM_FUNC_END(crypto_aegis128_aesni_init)
230
231/*
232 * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
233 *                               const void *data);
234 */
235SYM_FUNC_START(crypto_aegis128_aesni_ad)
236	FRAME_BEGIN
237
238	cmp $0x10, LEN
239	jb .Lad_out
240
241	/* load the state: */
242	movdqu 0x00(STATEP), STATE0
243	movdqu 0x10(STATEP), STATE1
244	movdqu 0x20(STATEP), STATE2
245	movdqu 0x30(STATEP), STATE3
246	movdqu 0x40(STATEP), STATE4
247
248	mov SRC, %r8
249	and $0xF, %r8
250	jnz .Lad_u_loop
251
252.align 8
253.Lad_a_loop:
254	movdqa 0x00(SRC), MSG
255	aegis128_update
256	pxor MSG, STATE4
257	sub $0x10, LEN
258	cmp $0x10, LEN
259	jl .Lad_out_1
260
261	movdqa 0x10(SRC), MSG
262	aegis128_update
263	pxor MSG, STATE3
264	sub $0x10, LEN
265	cmp $0x10, LEN
266	jl .Lad_out_2
267
268	movdqa 0x20(SRC), MSG
269	aegis128_update
270	pxor MSG, STATE2
271	sub $0x10, LEN
272	cmp $0x10, LEN
273	jl .Lad_out_3
274
275	movdqa 0x30(SRC), MSG
276	aegis128_update
277	pxor MSG, STATE1
278	sub $0x10, LEN
279	cmp $0x10, LEN
280	jl .Lad_out_4
281
282	movdqa 0x40(SRC), MSG
283	aegis128_update
284	pxor MSG, STATE0
285	sub $0x10, LEN
286	cmp $0x10, LEN
287	jl .Lad_out_0
288
289	add $0x50, SRC
290	jmp .Lad_a_loop
291
292.align 8
293.Lad_u_loop:
294	movdqu 0x00(SRC), MSG
295	aegis128_update
296	pxor MSG, STATE4
297	sub $0x10, LEN
298	cmp $0x10, LEN
299	jl .Lad_out_1
300
301	movdqu 0x10(SRC), MSG
302	aegis128_update
303	pxor MSG, STATE3
304	sub $0x10, LEN
305	cmp $0x10, LEN
306	jl .Lad_out_2
307
308	movdqu 0x20(SRC), MSG
309	aegis128_update
310	pxor MSG, STATE2
311	sub $0x10, LEN
312	cmp $0x10, LEN
313	jl .Lad_out_3
314
315	movdqu 0x30(SRC), MSG
316	aegis128_update
317	pxor MSG, STATE1
318	sub $0x10, LEN
319	cmp $0x10, LEN
320	jl .Lad_out_4
321
322	movdqu 0x40(SRC), MSG
323	aegis128_update
324	pxor MSG, STATE0
325	sub $0x10, LEN
326	cmp $0x10, LEN
327	jl .Lad_out_0
328
329	add $0x50, SRC
330	jmp .Lad_u_loop
331
332	/* store the state: */
333.Lad_out_0:
334	movdqu STATE0, 0x00(STATEP)
335	movdqu STATE1, 0x10(STATEP)
336	movdqu STATE2, 0x20(STATEP)
337	movdqu STATE3, 0x30(STATEP)
338	movdqu STATE4, 0x40(STATEP)
339	FRAME_END
340	ret
341
342.Lad_out_1:
343	movdqu STATE4, 0x00(STATEP)
344	movdqu STATE0, 0x10(STATEP)
345	movdqu STATE1, 0x20(STATEP)
346	movdqu STATE2, 0x30(STATEP)
347	movdqu STATE3, 0x40(STATEP)
348	FRAME_END
349	ret
350
351.Lad_out_2:
352	movdqu STATE3, 0x00(STATEP)
353	movdqu STATE4, 0x10(STATEP)
354	movdqu STATE0, 0x20(STATEP)
355	movdqu STATE1, 0x30(STATEP)
356	movdqu STATE2, 0x40(STATEP)
357	FRAME_END
358	ret
359
360.Lad_out_3:
361	movdqu STATE2, 0x00(STATEP)
362	movdqu STATE3, 0x10(STATEP)
363	movdqu STATE4, 0x20(STATEP)
364	movdqu STATE0, 0x30(STATEP)
365	movdqu STATE1, 0x40(STATEP)
366	FRAME_END
367	ret
368
369.Lad_out_4:
370	movdqu STATE1, 0x00(STATEP)
371	movdqu STATE2, 0x10(STATEP)
372	movdqu STATE3, 0x20(STATEP)
373	movdqu STATE4, 0x30(STATEP)
374	movdqu STATE0, 0x40(STATEP)
375	FRAME_END
376	ret
377
378.Lad_out:
379	FRAME_END
380	ret
381SYM_FUNC_END(crypto_aegis128_aesni_ad)
382
383.macro encrypt_block a s0 s1 s2 s3 s4 i
384	movdq\a (\i * 0x10)(SRC), MSG
385	movdqa MSG, T0
386	pxor \s1, T0
387	pxor \s4, T0
388	movdqa \s2, T1
389	pand \s3, T1
390	pxor T1, T0
391	movdq\a T0, (\i * 0x10)(DST)
392
393	aegis128_update
394	pxor MSG, \s4
395
396	sub $0x10, LEN
397	cmp $0x10, LEN
398	jl .Lenc_out_\i
399.endm
400
401/*
402 * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
403 *                                const void *src, void *dst);
404 */
405SYM_FUNC_START(crypto_aegis128_aesni_enc)
406	FRAME_BEGIN
407
408	cmp $0x10, LEN
409	jb .Lenc_out
410
411	/* load the state: */
412	movdqu 0x00(STATEP), STATE0
413	movdqu 0x10(STATEP), STATE1
414	movdqu 0x20(STATEP), STATE2
415	movdqu 0x30(STATEP), STATE3
416	movdqu 0x40(STATEP), STATE4
417
418	mov  SRC,  %r8
419	or   DST,  %r8
420	and $0xF, %r8
421	jnz .Lenc_u_loop
422
423.align 8
424.Lenc_a_loop:
425	encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
426	encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
427	encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
428	encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
429	encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
430
431	add $0x50, SRC
432	add $0x50, DST
433	jmp .Lenc_a_loop
434
435.align 8
436.Lenc_u_loop:
437	encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
438	encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
439	encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
440	encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
441	encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
442
443	add $0x50, SRC
444	add $0x50, DST
445	jmp .Lenc_u_loop
446
447	/* store the state: */
448.Lenc_out_0:
449	movdqu STATE4, 0x00(STATEP)
450	movdqu STATE0, 0x10(STATEP)
451	movdqu STATE1, 0x20(STATEP)
452	movdqu STATE2, 0x30(STATEP)
453	movdqu STATE3, 0x40(STATEP)
454	FRAME_END
455	ret
456
457.Lenc_out_1:
458	movdqu STATE3, 0x00(STATEP)
459	movdqu STATE4, 0x10(STATEP)
460	movdqu STATE0, 0x20(STATEP)
461	movdqu STATE1, 0x30(STATEP)
462	movdqu STATE2, 0x40(STATEP)
463	FRAME_END
464	ret
465
466.Lenc_out_2:
467	movdqu STATE2, 0x00(STATEP)
468	movdqu STATE3, 0x10(STATEP)
469	movdqu STATE4, 0x20(STATEP)
470	movdqu STATE0, 0x30(STATEP)
471	movdqu STATE1, 0x40(STATEP)
472	FRAME_END
473	ret
474
475.Lenc_out_3:
476	movdqu STATE1, 0x00(STATEP)
477	movdqu STATE2, 0x10(STATEP)
478	movdqu STATE3, 0x20(STATEP)
479	movdqu STATE4, 0x30(STATEP)
480	movdqu STATE0, 0x40(STATEP)
481	FRAME_END
482	ret
483
484.Lenc_out_4:
485	movdqu STATE0, 0x00(STATEP)
486	movdqu STATE1, 0x10(STATEP)
487	movdqu STATE2, 0x20(STATEP)
488	movdqu STATE3, 0x30(STATEP)
489	movdqu STATE4, 0x40(STATEP)
490	FRAME_END
491	ret
492
493.Lenc_out:
494	FRAME_END
495	ret
496SYM_FUNC_END(crypto_aegis128_aesni_enc)
497
498/*
499 * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
500 *                                     const void *src, void *dst);
501 */
502SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
503	FRAME_BEGIN
504
505	/* load the state: */
506	movdqu 0x00(STATEP), STATE0
507	movdqu 0x10(STATEP), STATE1
508	movdqu 0x20(STATEP), STATE2
509	movdqu 0x30(STATEP), STATE3
510	movdqu 0x40(STATEP), STATE4
511
512	/* encrypt message: */
513	call __load_partial
514
515	movdqa MSG, T0
516	pxor STATE1, T0
517	pxor STATE4, T0
518	movdqa STATE2, T1
519	pand STATE3, T1
520	pxor T1, T0
521
522	call __store_partial
523
524	aegis128_update
525	pxor MSG, STATE4
526
527	/* store the state: */
528	movdqu STATE4, 0x00(STATEP)
529	movdqu STATE0, 0x10(STATEP)
530	movdqu STATE1, 0x20(STATEP)
531	movdqu STATE2, 0x30(STATEP)
532	movdqu STATE3, 0x40(STATEP)
533
534	FRAME_END
535	ret
536SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
537
538.macro decrypt_block a s0 s1 s2 s3 s4 i
539	movdq\a (\i * 0x10)(SRC), MSG
540	pxor \s1, MSG
541	pxor \s4, MSG
542	movdqa \s2, T1
543	pand \s3, T1
544	pxor T1, MSG
545	movdq\a MSG, (\i * 0x10)(DST)
546
547	aegis128_update
548	pxor MSG, \s4
549
550	sub $0x10, LEN
551	cmp $0x10, LEN
552	jl .Ldec_out_\i
553.endm
554
555/*
556 * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
557 *                                const void *src, void *dst);
558 */
559SYM_FUNC_START(crypto_aegis128_aesni_dec)
560	FRAME_BEGIN
561
562	cmp $0x10, LEN
563	jb .Ldec_out
564
565	/* load the state: */
566	movdqu 0x00(STATEP), STATE0
567	movdqu 0x10(STATEP), STATE1
568	movdqu 0x20(STATEP), STATE2
569	movdqu 0x30(STATEP), STATE3
570	movdqu 0x40(STATEP), STATE4
571
572	mov  SRC, %r8
573	or   DST, %r8
574	and $0xF, %r8
575	jnz .Ldec_u_loop
576
577.align 8
578.Ldec_a_loop:
579	decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
580	decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
581	decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
582	decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
583	decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
584
585	add $0x50, SRC
586	add $0x50, DST
587	jmp .Ldec_a_loop
588
589.align 8
590.Ldec_u_loop:
591	decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
592	decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
593	decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
594	decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
595	decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
596
597	add $0x50, SRC
598	add $0x50, DST
599	jmp .Ldec_u_loop
600
601	/* store the state: */
602.Ldec_out_0:
603	movdqu STATE4, 0x00(STATEP)
604	movdqu STATE0, 0x10(STATEP)
605	movdqu STATE1, 0x20(STATEP)
606	movdqu STATE2, 0x30(STATEP)
607	movdqu STATE3, 0x40(STATEP)
608	FRAME_END
609	ret
610
611.Ldec_out_1:
612	movdqu STATE3, 0x00(STATEP)
613	movdqu STATE4, 0x10(STATEP)
614	movdqu STATE0, 0x20(STATEP)
615	movdqu STATE1, 0x30(STATEP)
616	movdqu STATE2, 0x40(STATEP)
617	FRAME_END
618	ret
619
620.Ldec_out_2:
621	movdqu STATE2, 0x00(STATEP)
622	movdqu STATE3, 0x10(STATEP)
623	movdqu STATE4, 0x20(STATEP)
624	movdqu STATE0, 0x30(STATEP)
625	movdqu STATE1, 0x40(STATEP)
626	FRAME_END
627	ret
628
629.Ldec_out_3:
630	movdqu STATE1, 0x00(STATEP)
631	movdqu STATE2, 0x10(STATEP)
632	movdqu STATE3, 0x20(STATEP)
633	movdqu STATE4, 0x30(STATEP)
634	movdqu STATE0, 0x40(STATEP)
635	FRAME_END
636	ret
637
638.Ldec_out_4:
639	movdqu STATE0, 0x00(STATEP)
640	movdqu STATE1, 0x10(STATEP)
641	movdqu STATE2, 0x20(STATEP)
642	movdqu STATE3, 0x30(STATEP)
643	movdqu STATE4, 0x40(STATEP)
644	FRAME_END
645	ret
646
647.Ldec_out:
648	FRAME_END
649	ret
650SYM_FUNC_END(crypto_aegis128_aesni_dec)
651
652/*
653 * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
654 *                                     const void *src, void *dst);
655 */
656SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
657	FRAME_BEGIN
658
659	/* load the state: */
660	movdqu 0x00(STATEP), STATE0
661	movdqu 0x10(STATEP), STATE1
662	movdqu 0x20(STATEP), STATE2
663	movdqu 0x30(STATEP), STATE3
664	movdqu 0x40(STATEP), STATE4
665
666	/* decrypt message: */
667	call __load_partial
668
669	pxor STATE1, MSG
670	pxor STATE4, MSG
671	movdqa STATE2, T1
672	pand STATE3, T1
673	pxor T1, MSG
674
675	movdqa MSG, T0
676	call __store_partial
677
678	/* mask with byte count: */
679	movq LEN, T0
680	punpcklbw T0, T0
681	punpcklbw T0, T0
682	punpcklbw T0, T0
683	punpcklbw T0, T0
684	movdqa .Laegis128_counter, T1
685	pcmpgtb T1, T0
686	pand T0, MSG
687
688	aegis128_update
689	pxor MSG, STATE4
690
691	/* store the state: */
692	movdqu STATE4, 0x00(STATEP)
693	movdqu STATE0, 0x10(STATEP)
694	movdqu STATE1, 0x20(STATEP)
695	movdqu STATE2, 0x30(STATEP)
696	movdqu STATE3, 0x40(STATEP)
697
698	FRAME_END
699	ret
700SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
701
702/*
703 * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
704 *                                  u64 assoclen, u64 cryptlen);
705 */
706SYM_FUNC_START(crypto_aegis128_aesni_final)
707	FRAME_BEGIN
708
709	/* load the state: */
710	movdqu 0x00(STATEP), STATE0
711	movdqu 0x10(STATEP), STATE1
712	movdqu 0x20(STATEP), STATE2
713	movdqu 0x30(STATEP), STATE3
714	movdqu 0x40(STATEP), STATE4
715
716	/* prepare length block: */
717	movq %rdx, MSG
718	movq %rcx, T0
719	pslldq $8, T0
720	pxor T0, MSG
721	psllq $3, MSG /* multiply by 8 (to get bit count) */
722
723	pxor STATE3, MSG
724
725	/* update state: */
726	aegis128_update; pxor MSG, STATE4
727	aegis128_update; pxor MSG, STATE3
728	aegis128_update; pxor MSG, STATE2
729	aegis128_update; pxor MSG, STATE1
730	aegis128_update; pxor MSG, STATE0
731	aegis128_update; pxor MSG, STATE4
732	aegis128_update; pxor MSG, STATE3
733
734	/* xor tag: */
735	movdqu (%rsi), MSG
736
737	pxor STATE0, MSG
738	pxor STATE1, MSG
739	pxor STATE2, MSG
740	pxor STATE3, MSG
741	pxor STATE4, MSG
742
743	movdqu MSG, (%rsi)
744
745	FRAME_END
746	ret
747SYM_FUNC_END(crypto_aegis128_aesni_final)
748