1/*
2 * AES-NI + SSE2 implementation of AEGIS-128
3 *
4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13#include <asm/frame.h>
14
15#define STATE0	%xmm0
16#define STATE1	%xmm1
17#define STATE2	%xmm2
18#define STATE3	%xmm3
19#define STATE4	%xmm4
20#define KEY	%xmm5
21#define MSG	%xmm5
22#define T0	%xmm6
23#define T1	%xmm7
24
25#define STATEP	%rdi
26#define LEN	%rsi
27#define SRC	%rdx
28#define DST	%rcx
29
30.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
31.align 16
32.Laegis128_const_0:
33	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
34	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
35.Laegis128_const_1:
36	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
37	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
38
39.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
40.align 16
41.Laegis128_counter:
42	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
43	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
44
45.text
46
47/*
48 * aegis128_update
49 * input:
50 *   STATE[0-4] - input state
51 * output:
52 *   STATE[0-4] - output state (shifted positions)
53 * changed:
54 *   T0
55 */
56.macro aegis128_update
57	movdqa STATE4, T0
58	aesenc STATE0, STATE4
59	aesenc STATE1, STATE0
60	aesenc STATE2, STATE1
61	aesenc STATE3, STATE2
62	aesenc T0,     STATE3
63.endm
64
65/*
66 * __load_partial: internal ABI
67 * input:
68 *   LEN - bytes
69 *   SRC - src
70 * output:
71 *   MSG  - message block
72 * changed:
73 *   T0
74 *   %r8
75 *   %r9
76 */
77__load_partial:
78	xor %r9d, %r9d
79	pxor MSG, MSG
80
81	mov LEN, %r8
82	and $0x1, %r8
83	jz .Lld_partial_1
84
85	mov LEN, %r8
86	and $0x1E, %r8
87	add SRC, %r8
88	mov (%r8), %r9b
89
90.Lld_partial_1:
91	mov LEN, %r8
92	and $0x2, %r8
93	jz .Lld_partial_2
94
95	mov LEN, %r8
96	and $0x1C, %r8
97	add SRC, %r8
98	shl $0x10, %r9
99	mov (%r8), %r9w
100
101.Lld_partial_2:
102	mov LEN, %r8
103	and $0x4, %r8
104	jz .Lld_partial_4
105
106	mov LEN, %r8
107	and $0x18, %r8
108	add SRC, %r8
109	shl $32, %r9
110	mov (%r8), %r8d
111	xor %r8, %r9
112
113.Lld_partial_4:
114	movq %r9, MSG
115
116	mov LEN, %r8
117	and $0x8, %r8
118	jz .Lld_partial_8
119
120	mov LEN, %r8
121	and $0x10, %r8
122	add SRC, %r8
123	pslldq $8, MSG
124	movq (%r8), T0
125	pxor T0, MSG
126
127.Lld_partial_8:
128	ret
129ENDPROC(__load_partial)
130
131/*
132 * __store_partial: internal ABI
133 * input:
134 *   LEN - bytes
135 *   DST - dst
136 * output:
137 *   T0   - message block
138 * changed:
139 *   %r8
140 *   %r9
141 *   %r10
142 */
143__store_partial:
144	mov LEN, %r8
145	mov DST, %r9
146
147	movq T0, %r10
148
149	cmp $8, %r8
150	jl .Lst_partial_8
151
152	mov %r10, (%r9)
153	psrldq $8, T0
154	movq T0, %r10
155
156	sub $8, %r8
157	add $8, %r9
158
159.Lst_partial_8:
160	cmp $4, %r8
161	jl .Lst_partial_4
162
163	mov %r10d, (%r9)
164	shr $32, %r10
165
166	sub $4, %r8
167	add $4, %r9
168
169.Lst_partial_4:
170	cmp $2, %r8
171	jl .Lst_partial_2
172
173	mov %r10w, (%r9)
174	shr $0x10, %r10
175
176	sub $2, %r8
177	add $2, %r9
178
179.Lst_partial_2:
180	cmp $1, %r8
181	jl .Lst_partial_1
182
183	mov %r10b, (%r9)
184
185.Lst_partial_1:
186	ret
187ENDPROC(__store_partial)
188
189/*
190 * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
191 */
192ENTRY(crypto_aegis128_aesni_init)
193	FRAME_BEGIN
194
195	/* load IV: */
196	movdqu (%rdx), T1
197
198	/* load key: */
199	movdqa (%rsi), KEY
200	pxor KEY, T1
201	movdqa T1, STATE0
202	movdqa KEY, STATE3
203	movdqa KEY, STATE4
204
205	/* load the constants: */
206	movdqa .Laegis128_const_0, STATE2
207	movdqa .Laegis128_const_1, STATE1
208	pxor STATE2, STATE3
209	pxor STATE1, STATE4
210
211	/* update 10 times with KEY / KEY xor IV: */
212	aegis128_update; pxor KEY, STATE4
213	aegis128_update; pxor T1,  STATE3
214	aegis128_update; pxor KEY, STATE2
215	aegis128_update; pxor T1,  STATE1
216	aegis128_update; pxor KEY, STATE0
217	aegis128_update; pxor T1,  STATE4
218	aegis128_update; pxor KEY, STATE3
219	aegis128_update; pxor T1,  STATE2
220	aegis128_update; pxor KEY, STATE1
221	aegis128_update; pxor T1,  STATE0
222
223	/* store the state: */
224	movdqu STATE0, 0x00(STATEP)
225	movdqu STATE1, 0x10(STATEP)
226	movdqu STATE2, 0x20(STATEP)
227	movdqu STATE3, 0x30(STATEP)
228	movdqu STATE4, 0x40(STATEP)
229
230	FRAME_END
231	ret
232ENDPROC(crypto_aegis128_aesni_init)
233
234/*
235 * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
236 *                               const void *data);
237 */
238ENTRY(crypto_aegis128_aesni_ad)
239	FRAME_BEGIN
240
241	cmp $0x10, LEN
242	jb .Lad_out
243
244	/* load the state: */
245	movdqu 0x00(STATEP), STATE0
246	movdqu 0x10(STATEP), STATE1
247	movdqu 0x20(STATEP), STATE2
248	movdqu 0x30(STATEP), STATE3
249	movdqu 0x40(STATEP), STATE4
250
251	mov SRC, %r8
252	and $0xF, %r8
253	jnz .Lad_u_loop
254
255.align 8
256.Lad_a_loop:
257	movdqa 0x00(SRC), MSG
258	aegis128_update
259	pxor MSG, STATE4
260	sub $0x10, LEN
261	cmp $0x10, LEN
262	jl .Lad_out_1
263
264	movdqa 0x10(SRC), MSG
265	aegis128_update
266	pxor MSG, STATE3
267	sub $0x10, LEN
268	cmp $0x10, LEN
269	jl .Lad_out_2
270
271	movdqa 0x20(SRC), MSG
272	aegis128_update
273	pxor MSG, STATE2
274	sub $0x10, LEN
275	cmp $0x10, LEN
276	jl .Lad_out_3
277
278	movdqa 0x30(SRC), MSG
279	aegis128_update
280	pxor MSG, STATE1
281	sub $0x10, LEN
282	cmp $0x10, LEN
283	jl .Lad_out_4
284
285	movdqa 0x40(SRC), MSG
286	aegis128_update
287	pxor MSG, STATE0
288	sub $0x10, LEN
289	cmp $0x10, LEN
290	jl .Lad_out_0
291
292	add $0x50, SRC
293	jmp .Lad_a_loop
294
295.align 8
296.Lad_u_loop:
297	movdqu 0x00(SRC), MSG
298	aegis128_update
299	pxor MSG, STATE4
300	sub $0x10, LEN
301	cmp $0x10, LEN
302	jl .Lad_out_1
303
304	movdqu 0x10(SRC), MSG
305	aegis128_update
306	pxor MSG, STATE3
307	sub $0x10, LEN
308	cmp $0x10, LEN
309	jl .Lad_out_2
310
311	movdqu 0x20(SRC), MSG
312	aegis128_update
313	pxor MSG, STATE2
314	sub $0x10, LEN
315	cmp $0x10, LEN
316	jl .Lad_out_3
317
318	movdqu 0x30(SRC), MSG
319	aegis128_update
320	pxor MSG, STATE1
321	sub $0x10, LEN
322	cmp $0x10, LEN
323	jl .Lad_out_4
324
325	movdqu 0x40(SRC), MSG
326	aegis128_update
327	pxor MSG, STATE0
328	sub $0x10, LEN
329	cmp $0x10, LEN
330	jl .Lad_out_0
331
332	add $0x50, SRC
333	jmp .Lad_u_loop
334
335	/* store the state: */
336.Lad_out_0:
337	movdqu STATE0, 0x00(STATEP)
338	movdqu STATE1, 0x10(STATEP)
339	movdqu STATE2, 0x20(STATEP)
340	movdqu STATE3, 0x30(STATEP)
341	movdqu STATE4, 0x40(STATEP)
342	FRAME_END
343	ret
344
345.Lad_out_1:
346	movdqu STATE4, 0x00(STATEP)
347	movdqu STATE0, 0x10(STATEP)
348	movdqu STATE1, 0x20(STATEP)
349	movdqu STATE2, 0x30(STATEP)
350	movdqu STATE3, 0x40(STATEP)
351	FRAME_END
352	ret
353
354.Lad_out_2:
355	movdqu STATE3, 0x00(STATEP)
356	movdqu STATE4, 0x10(STATEP)
357	movdqu STATE0, 0x20(STATEP)
358	movdqu STATE1, 0x30(STATEP)
359	movdqu STATE2, 0x40(STATEP)
360	FRAME_END
361	ret
362
363.Lad_out_3:
364	movdqu STATE2, 0x00(STATEP)
365	movdqu STATE3, 0x10(STATEP)
366	movdqu STATE4, 0x20(STATEP)
367	movdqu STATE0, 0x30(STATEP)
368	movdqu STATE1, 0x40(STATEP)
369	FRAME_END
370	ret
371
372.Lad_out_4:
373	movdqu STATE1, 0x00(STATEP)
374	movdqu STATE2, 0x10(STATEP)
375	movdqu STATE3, 0x20(STATEP)
376	movdqu STATE4, 0x30(STATEP)
377	movdqu STATE0, 0x40(STATEP)
378	FRAME_END
379	ret
380
381.Lad_out:
382	FRAME_END
383	ret
384ENDPROC(crypto_aegis128_aesni_ad)
385
386.macro encrypt_block a s0 s1 s2 s3 s4 i
387	movdq\a (\i * 0x10)(SRC), MSG
388	movdqa MSG, T0
389	pxor \s1, T0
390	pxor \s4, T0
391	movdqa \s2, T1
392	pand \s3, T1
393	pxor T1, T0
394	movdq\a T0, (\i * 0x10)(DST)
395
396	aegis128_update
397	pxor MSG, \s4
398
399	sub $0x10, LEN
400	cmp $0x10, LEN
401	jl .Lenc_out_\i
402.endm
403
404/*
405 * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
406 *                                const void *src, void *dst);
407 */
408ENTRY(crypto_aegis128_aesni_enc)
409	FRAME_BEGIN
410
411	cmp $0x10, LEN
412	jb .Lenc_out
413
414	/* load the state: */
415	movdqu 0x00(STATEP), STATE0
416	movdqu 0x10(STATEP), STATE1
417	movdqu 0x20(STATEP), STATE2
418	movdqu 0x30(STATEP), STATE3
419	movdqu 0x40(STATEP), STATE4
420
421	mov  SRC,  %r8
422	or   DST,  %r8
423	and $0xF, %r8
424	jnz .Lenc_u_loop
425
426.align 8
427.Lenc_a_loop:
428	encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
429	encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
430	encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
431	encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
432	encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
433
434	add $0x50, SRC
435	add $0x50, DST
436	jmp .Lenc_a_loop
437
438.align 8
439.Lenc_u_loop:
440	encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
441	encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
442	encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
443	encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
444	encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
445
446	add $0x50, SRC
447	add $0x50, DST
448	jmp .Lenc_u_loop
449
450	/* store the state: */
451.Lenc_out_0:
452	movdqu STATE4, 0x00(STATEP)
453	movdqu STATE0, 0x10(STATEP)
454	movdqu STATE1, 0x20(STATEP)
455	movdqu STATE2, 0x30(STATEP)
456	movdqu STATE3, 0x40(STATEP)
457	FRAME_END
458	ret
459
460.Lenc_out_1:
461	movdqu STATE3, 0x00(STATEP)
462	movdqu STATE4, 0x10(STATEP)
463	movdqu STATE0, 0x20(STATEP)
464	movdqu STATE1, 0x30(STATEP)
465	movdqu STATE2, 0x40(STATEP)
466	FRAME_END
467	ret
468
469.Lenc_out_2:
470	movdqu STATE2, 0x00(STATEP)
471	movdqu STATE3, 0x10(STATEP)
472	movdqu STATE4, 0x20(STATEP)
473	movdqu STATE0, 0x30(STATEP)
474	movdqu STATE1, 0x40(STATEP)
475	FRAME_END
476	ret
477
478.Lenc_out_3:
479	movdqu STATE1, 0x00(STATEP)
480	movdqu STATE2, 0x10(STATEP)
481	movdqu STATE3, 0x20(STATEP)
482	movdqu STATE4, 0x30(STATEP)
483	movdqu STATE0, 0x40(STATEP)
484	FRAME_END
485	ret
486
487.Lenc_out_4:
488	movdqu STATE0, 0x00(STATEP)
489	movdqu STATE1, 0x10(STATEP)
490	movdqu STATE2, 0x20(STATEP)
491	movdqu STATE3, 0x30(STATEP)
492	movdqu STATE4, 0x40(STATEP)
493	FRAME_END
494	ret
495
496.Lenc_out:
497	FRAME_END
498	ret
499ENDPROC(crypto_aegis128_aesni_enc)
500
501/*
502 * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
503 *                                     const void *src, void *dst);
504 */
505ENTRY(crypto_aegis128_aesni_enc_tail)
506	FRAME_BEGIN
507
508	/* load the state: */
509	movdqu 0x00(STATEP), STATE0
510	movdqu 0x10(STATEP), STATE1
511	movdqu 0x20(STATEP), STATE2
512	movdqu 0x30(STATEP), STATE3
513	movdqu 0x40(STATEP), STATE4
514
515	/* encrypt message: */
516	call __load_partial
517
518	movdqa MSG, T0
519	pxor STATE1, T0
520	pxor STATE4, T0
521	movdqa STATE2, T1
522	pand STATE3, T1
523	pxor T1, T0
524
525	call __store_partial
526
527	aegis128_update
528	pxor MSG, STATE4
529
530	/* store the state: */
531	movdqu STATE4, 0x00(STATEP)
532	movdqu STATE0, 0x10(STATEP)
533	movdqu STATE1, 0x20(STATEP)
534	movdqu STATE2, 0x30(STATEP)
535	movdqu STATE3, 0x40(STATEP)
536
537	FRAME_END
538	ret
539ENDPROC(crypto_aegis128_aesni_enc_tail)
540
541.macro decrypt_block a s0 s1 s2 s3 s4 i
542	movdq\a (\i * 0x10)(SRC), MSG
543	pxor \s1, MSG
544	pxor \s4, MSG
545	movdqa \s2, T1
546	pand \s3, T1
547	pxor T1, MSG
548	movdq\a MSG, (\i * 0x10)(DST)
549
550	aegis128_update
551	pxor MSG, \s4
552
553	sub $0x10, LEN
554	cmp $0x10, LEN
555	jl .Ldec_out_\i
556.endm
557
558/*
559 * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
560 *                                const void *src, void *dst);
561 */
562ENTRY(crypto_aegis128_aesni_dec)
563	FRAME_BEGIN
564
565	cmp $0x10, LEN
566	jb .Ldec_out
567
568	/* load the state: */
569	movdqu 0x00(STATEP), STATE0
570	movdqu 0x10(STATEP), STATE1
571	movdqu 0x20(STATEP), STATE2
572	movdqu 0x30(STATEP), STATE3
573	movdqu 0x40(STATEP), STATE4
574
575	mov  SRC, %r8
576	or   DST, %r8
577	and $0xF, %r8
578	jnz .Ldec_u_loop
579
580.align 8
581.Ldec_a_loop:
582	decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
583	decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
584	decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
585	decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
586	decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
587
588	add $0x50, SRC
589	add $0x50, DST
590	jmp .Ldec_a_loop
591
592.align 8
593.Ldec_u_loop:
594	decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
595	decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
596	decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
597	decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
598	decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
599
600	add $0x50, SRC
601	add $0x50, DST
602	jmp .Ldec_u_loop
603
604	/* store the state: */
605.Ldec_out_0:
606	movdqu STATE4, 0x00(STATEP)
607	movdqu STATE0, 0x10(STATEP)
608	movdqu STATE1, 0x20(STATEP)
609	movdqu STATE2, 0x30(STATEP)
610	movdqu STATE3, 0x40(STATEP)
611	FRAME_END
612	ret
613
614.Ldec_out_1:
615	movdqu STATE3, 0x00(STATEP)
616	movdqu STATE4, 0x10(STATEP)
617	movdqu STATE0, 0x20(STATEP)
618	movdqu STATE1, 0x30(STATEP)
619	movdqu STATE2, 0x40(STATEP)
620	FRAME_END
621	ret
622
623.Ldec_out_2:
624	movdqu STATE2, 0x00(STATEP)
625	movdqu STATE3, 0x10(STATEP)
626	movdqu STATE4, 0x20(STATEP)
627	movdqu STATE0, 0x30(STATEP)
628	movdqu STATE1, 0x40(STATEP)
629	FRAME_END
630	ret
631
632.Ldec_out_3:
633	movdqu STATE1, 0x00(STATEP)
634	movdqu STATE2, 0x10(STATEP)
635	movdqu STATE3, 0x20(STATEP)
636	movdqu STATE4, 0x30(STATEP)
637	movdqu STATE0, 0x40(STATEP)
638	FRAME_END
639	ret
640
641.Ldec_out_4:
642	movdqu STATE0, 0x00(STATEP)
643	movdqu STATE1, 0x10(STATEP)
644	movdqu STATE2, 0x20(STATEP)
645	movdqu STATE3, 0x30(STATEP)
646	movdqu STATE4, 0x40(STATEP)
647	FRAME_END
648	ret
649
650.Ldec_out:
651	FRAME_END
652	ret
653ENDPROC(crypto_aegis128_aesni_dec)
654
655/*
656 * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
657 *                                     const void *src, void *dst);
658 */
659ENTRY(crypto_aegis128_aesni_dec_tail)
660	FRAME_BEGIN
661
662	/* load the state: */
663	movdqu 0x00(STATEP), STATE0
664	movdqu 0x10(STATEP), STATE1
665	movdqu 0x20(STATEP), STATE2
666	movdqu 0x30(STATEP), STATE3
667	movdqu 0x40(STATEP), STATE4
668
669	/* decrypt message: */
670	call __load_partial
671
672	pxor STATE1, MSG
673	pxor STATE4, MSG
674	movdqa STATE2, T1
675	pand STATE3, T1
676	pxor T1, MSG
677
678	movdqa MSG, T0
679	call __store_partial
680
681	/* mask with byte count: */
682	movq LEN, T0
683	punpcklbw T0, T0
684	punpcklbw T0, T0
685	punpcklbw T0, T0
686	punpcklbw T0, T0
687	movdqa .Laegis128_counter, T1
688	pcmpgtb T1, T0
689	pand T0, MSG
690
691	aegis128_update
692	pxor MSG, STATE4
693
694	/* store the state: */
695	movdqu STATE4, 0x00(STATEP)
696	movdqu STATE0, 0x10(STATEP)
697	movdqu STATE1, 0x20(STATEP)
698	movdqu STATE2, 0x30(STATEP)
699	movdqu STATE3, 0x40(STATEP)
700
701	FRAME_END
702	ret
703ENDPROC(crypto_aegis128_aesni_dec_tail)
704
705/*
706 * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
707 *                                  u64 assoclen, u64 cryptlen);
708 */
709ENTRY(crypto_aegis128_aesni_final)
710	FRAME_BEGIN
711
712	/* load the state: */
713	movdqu 0x00(STATEP), STATE0
714	movdqu 0x10(STATEP), STATE1
715	movdqu 0x20(STATEP), STATE2
716	movdqu 0x30(STATEP), STATE3
717	movdqu 0x40(STATEP), STATE4
718
719	/* prepare length block: */
720	movq %rdx, MSG
721	movq %rcx, T0
722	pslldq $8, T0
723	pxor T0, MSG
724	psllq $3, MSG /* multiply by 8 (to get bit count) */
725
726	pxor STATE3, MSG
727
728	/* update state: */
729	aegis128_update; pxor MSG, STATE4
730	aegis128_update; pxor MSG, STATE3
731	aegis128_update; pxor MSG, STATE2
732	aegis128_update; pxor MSG, STATE1
733	aegis128_update; pxor MSG, STATE0
734	aegis128_update; pxor MSG, STATE4
735	aegis128_update; pxor MSG, STATE3
736
737	/* xor tag: */
738	movdqu (%rsi), MSG
739
740	pxor STATE0, MSG
741	pxor STATE1, MSG
742	pxor STATE2, MSG
743	pxor STATE3, MSG
744	pxor STATE4, MSG
745
746	movdqu MSG, (%rsi)
747
748	FRAME_END
749	ret
750ENDPROC(crypto_aegis128_aesni_final)
751