1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 16-way parallel algorithm (AVX)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <linux/cfi_types.h>
11#include <asm/frame.h>
12
13/* struct aria_ctx: */
14#define enc_key 0
15#define dec_key 272
16#define rounds 544
17
18/* register macros */
19#define CTX %rdi
20
21
22#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
23	( (((a0) & 1) << 0) |				\
24	  (((a1) & 1) << 1) |				\
25	  (((a2) & 1) << 2) |				\
26	  (((a3) & 1) << 3) |				\
27	  (((a4) & 1) << 4) |				\
28	  (((a5) & 1) << 5) |				\
29	  (((a6) & 1) << 6) |				\
30	  (((a7) & 1) << 7) )
31
32#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
33	( ((l7) << (0 * 8)) |				\
34	  ((l6) << (1 * 8)) |				\
35	  ((l5) << (2 * 8)) |				\
36	  ((l4) << (3 * 8)) |				\
37	  ((l3) << (4 * 8)) |				\
38	  ((l2) << (5 * 8)) |				\
39	  ((l1) << (6 * 8)) |				\
40	  ((l0) << (7 * 8)) )
41
42#define inc_le128(x, minus_one, tmp)			\
43	vpcmpeqq minus_one, x, tmp;			\
44	vpsubq minus_one, x, x;				\
45	vpslldq $8, tmp, tmp;				\
46	vpsubq tmp, x, x;
47
48#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
49	vpand x, mask4bit, tmp0;			\
50	vpandn x, mask4bit, x;				\
51	vpsrld $4, x, x;				\
52							\
53	vpshufb tmp0, lo_t, tmp0;			\
54	vpshufb x, hi_t, x;				\
55	vpxor tmp0, x, x;
56
57#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
58	vpunpckhdq x1, x0, t2;				\
59	vpunpckldq x1, x0, x0;				\
60							\
61	vpunpckldq x3, x2, t1;				\
62	vpunpckhdq x3, x2, x2;				\
63							\
64	vpunpckhqdq t1, x0, x1;				\
65	vpunpcklqdq t1, x0, x0;				\
66							\
67	vpunpckhqdq x2, t2, x3;				\
68	vpunpcklqdq x2, t2, x2;
69
70#define byteslice_16x16b(a0, b0, c0, d0,		\
71			 a1, b1, c1, d1,		\
72			 a2, b2, c2, d2,		\
73			 a3, b3, c3, d3,		\
74			 st0, st1)			\
75	vmovdqu d2, st0;				\
76	vmovdqu d3, st1;				\
77	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
78	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
79	vmovdqu st0, d2;				\
80	vmovdqu st1, d3;				\
81							\
82	vmovdqu a0, st0;				\
83	vmovdqu a1, st1;				\
84	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
85	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
86							\
87	vmovdqu .Lshufb_16x16b, a0;			\
88	vmovdqu st1, a1;				\
89	vpshufb a0, a2, a2;				\
90	vpshufb a0, a3, a3;				\
91	vpshufb a0, b0, b0;				\
92	vpshufb a0, b1, b1;				\
93	vpshufb a0, b2, b2;				\
94	vpshufb a0, b3, b3;				\
95	vpshufb a0, a1, a1;				\
96	vpshufb a0, c0, c0;				\
97	vpshufb a0, c1, c1;				\
98	vpshufb a0, c2, c2;				\
99	vpshufb a0, c3, c3;				\
100	vpshufb a0, d0, d0;				\
101	vpshufb a0, d1, d1;				\
102	vpshufb a0, d2, d2;				\
103	vpshufb a0, d3, d3;				\
104	vmovdqu d3, st1;				\
105	vmovdqu st0, d3;				\
106	vpshufb a0, d3, a0;				\
107	vmovdqu d2, st0;				\
108							\
109	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
110	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
111	vmovdqu st0, d2;				\
112	vmovdqu st1, d3;				\
113							\
114	vmovdqu b0, st0;				\
115	vmovdqu b1, st1;				\
116	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
117	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
118	vmovdqu st0, b0;				\
119	vmovdqu st1, b1;				\
120	/* does not adjust output bytes inside vectors */
121
122#define debyteslice_16x16b(a0, b0, c0, d0,		\
123			   a1, b1, c1, d1,		\
124			   a2, b2, c2, d2,		\
125			   a3, b3, c3, d3,		\
126			   st0, st1)			\
127	vmovdqu d2, st0;				\
128	vmovdqu d3, st1;				\
129	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
130	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
131	vmovdqu st0, d2;				\
132	vmovdqu st1, d3;				\
133							\
134	vmovdqu a0, st0;				\
135	vmovdqu a1, st1;				\
136	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
137	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
138							\
139	vmovdqu .Lshufb_16x16b, a0;			\
140	vmovdqu st1, a1;				\
141	vpshufb a0, a2, a2;				\
142	vpshufb a0, a3, a3;				\
143	vpshufb a0, b0, b0;				\
144	vpshufb a0, b1, b1;				\
145	vpshufb a0, b2, b2;				\
146	vpshufb a0, b3, b3;				\
147	vpshufb a0, a1, a1;				\
148	vpshufb a0, c0, c0;				\
149	vpshufb a0, c1, c1;				\
150	vpshufb a0, c2, c2;				\
151	vpshufb a0, c3, c3;				\
152	vpshufb a0, d0, d0;				\
153	vpshufb a0, d1, d1;				\
154	vpshufb a0, d2, d2;				\
155	vpshufb a0, d3, d3;				\
156	vmovdqu d3, st1;				\
157	vmovdqu st0, d3;				\
158	vpshufb a0, d3, a0;				\
159	vmovdqu d2, st0;				\
160							\
161	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
162	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
163	vmovdqu st0, d2;				\
164	vmovdqu st1, d3;				\
165							\
166	vmovdqu b0, st0;				\
167	vmovdqu b1, st1;				\
168	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
169	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
170	vmovdqu st0, b0;				\
171	vmovdqu st1, b1;				\
172	/* does not adjust output bytes inside vectors */
173
174/* load blocks to registers and apply pre-whitening */
175#define inpack16_pre(x0, x1, x2, x3,			\
176		     x4, x5, x6, x7,			\
177		     y0, y1, y2, y3,			\
178		     y4, y5, y6, y7,			\
179		     rio)				\
180	vmovdqu (0 * 16)(rio), x0;			\
181	vmovdqu (1 * 16)(rio), x1;			\
182	vmovdqu (2 * 16)(rio), x2;			\
183	vmovdqu (3 * 16)(rio), x3;			\
184	vmovdqu (4 * 16)(rio), x4;			\
185	vmovdqu (5 * 16)(rio), x5;			\
186	vmovdqu (6 * 16)(rio), x6;			\
187	vmovdqu (7 * 16)(rio), x7;			\
188	vmovdqu (8 * 16)(rio), y0;			\
189	vmovdqu (9 * 16)(rio), y1;			\
190	vmovdqu (10 * 16)(rio), y2;			\
191	vmovdqu (11 * 16)(rio), y3;			\
192	vmovdqu (12 * 16)(rio), y4;			\
193	vmovdqu (13 * 16)(rio), y5;			\
194	vmovdqu (14 * 16)(rio), y6;			\
195	vmovdqu (15 * 16)(rio), y7;
196
197/* byteslice pre-whitened blocks and store to temporary memory */
198#define inpack16_post(x0, x1, x2, x3,			\
199		      x4, x5, x6, x7,			\
200		      y0, y1, y2, y3,			\
201		      y4, y5, y6, y7,			\
202		      mem_ab, mem_cd)			\
203	byteslice_16x16b(x0, x1, x2, x3,		\
204			 x4, x5, x6, x7,		\
205			 y0, y1, y2, y3,		\
206			 y4, y5, y6, y7,		\
207			 (mem_ab), (mem_cd));		\
208							\
209	vmovdqu x0, 0 * 16(mem_ab);			\
210	vmovdqu x1, 1 * 16(mem_ab);			\
211	vmovdqu x2, 2 * 16(mem_ab);			\
212	vmovdqu x3, 3 * 16(mem_ab);			\
213	vmovdqu x4, 4 * 16(mem_ab);			\
214	vmovdqu x5, 5 * 16(mem_ab);			\
215	vmovdqu x6, 6 * 16(mem_ab);			\
216	vmovdqu x7, 7 * 16(mem_ab);			\
217	vmovdqu y0, 0 * 16(mem_cd);			\
218	vmovdqu y1, 1 * 16(mem_cd);			\
219	vmovdqu y2, 2 * 16(mem_cd);			\
220	vmovdqu y3, 3 * 16(mem_cd);			\
221	vmovdqu y4, 4 * 16(mem_cd);			\
222	vmovdqu y5, 5 * 16(mem_cd);			\
223	vmovdqu y6, 6 * 16(mem_cd);			\
224	vmovdqu y7, 7 * 16(mem_cd);
225
226#define write_output(x0, x1, x2, x3,			\
227		     x4, x5, x6, x7,			\
228		     y0, y1, y2, y3,			\
229		     y4, y5, y6, y7,			\
230		     mem)				\
231	vmovdqu x0, 0 * 16(mem);			\
232	vmovdqu x1, 1 * 16(mem);			\
233	vmovdqu x2, 2 * 16(mem);			\
234	vmovdqu x3, 3 * 16(mem);			\
235	vmovdqu x4, 4 * 16(mem);			\
236	vmovdqu x5, 5 * 16(mem);			\
237	vmovdqu x6, 6 * 16(mem);			\
238	vmovdqu x7, 7 * 16(mem);			\
239	vmovdqu y0, 8 * 16(mem);			\
240	vmovdqu y1, 9 * 16(mem);			\
241	vmovdqu y2, 10 * 16(mem);			\
242	vmovdqu y3, 11 * 16(mem);			\
243	vmovdqu y4, 12 * 16(mem);			\
244	vmovdqu y5, 13 * 16(mem);			\
245	vmovdqu y6, 14 * 16(mem);			\
246	vmovdqu y7, 15 * 16(mem);			\
247
248#define aria_store_state_8way(x0, x1, x2, x3,		\
249			      x4, x5, x6, x7,		\
250			      mem_tmp, idx)		\
251	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
252	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
253	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
254	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
255	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
256	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
257	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
258	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
259
260#define aria_load_state_8way(x0, x1, x2, x3,		\
261			     x4, x5, x6, x7,		\
262			     mem_tmp, idx)		\
263	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
264	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
265	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
266	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
267	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
268	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
269	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
270	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
271
272#define aria_ark_8way(x0, x1, x2, x3,			\
273		      x4, x5, x6, x7,			\
274		      t0, rk, idx, round)		\
275	/* AddRoundKey */                               \
276	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
277	vpxor t0, x0, x0;				\
278	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
279	vpxor t0, x1, x1;				\
280	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
281	vpxor t0, x2, x2;				\
282	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
283	vpxor t0, x3, x3;				\
284	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
285	vpxor t0, x4, x4;				\
286	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
287	vpxor t0, x5, x5;				\
288	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
289	vpxor t0, x6, x6;				\
290	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
291	vpxor t0, x7, x7;
292
293#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
294			    x4, x5, x6, x7,		\
295			    t0, t1, t2, t3,		\
296			    t4, t5, t6, t7)		\
297	vpbroadcastq .Ltf_s2_bitmatrix, t0;		\
298	vpbroadcastq .Ltf_inv_bitmatrix, t1;		\
299	vpbroadcastq .Ltf_id_bitmatrix, t2;		\
300	vpbroadcastq .Ltf_aff_bitmatrix, t3;		\
301	vpbroadcastq .Ltf_x2_bitmatrix, t4;		\
302	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
303	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
304	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
305	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
306	vgf2p8affineinvqb $0, t2, x2, x2;		\
307	vgf2p8affineinvqb $0, t2, x6, x6;		\
308	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
309	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
310	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
311	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
312	vgf2p8affineinvqb $0, t2, x3, x3;		\
313	vgf2p8affineinvqb $0, t2, x7, x7
314
315#define aria_sbox_8way(x0, x1, x2, x3,            	\
316		       x4, x5, x6, x7,			\
317		       t0, t1, t2, t3,			\
318		       t4, t5, t6, t7)			\
319	vpxor t7, t7, t7;				\
320	vmovdqa .Linv_shift_row, t0;			\
321	vmovdqa .Lshift_row, t1;			\
322	vpbroadcastd .L0f0f0f0f, t6;			\
323	vmovdqa .Ltf_lo__inv_aff__and__s2, t2;		\
324	vmovdqa .Ltf_hi__inv_aff__and__s2, t3;		\
325	vmovdqa .Ltf_lo__x2__and__fwd_aff, t4;		\
326	vmovdqa .Ltf_hi__x2__and__fwd_aff, t5;		\
327							\
328	vaesenclast t7, x0, x0;				\
329	vaesenclast t7, x4, x4;				\
330	vaesenclast t7, x1, x1;				\
331	vaesenclast t7, x5, x5;				\
332	vaesdeclast t7, x2, x2;				\
333	vaesdeclast t7, x6, x6;				\
334							\
335	/* AES inverse shift rows */			\
336	vpshufb t0, x0, x0;				\
337	vpshufb t0, x4, x4;				\
338	vpshufb t0, x1, x1;				\
339	vpshufb t0, x5, x5;				\
340	vpshufb t1, x3, x3;				\
341	vpshufb t1, x7, x7;				\
342	vpshufb t1, x2, x2;				\
343	vpshufb t1, x6, x6;				\
344							\
345	/* affine transformation for S2 */		\
346	filter_8bit(x1, t2, t3, t6, t0);		\
347	/* affine transformation for S2 */		\
348	filter_8bit(x5, t2, t3, t6, t0);		\
349							\
350	/* affine transformation for X2 */		\
351	filter_8bit(x3, t4, t5, t6, t0);		\
352	/* affine transformation for X2 */		\
353	filter_8bit(x7, t4, t5, t6, t0);		\
354	vaesdeclast t7, x3, x3;				\
355	vaesdeclast t7, x7, x7;
356
357#define aria_diff_m(x0, x1, x2, x3,			\
358		    t0, t1, t2, t3)			\
359	/* T = rotr32(X, 8); */				\
360	/* X ^= T */					\
361	vpxor x0, x3, t0;				\
362	vpxor x1, x0, t1;				\
363	vpxor x2, x1, t2;				\
364	vpxor x3, x2, t3;				\
365	/* X = T ^ rotr(X, 16); */			\
366	vpxor t2, x0, x0;				\
367	vpxor x1, t3, t3;				\
368	vpxor t0, x2, x2;				\
369	vpxor t1, x3, x1;				\
370	vmovdqu t3, x3;
371
372#define aria_diff_word(x0, x1, x2, x3,			\
373		       x4, x5, x6, x7,			\
374		       y0, y1, y2, y3,			\
375		       y4, y5, y6, y7)			\
376	/* t1 ^= t2; */					\
377	vpxor y0, x4, x4;				\
378	vpxor y1, x5, x5;				\
379	vpxor y2, x6, x6;				\
380	vpxor y3, x7, x7;				\
381							\
382	/* t2 ^= t3; */					\
383	vpxor y4, y0, y0;				\
384	vpxor y5, y1, y1;				\
385	vpxor y6, y2, y2;				\
386	vpxor y7, y3, y3;				\
387							\
388	/* t0 ^= t1; */					\
389	vpxor x4, x0, x0;				\
390	vpxor x5, x1, x1;				\
391	vpxor x6, x2, x2;				\
392	vpxor x7, x3, x3;				\
393							\
394	/* t3 ^= t1; */					\
395	vpxor x4, y4, y4;				\
396	vpxor x5, y5, y5;				\
397	vpxor x6, y6, y6;				\
398	vpxor x7, y7, y7;				\
399							\
400	/* t2 ^= t0; */					\
401	vpxor x0, y0, y0;				\
402	vpxor x1, y1, y1;				\
403	vpxor x2, y2, y2;				\
404	vpxor x3, y3, y3;				\
405							\
406	/* t1 ^= t2; */					\
407	vpxor y0, x4, x4;				\
408	vpxor y1, x5, x5;				\
409	vpxor y2, x6, x6;				\
410	vpxor y3, x7, x7;
411
412#define aria_fe(x0, x1, x2, x3,				\
413		x4, x5, x6, x7,				\
414		y0, y1, y2, y3,				\
415		y4, y5, y6, y7,				\
416		mem_tmp, rk, round)			\
417	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
418		      y0, rk, 8, round);		\
419							\
420	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
421		       y0, y1, y2, y3, y4, y5, y6, y7);	\
422							\
423	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
424	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
425	aria_store_state_8way(x0, x1, x2, x3,		\
426			      x4, x5, x6, x7,		\
427			      mem_tmp, 8);		\
428							\
429	aria_load_state_8way(x0, x1, x2, x3,		\
430			     x4, x5, x6, x7,		\
431			     mem_tmp, 0);		\
432	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
433		      y0, rk, 0, round);		\
434							\
435	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
436		       y0, y1, y2, y3, y4, y5, y6, y7);	\
437							\
438	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
439	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
440	aria_store_state_8way(x0, x1, x2, x3,		\
441			      x4, x5, x6, x7,		\
442			      mem_tmp, 0);		\
443	aria_load_state_8way(y0, y1, y2, y3,		\
444			     y4, y5, y6, y7,		\
445			     mem_tmp, 8);		\
446	aria_diff_word(x0, x1, x2, x3,			\
447		       x4, x5, x6, x7,			\
448		       y0, y1, y2, y3,			\
449		       y4, y5, y6, y7);			\
450	/* aria_diff_byte() 				\
451	 * T3 = ABCD -> BADC 				\
452	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
453	 * T0 = ABCD -> CDAB 				\
454	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
455	 * T1 = ABCD -> DCBA 				\
456	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
457	 */						\
458	aria_diff_word(x2, x3, x0, x1,			\
459		       x7, x6, x5, x4,			\
460		       y0, y1, y2, y3,			\
461		       y5, y4, y7, y6);			\
462	aria_store_state_8way(x3, x2, x1, x0,		\
463			      x6, x7, x4, x5,		\
464			      mem_tmp, 0);
465
466#define aria_fo(x0, x1, x2, x3,				\
467		x4, x5, x6, x7,				\
468		y0, y1, y2, y3,				\
469		y4, y5, y6, y7,				\
470		mem_tmp, rk, round)			\
471	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
472		      y0, rk, 8, round);		\
473							\
474	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
475		       y0, y1, y2, y3, y4, y5, y6, y7);	\
476							\
477	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
478	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
479	aria_store_state_8way(x0, x1, x2, x3,		\
480			      x4, x5, x6, x7,		\
481			      mem_tmp, 8);		\
482							\
483	aria_load_state_8way(x0, x1, x2, x3,		\
484			     x4, x5, x6, x7,		\
485			     mem_tmp, 0);		\
486	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
487		      y0, rk, 0, round);		\
488							\
489	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
490		       y0, y1, y2, y3, y4, y5, y6, y7);	\
491							\
492	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
493	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
494	aria_store_state_8way(x0, x1, x2, x3,		\
495			      x4, x5, x6, x7,		\
496			      mem_tmp, 0);		\
497	aria_load_state_8way(y0, y1, y2, y3,		\
498			     y4, y5, y6, y7,		\
499			     mem_tmp, 8);		\
500	aria_diff_word(x0, x1, x2, x3,			\
501		       x4, x5, x6, x7,			\
502		       y0, y1, y2, y3,			\
503		       y4, y5, y6, y7);			\
504	/* aria_diff_byte() 				\
505	 * T1 = ABCD -> BADC 				\
506	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
507	 * T2 = ABCD -> CDAB 				\
508	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
509	 * T3 = ABCD -> DCBA 				\
510	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
511	 */						\
512	aria_diff_word(x0, x1, x2, x3,			\
513		       x5, x4, x7, x6,			\
514		       y2, y3, y0, y1,			\
515		       y7, y6, y5, y4);			\
516	aria_store_state_8way(x3, x2, x1, x0,		\
517			      x6, x7, x4, x5,		\
518			      mem_tmp, 0);
519
520#define aria_ff(x0, x1, x2, x3,				\
521		x4, x5, x6, x7,				\
522		y0, y1, y2, y3,				\
523		y4, y5, y6, y7,				\
524		mem_tmp, rk, round, last_round)		\
525	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
526		      y0, rk, 8, round);		\
527							\
528	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
529		       y0, y1, y2, y3, y4, y5, y6, y7);	\
530							\
531	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
532		      y0, rk, 8, last_round);		\
533							\
534	aria_store_state_8way(x0, x1, x2, x3,		\
535			      x4, x5, x6, x7,		\
536			      mem_tmp, 8);		\
537							\
538	aria_load_state_8way(x0, x1, x2, x3,		\
539			     x4, x5, x6, x7,		\
540			     mem_tmp, 0);		\
541	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
542		      y0, rk, 0, round);		\
543							\
544	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
545		       y0, y1, y2, y3, y4, y5, y6, y7);	\
546							\
547	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
548		      y0, rk, 0, last_round);		\
549							\
550	aria_load_state_8way(y0, y1, y2, y3,		\
551			     y4, y5, y6, y7,		\
552			     mem_tmp, 8);
553
554#define aria_fe_gfni(x0, x1, x2, x3,			\
555		     x4, x5, x6, x7,			\
556		     y0, y1, y2, y3,			\
557		     y4, y5, y6, y7,			\
558		     mem_tmp, rk, round)		\
559	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
560		      y0, rk, 8, round);		\
561							\
562	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
563			    x6, x7, x4, x5,		\
564			    y0, y1, y2, y3, 		\
565			    y4, y5, y6, y7);		\
566							\
567	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
568	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
569	aria_store_state_8way(x0, x1, x2, x3,		\
570			      x4, x5, x6, x7,		\
571			      mem_tmp, 8);		\
572							\
573	aria_load_state_8way(x0, x1, x2, x3,		\
574			     x4, x5, x6, x7,		\
575			     mem_tmp, 0);		\
576	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
577		      y0, rk, 0, round);		\
578							\
579	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
580			    x6, x7, x4, x5,		\
581			    y0, y1, y2, y3, 		\
582			    y4, y5, y6, y7);		\
583							\
584	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
585	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
586	aria_store_state_8way(x0, x1, x2, x3,		\
587			      x4, x5, x6, x7,		\
588			      mem_tmp, 0);		\
589	aria_load_state_8way(y0, y1, y2, y3,		\
590			     y4, y5, y6, y7,		\
591			     mem_tmp, 8);		\
592	aria_diff_word(x0, x1, x2, x3,			\
593		       x4, x5, x6, x7,			\
594		       y0, y1, y2, y3,			\
595		       y4, y5, y6, y7);			\
596	/* aria_diff_byte() 				\
597	 * T3 = ABCD -> BADC 				\
598	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
599	 * T0 = ABCD -> CDAB 				\
600	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
601	 * T1 = ABCD -> DCBA 				\
602	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
603	 */						\
604	aria_diff_word(x2, x3, x0, x1,			\
605		       x7, x6, x5, x4,			\
606		       y0, y1, y2, y3,			\
607		       y5, y4, y7, y6);			\
608	aria_store_state_8way(x3, x2, x1, x0,		\
609			      x6, x7, x4, x5,		\
610			      mem_tmp, 0);
611
612#define aria_fo_gfni(x0, x1, x2, x3,			\
613		     x4, x5, x6, x7,			\
614		     y0, y1, y2, y3,			\
615		     y4, y5, y6, y7,			\
616		     mem_tmp, rk, round)		\
617	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
618		      y0, rk, 8, round);		\
619							\
620	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
621			    x4, x5, x6, x7,		\
622			    y0, y1, y2, y3, 		\
623			    y4, y5, y6, y7);		\
624							\
625	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
626	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
627	aria_store_state_8way(x0, x1, x2, x3,		\
628			      x4, x5, x6, x7,		\
629			      mem_tmp, 8);		\
630							\
631	aria_load_state_8way(x0, x1, x2, x3,		\
632			     x4, x5, x6, x7,		\
633			     mem_tmp, 0);		\
634	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
635		      y0, rk, 0, round);		\
636							\
637	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
638			    x4, x5, x6, x7,		\
639			    y0, y1, y2, y3, 		\
640			    y4, y5, y6, y7);		\
641							\
642	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
643	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
644	aria_store_state_8way(x0, x1, x2, x3,		\
645			      x4, x5, x6, x7,		\
646			      mem_tmp, 0);		\
647	aria_load_state_8way(y0, y1, y2, y3,		\
648			     y4, y5, y6, y7,		\
649			     mem_tmp, 8);		\
650	aria_diff_word(x0, x1, x2, x3,			\
651		       x4, x5, x6, x7,			\
652		       y0, y1, y2, y3,			\
653		       y4, y5, y6, y7);			\
654	/* aria_diff_byte() 				\
655	 * T1 = ABCD -> BADC 				\
656	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
657	 * T2 = ABCD -> CDAB 				\
658	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
659	 * T3 = ABCD -> DCBA 				\
660	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
661	 */						\
662	aria_diff_word(x0, x1, x2, x3,			\
663		       x5, x4, x7, x6,			\
664		       y2, y3, y0, y1,			\
665		       y7, y6, y5, y4);			\
666	aria_store_state_8way(x3, x2, x1, x0,		\
667			      x6, x7, x4, x5,		\
668			      mem_tmp, 0);
669
670#define aria_ff_gfni(x0, x1, x2, x3,			\
671		x4, x5, x6, x7,				\
672		y0, y1, y2, y3,				\
673		y4, y5, y6, y7,				\
674		mem_tmp, rk, round, last_round)		\
675	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
676		      y0, rk, 8, round);		\
677							\
678	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
679			    x6, x7, x4, x5,		\
680			    y0, y1, y2, y3, 		\
681			    y4, y5, y6, y7);		\
682							\
683	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
684		      y0, rk, 8, last_round);		\
685							\
686	aria_store_state_8way(x0, x1, x2, x3,		\
687			      x4, x5, x6, x7,		\
688			      mem_tmp, 8);		\
689							\
690	aria_load_state_8way(x0, x1, x2, x3,		\
691			     x4, x5, x6, x7,		\
692			     mem_tmp, 0);		\
693	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
694		      y0, rk, 0, round);		\
695							\
696	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
697			    x6, x7, x4, x5,		\
698			    y0, y1, y2, y3, 		\
699			    y4, y5, y6, y7);		\
700							\
701	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
702		      y0, rk, 0, last_round);		\
703							\
704	aria_load_state_8way(y0, y1, y2, y3,		\
705			     y4, y5, y6, y7,		\
706			     mem_tmp, 8);
707
708/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
709.section	.rodata.cst16, "aM", @progbits, 16
710.align 16
711
712#define SHUFB_BYTES(idx) \
713	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
714
715.Lshufb_16x16b:
716	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
717/* For isolating SubBytes from AESENCLAST, inverse shift row */
718.Linv_shift_row:
719	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
720	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
721.Lshift_row:
722	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
723	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
724/* For CTR-mode IV byteswap */
725.Lbswap128_mask:
726	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
727	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
728
729/* AES inverse affine and S2 combined:
730 *      1 1 0 0 0 0 0 1     x0     0
731 *      0 1 0 0 1 0 0 0     x1     0
732 *      1 1 0 0 1 1 1 1     x2     0
733 *      0 1 1 0 1 0 0 1     x3     1
734 *      0 1 0 0 1 1 0 0  *  x4  +  0
735 *      0 1 0 1 1 0 0 0     x5     0
736 *      0 0 0 0 0 1 0 1     x6     0
737 *      1 1 1 0 0 1 1 1     x7     1
738 */
739.Ltf_lo__inv_aff__and__s2:
740	.octa 0x92172DA81A9FA520B2370D883ABF8500
741.Ltf_hi__inv_aff__and__s2:
742	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
743
744/* X2 and AES forward affine combined:
745 *      1 0 1 1 0 0 0 1     x0     0
746 *      0 1 1 1 1 0 1 1     x1     0
747 *      0 0 0 1 1 0 1 0     x2     1
748 *      0 1 0 0 0 1 0 0     x3     0
749 *      0 0 1 1 1 0 1 1  *  x4  +  0
750 *      0 1 0 0 1 0 0 0     x5     0
751 *      1 1 0 1 0 0 1 1     x6     0
752 *      0 1 0 0 1 0 1 0     x7     0
753 */
754.Ltf_lo__x2__and__fwd_aff:
755	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
756.Ltf_hi__x2__and__fwd_aff:
757	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
758
759.section	.rodata.cst8, "aM", @progbits, 8
760.align 8
761/* AES affine: */
762#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
763.Ltf_aff_bitmatrix:
764	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
765		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
766		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
767		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
768		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
769		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
770		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
771		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
772
773/* AES inverse affine: */
774#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
775.Ltf_inv_bitmatrix:
776	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
777		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
778		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
779		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
780		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
781		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
782		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
783		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
784
785/* S2: */
786#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
787.Ltf_s2_bitmatrix:
788	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
789		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
790		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
791		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
792		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
793		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
794		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
795		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
796
797/* X2: */
798#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
799.Ltf_x2_bitmatrix:
800	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
801		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
802		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
803		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
804		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
805		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
806		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
807		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
808
809/* Identity matrix: */
810.Ltf_id_bitmatrix:
811	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
812		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
813		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
814		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
815		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
816		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
817		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
818		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
819
820/* 4-bit mask */
821.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
822.align 4
823.L0f0f0f0f:
824	.long 0x0f0f0f0f
825
826.text
827
828SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
829	/* input:
830	*      %r9: rk
831	*      %rsi: dst
832	*      %rdx: src
833	*      %xmm0..%xmm15: 16 byte-sliced blocks
834	*/
835
836	FRAME_BEGIN
837
838	movq %rsi, %rax;
839	leaq 8 * 16(%rax), %r8;
840
841	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
842		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
843		      %xmm15, %rax, %r8);
844	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
845		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
846		%rax, %r9, 0);
847	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
848		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
849		%xmm15, %rax, %r9, 1);
850	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
851		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
852		%rax, %r9, 2);
853	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
854		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
855		%xmm15, %rax, %r9, 3);
856	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
857		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
858		%rax, %r9, 4);
859	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
860		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
861		%xmm15, %rax, %r9, 5);
862	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
863		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
864		%rax, %r9, 6);
865	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
866		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
867		%xmm15, %rax, %r9, 7);
868	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
869		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
870		%rax, %r9, 8);
871	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
872		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
873		%xmm15, %rax, %r9, 9);
874	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
875		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
876		%rax, %r9, 10);
877	cmpl $12, rounds(CTX);
878	jne .Laria_192;
879	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
880		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
881		%xmm15, %rax, %r9, 11, 12);
882	jmp .Laria_end;
883.Laria_192:
884	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
885		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
886		%xmm15, %rax, %r9, 11);
887	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
888		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
889		%rax, %r9, 12);
890	cmpl $14, rounds(CTX);
891	jne .Laria_256;
892	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
893		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
894		%xmm15, %rax, %r9, 13, 14);
895	jmp .Laria_end;
896.Laria_256:
897	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
898		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899		%xmm15, %rax, %r9, 13);
900	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
901		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902		%rax, %r9, 14);
903	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905		%xmm15, %rax, %r9, 15, 16);
906.Laria_end:
907	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
908			   %xmm9, %xmm13, %xmm0, %xmm5,
909			   %xmm10, %xmm14, %xmm3, %xmm6,
910			   %xmm11, %xmm15, %xmm2, %xmm7,
911			   (%rax), (%r8));
912
913	FRAME_END
914	RET;
915SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
916
917SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
918	/* input:
919	*      %rdi: ctx, CTX
920	*      %rsi: dst
921	*      %rdx: src
922	*/
923
924	FRAME_BEGIN
925
926	leaq enc_key(CTX), %r9;
927
928	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
929		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
930		     %xmm15, %rdx);
931
932	call __aria_aesni_avx_crypt_16way;
933
934	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
935		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
936		     %xmm15, %rax);
937
938	FRAME_END
939	RET;
940SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
941
942SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
943	/* input:
944	*      %rdi: ctx, CTX
945	*      %rsi: dst
946	*      %rdx: src
947	*/
948
949	FRAME_BEGIN
950
951	leaq dec_key(CTX), %r9;
952
953	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
954		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955		     %xmm15, %rdx);
956
957	call __aria_aesni_avx_crypt_16way;
958
959	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961		     %xmm15, %rax);
962
963	FRAME_END
964	RET;
965SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
966
967SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
968	/* input:
969	*      %rdi: ctx
970	*      %rsi: dst
971	*      %rdx: src
972	*      %rcx: keystream
973	*      %r8: iv (big endian, 128bit)
974	*/
975
976	FRAME_BEGIN
977	/* load IV and byteswap */
978	vmovdqu (%r8), %xmm8;
979
980	vmovdqa .Lbswap128_mask (%rip), %xmm1;
981	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
982
983	vpcmpeqd %xmm0, %xmm0, %xmm0;
984	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
985
986	/* construct IVs */
987	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
988	vpshufb %xmm1, %xmm3, %xmm9;
989	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
990	vpshufb %xmm1, %xmm3, %xmm10;
991	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
992	vpshufb %xmm1, %xmm3, %xmm11;
993	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
994	vpshufb %xmm1, %xmm3, %xmm12;
995	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
996	vpshufb %xmm1, %xmm3, %xmm13;
997	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
998	vpshufb %xmm1, %xmm3, %xmm14;
999	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1000	vpshufb %xmm1, %xmm3, %xmm15;
1001	vmovdqu %xmm8, (0 * 16)(%rcx);
1002	vmovdqu %xmm9, (1 * 16)(%rcx);
1003	vmovdqu %xmm10, (2 * 16)(%rcx);
1004	vmovdqu %xmm11, (3 * 16)(%rcx);
1005	vmovdqu %xmm12, (4 * 16)(%rcx);
1006	vmovdqu %xmm13, (5 * 16)(%rcx);
1007	vmovdqu %xmm14, (6 * 16)(%rcx);
1008	vmovdqu %xmm15, (7 * 16)(%rcx);
1009
1010	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1011	vpshufb %xmm1, %xmm3, %xmm8;
1012	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1013	vpshufb %xmm1, %xmm3, %xmm9;
1014	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1015	vpshufb %xmm1, %xmm3, %xmm10;
1016	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1017	vpshufb %xmm1, %xmm3, %xmm11;
1018	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1019	vpshufb %xmm1, %xmm3, %xmm12;
1020	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1021	vpshufb %xmm1, %xmm3, %xmm13;
1022	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1023	vpshufb %xmm1, %xmm3, %xmm14;
1024	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1025	vpshufb %xmm1, %xmm3, %xmm15;
1026	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1027	vpshufb %xmm1, %xmm3, %xmm4;
1028	vmovdqu %xmm4, (%r8);
1029
1030	vmovdqu (0 * 16)(%rcx), %xmm0;
1031	vmovdqu (1 * 16)(%rcx), %xmm1;
1032	vmovdqu (2 * 16)(%rcx), %xmm2;
1033	vmovdqu (3 * 16)(%rcx), %xmm3;
1034	vmovdqu (4 * 16)(%rcx), %xmm4;
1035	vmovdqu (5 * 16)(%rcx), %xmm5;
1036	vmovdqu (6 * 16)(%rcx), %xmm6;
1037	vmovdqu (7 * 16)(%rcx), %xmm7;
1038
1039	FRAME_END
1040	RET;
1041SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1042
1043SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1044	/* input:
1045	*      %rdi: ctx
1046	*      %rsi: dst
1047	*      %rdx: src
1048	*      %rcx: keystream
1049	*      %r8: iv (big endian, 128bit)
1050	*/
1051	FRAME_BEGIN
1052
1053	call __aria_aesni_avx_ctr_gen_keystream_16way;
1054
1055	leaq (%rsi), %r10;
1056	leaq (%rdx), %r11;
1057	leaq (%rcx), %rsi;
1058	leaq (%rcx), %rdx;
1059	leaq enc_key(CTX), %r9;
1060
1061	call __aria_aesni_avx_crypt_16way;
1062
1063	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1064	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1065	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1066	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1067	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1068	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1069	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1070	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1071	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1072	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1073	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1074	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1075	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1076	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1077	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1078	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1079	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1080		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1081		     %xmm15, %r10);
1082
1083	FRAME_END
1084	RET;
1085SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1086
1087SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1088	/* input:
1089	*      %r9: rk
1090	*      %rsi: dst
1091	*      %rdx: src
1092	*      %xmm0..%xmm15: 16 byte-sliced blocks
1093	*/
1094
1095	FRAME_BEGIN
1096
1097	movq %rsi, %rax;
1098	leaq 8 * 16(%rax), %r8;
1099
1100	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1101		      %xmm4, %xmm5, %xmm6, %xmm7,
1102		      %xmm8, %xmm9, %xmm10, %xmm11,
1103		      %xmm12, %xmm13, %xmm14,
1104		      %xmm15, %rax, %r8);
1105	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1106		     %xmm12, %xmm13, %xmm14, %xmm15,
1107		     %xmm0, %xmm1, %xmm2, %xmm3,
1108		     %xmm4, %xmm5, %xmm6, %xmm7,
1109		     %rax, %r9, 0);
1110	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1111		     %xmm4, %xmm5, %xmm6, %xmm7,
1112		     %xmm8, %xmm9, %xmm10, %xmm11,
1113		     %xmm12, %xmm13, %xmm14,
1114		     %xmm15, %rax, %r9, 1);
1115	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1116		     %xmm12, %xmm13, %xmm14, %xmm15,
1117		     %xmm0, %xmm1, %xmm2, %xmm3,
1118		     %xmm4, %xmm5, %xmm6, %xmm7,
1119		     %rax, %r9, 2);
1120	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1121		     %xmm4, %xmm5, %xmm6, %xmm7,
1122		     %xmm8, %xmm9, %xmm10, %xmm11,
1123		     %xmm12, %xmm13, %xmm14,
1124		     %xmm15, %rax, %r9, 3);
1125	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1126		     %xmm12, %xmm13, %xmm14, %xmm15,
1127		     %xmm0, %xmm1, %xmm2, %xmm3,
1128		     %xmm4, %xmm5, %xmm6, %xmm7,
1129		     %rax, %r9, 4);
1130	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1131		     %xmm4, %xmm5, %xmm6, %xmm7,
1132		     %xmm8, %xmm9, %xmm10, %xmm11,
1133		     %xmm12, %xmm13, %xmm14,
1134		     %xmm15, %rax, %r9, 5);
1135	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1136		     %xmm12, %xmm13, %xmm14, %xmm15,
1137		     %xmm0, %xmm1, %xmm2, %xmm3,
1138		     %xmm4, %xmm5, %xmm6, %xmm7,
1139		     %rax, %r9, 6);
1140	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1141		     %xmm4, %xmm5, %xmm6, %xmm7,
1142		     %xmm8, %xmm9, %xmm10, %xmm11,
1143		     %xmm12, %xmm13, %xmm14,
1144		     %xmm15, %rax, %r9, 7);
1145	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1146		     %xmm12, %xmm13, %xmm14, %xmm15,
1147		     %xmm0, %xmm1, %xmm2, %xmm3,
1148		     %xmm4, %xmm5, %xmm6, %xmm7,
1149		     %rax, %r9, 8);
1150	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1151		     %xmm4, %xmm5, %xmm6, %xmm7,
1152		     %xmm8, %xmm9, %xmm10, %xmm11,
1153		     %xmm12, %xmm13, %xmm14,
1154		     %xmm15, %rax, %r9, 9);
1155	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1156		     %xmm12, %xmm13, %xmm14, %xmm15,
1157		     %xmm0, %xmm1, %xmm2, %xmm3,
1158		     %xmm4, %xmm5, %xmm6, %xmm7,
1159		     %rax, %r9, 10);
1160	cmpl $12, rounds(CTX);
1161	jne .Laria_gfni_192;
1162	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1163		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1164		%xmm15, %rax, %r9, 11, 12);
1165	jmp .Laria_gfni_end;
1166.Laria_gfni_192:
1167	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168		     %xmm4, %xmm5, %xmm6, %xmm7,
1169		     %xmm8, %xmm9, %xmm10, %xmm11,
1170		     %xmm12, %xmm13, %xmm14,
1171		     %xmm15, %rax, %r9, 11);
1172	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173		     %xmm12, %xmm13, %xmm14, %xmm15,
1174		     %xmm0, %xmm1, %xmm2, %xmm3,
1175		     %xmm4, %xmm5, %xmm6, %xmm7,
1176		     %rax, %r9, 12);
1177	cmpl $14, rounds(CTX);
1178	jne .Laria_gfni_256;
1179	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1180		     %xmm4, %xmm5, %xmm6, %xmm7,
1181		     %xmm8, %xmm9, %xmm10, %xmm11,
1182		     %xmm12, %xmm13, %xmm14,
1183		     %xmm15, %rax, %r9, 13, 14);
1184	jmp .Laria_gfni_end;
1185.Laria_gfni_256:
1186	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1187		     %xmm4, %xmm5, %xmm6, %xmm7,
1188		     %xmm8, %xmm9, %xmm10, %xmm11,
1189		     %xmm12, %xmm13, %xmm14,
1190		     %xmm15, %rax, %r9, 13);
1191	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1192		     %xmm12, %xmm13, %xmm14, %xmm15,
1193		     %xmm0, %xmm1, %xmm2, %xmm3,
1194		     %xmm4, %xmm5, %xmm6, %xmm7,
1195		     %rax, %r9, 14);
1196	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1197		     %xmm4, %xmm5, %xmm6, %xmm7,
1198		     %xmm8, %xmm9, %xmm10, %xmm11,
1199		     %xmm12, %xmm13, %xmm14,
1200		     %xmm15, %rax, %r9, 15, 16);
1201.Laria_gfni_end:
1202	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1203			   %xmm9, %xmm13, %xmm0, %xmm5,
1204			   %xmm10, %xmm14, %xmm3, %xmm6,
1205			   %xmm11, %xmm15, %xmm2, %xmm7,
1206			   (%rax), (%r8));
1207
1208	FRAME_END
1209	RET;
1210SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1211
1212SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1213	/* input:
1214	*      %rdi: ctx, CTX
1215	*      %rsi: dst
1216	*      %rdx: src
1217	*/
1218
1219	FRAME_BEGIN
1220
1221	leaq enc_key(CTX), %r9;
1222
1223	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1224		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1225		     %xmm15, %rdx);
1226
1227	call __aria_aesni_avx_gfni_crypt_16way;
1228
1229	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1230		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1231		     %xmm15, %rax);
1232
1233	FRAME_END
1234	RET;
1235SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1236
1237SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1238	/* input:
1239	*      %rdi: ctx, CTX
1240	*      %rsi: dst
1241	*      %rdx: src
1242	*/
1243
1244	FRAME_BEGIN
1245
1246	leaq dec_key(CTX), %r9;
1247
1248	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1249		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1250		     %xmm15, %rdx);
1251
1252	call __aria_aesni_avx_gfni_crypt_16way;
1253
1254	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1255		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1256		     %xmm15, %rax);
1257
1258	FRAME_END
1259	RET;
1260SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1261
1262SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1263	/* input:
1264	*      %rdi: ctx
1265	*      %rsi: dst
1266	*      %rdx: src
1267	*      %rcx: keystream
1268	*      %r8: iv (big endian, 128bit)
1269	*/
1270	FRAME_BEGIN
1271
1272	call __aria_aesni_avx_ctr_gen_keystream_16way
1273
1274	leaq (%rsi), %r10;
1275	leaq (%rdx), %r11;
1276	leaq (%rcx), %rsi;
1277	leaq (%rcx), %rdx;
1278	leaq enc_key(CTX), %r9;
1279
1280	call __aria_aesni_avx_gfni_crypt_16way;
1281
1282	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1283	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1284	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1285	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1286	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1287	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1288	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1289	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1290	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1291	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1292	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1293	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1294	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1295	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1296	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1297	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1298	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1299		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1300		     %xmm15, %r10);
1301
1302	FRAME_END
1303	RET;
1304SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1305