xref: /openbmc/linux/arch/x86/crypto/aria-aesni-avx-asm_64.S (revision c496daeb863093a046e0bb8db7265bf45d91775a)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 16-way parallel algorithm (AVX)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <linux/cfi_types.h>
11#include <asm/asm-offsets.h>
12#include <asm/frame.h>
13
14/* register macros */
15#define CTX %rdi
16
17
18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
19	( (((a0) & 1) << 0) |				\
20	  (((a1) & 1) << 1) |				\
21	  (((a2) & 1) << 2) |				\
22	  (((a3) & 1) << 3) |				\
23	  (((a4) & 1) << 4) |				\
24	  (((a5) & 1) << 5) |				\
25	  (((a6) & 1) << 6) |				\
26	  (((a7) & 1) << 7) )
27
28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
29	( ((l7) << (0 * 8)) |				\
30	  ((l6) << (1 * 8)) |				\
31	  ((l5) << (2 * 8)) |				\
32	  ((l4) << (3 * 8)) |				\
33	  ((l3) << (4 * 8)) |				\
34	  ((l2) << (5 * 8)) |				\
35	  ((l1) << (6 * 8)) |				\
36	  ((l0) << (7 * 8)) )
37
38#define inc_le128(x, minus_one, tmp)			\
39	vpcmpeqq minus_one, x, tmp;			\
40	vpsubq minus_one, x, x;				\
41	vpslldq $8, tmp, tmp;				\
42	vpsubq tmp, x, x;
43
44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
45	vpand x, mask4bit, tmp0;			\
46	vpandn x, mask4bit, x;				\
47	vpsrld $4, x, x;				\
48							\
49	vpshufb tmp0, lo_t, tmp0;			\
50	vpshufb x, hi_t, x;				\
51	vpxor tmp0, x, x;
52
53#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
54	vpunpckhdq x1, x0, t2;				\
55	vpunpckldq x1, x0, x0;				\
56							\
57	vpunpckldq x3, x2, t1;				\
58	vpunpckhdq x3, x2, x2;				\
59							\
60	vpunpckhqdq t1, x0, x1;				\
61	vpunpcklqdq t1, x0, x0;				\
62							\
63	vpunpckhqdq x2, t2, x3;				\
64	vpunpcklqdq x2, t2, x2;
65
66#define byteslice_16x16b(a0, b0, c0, d0,		\
67			 a1, b1, c1, d1,		\
68			 a2, b2, c2, d2,		\
69			 a3, b3, c3, d3,		\
70			 st0, st1)			\
71	vmovdqu d2, st0;				\
72	vmovdqu d3, st1;				\
73	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
74	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
75	vmovdqu st0, d2;				\
76	vmovdqu st1, d3;				\
77							\
78	vmovdqu a0, st0;				\
79	vmovdqu a1, st1;				\
80	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
81	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
82							\
83	vmovdqu .Lshufb_16x16b(%rip), a0;		\
84	vmovdqu st1, a1;				\
85	vpshufb a0, a2, a2;				\
86	vpshufb a0, a3, a3;				\
87	vpshufb a0, b0, b0;				\
88	vpshufb a0, b1, b1;				\
89	vpshufb a0, b2, b2;				\
90	vpshufb a0, b3, b3;				\
91	vpshufb a0, a1, a1;				\
92	vpshufb a0, c0, c0;				\
93	vpshufb a0, c1, c1;				\
94	vpshufb a0, c2, c2;				\
95	vpshufb a0, c3, c3;				\
96	vpshufb a0, d0, d0;				\
97	vpshufb a0, d1, d1;				\
98	vpshufb a0, d2, d2;				\
99	vpshufb a0, d3, d3;				\
100	vmovdqu d3, st1;				\
101	vmovdqu st0, d3;				\
102	vpshufb a0, d3, a0;				\
103	vmovdqu d2, st0;				\
104							\
105	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
106	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
107	vmovdqu st0, d2;				\
108	vmovdqu st1, d3;				\
109							\
110	vmovdqu b0, st0;				\
111	vmovdqu b1, st1;				\
112	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
113	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
114	vmovdqu st0, b0;				\
115	vmovdqu st1, b1;				\
116	/* does not adjust output bytes inside vectors */
117
118#define debyteslice_16x16b(a0, b0, c0, d0,		\
119			   a1, b1, c1, d1,		\
120			   a2, b2, c2, d2,		\
121			   a3, b3, c3, d3,		\
122			   st0, st1)			\
123	vmovdqu d2, st0;				\
124	vmovdqu d3, st1;				\
125	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
126	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
127	vmovdqu st0, d2;				\
128	vmovdqu st1, d3;				\
129							\
130	vmovdqu a0, st0;				\
131	vmovdqu a1, st1;				\
132	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
133	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
134							\
135	vmovdqu .Lshufb_16x16b(%rip), a0;		\
136	vmovdqu st1, a1;				\
137	vpshufb a0, a2, a2;				\
138	vpshufb a0, a3, a3;				\
139	vpshufb a0, b0, b0;				\
140	vpshufb a0, b1, b1;				\
141	vpshufb a0, b2, b2;				\
142	vpshufb a0, b3, b3;				\
143	vpshufb a0, a1, a1;				\
144	vpshufb a0, c0, c0;				\
145	vpshufb a0, c1, c1;				\
146	vpshufb a0, c2, c2;				\
147	vpshufb a0, c3, c3;				\
148	vpshufb a0, d0, d0;				\
149	vpshufb a0, d1, d1;				\
150	vpshufb a0, d2, d2;				\
151	vpshufb a0, d3, d3;				\
152	vmovdqu d3, st1;				\
153	vmovdqu st0, d3;				\
154	vpshufb a0, d3, a0;				\
155	vmovdqu d2, st0;				\
156							\
157	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
158	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
159	vmovdqu st0, d2;				\
160	vmovdqu st1, d3;				\
161							\
162	vmovdqu b0, st0;				\
163	vmovdqu b1, st1;				\
164	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
165	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
166	vmovdqu st0, b0;				\
167	vmovdqu st1, b1;				\
168	/* does not adjust output bytes inside vectors */
169
170/* load blocks to registers and apply pre-whitening */
171#define inpack16_pre(x0, x1, x2, x3,			\
172		     x4, x5, x6, x7,			\
173		     y0, y1, y2, y3,			\
174		     y4, y5, y6, y7,			\
175		     rio)				\
176	vmovdqu (0 * 16)(rio), x0;			\
177	vmovdqu (1 * 16)(rio), x1;			\
178	vmovdqu (2 * 16)(rio), x2;			\
179	vmovdqu (3 * 16)(rio), x3;			\
180	vmovdqu (4 * 16)(rio), x4;			\
181	vmovdqu (5 * 16)(rio), x5;			\
182	vmovdqu (6 * 16)(rio), x6;			\
183	vmovdqu (7 * 16)(rio), x7;			\
184	vmovdqu (8 * 16)(rio), y0;			\
185	vmovdqu (9 * 16)(rio), y1;			\
186	vmovdqu (10 * 16)(rio), y2;			\
187	vmovdqu (11 * 16)(rio), y3;			\
188	vmovdqu (12 * 16)(rio), y4;			\
189	vmovdqu (13 * 16)(rio), y5;			\
190	vmovdqu (14 * 16)(rio), y6;			\
191	vmovdqu (15 * 16)(rio), y7;
192
193/* byteslice pre-whitened blocks and store to temporary memory */
194#define inpack16_post(x0, x1, x2, x3,			\
195		      x4, x5, x6, x7,			\
196		      y0, y1, y2, y3,			\
197		      y4, y5, y6, y7,			\
198		      mem_ab, mem_cd)			\
199	byteslice_16x16b(x0, x1, x2, x3,		\
200			 x4, x5, x6, x7,		\
201			 y0, y1, y2, y3,		\
202			 y4, y5, y6, y7,		\
203			 (mem_ab), (mem_cd));		\
204							\
205	vmovdqu x0, 0 * 16(mem_ab);			\
206	vmovdqu x1, 1 * 16(mem_ab);			\
207	vmovdqu x2, 2 * 16(mem_ab);			\
208	vmovdqu x3, 3 * 16(mem_ab);			\
209	vmovdqu x4, 4 * 16(mem_ab);			\
210	vmovdqu x5, 5 * 16(mem_ab);			\
211	vmovdqu x6, 6 * 16(mem_ab);			\
212	vmovdqu x7, 7 * 16(mem_ab);			\
213	vmovdqu y0, 0 * 16(mem_cd);			\
214	vmovdqu y1, 1 * 16(mem_cd);			\
215	vmovdqu y2, 2 * 16(mem_cd);			\
216	vmovdqu y3, 3 * 16(mem_cd);			\
217	vmovdqu y4, 4 * 16(mem_cd);			\
218	vmovdqu y5, 5 * 16(mem_cd);			\
219	vmovdqu y6, 6 * 16(mem_cd);			\
220	vmovdqu y7, 7 * 16(mem_cd);
221
222#define write_output(x0, x1, x2, x3,			\
223		     x4, x5, x6, x7,			\
224		     y0, y1, y2, y3,			\
225		     y4, y5, y6, y7,			\
226		     mem)				\
227	vmovdqu x0, 0 * 16(mem);			\
228	vmovdqu x1, 1 * 16(mem);			\
229	vmovdqu x2, 2 * 16(mem);			\
230	vmovdqu x3, 3 * 16(mem);			\
231	vmovdqu x4, 4 * 16(mem);			\
232	vmovdqu x5, 5 * 16(mem);			\
233	vmovdqu x6, 6 * 16(mem);			\
234	vmovdqu x7, 7 * 16(mem);			\
235	vmovdqu y0, 8 * 16(mem);			\
236	vmovdqu y1, 9 * 16(mem);			\
237	vmovdqu y2, 10 * 16(mem);			\
238	vmovdqu y3, 11 * 16(mem);			\
239	vmovdqu y4, 12 * 16(mem);			\
240	vmovdqu y5, 13 * 16(mem);			\
241	vmovdqu y6, 14 * 16(mem);			\
242	vmovdqu y7, 15 * 16(mem);			\
243
244#define aria_store_state_8way(x0, x1, x2, x3,		\
245			      x4, x5, x6, x7,		\
246			      mem_tmp, idx)		\
247	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
248	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
249	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
250	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
251	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
252	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
253	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
254	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
255
256#define aria_load_state_8way(x0, x1, x2, x3,		\
257			     x4, x5, x6, x7,		\
258			     mem_tmp, idx)		\
259	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
260	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
261	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
262	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
263	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
264	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
265	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
266	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
267
268#define aria_ark_8way(x0, x1, x2, x3,			\
269		      x4, x5, x6, x7,			\
270		      t0, t1, t2, rk,			\
271		      idx, round)			\
272	/* AddRoundKey */                               \
273	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
274	vpsrld $24, t0, t2;				\
275	vpshufb t1, t2, t2;				\
276	vpxor t2, x0, x0;				\
277	vpsrld $16, t0, t2;				\
278	vpshufb t1, t2, t2;				\
279	vpxor t2, x1, x1;				\
280	vpsrld $8, t0, t2;				\
281	vpshufb t1, t2, t2;				\
282	vpxor t2, x2, x2;				\
283	vpshufb t1, t0, t2;				\
284	vpxor t2, x3, x3;				\
285	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
286	vpsrld $24, t0, t2;				\
287	vpshufb t1, t2, t2;				\
288	vpxor t2, x4, x4;				\
289	vpsrld $16, t0, t2;				\
290	vpshufb t1, t2, t2;				\
291	vpxor t2, x5, x5;				\
292	vpsrld $8, t0, t2;				\
293	vpshufb t1, t2, t2;				\
294	vpxor t2, x6, x6;				\
295	vpshufb t1, t0, t2;				\
296	vpxor t2, x7, x7;
297
298#ifdef CONFIG_AS_GFNI
299#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
300			    x4, x5, x6, x7,		\
301			    t0, t1, t2, t3,		\
302			    t4, t5, t6, t7)		\
303	vmovdqa .Ltf_s2_bitmatrix(%rip), t0;		\
304	vmovdqa .Ltf_inv_bitmatrix(%rip), t1;		\
305	vmovdqa .Ltf_id_bitmatrix(%rip), t2;		\
306	vmovdqa .Ltf_aff_bitmatrix(%rip), t3;		\
307	vmovdqa .Ltf_x2_bitmatrix(%rip), t4;		\
308	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
309	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
310	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
311	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
312	vgf2p8affineinvqb $0, t2, x2, x2;		\
313	vgf2p8affineinvqb $0, t2, x6, x6;		\
314	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
315	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
316	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
317	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
318	vgf2p8affineinvqb $0, t2, x3, x3;		\
319	vgf2p8affineinvqb $0, t2, x7, x7
320
321#endif /* CONFIG_AS_GFNI */
322
323#define aria_sbox_8way(x0, x1, x2, x3,            	\
324		       x4, x5, x6, x7,			\
325		       t0, t1, t2, t3,			\
326		       t4, t5, t6, t7)			\
327	vmovdqa .Linv_shift_row(%rip), t0;		\
328	vmovdqa .Lshift_row(%rip), t1;			\
329	vbroadcastss .L0f0f0f0f(%rip), t6;		\
330	vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2;	\
331	vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3;	\
332	vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4;	\
333	vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5;	\
334							\
335	vaesenclast t7, x0, x0;				\
336	vaesenclast t7, x4, x4;				\
337	vaesenclast t7, x1, x1;				\
338	vaesenclast t7, x5, x5;				\
339	vaesdeclast t7, x2, x2;				\
340	vaesdeclast t7, x6, x6;				\
341							\
342	/* AES inverse shift rows */			\
343	vpshufb t0, x0, x0;				\
344	vpshufb t0, x4, x4;				\
345	vpshufb t0, x1, x1;				\
346	vpshufb t0, x5, x5;				\
347	vpshufb t1, x3, x3;				\
348	vpshufb t1, x7, x7;				\
349	vpshufb t1, x2, x2;				\
350	vpshufb t1, x6, x6;				\
351							\
352	/* affine transformation for S2 */		\
353	filter_8bit(x1, t2, t3, t6, t0);		\
354	/* affine transformation for S2 */		\
355	filter_8bit(x5, t2, t3, t6, t0);		\
356							\
357	/* affine transformation for X2 */		\
358	filter_8bit(x3, t4, t5, t6, t0);		\
359	/* affine transformation for X2 */		\
360	filter_8bit(x7, t4, t5, t6, t0);		\
361	vaesdeclast t7, x3, x3;				\
362	vaesdeclast t7, x7, x7;
363
364#define aria_diff_m(x0, x1, x2, x3,			\
365		    t0, t1, t2, t3)			\
366	/* T = rotr32(X, 8); */				\
367	/* X ^= T */					\
368	vpxor x0, x3, t0;				\
369	vpxor x1, x0, t1;				\
370	vpxor x2, x1, t2;				\
371	vpxor x3, x2, t3;				\
372	/* X = T ^ rotr(X, 16); */			\
373	vpxor t2, x0, x0;				\
374	vpxor x1, t3, t3;				\
375	vpxor t0, x2, x2;				\
376	vpxor t1, x3, x1;				\
377	vmovdqu t3, x3;
378
379#define aria_diff_word(x0, x1, x2, x3,			\
380		       x4, x5, x6, x7,			\
381		       y0, y1, y2, y3,			\
382		       y4, y5, y6, y7)			\
383	/* t1 ^= t2; */					\
384	vpxor y0, x4, x4;				\
385	vpxor y1, x5, x5;				\
386	vpxor y2, x6, x6;				\
387	vpxor y3, x7, x7;				\
388							\
389	/* t2 ^= t3; */					\
390	vpxor y4, y0, y0;				\
391	vpxor y5, y1, y1;				\
392	vpxor y6, y2, y2;				\
393	vpxor y7, y3, y3;				\
394							\
395	/* t0 ^= t1; */					\
396	vpxor x4, x0, x0;				\
397	vpxor x5, x1, x1;				\
398	vpxor x6, x2, x2;				\
399	vpxor x7, x3, x3;				\
400							\
401	/* t3 ^= t1; */					\
402	vpxor x4, y4, y4;				\
403	vpxor x5, y5, y5;				\
404	vpxor x6, y6, y6;				\
405	vpxor x7, y7, y7;				\
406							\
407	/* t2 ^= t0; */					\
408	vpxor x0, y0, y0;				\
409	vpxor x1, y1, y1;				\
410	vpxor x2, y2, y2;				\
411	vpxor x3, y3, y3;				\
412							\
413	/* t1 ^= t2; */					\
414	vpxor y0, x4, x4;				\
415	vpxor y1, x5, x5;				\
416	vpxor y2, x6, x6;				\
417	vpxor y3, x7, x7;
418
419#define aria_fe(x0, x1, x2, x3,				\
420		x4, x5, x6, x7,				\
421		y0, y1, y2, y3,				\
422		y4, y5, y6, y7,				\
423		mem_tmp, rk, round)			\
424	vpxor y7, y7, y7;				\
425	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
426		      y0, y7, y2, rk, 8, round);	\
427							\
428	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
429		       y0, y1, y2, y3, y4, y5, y6, y7);	\
430							\
431	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
432	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
433	aria_store_state_8way(x0, x1, x2, x3,		\
434			      x4, x5, x6, x7,		\
435			      mem_tmp, 8);		\
436							\
437	aria_load_state_8way(x0, x1, x2, x3,		\
438			     x4, x5, x6, x7,		\
439			     mem_tmp, 0);		\
440	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
441		      y0, y7, y2, rk, 0, round);	\
442							\
443	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
444		       y0, y1, y2, y3, y4, y5, y6, y7);	\
445							\
446	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
447	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
448	aria_store_state_8way(x0, x1, x2, x3,		\
449			      x4, x5, x6, x7,		\
450			      mem_tmp, 0);		\
451	aria_load_state_8way(y0, y1, y2, y3,		\
452			     y4, y5, y6, y7,		\
453			     mem_tmp, 8);		\
454	aria_diff_word(x0, x1, x2, x3,			\
455		       x4, x5, x6, x7,			\
456		       y0, y1, y2, y3,			\
457		       y4, y5, y6, y7);			\
458	/* aria_diff_byte() 				\
459	 * T3 = ABCD -> BADC 				\
460	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
461	 * T0 = ABCD -> CDAB 				\
462	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
463	 * T1 = ABCD -> DCBA 				\
464	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
465	 */						\
466	aria_diff_word(x2, x3, x0, x1,			\
467		       x7, x6, x5, x4,			\
468		       y0, y1, y2, y3,			\
469		       y5, y4, y7, y6);			\
470	aria_store_state_8way(x3, x2, x1, x0,		\
471			      x6, x7, x4, x5,		\
472			      mem_tmp, 0);
473
474#define aria_fo(x0, x1, x2, x3,				\
475		x4, x5, x6, x7,				\
476		y0, y1, y2, y3,				\
477		y4, y5, y6, y7,				\
478		mem_tmp, rk, round)			\
479	vpxor y7, y7, y7;				\
480	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
481		      y0, y7, y2, rk, 8, round);	\
482							\
483	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
484		       y0, y1, y2, y3, y4, y5, y6, y7);	\
485							\
486	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
487	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
488	aria_store_state_8way(x0, x1, x2, x3,		\
489			      x4, x5, x6, x7,		\
490			      mem_tmp, 8);		\
491							\
492	aria_load_state_8way(x0, x1, x2, x3,		\
493			     x4, x5, x6, x7,		\
494			     mem_tmp, 0);		\
495	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
496		      y0, y7, y2, rk, 0, round);	\
497							\
498	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
499		       y0, y1, y2, y3, y4, y5, y6, y7);	\
500							\
501	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
502	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
503	aria_store_state_8way(x0, x1, x2, x3,		\
504			      x4, x5, x6, x7,		\
505			      mem_tmp, 0);		\
506	aria_load_state_8way(y0, y1, y2, y3,		\
507			     y4, y5, y6, y7,		\
508			     mem_tmp, 8);		\
509	aria_diff_word(x0, x1, x2, x3,			\
510		       x4, x5, x6, x7,			\
511		       y0, y1, y2, y3,			\
512		       y4, y5, y6, y7);			\
513	/* aria_diff_byte() 				\
514	 * T1 = ABCD -> BADC 				\
515	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
516	 * T2 = ABCD -> CDAB 				\
517	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
518	 * T3 = ABCD -> DCBA 				\
519	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
520	 */						\
521	aria_diff_word(x0, x1, x2, x3,			\
522		       x5, x4, x7, x6,			\
523		       y2, y3, y0, y1,			\
524		       y7, y6, y5, y4);			\
525	aria_store_state_8way(x3, x2, x1, x0,		\
526			      x6, x7, x4, x5,		\
527			      mem_tmp, 0);
528
529#define aria_ff(x0, x1, x2, x3,				\
530		x4, x5, x6, x7,				\
531		y0, y1, y2, y3,				\
532		y4, y5, y6, y7,				\
533		mem_tmp, rk, round, last_round)		\
534	vpxor y7, y7, y7;				\
535	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
536		      y0, y7, y2, rk, 8, round);	\
537							\
538	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
539		       y0, y1, y2, y3, y4, y5, y6, y7);	\
540							\
541	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
542		      y0, y7, y2, rk, 8, last_round);	\
543							\
544	aria_store_state_8way(x0, x1, x2, x3,		\
545			      x4, x5, x6, x7,		\
546			      mem_tmp, 8);		\
547							\
548	aria_load_state_8way(x0, x1, x2, x3,		\
549			     x4, x5, x6, x7,		\
550			     mem_tmp, 0);		\
551	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
552		      y0, y7, y2, rk, 0, round);	\
553							\
554	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
555		       y0, y1, y2, y3, y4, y5, y6, y7);	\
556							\
557	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
558		      y0, y7, y2, rk, 0, last_round);	\
559							\
560	aria_load_state_8way(y0, y1, y2, y3,		\
561			     y4, y5, y6, y7,		\
562			     mem_tmp, 8);
563
564#ifdef CONFIG_AS_GFNI
565#define aria_fe_gfni(x0, x1, x2, x3,			\
566		     x4, x5, x6, x7,			\
567		     y0, y1, y2, y3,			\
568		     y4, y5, y6, y7,			\
569		     mem_tmp, rk, round)		\
570	vpxor y7, y7, y7;				\
571	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
572		      y0, y7, y2, rk, 8, round);	\
573							\
574	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
575			    x6, x7, x4, x5,		\
576			    y0, y1, y2, y3, 		\
577			    y4, y5, y6, y7);		\
578							\
579	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
580	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
581	aria_store_state_8way(x0, x1, x2, x3,		\
582			      x4, x5, x6, x7,		\
583			      mem_tmp, 8);		\
584							\
585	aria_load_state_8way(x0, x1, x2, x3,		\
586			     x4, x5, x6, x7,		\
587			     mem_tmp, 0);		\
588	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
589		      y0, y7, y2, rk, 0, round);	\
590							\
591	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
592			    x6, x7, x4, x5,		\
593			    y0, y1, y2, y3, 		\
594			    y4, y5, y6, y7);		\
595							\
596	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
597	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
598	aria_store_state_8way(x0, x1, x2, x3,		\
599			      x4, x5, x6, x7,		\
600			      mem_tmp, 0);		\
601	aria_load_state_8way(y0, y1, y2, y3,		\
602			     y4, y5, y6, y7,		\
603			     mem_tmp, 8);		\
604	aria_diff_word(x0, x1, x2, x3,			\
605		       x4, x5, x6, x7,			\
606		       y0, y1, y2, y3,			\
607		       y4, y5, y6, y7);			\
608	/* aria_diff_byte() 				\
609	 * T3 = ABCD -> BADC 				\
610	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
611	 * T0 = ABCD -> CDAB 				\
612	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
613	 * T1 = ABCD -> DCBA 				\
614	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
615	 */						\
616	aria_diff_word(x2, x3, x0, x1,			\
617		       x7, x6, x5, x4,			\
618		       y0, y1, y2, y3,			\
619		       y5, y4, y7, y6);			\
620	aria_store_state_8way(x3, x2, x1, x0,		\
621			      x6, x7, x4, x5,		\
622			      mem_tmp, 0);
623
624#define aria_fo_gfni(x0, x1, x2, x3,			\
625		     x4, x5, x6, x7,			\
626		     y0, y1, y2, y3,			\
627		     y4, y5, y6, y7,			\
628		     mem_tmp, rk, round)		\
629	vpxor y7, y7, y7;				\
630	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
631		      y0, y7, y2, rk, 8, round);	\
632							\
633	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
634			    x4, x5, x6, x7,		\
635			    y0, y1, y2, y3, 		\
636			    y4, y5, y6, y7);		\
637							\
638	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
639	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
640	aria_store_state_8way(x0, x1, x2, x3,		\
641			      x4, x5, x6, x7,		\
642			      mem_tmp, 8);		\
643							\
644	aria_load_state_8way(x0, x1, x2, x3,		\
645			     x4, x5, x6, x7,		\
646			     mem_tmp, 0);		\
647	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
648		      y0, y7, y2, rk, 0, round);	\
649							\
650	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
651			    x4, x5, x6, x7,		\
652			    y0, y1, y2, y3, 		\
653			    y4, y5, y6, y7);		\
654							\
655	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
656	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
657	aria_store_state_8way(x0, x1, x2, x3,		\
658			      x4, x5, x6, x7,		\
659			      mem_tmp, 0);		\
660	aria_load_state_8way(y0, y1, y2, y3,		\
661			     y4, y5, y6, y7,		\
662			     mem_tmp, 8);		\
663	aria_diff_word(x0, x1, x2, x3,			\
664		       x4, x5, x6, x7,			\
665		       y0, y1, y2, y3,			\
666		       y4, y5, y6, y7);			\
667	/* aria_diff_byte() 				\
668	 * T1 = ABCD -> BADC 				\
669	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
670	 * T2 = ABCD -> CDAB 				\
671	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
672	 * T3 = ABCD -> DCBA 				\
673	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
674	 */						\
675	aria_diff_word(x0, x1, x2, x3,			\
676		       x5, x4, x7, x6,			\
677		       y2, y3, y0, y1,			\
678		       y7, y6, y5, y4);			\
679	aria_store_state_8way(x3, x2, x1, x0,		\
680			      x6, x7, x4, x5,		\
681			      mem_tmp, 0);
682
683#define aria_ff_gfni(x0, x1, x2, x3,			\
684		x4, x5, x6, x7,				\
685		y0, y1, y2, y3,				\
686		y4, y5, y6, y7,				\
687		mem_tmp, rk, round, last_round)		\
688	vpxor y7, y7, y7;				\
689	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
690		      y0, y7, y2, rk, 8, round);	\
691							\
692	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
693			    x6, x7, x4, x5,		\
694			    y0, y1, y2, y3, 		\
695			    y4, y5, y6, y7);		\
696							\
697	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
698		      y0, y7, y2, rk, 8, last_round);	\
699							\
700	aria_store_state_8way(x0, x1, x2, x3,		\
701			      x4, x5, x6, x7,		\
702			      mem_tmp, 8);		\
703							\
704	aria_load_state_8way(x0, x1, x2, x3,		\
705			     x4, x5, x6, x7,		\
706			     mem_tmp, 0);		\
707	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
708		      y0, y7, y2, rk, 0, round);	\
709							\
710	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
711			    x6, x7, x4, x5,		\
712			    y0, y1, y2, y3, 		\
713			    y4, y5, y6, y7);		\
714							\
715	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
716		      y0, y7, y2, rk, 0, last_round);	\
717							\
718	aria_load_state_8way(y0, y1, y2, y3,		\
719			     y4, y5, y6, y7,		\
720			     mem_tmp, 8);
721
722#endif /* CONFIG_AS_GFNI */
723
724/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725.section	.rodata.cst16, "aM", @progbits, 16
726.align 16
727
728#define SHUFB_BYTES(idx) \
729	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
730
731.Lshufb_16x16b:
732	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733/* For isolating SubBytes from AESENCLAST, inverse shift row */
734.Linv_shift_row:
735	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
737.Lshift_row:
738	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740/* For CTR-mode IV byteswap */
741.Lbswap128_mask:
742	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
744
745/* AES inverse affine and S2 combined:
746 *      1 1 0 0 0 0 0 1     x0     0
747 *      0 1 0 0 1 0 0 0     x1     0
748 *      1 1 0 0 1 1 1 1     x2     0
749 *      0 1 1 0 1 0 0 1     x3     1
750 *      0 1 0 0 1 1 0 0  *  x4  +  0
751 *      0 1 0 1 1 0 0 0     x5     0
752 *      0 0 0 0 0 1 0 1     x6     0
753 *      1 1 1 0 0 1 1 1     x7     1
754 */
755.Ltf_lo__inv_aff__and__s2:
756	.octa 0x92172DA81A9FA520B2370D883ABF8500
757.Ltf_hi__inv_aff__and__s2:
758	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
759
760/* X2 and AES forward affine combined:
761 *      1 0 1 1 0 0 0 1     x0     0
762 *      0 1 1 1 1 0 1 1     x1     0
763 *      0 0 0 1 1 0 1 0     x2     1
764 *      0 1 0 0 0 1 0 0     x3     0
765 *      0 0 1 1 1 0 1 1  *  x4  +  0
766 *      0 1 0 0 1 0 0 0     x5     0
767 *      1 1 0 1 0 0 1 1     x6     0
768 *      0 1 0 0 1 0 1 0     x7     0
769 */
770.Ltf_lo__x2__and__fwd_aff:
771	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772.Ltf_hi__x2__and__fwd_aff:
773	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
774
775#ifdef CONFIG_AS_GFNI
776.section	.rodata.cst8, "aM", @progbits, 8
777.align 8
778/* AES affine: */
779#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
780.Ltf_aff_bitmatrix:
781	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
782		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
783		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
784		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
785		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
786		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
787		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
788		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
789	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
790		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
791		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
792		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
793		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
794		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
795		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
796		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
797
798/* AES inverse affine: */
799#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
800.Ltf_inv_bitmatrix:
801	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
802		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
803		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
804		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
805		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
806		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
807		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
808		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
809	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
810		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
811		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
812		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
813		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
814		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
815		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
816		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
817
818/* S2: */
819#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
820.Ltf_s2_bitmatrix:
821	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
822		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
823		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
824		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
825		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
826		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
827		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
828		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
829	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
830		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
831		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
832		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
833		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
834		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
835		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
836		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
837
838/* X2: */
839#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
840.Ltf_x2_bitmatrix:
841	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
842		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
843		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
844		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
845		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
846		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
847		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
848		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
849	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
850		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
851		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
852		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
853		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
854		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
855		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
856		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
857
858/* Identity matrix: */
859.Ltf_id_bitmatrix:
860	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
861		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
862		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
863		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
864		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
865		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
866		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
867		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
868	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
869		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
870		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
871		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
872		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
873		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
874		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
875		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
876#endif /* CONFIG_AS_GFNI */
877
878/* 4-bit mask */
879.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
880.align 4
881.L0f0f0f0f:
882	.long 0x0f0f0f0f
883
884.text
885
886SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
887	/* input:
888	*      %r9: rk
889	*      %rsi: dst
890	*      %rdx: src
891	*      %xmm0..%xmm15: 16 byte-sliced blocks
892	*/
893
894	FRAME_BEGIN
895
896	movq %rsi, %rax;
897	leaq 8 * 16(%rax), %r8;
898
899	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
900		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
901		      %xmm15, %rax, %r8);
902	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
903		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
904		%rax, %r9, 0);
905	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
906		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
907		%xmm15, %rax, %r9, 1);
908	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
909		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
910		%rax, %r9, 2);
911	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
912		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
913		%xmm15, %rax, %r9, 3);
914	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
915		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
916		%rax, %r9, 4);
917	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
918		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
919		%xmm15, %rax, %r9, 5);
920	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
921		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
922		%rax, %r9, 6);
923	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
924		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
925		%xmm15, %rax, %r9, 7);
926	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
927		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
928		%rax, %r9, 8);
929	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
930		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
931		%xmm15, %rax, %r9, 9);
932	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
933		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
934		%rax, %r9, 10);
935	cmpl $12, ARIA_CTX_rounds(CTX);
936	jne .Laria_192;
937	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
938		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
939		%xmm15, %rax, %r9, 11, 12);
940	jmp .Laria_end;
941.Laria_192:
942	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
943		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
944		%xmm15, %rax, %r9, 11);
945	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
946		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
947		%rax, %r9, 12);
948	cmpl $14, ARIA_CTX_rounds(CTX);
949	jne .Laria_256;
950	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
951		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
952		%xmm15, %rax, %r9, 13, 14);
953	jmp .Laria_end;
954.Laria_256:
955	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
956		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
957		%xmm15, %rax, %r9, 13);
958	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
959		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
960		%rax, %r9, 14);
961	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
962		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
963		%xmm15, %rax, %r9, 15, 16);
964.Laria_end:
965	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
966			   %xmm9, %xmm13, %xmm0, %xmm5,
967			   %xmm10, %xmm14, %xmm3, %xmm6,
968			   %xmm11, %xmm15, %xmm2, %xmm7,
969			   (%rax), (%r8));
970
971	FRAME_END
972	RET;
973SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
974
975SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
976	/* input:
977	*      %rdi: ctx, CTX
978	*      %rsi: dst
979	*      %rdx: src
980	*/
981
982	FRAME_BEGIN
983
984	leaq ARIA_CTX_enc_key(CTX), %r9;
985
986	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
987		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
988		     %xmm15, %rdx);
989
990	call __aria_aesni_avx_crypt_16way;
991
992	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
993		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
994		     %xmm15, %rax);
995
996	FRAME_END
997	RET;
998SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
999
1000SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
1001	/* input:
1002	*      %rdi: ctx, CTX
1003	*      %rsi: dst
1004	*      %rdx: src
1005	*/
1006
1007	FRAME_BEGIN
1008
1009	leaq ARIA_CTX_dec_key(CTX), %r9;
1010
1011	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1012		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1013		     %xmm15, %rdx);
1014
1015	call __aria_aesni_avx_crypt_16way;
1016
1017	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1018		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1019		     %xmm15, %rax);
1020
1021	FRAME_END
1022	RET;
1023SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1024
1025SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1026	/* input:
1027	*      %rdi: ctx
1028	*      %rsi: dst
1029	*      %rdx: src
1030	*      %rcx: keystream
1031	*      %r8: iv (big endian, 128bit)
1032	*/
1033
1034	FRAME_BEGIN
1035	/* load IV and byteswap */
1036	vmovdqu (%r8), %xmm8;
1037
1038	vmovdqa .Lbswap128_mask (%rip), %xmm1;
1039	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1040
1041	vpcmpeqd %xmm0, %xmm0, %xmm0;
1042	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1043
1044	/* construct IVs */
1045	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046	vpshufb %xmm1, %xmm3, %xmm9;
1047	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048	vpshufb %xmm1, %xmm3, %xmm10;
1049	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050	vpshufb %xmm1, %xmm3, %xmm11;
1051	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052	vpshufb %xmm1, %xmm3, %xmm12;
1053	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054	vpshufb %xmm1, %xmm3, %xmm13;
1055	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056	vpshufb %xmm1, %xmm3, %xmm14;
1057	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1058	vpshufb %xmm1, %xmm3, %xmm15;
1059	vmovdqu %xmm8, (0 * 16)(%rcx);
1060	vmovdqu %xmm9, (1 * 16)(%rcx);
1061	vmovdqu %xmm10, (2 * 16)(%rcx);
1062	vmovdqu %xmm11, (3 * 16)(%rcx);
1063	vmovdqu %xmm12, (4 * 16)(%rcx);
1064	vmovdqu %xmm13, (5 * 16)(%rcx);
1065	vmovdqu %xmm14, (6 * 16)(%rcx);
1066	vmovdqu %xmm15, (7 * 16)(%rcx);
1067
1068	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069	vpshufb %xmm1, %xmm3, %xmm8;
1070	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071	vpshufb %xmm1, %xmm3, %xmm9;
1072	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073	vpshufb %xmm1, %xmm3, %xmm10;
1074	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075	vpshufb %xmm1, %xmm3, %xmm11;
1076	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077	vpshufb %xmm1, %xmm3, %xmm12;
1078	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079	vpshufb %xmm1, %xmm3, %xmm13;
1080	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081	vpshufb %xmm1, %xmm3, %xmm14;
1082	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083	vpshufb %xmm1, %xmm3, %xmm15;
1084	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1085	vpshufb %xmm1, %xmm3, %xmm4;
1086	vmovdqu %xmm4, (%r8);
1087
1088	vmovdqu (0 * 16)(%rcx), %xmm0;
1089	vmovdqu (1 * 16)(%rcx), %xmm1;
1090	vmovdqu (2 * 16)(%rcx), %xmm2;
1091	vmovdqu (3 * 16)(%rcx), %xmm3;
1092	vmovdqu (4 * 16)(%rcx), %xmm4;
1093	vmovdqu (5 * 16)(%rcx), %xmm5;
1094	vmovdqu (6 * 16)(%rcx), %xmm6;
1095	vmovdqu (7 * 16)(%rcx), %xmm7;
1096
1097	FRAME_END
1098	RET;
1099SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1100
1101SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1102	/* input:
1103	*      %rdi: ctx
1104	*      %rsi: dst
1105	*      %rdx: src
1106	*      %rcx: keystream
1107	*      %r8: iv (big endian, 128bit)
1108	*/
1109	FRAME_BEGIN
1110
1111	call __aria_aesni_avx_ctr_gen_keystream_16way;
1112
1113	leaq (%rsi), %r10;
1114	leaq (%rdx), %r11;
1115	leaq (%rcx), %rsi;
1116	leaq (%rcx), %rdx;
1117	leaq ARIA_CTX_enc_key(CTX), %r9;
1118
1119	call __aria_aesni_avx_crypt_16way;
1120
1121	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1122	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1123	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1124	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1125	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1126	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1127	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1128	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1129	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1130	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1131	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1132	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1133	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1134	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1135	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1136	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1137	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1138		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1139		     %xmm15, %r10);
1140
1141	FRAME_END
1142	RET;
1143SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1144
1145#ifdef CONFIG_AS_GFNI
1146SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1147	/* input:
1148	*      %r9: rk
1149	*      %rsi: dst
1150	*      %rdx: src
1151	*      %xmm0..%xmm15: 16 byte-sliced blocks
1152	*/
1153
1154	FRAME_BEGIN
1155
1156	movq %rsi, %rax;
1157	leaq 8 * 16(%rax), %r8;
1158
1159	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1160		      %xmm4, %xmm5, %xmm6, %xmm7,
1161		      %xmm8, %xmm9, %xmm10, %xmm11,
1162		      %xmm12, %xmm13, %xmm14,
1163		      %xmm15, %rax, %r8);
1164	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1165		     %xmm12, %xmm13, %xmm14, %xmm15,
1166		     %xmm0, %xmm1, %xmm2, %xmm3,
1167		     %xmm4, %xmm5, %xmm6, %xmm7,
1168		     %rax, %r9, 0);
1169	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1170		     %xmm4, %xmm5, %xmm6, %xmm7,
1171		     %xmm8, %xmm9, %xmm10, %xmm11,
1172		     %xmm12, %xmm13, %xmm14,
1173		     %xmm15, %rax, %r9, 1);
1174	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1175		     %xmm12, %xmm13, %xmm14, %xmm15,
1176		     %xmm0, %xmm1, %xmm2, %xmm3,
1177		     %xmm4, %xmm5, %xmm6, %xmm7,
1178		     %rax, %r9, 2);
1179	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1180		     %xmm4, %xmm5, %xmm6, %xmm7,
1181		     %xmm8, %xmm9, %xmm10, %xmm11,
1182		     %xmm12, %xmm13, %xmm14,
1183		     %xmm15, %rax, %r9, 3);
1184	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1185		     %xmm12, %xmm13, %xmm14, %xmm15,
1186		     %xmm0, %xmm1, %xmm2, %xmm3,
1187		     %xmm4, %xmm5, %xmm6, %xmm7,
1188		     %rax, %r9, 4);
1189	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1190		     %xmm4, %xmm5, %xmm6, %xmm7,
1191		     %xmm8, %xmm9, %xmm10, %xmm11,
1192		     %xmm12, %xmm13, %xmm14,
1193		     %xmm15, %rax, %r9, 5);
1194	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1195		     %xmm12, %xmm13, %xmm14, %xmm15,
1196		     %xmm0, %xmm1, %xmm2, %xmm3,
1197		     %xmm4, %xmm5, %xmm6, %xmm7,
1198		     %rax, %r9, 6);
1199	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1200		     %xmm4, %xmm5, %xmm6, %xmm7,
1201		     %xmm8, %xmm9, %xmm10, %xmm11,
1202		     %xmm12, %xmm13, %xmm14,
1203		     %xmm15, %rax, %r9, 7);
1204	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1205		     %xmm12, %xmm13, %xmm14, %xmm15,
1206		     %xmm0, %xmm1, %xmm2, %xmm3,
1207		     %xmm4, %xmm5, %xmm6, %xmm7,
1208		     %rax, %r9, 8);
1209	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1210		     %xmm4, %xmm5, %xmm6, %xmm7,
1211		     %xmm8, %xmm9, %xmm10, %xmm11,
1212		     %xmm12, %xmm13, %xmm14,
1213		     %xmm15, %rax, %r9, 9);
1214	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1215		     %xmm12, %xmm13, %xmm14, %xmm15,
1216		     %xmm0, %xmm1, %xmm2, %xmm3,
1217		     %xmm4, %xmm5, %xmm6, %xmm7,
1218		     %rax, %r9, 10);
1219	cmpl $12, ARIA_CTX_rounds(CTX);
1220	jne .Laria_gfni_192;
1221	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1222		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1223		%xmm15, %rax, %r9, 11, 12);
1224	jmp .Laria_gfni_end;
1225.Laria_gfni_192:
1226	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1227		     %xmm4, %xmm5, %xmm6, %xmm7,
1228		     %xmm8, %xmm9, %xmm10, %xmm11,
1229		     %xmm12, %xmm13, %xmm14,
1230		     %xmm15, %rax, %r9, 11);
1231	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1232		     %xmm12, %xmm13, %xmm14, %xmm15,
1233		     %xmm0, %xmm1, %xmm2, %xmm3,
1234		     %xmm4, %xmm5, %xmm6, %xmm7,
1235		     %rax, %r9, 12);
1236	cmpl $14, ARIA_CTX_rounds(CTX);
1237	jne .Laria_gfni_256;
1238	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1239		     %xmm4, %xmm5, %xmm6, %xmm7,
1240		     %xmm8, %xmm9, %xmm10, %xmm11,
1241		     %xmm12, %xmm13, %xmm14,
1242		     %xmm15, %rax, %r9, 13, 14);
1243	jmp .Laria_gfni_end;
1244.Laria_gfni_256:
1245	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1246		     %xmm4, %xmm5, %xmm6, %xmm7,
1247		     %xmm8, %xmm9, %xmm10, %xmm11,
1248		     %xmm12, %xmm13, %xmm14,
1249		     %xmm15, %rax, %r9, 13);
1250	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1251		     %xmm12, %xmm13, %xmm14, %xmm15,
1252		     %xmm0, %xmm1, %xmm2, %xmm3,
1253		     %xmm4, %xmm5, %xmm6, %xmm7,
1254		     %rax, %r9, 14);
1255	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1256		     %xmm4, %xmm5, %xmm6, %xmm7,
1257		     %xmm8, %xmm9, %xmm10, %xmm11,
1258		     %xmm12, %xmm13, %xmm14,
1259		     %xmm15, %rax, %r9, 15, 16);
1260.Laria_gfni_end:
1261	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1262			   %xmm9, %xmm13, %xmm0, %xmm5,
1263			   %xmm10, %xmm14, %xmm3, %xmm6,
1264			   %xmm11, %xmm15, %xmm2, %xmm7,
1265			   (%rax), (%r8));
1266
1267	FRAME_END
1268	RET;
1269SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1270
1271SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1272	/* input:
1273	*      %rdi: ctx, CTX
1274	*      %rsi: dst
1275	*      %rdx: src
1276	*/
1277
1278	FRAME_BEGIN
1279
1280	leaq ARIA_CTX_enc_key(CTX), %r9;
1281
1282	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1283		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1284		     %xmm15, %rdx);
1285
1286	call __aria_aesni_avx_gfni_crypt_16way;
1287
1288	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1289		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1290		     %xmm15, %rax);
1291
1292	FRAME_END
1293	RET;
1294SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1295
1296SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1297	/* input:
1298	*      %rdi: ctx, CTX
1299	*      %rsi: dst
1300	*      %rdx: src
1301	*/
1302
1303	FRAME_BEGIN
1304
1305	leaq ARIA_CTX_dec_key(CTX), %r9;
1306
1307	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1308		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1309		     %xmm15, %rdx);
1310
1311	call __aria_aesni_avx_gfni_crypt_16way;
1312
1313	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1314		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1315		     %xmm15, %rax);
1316
1317	FRAME_END
1318	RET;
1319SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1320
1321SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1322	/* input:
1323	*      %rdi: ctx
1324	*      %rsi: dst
1325	*      %rdx: src
1326	*      %rcx: keystream
1327	*      %r8: iv (big endian, 128bit)
1328	*/
1329	FRAME_BEGIN
1330
1331	call __aria_aesni_avx_ctr_gen_keystream_16way
1332
1333	leaq (%rsi), %r10;
1334	leaq (%rdx), %r11;
1335	leaq (%rcx), %rsi;
1336	leaq (%rcx), %rdx;
1337	leaq ARIA_CTX_enc_key(CTX), %r9;
1338
1339	call __aria_aesni_avx_gfni_crypt_16way;
1340
1341	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1342	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1343	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1344	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1345	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1346	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1347	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1348	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1349	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1350	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1351	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1352	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1353	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1354	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1355	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1356	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1357	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1358		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1359		     %xmm15, %r10);
1360
1361	FRAME_END
1362	RET;
1363SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1364#endif /* CONFIG_AS_GFNI */
1365