xref: /openbmc/linux/arch/x86/crypto/aria-aesni-avx-asm_64.S (revision 32bc7297d855608fcb13af62a95739a079b4f8e2)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 16-way parallel algorithm (AVX)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <linux/cfi_types.h>
11#include <asm/asm-offsets.h>
12#include <asm/frame.h>
13
14/* register macros */
15#define CTX %rdi
16
17
18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
19	( (((a0) & 1) << 0) |				\
20	  (((a1) & 1) << 1) |				\
21	  (((a2) & 1) << 2) |				\
22	  (((a3) & 1) << 3) |				\
23	  (((a4) & 1) << 4) |				\
24	  (((a5) & 1) << 5) |				\
25	  (((a6) & 1) << 6) |				\
26	  (((a7) & 1) << 7) )
27
28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
29	( ((l7) << (0 * 8)) |				\
30	  ((l6) << (1 * 8)) |				\
31	  ((l5) << (2 * 8)) |				\
32	  ((l4) << (3 * 8)) |				\
33	  ((l3) << (4 * 8)) |				\
34	  ((l2) << (5 * 8)) |				\
35	  ((l1) << (6 * 8)) |				\
36	  ((l0) << (7 * 8)) )
37
38#define inc_le128(x, minus_one, tmp)			\
39	vpcmpeqq minus_one, x, tmp;			\
40	vpsubq minus_one, x, x;				\
41	vpslldq $8, tmp, tmp;				\
42	vpsubq tmp, x, x;
43
44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
45	vpand x, mask4bit, tmp0;			\
46	vpandn x, mask4bit, x;				\
47	vpsrld $4, x, x;				\
48							\
49	vpshufb tmp0, lo_t, tmp0;			\
50	vpshufb x, hi_t, x;				\
51	vpxor tmp0, x, x;
52
53#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
54	vpunpckhdq x1, x0, t2;				\
55	vpunpckldq x1, x0, x0;				\
56							\
57	vpunpckldq x3, x2, t1;				\
58	vpunpckhdq x3, x2, x2;				\
59							\
60	vpunpckhqdq t1, x0, x1;				\
61	vpunpcklqdq t1, x0, x0;				\
62							\
63	vpunpckhqdq x2, t2, x3;				\
64	vpunpcklqdq x2, t2, x2;
65
66#define byteslice_16x16b(a0, b0, c0, d0,		\
67			 a1, b1, c1, d1,		\
68			 a2, b2, c2, d2,		\
69			 a3, b3, c3, d3,		\
70			 st0, st1)			\
71	vmovdqu d2, st0;				\
72	vmovdqu d3, st1;				\
73	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
74	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
75	vmovdqu st0, d2;				\
76	vmovdqu st1, d3;				\
77							\
78	vmovdqu a0, st0;				\
79	vmovdqu a1, st1;				\
80	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
81	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
82							\
83	vmovdqu .Lshufb_16x16b(%rip), a0;		\
84	vmovdqu st1, a1;				\
85	vpshufb a0, a2, a2;				\
86	vpshufb a0, a3, a3;				\
87	vpshufb a0, b0, b0;				\
88	vpshufb a0, b1, b1;				\
89	vpshufb a0, b2, b2;				\
90	vpshufb a0, b3, b3;				\
91	vpshufb a0, a1, a1;				\
92	vpshufb a0, c0, c0;				\
93	vpshufb a0, c1, c1;				\
94	vpshufb a0, c2, c2;				\
95	vpshufb a0, c3, c3;				\
96	vpshufb a0, d0, d0;				\
97	vpshufb a0, d1, d1;				\
98	vpshufb a0, d2, d2;				\
99	vpshufb a0, d3, d3;				\
100	vmovdqu d3, st1;				\
101	vmovdqu st0, d3;				\
102	vpshufb a0, d3, a0;				\
103	vmovdqu d2, st0;				\
104							\
105	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
106	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
107	vmovdqu st0, d2;				\
108	vmovdqu st1, d3;				\
109							\
110	vmovdqu b0, st0;				\
111	vmovdqu b1, st1;				\
112	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
113	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
114	vmovdqu st0, b0;				\
115	vmovdqu st1, b1;				\
116	/* does not adjust output bytes inside vectors */
117
118#define debyteslice_16x16b(a0, b0, c0, d0,		\
119			   a1, b1, c1, d1,		\
120			   a2, b2, c2, d2,		\
121			   a3, b3, c3, d3,		\
122			   st0, st1)			\
123	vmovdqu d2, st0;				\
124	vmovdqu d3, st1;				\
125	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
126	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
127	vmovdqu st0, d2;				\
128	vmovdqu st1, d3;				\
129							\
130	vmovdqu a0, st0;				\
131	vmovdqu a1, st1;				\
132	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
133	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
134							\
135	vmovdqu .Lshufb_16x16b(%rip), a0;		\
136	vmovdqu st1, a1;				\
137	vpshufb a0, a2, a2;				\
138	vpshufb a0, a3, a3;				\
139	vpshufb a0, b0, b0;				\
140	vpshufb a0, b1, b1;				\
141	vpshufb a0, b2, b2;				\
142	vpshufb a0, b3, b3;				\
143	vpshufb a0, a1, a1;				\
144	vpshufb a0, c0, c0;				\
145	vpshufb a0, c1, c1;				\
146	vpshufb a0, c2, c2;				\
147	vpshufb a0, c3, c3;				\
148	vpshufb a0, d0, d0;				\
149	vpshufb a0, d1, d1;				\
150	vpshufb a0, d2, d2;				\
151	vpshufb a0, d3, d3;				\
152	vmovdqu d3, st1;				\
153	vmovdqu st0, d3;				\
154	vpshufb a0, d3, a0;				\
155	vmovdqu d2, st0;				\
156							\
157	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
158	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
159	vmovdqu st0, d2;				\
160	vmovdqu st1, d3;				\
161							\
162	vmovdqu b0, st0;				\
163	vmovdqu b1, st1;				\
164	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
165	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
166	vmovdqu st0, b0;				\
167	vmovdqu st1, b1;				\
168	/* does not adjust output bytes inside vectors */
169
170/* load blocks to registers and apply pre-whitening */
171#define inpack16_pre(x0, x1, x2, x3,			\
172		     x4, x5, x6, x7,			\
173		     y0, y1, y2, y3,			\
174		     y4, y5, y6, y7,			\
175		     rio)				\
176	vmovdqu (0 * 16)(rio), x0;			\
177	vmovdqu (1 * 16)(rio), x1;			\
178	vmovdqu (2 * 16)(rio), x2;			\
179	vmovdqu (3 * 16)(rio), x3;			\
180	vmovdqu (4 * 16)(rio), x4;			\
181	vmovdqu (5 * 16)(rio), x5;			\
182	vmovdqu (6 * 16)(rio), x6;			\
183	vmovdqu (7 * 16)(rio), x7;			\
184	vmovdqu (8 * 16)(rio), y0;			\
185	vmovdqu (9 * 16)(rio), y1;			\
186	vmovdqu (10 * 16)(rio), y2;			\
187	vmovdqu (11 * 16)(rio), y3;			\
188	vmovdqu (12 * 16)(rio), y4;			\
189	vmovdqu (13 * 16)(rio), y5;			\
190	vmovdqu (14 * 16)(rio), y6;			\
191	vmovdqu (15 * 16)(rio), y7;
192
193/* byteslice pre-whitened blocks and store to temporary memory */
194#define inpack16_post(x0, x1, x2, x3,			\
195		      x4, x5, x6, x7,			\
196		      y0, y1, y2, y3,			\
197		      y4, y5, y6, y7,			\
198		      mem_ab, mem_cd)			\
199	byteslice_16x16b(x0, x1, x2, x3,		\
200			 x4, x5, x6, x7,		\
201			 y0, y1, y2, y3,		\
202			 y4, y5, y6, y7,		\
203			 (mem_ab), (mem_cd));		\
204							\
205	vmovdqu x0, 0 * 16(mem_ab);			\
206	vmovdqu x1, 1 * 16(mem_ab);			\
207	vmovdqu x2, 2 * 16(mem_ab);			\
208	vmovdqu x3, 3 * 16(mem_ab);			\
209	vmovdqu x4, 4 * 16(mem_ab);			\
210	vmovdqu x5, 5 * 16(mem_ab);			\
211	vmovdqu x6, 6 * 16(mem_ab);			\
212	vmovdqu x7, 7 * 16(mem_ab);			\
213	vmovdqu y0, 0 * 16(mem_cd);			\
214	vmovdqu y1, 1 * 16(mem_cd);			\
215	vmovdqu y2, 2 * 16(mem_cd);			\
216	vmovdqu y3, 3 * 16(mem_cd);			\
217	vmovdqu y4, 4 * 16(mem_cd);			\
218	vmovdqu y5, 5 * 16(mem_cd);			\
219	vmovdqu y6, 6 * 16(mem_cd);			\
220	vmovdqu y7, 7 * 16(mem_cd);
221
222#define write_output(x0, x1, x2, x3,			\
223		     x4, x5, x6, x7,			\
224		     y0, y1, y2, y3,			\
225		     y4, y5, y6, y7,			\
226		     mem)				\
227	vmovdqu x0, 0 * 16(mem);			\
228	vmovdqu x1, 1 * 16(mem);			\
229	vmovdqu x2, 2 * 16(mem);			\
230	vmovdqu x3, 3 * 16(mem);			\
231	vmovdqu x4, 4 * 16(mem);			\
232	vmovdqu x5, 5 * 16(mem);			\
233	vmovdqu x6, 6 * 16(mem);			\
234	vmovdqu x7, 7 * 16(mem);			\
235	vmovdqu y0, 8 * 16(mem);			\
236	vmovdqu y1, 9 * 16(mem);			\
237	vmovdqu y2, 10 * 16(mem);			\
238	vmovdqu y3, 11 * 16(mem);			\
239	vmovdqu y4, 12 * 16(mem);			\
240	vmovdqu y5, 13 * 16(mem);			\
241	vmovdqu y6, 14 * 16(mem);			\
242	vmovdqu y7, 15 * 16(mem);			\
243
244#define aria_store_state_8way(x0, x1, x2, x3,		\
245			      x4, x5, x6, x7,		\
246			      mem_tmp, idx)		\
247	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
248	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
249	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
250	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
251	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
252	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
253	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
254	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
255
256#define aria_load_state_8way(x0, x1, x2, x3,		\
257			     x4, x5, x6, x7,		\
258			     mem_tmp, idx)		\
259	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
260	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
261	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
262	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
263	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
264	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
265	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
266	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
267
268#define aria_ark_8way(x0, x1, x2, x3,			\
269		      x4, x5, x6, x7,			\
270		      t0, t1, t2, rk,			\
271		      idx, round)			\
272	/* AddRoundKey */                               \
273	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
274	vpsrld $24, t0, t2;				\
275	vpshufb t1, t2, t2;				\
276	vpxor t2, x0, x0;				\
277	vpsrld $16, t0, t2;				\
278	vpshufb t1, t2, t2;				\
279	vpxor t2, x1, x1;				\
280	vpsrld $8, t0, t2;				\
281	vpshufb t1, t2, t2;				\
282	vpxor t2, x2, x2;				\
283	vpshufb t1, t0, t2;				\
284	vpxor t2, x3, x3;				\
285	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
286	vpsrld $24, t0, t2;				\
287	vpshufb t1, t2, t2;				\
288	vpxor t2, x4, x4;				\
289	vpsrld $16, t0, t2;				\
290	vpshufb t1, t2, t2;				\
291	vpxor t2, x5, x5;				\
292	vpsrld $8, t0, t2;				\
293	vpshufb t1, t2, t2;				\
294	vpxor t2, x6, x6;				\
295	vpshufb t1, t0, t2;				\
296	vpxor t2, x7, x7;
297
298#ifdef CONFIG_AS_GFNI
299#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
300			    x4, x5, x6, x7,		\
301			    t0, t1, t2, t3,		\
302			    t4, t5, t6, t7)		\
303	vmovdqa .Ltf_s2_bitmatrix(%rip), t0;		\
304	vmovdqa .Ltf_inv_bitmatrix(%rip), t1;		\
305	vmovdqa .Ltf_id_bitmatrix(%rip), t2;		\
306	vmovdqa .Ltf_aff_bitmatrix(%rip), t3;		\
307	vmovdqa .Ltf_x2_bitmatrix(%rip), t4;		\
308	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
309	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
310	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
311	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
312	vgf2p8affineinvqb $0, t2, x2, x2;		\
313	vgf2p8affineinvqb $0, t2, x6, x6;		\
314	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
315	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
316	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
317	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
318	vgf2p8affineinvqb $0, t2, x3, x3;		\
319	vgf2p8affineinvqb $0, t2, x7, x7
320
321#endif /* CONFIG_AS_GFNI */
322
323#define aria_sbox_8way(x0, x1, x2, x3,            	\
324		       x4, x5, x6, x7,			\
325		       t0, t1, t2, t3,			\
326		       t4, t5, t6, t7)			\
327	vmovdqa .Linv_shift_row(%rip), t0;		\
328	vmovdqa .Lshift_row(%rip), t1;			\
329	vbroadcastss .L0f0f0f0f(%rip), t6;		\
330	vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2;	\
331	vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3;	\
332	vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4;	\
333	vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5;	\
334							\
335	vaesenclast t7, x0, x0;				\
336	vaesenclast t7, x4, x4;				\
337	vaesenclast t7, x1, x1;				\
338	vaesenclast t7, x5, x5;				\
339	vaesdeclast t7, x2, x2;				\
340	vaesdeclast t7, x6, x6;				\
341							\
342	/* AES inverse shift rows */			\
343	vpshufb t0, x0, x0;				\
344	vpshufb t0, x4, x4;				\
345	vpshufb t0, x1, x1;				\
346	vpshufb t0, x5, x5;				\
347	vpshufb t1, x3, x3;				\
348	vpshufb t1, x7, x7;				\
349	vpshufb t1, x2, x2;				\
350	vpshufb t1, x6, x6;				\
351							\
352	/* affine transformation for S2 */		\
353	filter_8bit(x1, t2, t3, t6, t0);		\
354	/* affine transformation for S2 */		\
355	filter_8bit(x5, t2, t3, t6, t0);		\
356							\
357	/* affine transformation for X2 */		\
358	filter_8bit(x3, t4, t5, t6, t0);		\
359	/* affine transformation for X2 */		\
360	filter_8bit(x7, t4, t5, t6, t0);		\
361	vaesdeclast t7, x3, x3;				\
362	vaesdeclast t7, x7, x7;
363
364#define aria_diff_m(x0, x1, x2, x3,			\
365		    t0, t1, t2, t3)			\
366	/* T = rotr32(X, 8); */				\
367	/* X ^= T */					\
368	vpxor x0, x3, t0;				\
369	vpxor x1, x0, t1;				\
370	vpxor x2, x1, t2;				\
371	vpxor x3, x2, t3;				\
372	/* X = T ^ rotr(X, 16); */			\
373	vpxor t2, x0, x0;				\
374	vpxor x1, t3, t3;				\
375	vpxor t0, x2, x2;				\
376	vpxor t1, x3, x1;				\
377	vmovdqu t3, x3;
378
379#define aria_diff_word(x0, x1, x2, x3,			\
380		       x4, x5, x6, x7,			\
381		       y0, y1, y2, y3,			\
382		       y4, y5, y6, y7)			\
383	/* t1 ^= t2; */					\
384	vpxor y0, x4, x4;				\
385	vpxor y1, x5, x5;				\
386	vpxor y2, x6, x6;				\
387	vpxor y3, x7, x7;				\
388							\
389	/* t2 ^= t3; */					\
390	vpxor y4, y0, y0;				\
391	vpxor y5, y1, y1;				\
392	vpxor y6, y2, y2;				\
393	vpxor y7, y3, y3;				\
394							\
395	/* t0 ^= t1; */					\
396	vpxor x4, x0, x0;				\
397	vpxor x5, x1, x1;				\
398	vpxor x6, x2, x2;				\
399	vpxor x7, x3, x3;				\
400							\
401	/* t3 ^= t1; */					\
402	vpxor x4, y4, y4;				\
403	vpxor x5, y5, y5;				\
404	vpxor x6, y6, y6;				\
405	vpxor x7, y7, y7;				\
406							\
407	/* t2 ^= t0; */					\
408	vpxor x0, y0, y0;				\
409	vpxor x1, y1, y1;				\
410	vpxor x2, y2, y2;				\
411	vpxor x3, y3, y3;				\
412							\
413	/* t1 ^= t2; */					\
414	vpxor y0, x4, x4;				\
415	vpxor y1, x5, x5;				\
416	vpxor y2, x6, x6;				\
417	vpxor y3, x7, x7;
418
419#define aria_fe(x0, x1, x2, x3,				\
420		x4, x5, x6, x7,				\
421		y0, y1, y2, y3,				\
422		y4, y5, y6, y7,				\
423		mem_tmp, rk, round)			\
424	vpxor y7, y7, y7;				\
425	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
426		      y0, y7, y2, rk, 8, round);	\
427							\
428	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
429		       y0, y1, y2, y3, y4, y5, y6, y7);	\
430							\
431	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
432	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
433	aria_store_state_8way(x0, x1, x2, x3,		\
434			      x4, x5, x6, x7,		\
435			      mem_tmp, 8);		\
436							\
437	aria_load_state_8way(x0, x1, x2, x3,		\
438			     x4, x5, x6, x7,		\
439			     mem_tmp, 0);		\
440	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
441		      y0, y7, y2, rk, 0, round);	\
442							\
443	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
444		       y0, y1, y2, y3, y4, y5, y6, y7);	\
445							\
446	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
447	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
448	aria_store_state_8way(x0, x1, x2, x3,		\
449			      x4, x5, x6, x7,		\
450			      mem_tmp, 0);		\
451	aria_load_state_8way(y0, y1, y2, y3,		\
452			     y4, y5, y6, y7,		\
453			     mem_tmp, 8);		\
454	aria_diff_word(x0, x1, x2, x3,			\
455		       x4, x5, x6, x7,			\
456		       y0, y1, y2, y3,			\
457		       y4, y5, y6, y7);			\
458	/* aria_diff_byte() 				\
459	 * T3 = ABCD -> BADC 				\
460	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
461	 * T0 = ABCD -> CDAB 				\
462	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
463	 * T1 = ABCD -> DCBA 				\
464	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
465	 */						\
466	aria_diff_word(x2, x3, x0, x1,			\
467		       x7, x6, x5, x4,			\
468		       y0, y1, y2, y3,			\
469		       y5, y4, y7, y6);			\
470	aria_store_state_8way(x3, x2, x1, x0,		\
471			      x6, x7, x4, x5,		\
472			      mem_tmp, 0);
473
474#define aria_fo(x0, x1, x2, x3,				\
475		x4, x5, x6, x7,				\
476		y0, y1, y2, y3,				\
477		y4, y5, y6, y7,				\
478		mem_tmp, rk, round)			\
479	vpxor y7, y7, y7;				\
480	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
481		      y0, y7, y2, rk, 8, round);	\
482							\
483	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
484		       y0, y1, y2, y3, y4, y5, y6, y7);	\
485							\
486	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
487	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
488	aria_store_state_8way(x0, x1, x2, x3,		\
489			      x4, x5, x6, x7,		\
490			      mem_tmp, 8);		\
491							\
492	aria_load_state_8way(x0, x1, x2, x3,		\
493			     x4, x5, x6, x7,		\
494			     mem_tmp, 0);		\
495	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
496		      y0, y7, y2, rk, 0, round);	\
497							\
498	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
499		       y0, y1, y2, y3, y4, y5, y6, y7);	\
500							\
501	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
502	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
503	aria_store_state_8way(x0, x1, x2, x3,		\
504			      x4, x5, x6, x7,		\
505			      mem_tmp, 0);		\
506	aria_load_state_8way(y0, y1, y2, y3,		\
507			     y4, y5, y6, y7,		\
508			     mem_tmp, 8);		\
509	aria_diff_word(x0, x1, x2, x3,			\
510		       x4, x5, x6, x7,			\
511		       y0, y1, y2, y3,			\
512		       y4, y5, y6, y7);			\
513	/* aria_diff_byte() 				\
514	 * T1 = ABCD -> BADC 				\
515	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
516	 * T2 = ABCD -> CDAB 				\
517	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
518	 * T3 = ABCD -> DCBA 				\
519	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
520	 */						\
521	aria_diff_word(x0, x1, x2, x3,			\
522		       x5, x4, x7, x6,			\
523		       y2, y3, y0, y1,			\
524		       y7, y6, y5, y4);			\
525	aria_store_state_8way(x3, x2, x1, x0,		\
526			      x6, x7, x4, x5,		\
527			      mem_tmp, 0);
528
529#define aria_ff(x0, x1, x2, x3,				\
530		x4, x5, x6, x7,				\
531		y0, y1, y2, y3,				\
532		y4, y5, y6, y7,				\
533		mem_tmp, rk, round, last_round)		\
534	vpxor y7, y7, y7;				\
535	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
536		      y0, y7, y2, rk, 8, round);	\
537							\
538	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
539		       y0, y1, y2, y3, y4, y5, y6, y7);	\
540							\
541	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
542		      y0, y7, y2, rk, 8, last_round);	\
543							\
544	aria_store_state_8way(x0, x1, x2, x3,		\
545			      x4, x5, x6, x7,		\
546			      mem_tmp, 8);		\
547							\
548	aria_load_state_8way(x0, x1, x2, x3,		\
549			     x4, x5, x6, x7,		\
550			     mem_tmp, 0);		\
551	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
552		      y0, y7, y2, rk, 0, round);	\
553							\
554	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
555		       y0, y1, y2, y3, y4, y5, y6, y7);	\
556							\
557	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
558		      y0, y7, y2, rk, 0, last_round);	\
559							\
560	aria_load_state_8way(y0, y1, y2, y3,		\
561			     y4, y5, y6, y7,		\
562			     mem_tmp, 8);
563
564#ifdef CONFIG_AS_GFNI
565#define aria_fe_gfni(x0, x1, x2, x3,			\
566		     x4, x5, x6, x7,			\
567		     y0, y1, y2, y3,			\
568		     y4, y5, y6, y7,			\
569		     mem_tmp, rk, round)		\
570	vpxor y7, y7, y7;				\
571	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
572		      y0, y7, y2, rk, 8, round);	\
573							\
574	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
575			    x6, x7, x4, x5,		\
576			    y0, y1, y2, y3, 		\
577			    y4, y5, y6, y7);		\
578							\
579	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
580	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
581	aria_store_state_8way(x0, x1, x2, x3,		\
582			      x4, x5, x6, x7,		\
583			      mem_tmp, 8);		\
584							\
585	aria_load_state_8way(x0, x1, x2, x3,		\
586			     x4, x5, x6, x7,		\
587			     mem_tmp, 0);		\
588	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
589		      y0, y7, y2, rk, 0, round);	\
590							\
591	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
592			    x6, x7, x4, x5,		\
593			    y0, y1, y2, y3, 		\
594			    y4, y5, y6, y7);		\
595							\
596	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
597	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
598	aria_store_state_8way(x0, x1, x2, x3,		\
599			      x4, x5, x6, x7,		\
600			      mem_tmp, 0);		\
601	aria_load_state_8way(y0, y1, y2, y3,		\
602			     y4, y5, y6, y7,		\
603			     mem_tmp, 8);		\
604	aria_diff_word(x0, x1, x2, x3,			\
605		       x4, x5, x6, x7,			\
606		       y0, y1, y2, y3,			\
607		       y4, y5, y6, y7);			\
608	/* aria_diff_byte() 				\
609	 * T3 = ABCD -> BADC 				\
610	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
611	 * T0 = ABCD -> CDAB 				\
612	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
613	 * T1 = ABCD -> DCBA 				\
614	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
615	 */						\
616	aria_diff_word(x2, x3, x0, x1,			\
617		       x7, x6, x5, x4,			\
618		       y0, y1, y2, y3,			\
619		       y5, y4, y7, y6);			\
620	aria_store_state_8way(x3, x2, x1, x0,		\
621			      x6, x7, x4, x5,		\
622			      mem_tmp, 0);
623
624#define aria_fo_gfni(x0, x1, x2, x3,			\
625		     x4, x5, x6, x7,			\
626		     y0, y1, y2, y3,			\
627		     y4, y5, y6, y7,			\
628		     mem_tmp, rk, round)		\
629	vpxor y7, y7, y7;				\
630	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
631		      y0, y7, y2, rk, 8, round);	\
632							\
633	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
634			    x4, x5, x6, x7,		\
635			    y0, y1, y2, y3, 		\
636			    y4, y5, y6, y7);		\
637							\
638	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
639	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
640	aria_store_state_8way(x0, x1, x2, x3,		\
641			      x4, x5, x6, x7,		\
642			      mem_tmp, 8);		\
643							\
644	aria_load_state_8way(x0, x1, x2, x3,		\
645			     x4, x5, x6, x7,		\
646			     mem_tmp, 0);		\
647	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
648		      y0, y7, y2, rk, 0, round);	\
649							\
650	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
651			    x4, x5, x6, x7,		\
652			    y0, y1, y2, y3, 		\
653			    y4, y5, y6, y7);		\
654							\
655	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
656	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
657	aria_store_state_8way(x0, x1, x2, x3,		\
658			      x4, x5, x6, x7,		\
659			      mem_tmp, 0);		\
660	aria_load_state_8way(y0, y1, y2, y3,		\
661			     y4, y5, y6, y7,		\
662			     mem_tmp, 8);		\
663	aria_diff_word(x0, x1, x2, x3,			\
664		       x4, x5, x6, x7,			\
665		       y0, y1, y2, y3,			\
666		       y4, y5, y6, y7);			\
667	/* aria_diff_byte() 				\
668	 * T1 = ABCD -> BADC 				\
669	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
670	 * T2 = ABCD -> CDAB 				\
671	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
672	 * T3 = ABCD -> DCBA 				\
673	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
674	 */						\
675	aria_diff_word(x0, x1, x2, x3,			\
676		       x5, x4, x7, x6,			\
677		       y2, y3, y0, y1,			\
678		       y7, y6, y5, y4);			\
679	aria_store_state_8way(x3, x2, x1, x0,		\
680			      x6, x7, x4, x5,		\
681			      mem_tmp, 0);
682
683#define aria_ff_gfni(x0, x1, x2, x3,			\
684		x4, x5, x6, x7,				\
685		y0, y1, y2, y3,				\
686		y4, y5, y6, y7,				\
687		mem_tmp, rk, round, last_round)		\
688	vpxor y7, y7, y7;				\
689	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
690		      y0, y7, y2, rk, 8, round);	\
691							\
692	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
693			    x6, x7, x4, x5,		\
694			    y0, y1, y2, y3, 		\
695			    y4, y5, y6, y7);		\
696							\
697	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
698		      y0, y7, y2, rk, 8, last_round);	\
699							\
700	aria_store_state_8way(x0, x1, x2, x3,		\
701			      x4, x5, x6, x7,		\
702			      mem_tmp, 8);		\
703							\
704	aria_load_state_8way(x0, x1, x2, x3,		\
705			     x4, x5, x6, x7,		\
706			     mem_tmp, 0);		\
707	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
708		      y0, y7, y2, rk, 0, round);	\
709							\
710	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
711			    x6, x7, x4, x5,		\
712			    y0, y1, y2, y3, 		\
713			    y4, y5, y6, y7);		\
714							\
715	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
716		      y0, y7, y2, rk, 0, last_round);	\
717							\
718	aria_load_state_8way(y0, y1, y2, y3,		\
719			     y4, y5, y6, y7,		\
720			     mem_tmp, 8);
721
722#endif /* CONFIG_AS_GFNI */
723
724/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725.section	.rodata.cst16, "aM", @progbits, 16
726.align 16
727
728#define SHUFB_BYTES(idx) \
729	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
730
731.Lshufb_16x16b:
732	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733/* For isolating SubBytes from AESENCLAST, inverse shift row */
734.Linv_shift_row:
735	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
737.Lshift_row:
738	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740/* For CTR-mode IV byteswap */
741.Lbswap128_mask:
742	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
744
745/* AES inverse affine and S2 combined:
746 *      1 1 0 0 0 0 0 1     x0     0
747 *      0 1 0 0 1 0 0 0     x1     0
748 *      1 1 0 0 1 1 1 1     x2     0
749 *      0 1 1 0 1 0 0 1     x3     1
750 *      0 1 0 0 1 1 0 0  *  x4  +  0
751 *      0 1 0 1 1 0 0 0     x5     0
752 *      0 0 0 0 0 1 0 1     x6     0
753 *      1 1 1 0 0 1 1 1     x7     1
754 */
755.Ltf_lo__inv_aff__and__s2:
756	.octa 0x92172DA81A9FA520B2370D883ABF8500
757.Ltf_hi__inv_aff__and__s2:
758	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
759
760/* X2 and AES forward affine combined:
761 *      1 0 1 1 0 0 0 1     x0     0
762 *      0 1 1 1 1 0 1 1     x1     0
763 *      0 0 0 1 1 0 1 0     x2     1
764 *      0 1 0 0 0 1 0 0     x3     0
765 *      0 0 1 1 1 0 1 1  *  x4  +  0
766 *      0 1 0 0 1 0 0 0     x5     0
767 *      1 1 0 1 0 0 1 1     x6     0
768 *      0 1 0 0 1 0 1 0     x7     0
769 */
770.Ltf_lo__x2__and__fwd_aff:
771	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772.Ltf_hi__x2__and__fwd_aff:
773	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
774
775#ifdef CONFIG_AS_GFNI
776/* AES affine: */
777#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
778.Ltf_aff_bitmatrix:
779	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
780		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
781		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
782		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
783		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
784		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
785		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
786		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
787	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
788		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
789		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
790		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
791		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
792		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
793		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
794		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
795
796/* AES inverse affine: */
797#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
798.Ltf_inv_bitmatrix:
799	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
800		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
801		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
802		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
803		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
804		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
805		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
806		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
807	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
808		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
809		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
810		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
811		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
812		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
813		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
814		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
815
816/* S2: */
817#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
818.Ltf_s2_bitmatrix:
819	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
820		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
821		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
822		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
823		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
824		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
825		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
826		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
827	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
828		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
829		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
830		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
831		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
832		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
833		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
834		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
835
836/* X2: */
837#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
838.Ltf_x2_bitmatrix:
839	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
840		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
841		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
842		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
843		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
844		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
845		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
846		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
847	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
848		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
849		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
850		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
851		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
852		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
853		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
854		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
855
856/* Identity matrix: */
857.Ltf_id_bitmatrix:
858	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
859		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
860		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
861		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
862		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
863		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
864		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
865		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
866	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
867		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
868		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
869		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
870		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
871		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
872		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
873		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
874#endif /* CONFIG_AS_GFNI */
875
876/* 4-bit mask */
877.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
878.align 4
879.L0f0f0f0f:
880	.long 0x0f0f0f0f
881
882.text
883
884SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
885	/* input:
886	*      %r9: rk
887	*      %rsi: dst
888	*      %rdx: src
889	*      %xmm0..%xmm15: 16 byte-sliced blocks
890	*/
891
892	FRAME_BEGIN
893
894	movq %rsi, %rax;
895	leaq 8 * 16(%rax), %r8;
896
897	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
898		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899		      %xmm15, %rax, %r8);
900	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
901		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902		%rax, %r9, 0);
903	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905		%xmm15, %rax, %r9, 1);
906	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
907		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
908		%rax, %r9, 2);
909	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
910		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
911		%xmm15, %rax, %r9, 3);
912	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
913		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
914		%rax, %r9, 4);
915	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
916		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
917		%xmm15, %rax, %r9, 5);
918	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
919		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
920		%rax, %r9, 6);
921	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
922		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
923		%xmm15, %rax, %r9, 7);
924	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
925		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
926		%rax, %r9, 8);
927	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
928		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929		%xmm15, %rax, %r9, 9);
930	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
931		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
932		%rax, %r9, 10);
933	cmpl $12, ARIA_CTX_rounds(CTX);
934	jne .Laria_192;
935	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
936		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
937		%xmm15, %rax, %r9, 11, 12);
938	jmp .Laria_end;
939.Laria_192:
940	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
941		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
942		%xmm15, %rax, %r9, 11);
943	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
944		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
945		%rax, %r9, 12);
946	cmpl $14, ARIA_CTX_rounds(CTX);
947	jne .Laria_256;
948	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
949		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
950		%xmm15, %rax, %r9, 13, 14);
951	jmp .Laria_end;
952.Laria_256:
953	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
954		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955		%xmm15, %rax, %r9, 13);
956	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
957		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
958		%rax, %r9, 14);
959	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961		%xmm15, %rax, %r9, 15, 16);
962.Laria_end:
963	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
964			   %xmm9, %xmm13, %xmm0, %xmm5,
965			   %xmm10, %xmm14, %xmm3, %xmm6,
966			   %xmm11, %xmm15, %xmm2, %xmm7,
967			   (%rax), (%r8));
968
969	FRAME_END
970	RET;
971SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
972
973SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
974	/* input:
975	*      %rdi: ctx, CTX
976	*      %rsi: dst
977	*      %rdx: src
978	*/
979
980	FRAME_BEGIN
981
982	leaq ARIA_CTX_enc_key(CTX), %r9;
983
984	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
985		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
986		     %xmm15, %rdx);
987
988	call __aria_aesni_avx_crypt_16way;
989
990	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
991		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
992		     %xmm15, %rax);
993
994	FRAME_END
995	RET;
996SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
997
998SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
999	/* input:
1000	*      %rdi: ctx, CTX
1001	*      %rsi: dst
1002	*      %rdx: src
1003	*/
1004
1005	FRAME_BEGIN
1006
1007	leaq ARIA_CTX_dec_key(CTX), %r9;
1008
1009	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1010		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1011		     %xmm15, %rdx);
1012
1013	call __aria_aesni_avx_crypt_16way;
1014
1015	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1016		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1017		     %xmm15, %rax);
1018
1019	FRAME_END
1020	RET;
1021SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1022
1023SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1024	/* input:
1025	*      %rdi: ctx
1026	*      %rsi: dst
1027	*      %rdx: src
1028	*      %rcx: keystream
1029	*      %r8: iv (big endian, 128bit)
1030	*/
1031
1032	FRAME_BEGIN
1033	/* load IV and byteswap */
1034	vmovdqu (%r8), %xmm8;
1035
1036	vmovdqa .Lbswap128_mask (%rip), %xmm1;
1037	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1038
1039	vpcmpeqd %xmm0, %xmm0, %xmm0;
1040	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1041
1042	/* construct IVs */
1043	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044	vpshufb %xmm1, %xmm3, %xmm9;
1045	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046	vpshufb %xmm1, %xmm3, %xmm10;
1047	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048	vpshufb %xmm1, %xmm3, %xmm11;
1049	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050	vpshufb %xmm1, %xmm3, %xmm12;
1051	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052	vpshufb %xmm1, %xmm3, %xmm13;
1053	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054	vpshufb %xmm1, %xmm3, %xmm14;
1055	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056	vpshufb %xmm1, %xmm3, %xmm15;
1057	vmovdqu %xmm8, (0 * 16)(%rcx);
1058	vmovdqu %xmm9, (1 * 16)(%rcx);
1059	vmovdqu %xmm10, (2 * 16)(%rcx);
1060	vmovdqu %xmm11, (3 * 16)(%rcx);
1061	vmovdqu %xmm12, (4 * 16)(%rcx);
1062	vmovdqu %xmm13, (5 * 16)(%rcx);
1063	vmovdqu %xmm14, (6 * 16)(%rcx);
1064	vmovdqu %xmm15, (7 * 16)(%rcx);
1065
1066	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067	vpshufb %xmm1, %xmm3, %xmm8;
1068	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069	vpshufb %xmm1, %xmm3, %xmm9;
1070	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071	vpshufb %xmm1, %xmm3, %xmm10;
1072	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073	vpshufb %xmm1, %xmm3, %xmm11;
1074	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075	vpshufb %xmm1, %xmm3, %xmm12;
1076	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077	vpshufb %xmm1, %xmm3, %xmm13;
1078	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079	vpshufb %xmm1, %xmm3, %xmm14;
1080	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081	vpshufb %xmm1, %xmm3, %xmm15;
1082	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083	vpshufb %xmm1, %xmm3, %xmm4;
1084	vmovdqu %xmm4, (%r8);
1085
1086	vmovdqu (0 * 16)(%rcx), %xmm0;
1087	vmovdqu (1 * 16)(%rcx), %xmm1;
1088	vmovdqu (2 * 16)(%rcx), %xmm2;
1089	vmovdqu (3 * 16)(%rcx), %xmm3;
1090	vmovdqu (4 * 16)(%rcx), %xmm4;
1091	vmovdqu (5 * 16)(%rcx), %xmm5;
1092	vmovdqu (6 * 16)(%rcx), %xmm6;
1093	vmovdqu (7 * 16)(%rcx), %xmm7;
1094
1095	FRAME_END
1096	RET;
1097SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1098
1099SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1100	/* input:
1101	*      %rdi: ctx
1102	*      %rsi: dst
1103	*      %rdx: src
1104	*      %rcx: keystream
1105	*      %r8: iv (big endian, 128bit)
1106	*/
1107	FRAME_BEGIN
1108
1109	call __aria_aesni_avx_ctr_gen_keystream_16way;
1110
1111	leaq (%rsi), %r10;
1112	leaq (%rdx), %r11;
1113	leaq (%rcx), %rsi;
1114	leaq (%rcx), %rdx;
1115	leaq ARIA_CTX_enc_key(CTX), %r9;
1116
1117	call __aria_aesni_avx_crypt_16way;
1118
1119	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1120	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1121	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1122	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1123	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1124	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1125	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1126	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1127	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1128	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1129	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1130	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1131	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1132	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1133	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1134	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1135	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1136		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1137		     %xmm15, %r10);
1138
1139	FRAME_END
1140	RET;
1141SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1142
1143#ifdef CONFIG_AS_GFNI
1144SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1145	/* input:
1146	*      %r9: rk
1147	*      %rsi: dst
1148	*      %rdx: src
1149	*      %xmm0..%xmm15: 16 byte-sliced blocks
1150	*/
1151
1152	FRAME_BEGIN
1153
1154	movq %rsi, %rax;
1155	leaq 8 * 16(%rax), %r8;
1156
1157	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1158		      %xmm4, %xmm5, %xmm6, %xmm7,
1159		      %xmm8, %xmm9, %xmm10, %xmm11,
1160		      %xmm12, %xmm13, %xmm14,
1161		      %xmm15, %rax, %r8);
1162	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1163		     %xmm12, %xmm13, %xmm14, %xmm15,
1164		     %xmm0, %xmm1, %xmm2, %xmm3,
1165		     %xmm4, %xmm5, %xmm6, %xmm7,
1166		     %rax, %r9, 0);
1167	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168		     %xmm4, %xmm5, %xmm6, %xmm7,
1169		     %xmm8, %xmm9, %xmm10, %xmm11,
1170		     %xmm12, %xmm13, %xmm14,
1171		     %xmm15, %rax, %r9, 1);
1172	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173		     %xmm12, %xmm13, %xmm14, %xmm15,
1174		     %xmm0, %xmm1, %xmm2, %xmm3,
1175		     %xmm4, %xmm5, %xmm6, %xmm7,
1176		     %rax, %r9, 2);
1177	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1178		     %xmm4, %xmm5, %xmm6, %xmm7,
1179		     %xmm8, %xmm9, %xmm10, %xmm11,
1180		     %xmm12, %xmm13, %xmm14,
1181		     %xmm15, %rax, %r9, 3);
1182	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1183		     %xmm12, %xmm13, %xmm14, %xmm15,
1184		     %xmm0, %xmm1, %xmm2, %xmm3,
1185		     %xmm4, %xmm5, %xmm6, %xmm7,
1186		     %rax, %r9, 4);
1187	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1188		     %xmm4, %xmm5, %xmm6, %xmm7,
1189		     %xmm8, %xmm9, %xmm10, %xmm11,
1190		     %xmm12, %xmm13, %xmm14,
1191		     %xmm15, %rax, %r9, 5);
1192	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1193		     %xmm12, %xmm13, %xmm14, %xmm15,
1194		     %xmm0, %xmm1, %xmm2, %xmm3,
1195		     %xmm4, %xmm5, %xmm6, %xmm7,
1196		     %rax, %r9, 6);
1197	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1198		     %xmm4, %xmm5, %xmm6, %xmm7,
1199		     %xmm8, %xmm9, %xmm10, %xmm11,
1200		     %xmm12, %xmm13, %xmm14,
1201		     %xmm15, %rax, %r9, 7);
1202	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1203		     %xmm12, %xmm13, %xmm14, %xmm15,
1204		     %xmm0, %xmm1, %xmm2, %xmm3,
1205		     %xmm4, %xmm5, %xmm6, %xmm7,
1206		     %rax, %r9, 8);
1207	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1208		     %xmm4, %xmm5, %xmm6, %xmm7,
1209		     %xmm8, %xmm9, %xmm10, %xmm11,
1210		     %xmm12, %xmm13, %xmm14,
1211		     %xmm15, %rax, %r9, 9);
1212	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1213		     %xmm12, %xmm13, %xmm14, %xmm15,
1214		     %xmm0, %xmm1, %xmm2, %xmm3,
1215		     %xmm4, %xmm5, %xmm6, %xmm7,
1216		     %rax, %r9, 10);
1217	cmpl $12, ARIA_CTX_rounds(CTX);
1218	jne .Laria_gfni_192;
1219	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1220		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1221		%xmm15, %rax, %r9, 11, 12);
1222	jmp .Laria_gfni_end;
1223.Laria_gfni_192:
1224	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1225		     %xmm4, %xmm5, %xmm6, %xmm7,
1226		     %xmm8, %xmm9, %xmm10, %xmm11,
1227		     %xmm12, %xmm13, %xmm14,
1228		     %xmm15, %rax, %r9, 11);
1229	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1230		     %xmm12, %xmm13, %xmm14, %xmm15,
1231		     %xmm0, %xmm1, %xmm2, %xmm3,
1232		     %xmm4, %xmm5, %xmm6, %xmm7,
1233		     %rax, %r9, 12);
1234	cmpl $14, ARIA_CTX_rounds(CTX);
1235	jne .Laria_gfni_256;
1236	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1237		     %xmm4, %xmm5, %xmm6, %xmm7,
1238		     %xmm8, %xmm9, %xmm10, %xmm11,
1239		     %xmm12, %xmm13, %xmm14,
1240		     %xmm15, %rax, %r9, 13, 14);
1241	jmp .Laria_gfni_end;
1242.Laria_gfni_256:
1243	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1244		     %xmm4, %xmm5, %xmm6, %xmm7,
1245		     %xmm8, %xmm9, %xmm10, %xmm11,
1246		     %xmm12, %xmm13, %xmm14,
1247		     %xmm15, %rax, %r9, 13);
1248	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1249		     %xmm12, %xmm13, %xmm14, %xmm15,
1250		     %xmm0, %xmm1, %xmm2, %xmm3,
1251		     %xmm4, %xmm5, %xmm6, %xmm7,
1252		     %rax, %r9, 14);
1253	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1254		     %xmm4, %xmm5, %xmm6, %xmm7,
1255		     %xmm8, %xmm9, %xmm10, %xmm11,
1256		     %xmm12, %xmm13, %xmm14,
1257		     %xmm15, %rax, %r9, 15, 16);
1258.Laria_gfni_end:
1259	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1260			   %xmm9, %xmm13, %xmm0, %xmm5,
1261			   %xmm10, %xmm14, %xmm3, %xmm6,
1262			   %xmm11, %xmm15, %xmm2, %xmm7,
1263			   (%rax), (%r8));
1264
1265	FRAME_END
1266	RET;
1267SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1268
1269SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1270	/* input:
1271	*      %rdi: ctx, CTX
1272	*      %rsi: dst
1273	*      %rdx: src
1274	*/
1275
1276	FRAME_BEGIN
1277
1278	leaq ARIA_CTX_enc_key(CTX), %r9;
1279
1280	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1281		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1282		     %xmm15, %rdx);
1283
1284	call __aria_aesni_avx_gfni_crypt_16way;
1285
1286	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1287		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1288		     %xmm15, %rax);
1289
1290	FRAME_END
1291	RET;
1292SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1293
1294SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1295	/* input:
1296	*      %rdi: ctx, CTX
1297	*      %rsi: dst
1298	*      %rdx: src
1299	*/
1300
1301	FRAME_BEGIN
1302
1303	leaq ARIA_CTX_dec_key(CTX), %r9;
1304
1305	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1306		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1307		     %xmm15, %rdx);
1308
1309	call __aria_aesni_avx_gfni_crypt_16way;
1310
1311	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1312		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1313		     %xmm15, %rax);
1314
1315	FRAME_END
1316	RET;
1317SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1318
1319SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1320	/* input:
1321	*      %rdi: ctx
1322	*      %rsi: dst
1323	*      %rdx: src
1324	*      %rcx: keystream
1325	*      %r8: iv (big endian, 128bit)
1326	*/
1327	FRAME_BEGIN
1328
1329	call __aria_aesni_avx_ctr_gen_keystream_16way
1330
1331	leaq (%rsi), %r10;
1332	leaq (%rdx), %r11;
1333	leaq (%rcx), %rsi;
1334	leaq (%rcx), %rdx;
1335	leaq ARIA_CTX_enc_key(CTX), %r9;
1336
1337	call __aria_aesni_avx_gfni_crypt_16way;
1338
1339	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1340	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1341	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1342	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1343	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1344	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1345	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1346	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1347	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1348	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1349	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1350	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1351	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1352	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1353	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1354	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1355	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1356		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1357		     %xmm15, %r10);
1358
1359	FRAME_END
1360	RET;
1361SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1362#endif /* CONFIG_AS_GFNI */
1363