xref: /openbmc/linux/arch/x86/crypto/aria-gfni-avx512-asm_64.S (revision 724ba6751532055db75992fc6ae21c3e322e94a7)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 64-way parallel algorithm (AVX512)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <asm/frame.h>
11#include <asm/asm-offsets.h>
12#include <linux/cfi_types.h>
13
14/* register macros */
15#define CTX %rdi
16
17
18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
19	( (((a0) & 1) << 0) |				\
20	  (((a1) & 1) << 1) |				\
21	  (((a2) & 1) << 2) |				\
22	  (((a3) & 1) << 3) |				\
23	  (((a4) & 1) << 4) |				\
24	  (((a5) & 1) << 5) |				\
25	  (((a6) & 1) << 6) |				\
26	  (((a7) & 1) << 7) )
27
28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
29	( ((l7) << (0 * 8)) |				\
30	  ((l6) << (1 * 8)) |				\
31	  ((l5) << (2 * 8)) |				\
32	  ((l4) << (3 * 8)) |				\
33	  ((l3) << (4 * 8)) |				\
34	  ((l2) << (5 * 8)) |				\
35	  ((l1) << (6 * 8)) |				\
36	  ((l0) << (7 * 8)) )
37
38#define add_le128(out, in, lo_counter, hi_counter1)	\
39	vpaddq lo_counter, in, out;			\
40	vpcmpuq $1, lo_counter, out, %k1;		\
41	kaddb %k1, %k1, %k1;				\
42	vpaddq hi_counter1, out, out{%k1};
43
44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
45	vpandq x, mask4bit, tmp0;			\
46	vpandqn x, mask4bit, x;				\
47	vpsrld $4, x, x;				\
48							\
49	vpshufb tmp0, lo_t, tmp0;			\
50	vpshufb x, hi_t, x;				\
51	vpxorq tmp0, x, x;
52
53#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
54	vpunpckhdq x1, x0, t2;				\
55	vpunpckldq x1, x0, x0;				\
56							\
57	vpunpckldq x3, x2, t1;				\
58	vpunpckhdq x3, x2, x2;				\
59							\
60	vpunpckhqdq t1, x0, x1;				\
61	vpunpcklqdq t1, x0, x0;				\
62							\
63	vpunpckhqdq x2, t2, x3;				\
64	vpunpcklqdq x2, t2, x2;
65
66#define byteslice_16x16b(a0, b0, c0, d0,		\
67			 a1, b1, c1, d1,		\
68			 a2, b2, c2, d2,		\
69			 a3, b3, c3, d3,		\
70			 st0, st1)			\
71	vmovdqu64 d2, st0;				\
72	vmovdqu64 d3, st1;				\
73	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
74	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
75	vmovdqu64 st0, d2;				\
76	vmovdqu64 st1, d3;				\
77							\
78	vmovdqu64 a0, st0;				\
79	vmovdqu64 a1, st1;				\
80	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
81	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
82							\
83	vbroadcasti64x2 .Lshufb_16x16b(%rip), a0;	\
84	vmovdqu64 st1, a1;				\
85	vpshufb a0, a2, a2;				\
86	vpshufb a0, a3, a3;				\
87	vpshufb a0, b0, b0;				\
88	vpshufb a0, b1, b1;				\
89	vpshufb a0, b2, b2;				\
90	vpshufb a0, b3, b3;				\
91	vpshufb a0, a1, a1;				\
92	vpshufb a0, c0, c0;				\
93	vpshufb a0, c1, c1;				\
94	vpshufb a0, c2, c2;				\
95	vpshufb a0, c3, c3;				\
96	vpshufb a0, d0, d0;				\
97	vpshufb a0, d1, d1;				\
98	vpshufb a0, d2, d2;				\
99	vpshufb a0, d3, d3;				\
100	vmovdqu64 d3, st1;				\
101	vmovdqu64 st0, d3;				\
102	vpshufb a0, d3, a0;				\
103	vmovdqu64 d2, st0;				\
104							\
105	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
106	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
107	vmovdqu64 st0, d2;				\
108	vmovdqu64 st1, d3;				\
109							\
110	vmovdqu64 b0, st0;				\
111	vmovdqu64 b1, st1;				\
112	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
113	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
114	vmovdqu64 st0, b0;				\
115	vmovdqu64 st1, b1;				\
116	/* does not adjust output bytes inside vectors */
117
118#define debyteslice_16x16b(a0, b0, c0, d0,		\
119			   a1, b1, c1, d1,		\
120			   a2, b2, c2, d2,		\
121			   a3, b3, c3, d3,		\
122			   st0, st1)			\
123	vmovdqu64 d2, st0;				\
124	vmovdqu64 d3, st1;				\
125	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
126	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
127	vmovdqu64 st0, d2;				\
128	vmovdqu64 st1, d3;				\
129							\
130	vmovdqu64 a0, st0;				\
131	vmovdqu64 a1, st1;				\
132	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
133	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
134							\
135	vbroadcasti64x2 .Lshufb_16x16b(%rip), a0;	\
136	vmovdqu64 st1, a1;				\
137	vpshufb a0, a2, a2;				\
138	vpshufb a0, a3, a3;				\
139	vpshufb a0, b0, b0;				\
140	vpshufb a0, b1, b1;				\
141	vpshufb a0, b2, b2;				\
142	vpshufb a0, b3, b3;				\
143	vpshufb a0, a1, a1;				\
144	vpshufb a0, c0, c0;				\
145	vpshufb a0, c1, c1;				\
146	vpshufb a0, c2, c2;				\
147	vpshufb a0, c3, c3;				\
148	vpshufb a0, d0, d0;				\
149	vpshufb a0, d1, d1;				\
150	vpshufb a0, d2, d2;				\
151	vpshufb a0, d3, d3;				\
152	vmovdqu64 d3, st1;				\
153	vmovdqu64 st0, d3;				\
154	vpshufb a0, d3, a0;				\
155	vmovdqu64 d2, st0;				\
156							\
157	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
158	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
159	vmovdqu64 st0, d2;				\
160	vmovdqu64 st1, d3;				\
161							\
162	vmovdqu64 b0, st0;				\
163	vmovdqu64 b1, st1;				\
164	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
165	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
166	vmovdqu64 st0, b0;				\
167	vmovdqu64 st1, b1;				\
168	/* does not adjust output bytes inside vectors */
169
170/* load blocks to registers and apply pre-whitening */
171#define inpack16_pre(x0, x1, x2, x3,			\
172		     x4, x5, x6, x7,			\
173		     y0, y1, y2, y3,			\
174		     y4, y5, y6, y7,			\
175		     rio)				\
176	vmovdqu64 (0 * 64)(rio), x0;			\
177	vmovdqu64 (1 * 64)(rio), x1;			\
178	vmovdqu64 (2 * 64)(rio), x2;			\
179	vmovdqu64 (3 * 64)(rio), x3;			\
180	vmovdqu64 (4 * 64)(rio), x4;			\
181	vmovdqu64 (5 * 64)(rio), x5;			\
182	vmovdqu64 (6 * 64)(rio), x6;			\
183	vmovdqu64 (7 * 64)(rio), x7;			\
184	vmovdqu64 (8 * 64)(rio), y0;			\
185	vmovdqu64 (9 * 64)(rio), y1;			\
186	vmovdqu64 (10 * 64)(rio), y2;			\
187	vmovdqu64 (11 * 64)(rio), y3;			\
188	vmovdqu64 (12 * 64)(rio), y4;			\
189	vmovdqu64 (13 * 64)(rio), y5;			\
190	vmovdqu64 (14 * 64)(rio), y6;			\
191	vmovdqu64 (15 * 64)(rio), y7;
192
193/* byteslice pre-whitened blocks and store to temporary memory */
194#define inpack16_post(x0, x1, x2, x3,			\
195		      x4, x5, x6, x7,			\
196		      y0, y1, y2, y3,			\
197		      y4, y5, y6, y7,			\
198		      mem_ab, mem_cd)			\
199	byteslice_16x16b(x0, x1, x2, x3,		\
200			 x4, x5, x6, x7,		\
201			 y0, y1, y2, y3,		\
202			 y4, y5, y6, y7,		\
203			 (mem_ab), (mem_cd));		\
204							\
205	vmovdqu64 x0, 0 * 64(mem_ab);			\
206	vmovdqu64 x1, 1 * 64(mem_ab);			\
207	vmovdqu64 x2, 2 * 64(mem_ab);			\
208	vmovdqu64 x3, 3 * 64(mem_ab);			\
209	vmovdqu64 x4, 4 * 64(mem_ab);			\
210	vmovdqu64 x5, 5 * 64(mem_ab);			\
211	vmovdqu64 x6, 6 * 64(mem_ab);			\
212	vmovdqu64 x7, 7 * 64(mem_ab);			\
213	vmovdqu64 y0, 0 * 64(mem_cd);			\
214	vmovdqu64 y1, 1 * 64(mem_cd);			\
215	vmovdqu64 y2, 2 * 64(mem_cd);			\
216	vmovdqu64 y3, 3 * 64(mem_cd);			\
217	vmovdqu64 y4, 4 * 64(mem_cd);			\
218	vmovdqu64 y5, 5 * 64(mem_cd);			\
219	vmovdqu64 y6, 6 * 64(mem_cd);			\
220	vmovdqu64 y7, 7 * 64(mem_cd);
221
222#define write_output(x0, x1, x2, x3,			\
223		     x4, x5, x6, x7,			\
224		     y0, y1, y2, y3,			\
225		     y4, y5, y6, y7,			\
226		     mem)				\
227	vmovdqu64 x0, 0 * 64(mem);			\
228	vmovdqu64 x1, 1 * 64(mem);			\
229	vmovdqu64 x2, 2 * 64(mem);			\
230	vmovdqu64 x3, 3 * 64(mem);			\
231	vmovdqu64 x4, 4 * 64(mem);			\
232	vmovdqu64 x5, 5 * 64(mem);			\
233	vmovdqu64 x6, 6 * 64(mem);			\
234	vmovdqu64 x7, 7 * 64(mem);			\
235	vmovdqu64 y0, 8 * 64(mem);			\
236	vmovdqu64 y1, 9 * 64(mem);			\
237	vmovdqu64 y2, 10 * 64(mem);			\
238	vmovdqu64 y3, 11 * 64(mem);			\
239	vmovdqu64 y4, 12 * 64(mem);			\
240	vmovdqu64 y5, 13 * 64(mem);			\
241	vmovdqu64 y6, 14 * 64(mem);			\
242	vmovdqu64 y7, 15 * 64(mem);			\
243
244#define aria_store_state_8way(x0, x1, x2, x3,		\
245			      x4, x5, x6, x7,		\
246			      mem_tmp, idx)		\
247	vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp);	\
248	vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp);	\
249	vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp);	\
250	vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp);	\
251	vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp);	\
252	vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp);	\
253	vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp);	\
254	vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
255
256#define aria_load_state_8way(x0, x1, x2, x3,		\
257			     x4, x5, x6, x7,		\
258			     mem_tmp, idx)		\
259	vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0;	\
260	vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1;	\
261	vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2;	\
262	vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3;	\
263	vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4;	\
264	vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5;	\
265	vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6;	\
266	vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
267
268#define aria_ark_16way(x0, x1, x2, x3,			\
269		       x4, x5, x6, x7,			\
270		       y0, y1, y2, y3,			\
271		       y4, y5, y6, y7,			\
272		       t0, rk, round)			\
273	/* AddRoundKey */                               \
274	vpbroadcastb ((round * 16) + 3)(rk), t0;	\
275	vpxorq t0, x0, x0;				\
276	vpbroadcastb ((round * 16) + 2)(rk), t0;	\
277	vpxorq t0, x1, x1;				\
278	vpbroadcastb ((round * 16) + 1)(rk), t0;	\
279	vpxorq t0, x2, x2;				\
280	vpbroadcastb ((round * 16) + 0)(rk), t0;	\
281	vpxorq t0, x3, x3;				\
282	vpbroadcastb ((round * 16) + 7)(rk), t0;	\
283	vpxorq t0, x4, x4;				\
284	vpbroadcastb ((round * 16) + 6)(rk), t0;	\
285	vpxorq t0, x5, x5;				\
286	vpbroadcastb ((round * 16) + 5)(rk), t0;	\
287	vpxorq t0, x6, x6;				\
288	vpbroadcastb ((round * 16) + 4)(rk), t0;	\
289	vpxorq t0, x7, x7;				\
290	vpbroadcastb ((round * 16) + 11)(rk), t0;	\
291	vpxorq t0, y0, y0;				\
292	vpbroadcastb ((round * 16) + 10)(rk), t0;	\
293	vpxorq t0, y1, y1;				\
294	vpbroadcastb ((round * 16) + 9)(rk), t0;	\
295	vpxorq t0, y2, y2;				\
296	vpbroadcastb ((round * 16) + 8)(rk), t0;	\
297	vpxorq t0, y3, y3;				\
298	vpbroadcastb ((round * 16) + 15)(rk), t0;	\
299	vpxorq t0, y4, y4;				\
300	vpbroadcastb ((round * 16) + 14)(rk), t0;	\
301	vpxorq t0, y5, y5;				\
302	vpbroadcastb ((round * 16) + 13)(rk), t0;	\
303	vpxorq t0, y6, y6;				\
304	vpbroadcastb ((round * 16) + 12)(rk), t0;	\
305	vpxorq t0, y7, y7;
306
307#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
308			    x4, x5, x6, x7,		\
309			    t0, t1, t2, t3,		\
310			    t4, t5, t6, t7)		\
311	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
312	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
313	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
314	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
315	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
316	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
317	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
318	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
319	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
320	vgf2p8affineinvqb $0, t2, x2, x2;		\
321	vgf2p8affineinvqb $0, t2, x6, x6;		\
322	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
323	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
324	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
325	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
326	vgf2p8affineinvqb $0, t2, x3, x3;		\
327	vgf2p8affineinvqb $0, t2, x7, x7;
328
329#define aria_sbox_16way_gfni(x0, x1, x2, x3,		\
330			     x4, x5, x6, x7,		\
331			     y0, y1, y2, y3,		\
332			     y4, y5, y6, y7,		\
333			     t0, t1, t2, t3,		\
334			     t4, t5, t6, t7)		\
335	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
336	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
337	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
338	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
339	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
340	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
341	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
342	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
343	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
344	vgf2p8affineinvqb $0, t2, x2, x2;		\
345	vgf2p8affineinvqb $0, t2, x6, x6;		\
346	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
347	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
348	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
349	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
350	vgf2p8affineinvqb $0, t2, x3, x3;		\
351	vgf2p8affineinvqb $0, t2, x7, x7;		\
352	vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1;	\
353	vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5;	\
354	vgf2p8affineqb $(tf_inv_const), t1, y2, y2;	\
355	vgf2p8affineqb $(tf_inv_const), t1, y6, y6;	\
356	vgf2p8affineinvqb $0, t2, y2, y2;		\
357	vgf2p8affineinvqb $0, t2, y6, y6;		\
358	vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0;	\
359	vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4;	\
360	vgf2p8affineqb $(tf_x2_const), t4, y3, y3;	\
361	vgf2p8affineqb $(tf_x2_const), t4, y7, y7;	\
362	vgf2p8affineinvqb $0, t2, y3, y3;		\
363	vgf2p8affineinvqb $0, t2, y7, y7;
364
365
366#define aria_diff_m(x0, x1, x2, x3,			\
367		    t0, t1, t2, t3)			\
368	/* T = rotr32(X, 8); */				\
369	/* X ^= T */					\
370	vpxorq x0, x3, t0;				\
371	vpxorq x1, x0, t1;				\
372	vpxorq x2, x1, t2;				\
373	vpxorq x3, x2, t3;				\
374	/* X = T ^ rotr(X, 16); */			\
375	vpxorq t2, x0, x0;				\
376	vpxorq x1, t3, t3;				\
377	vpxorq t0, x2, x2;				\
378	vpxorq t1, x3, x1;				\
379	vmovdqu64 t3, x3;
380
381#define aria_diff_word(x0, x1, x2, x3,			\
382		       x4, x5, x6, x7,			\
383		       y0, y1, y2, y3,			\
384		       y4, y5, y6, y7)			\
385	/* t1 ^= t2; */					\
386	vpxorq y0, x4, x4;				\
387	vpxorq y1, x5, x5;				\
388	vpxorq y2, x6, x6;				\
389	vpxorq y3, x7, x7;				\
390							\
391	/* t2 ^= t3; */					\
392	vpxorq y4, y0, y0;				\
393	vpxorq y5, y1, y1;				\
394	vpxorq y6, y2, y2;				\
395	vpxorq y7, y3, y3;				\
396							\
397	/* t0 ^= t1; */					\
398	vpxorq x4, x0, x0;				\
399	vpxorq x5, x1, x1;				\
400	vpxorq x6, x2, x2;				\
401	vpxorq x7, x3, x3;				\
402							\
403	/* t3 ^= t1; */					\
404	vpxorq x4, y4, y4;				\
405	vpxorq x5, y5, y5;				\
406	vpxorq x6, y6, y6;				\
407	vpxorq x7, y7, y7;				\
408							\
409	/* t2 ^= t0; */					\
410	vpxorq x0, y0, y0;				\
411	vpxorq x1, y1, y1;				\
412	vpxorq x2, y2, y2;				\
413	vpxorq x3, y3, y3;				\
414							\
415	/* t1 ^= t2; */					\
416	vpxorq y0, x4, x4;				\
417	vpxorq y1, x5, x5;				\
418	vpxorq y2, x6, x6;				\
419	vpxorq y3, x7, x7;
420
421#define aria_fe_gfni(x0, x1, x2, x3,			\
422		     x4, x5, x6, x7,			\
423		     y0, y1, y2, y3,			\
424		     y4, y5, y6, y7,			\
425		     z0, z1, z2, z3,			\
426		     z4, z5, z6, z7,			\
427		     mem_tmp, rk, round)		\
428	aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,	\
429		       y0, y1, y2, y3, y4, y5, y6, y7,	\
430		       z0, rk, round);			\
431							\
432	aria_sbox_16way_gfni(x2, x3, x0, x1,		\
433			     x6, x7, x4, x5,		\
434			     y2, y3, y0, y1,		\
435			     y6, y7, y4, y5,		\
436			     z0, z1, z2, z3,		\
437			     z4, z5, z6, z7);		\
438							\
439	aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);	\
440	aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);	\
441	aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);	\
442	aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);	\
443	aria_diff_word(x0, x1, x2, x3,			\
444		       x4, x5, x6, x7,			\
445		       y0, y1, y2, y3,			\
446		       y4, y5, y6, y7);			\
447	/* aria_diff_byte()				\
448	 * T3 = ABCD -> BADC				\
449	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
450	 * T0 = ABCD -> CDAB				\
451	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
452	 * T1 = ABCD -> DCBA				\
453	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
454	 */						\
455	aria_diff_word(x2, x3, x0, x1,			\
456		       x7, x6, x5, x4,			\
457		       y0, y1, y2, y3,			\
458		       y5, y4, y7, y6);			\
459
460
461#define aria_fo_gfni(x0, x1, x2, x3,			\
462		     x4, x5, x6, x7,			\
463		     y0, y1, y2, y3,			\
464		     y4, y5, y6, y7,			\
465		     z0, z1, z2, z3,			\
466		     z4, z5, z6, z7,			\
467		     mem_tmp, rk, round)		\
468	aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,	\
469		       y0, y1, y2, y3, y4, y5, y6, y7,	\
470		       z0, rk, round);			\
471							\
472	aria_sbox_16way_gfni(x0, x1, x2, x3,		\
473			     x4, x5, x6, x7,		\
474			     y0, y1, y2, y3,		\
475			     y4, y5, y6, y7,		\
476			     z0, z1, z2, z3,		\
477			     z4, z5, z6, z7);		\
478							\
479	aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);	\
480	aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);	\
481	aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);	\
482	aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);	\
483	aria_diff_word(x0, x1, x2, x3,			\
484		       x4, x5, x6, x7,			\
485		       y0, y1, y2, y3,			\
486		       y4, y5, y6, y7);			\
487	/* aria_diff_byte()				\
488	 * T1 = ABCD -> BADC				\
489	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
490	 * T2 = ABCD -> CDAB				\
491	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
492	 * T3 = ABCD -> DCBA				\
493	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
494	 */						\
495	aria_diff_word(x0, x1, x2, x3,			\
496		       x5, x4, x7, x6,			\
497		       y2, y3, y0, y1,			\
498		       y7, y6, y5, y4);
499
500#define aria_ff_gfni(x0, x1, x2, x3,			\
501		     x4, x5, x6, x7,			\
502		     y0, y1, y2, y3,			\
503		     y4, y5, y6, y7,			\
504		     z0, z1, z2, z3,			\
505		     z4, z5, z6, z7,			\
506		     mem_tmp, rk, round, last_round)	\
507	aria_ark_16way(x0, x1, x2, x3,			\
508		       x4, x5, x6, x7,			\
509		       y0, y1, y2, y3,			\
510		       y4, y5, y6, y7,			\
511		       z0, rk, round);			\
512	aria_sbox_16way_gfni(x2, x3, x0, x1,		\
513			     x6, x7, x4, x5,		\
514			     y2, y3, y0, y1,		\
515			     y6, y7, y4, y5,		\
516			     z0, z1, z2, z3,		\
517			     z4, z5, z6, z7);		\
518	aria_ark_16way(x0, x1, x2, x3,			\
519		       x4, x5, x6, x7,			\
520		       y0, y1, y2, y3,			\
521		       y4, y5, y6, y7,			\
522		       z0, rk, last_round);
523
524
525.section        .rodata.cst64, "aM", @progbits, 64
526.align 64
527.Lcounter0123_lo:
528	.quad 0, 0
529	.quad 1, 0
530	.quad 2, 0
531	.quad 3, 0
532
533.section        .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
534.align 32
535#define SHUFB_BYTES(idx) \
536	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
537.Lshufb_16x16b:
538	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
539	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
540
541.section	.rodata.cst16, "aM", @progbits, 16
542.align 16
543
544.Lcounter4444_lo:
545	.quad 4, 0
546.Lcounter8888_lo:
547	.quad 8, 0
548.Lcounter16161616_lo:
549	.quad 16, 0
550.Lcounter1111_hi:
551	.quad 0, 1
552
553/* For CTR-mode IV byteswap */
554.Lbswap128_mask:
555	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
556	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
557
558.section	.rodata.cst8, "aM", @progbits, 8
559.align 8
560/* AES affine: */
561#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
562.Ltf_aff_bitmatrix:
563	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
564		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
565		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
566		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
567		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
568		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
569		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
570		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
571
572/* AES inverse affine: */
573#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
574.Ltf_inv_bitmatrix:
575	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
576		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
577		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
578		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
579		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
580		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
581		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
582		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
583
584/* S2: */
585#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
586.Ltf_s2_bitmatrix:
587	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
588		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
589		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
590		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
591		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
592		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
593		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
594		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
595
596/* X2: */
597#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
598.Ltf_x2_bitmatrix:
599	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
600		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
601		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
602		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
603		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
604		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
605		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
606		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
607
608/* Identity matrix: */
609.Ltf_id_bitmatrix:
610	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
611		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
612		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
613		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
614		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
615		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
616		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
617		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
618
619.text
620SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
621	/* input:
622	 *      %r9: rk
623	 *      %rsi: dst
624	 *      %rdx: src
625	 *      %zmm0..%zmm15: byte-sliced blocks
626	 */
627
628	FRAME_BEGIN
629
630	movq %rsi, %rax;
631	leaq 8 * 64(%rax), %r8;
632
633	inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
634		      %zmm4, %zmm5, %zmm6, %zmm7,
635		      %zmm8, %zmm9, %zmm10, %zmm11,
636		      %zmm12, %zmm13, %zmm14,
637		      %zmm15, %rax, %r8);
638	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
639		     %zmm4, %zmm5, %zmm6, %zmm7,
640		     %zmm8, %zmm9, %zmm10, %zmm11,
641		     %zmm12, %zmm13, %zmm14, %zmm15,
642		     %zmm24, %zmm25, %zmm26, %zmm27,
643		     %zmm28, %zmm29, %zmm30, %zmm31,
644		     %rax, %r9, 0);
645	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
646		     %zmm6, %zmm7, %zmm4, %zmm5,
647		     %zmm9, %zmm8, %zmm11, %zmm10,
648		     %zmm12, %zmm13, %zmm14, %zmm15,
649		     %zmm24, %zmm25, %zmm26, %zmm27,
650		     %zmm28, %zmm29, %zmm30, %zmm31,
651		     %rax, %r9, 1);
652	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
653		     %zmm4, %zmm5, %zmm6, %zmm7,
654		     %zmm8, %zmm9, %zmm10, %zmm11,
655		     %zmm12, %zmm13, %zmm14, %zmm15,
656		     %zmm24, %zmm25, %zmm26, %zmm27,
657		     %zmm28, %zmm29, %zmm30, %zmm31,
658		     %rax, %r9, 2);
659	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
660		     %zmm6, %zmm7, %zmm4, %zmm5,
661		     %zmm9, %zmm8, %zmm11, %zmm10,
662		     %zmm12, %zmm13, %zmm14, %zmm15,
663		     %zmm24, %zmm25, %zmm26, %zmm27,
664		     %zmm28, %zmm29, %zmm30, %zmm31,
665		     %rax, %r9, 3);
666	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
667		     %zmm4, %zmm5, %zmm6, %zmm7,
668		     %zmm8, %zmm9, %zmm10, %zmm11,
669		     %zmm12, %zmm13, %zmm14, %zmm15,
670		     %zmm24, %zmm25, %zmm26, %zmm27,
671		     %zmm28, %zmm29, %zmm30, %zmm31,
672		     %rax, %r9, 4);
673	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
674		     %zmm6, %zmm7, %zmm4, %zmm5,
675		     %zmm9, %zmm8, %zmm11, %zmm10,
676		     %zmm12, %zmm13, %zmm14, %zmm15,
677		     %zmm24, %zmm25, %zmm26, %zmm27,
678		     %zmm28, %zmm29, %zmm30, %zmm31,
679		     %rax, %r9, 5);
680	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
681		     %zmm4, %zmm5, %zmm6, %zmm7,
682		     %zmm8, %zmm9, %zmm10, %zmm11,
683		     %zmm12, %zmm13, %zmm14, %zmm15,
684		     %zmm24, %zmm25, %zmm26, %zmm27,
685		     %zmm28, %zmm29, %zmm30, %zmm31,
686		     %rax, %r9, 6);
687	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
688		     %zmm6, %zmm7, %zmm4, %zmm5,
689		     %zmm9, %zmm8, %zmm11, %zmm10,
690		     %zmm12, %zmm13, %zmm14, %zmm15,
691		     %zmm24, %zmm25, %zmm26, %zmm27,
692		     %zmm28, %zmm29, %zmm30, %zmm31,
693		     %rax, %r9, 7);
694	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
695		     %zmm4, %zmm5, %zmm6, %zmm7,
696		     %zmm8, %zmm9, %zmm10, %zmm11,
697		     %zmm12, %zmm13, %zmm14, %zmm15,
698		     %zmm24, %zmm25, %zmm26, %zmm27,
699		     %zmm28, %zmm29, %zmm30, %zmm31,
700		     %rax, %r9, 8);
701	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
702		     %zmm6, %zmm7, %zmm4, %zmm5,
703		     %zmm9, %zmm8, %zmm11, %zmm10,
704		     %zmm12, %zmm13, %zmm14, %zmm15,
705		     %zmm24, %zmm25, %zmm26, %zmm27,
706		     %zmm28, %zmm29, %zmm30, %zmm31,
707		     %rax, %r9, 9);
708	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
709		     %zmm4, %zmm5, %zmm6, %zmm7,
710		     %zmm8, %zmm9, %zmm10, %zmm11,
711		     %zmm12, %zmm13, %zmm14, %zmm15,
712		     %zmm24, %zmm25, %zmm26, %zmm27,
713		     %zmm28, %zmm29, %zmm30, %zmm31,
714		     %rax, %r9, 10);
715	cmpl $12, ARIA_CTX_rounds(CTX);
716	jne .Laria_gfni_192;
717	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
718		     %zmm6, %zmm7, %zmm4, %zmm5,
719		     %zmm9, %zmm8, %zmm11, %zmm10,
720		     %zmm12, %zmm13, %zmm14, %zmm15,
721		     %zmm24, %zmm25, %zmm26, %zmm27,
722		     %zmm28, %zmm29, %zmm30, %zmm31,
723		     %rax, %r9, 11, 12);
724	jmp .Laria_gfni_end;
725.Laria_gfni_192:
726	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
727		     %zmm6, %zmm7, %zmm4, %zmm5,
728		     %zmm9, %zmm8, %zmm11, %zmm10,
729		     %zmm12, %zmm13, %zmm14, %zmm15,
730		     %zmm24, %zmm25, %zmm26, %zmm27,
731		     %zmm28, %zmm29, %zmm30, %zmm31,
732		     %rax, %r9, 11);
733	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
734		     %zmm4, %zmm5, %zmm6, %zmm7,
735		     %zmm8, %zmm9, %zmm10, %zmm11,
736		     %zmm12, %zmm13, %zmm14, %zmm15,
737		     %zmm24, %zmm25, %zmm26, %zmm27,
738		     %zmm28, %zmm29, %zmm30, %zmm31,
739		     %rax, %r9, 12);
740	cmpl $14, ARIA_CTX_rounds(CTX);
741	jne .Laria_gfni_256;
742	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
743		     %zmm6, %zmm7, %zmm4, %zmm5,
744		     %zmm9, %zmm8, %zmm11, %zmm10,
745		     %zmm12, %zmm13, %zmm14, %zmm15,
746		     %zmm24, %zmm25, %zmm26, %zmm27,
747		     %zmm28, %zmm29, %zmm30, %zmm31,
748		     %rax, %r9, 13, 14);
749	jmp .Laria_gfni_end;
750.Laria_gfni_256:
751	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
752		     %zmm6, %zmm7, %zmm4, %zmm5,
753		     %zmm9, %zmm8, %zmm11, %zmm10,
754		     %zmm12, %zmm13, %zmm14, %zmm15,
755		     %zmm24, %zmm25, %zmm26, %zmm27,
756		     %zmm28, %zmm29, %zmm30, %zmm31,
757		     %rax, %r9, 13);
758	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
759		     %zmm4, %zmm5, %zmm6, %zmm7,
760		     %zmm8, %zmm9, %zmm10, %zmm11,
761		     %zmm12, %zmm13, %zmm14, %zmm15,
762		     %zmm24, %zmm25, %zmm26, %zmm27,
763		     %zmm28, %zmm29, %zmm30, %zmm31,
764		     %rax, %r9, 14);
765	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
766		     %zmm6, %zmm7, %zmm4, %zmm5,
767		     %zmm9, %zmm8, %zmm11, %zmm10,
768		     %zmm12, %zmm13, %zmm14, %zmm15,
769		     %zmm24, %zmm25, %zmm26, %zmm27,
770		     %zmm28, %zmm29, %zmm30, %zmm31,
771		     %rax, %r9, 15, 16);
772.Laria_gfni_end:
773	debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
774			   %zmm8, %zmm13, %zmm2, %zmm7,
775			   %zmm11, %zmm14, %zmm1, %zmm4,
776			   %zmm10, %zmm15, %zmm0, %zmm5,
777			   (%rax), (%r8));
778	FRAME_END
779	RET;
780SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
781
782SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
783	/* input:
784	 *      %rdi: ctx, CTX
785	 *      %rsi: dst
786	 *      %rdx: src
787	 */
788
789	FRAME_BEGIN
790
791	leaq ARIA_CTX_enc_key(CTX), %r9;
792
793	inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
794		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
795		     %zmm15, %rdx);
796
797	call __aria_gfni_avx512_crypt_64way;
798
799	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
800		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
801		     %zmm15, %rax);
802
803	FRAME_END
804	RET;
805SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
806
807SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
808	/* input:
809	 *      %rdi: ctx, CTX
810	 *      %rsi: dst
811	 *      %rdx: src
812	 */
813
814	FRAME_BEGIN
815
816	leaq ARIA_CTX_dec_key(CTX), %r9;
817
818	inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
819		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
820		     %zmm15, %rdx);
821
822	call __aria_gfni_avx512_crypt_64way;
823
824	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
825		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
826		     %zmm15, %rax);
827
828	FRAME_END
829	RET;
830SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
831
832SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
833	/* input:
834	 *      %rdi: ctx
835	 *      %rsi: dst
836	 *      %rdx: src
837	 *      %rcx: keystream
838	 *      %r8: iv (big endian, 128bit)
839	 */
840
841	FRAME_BEGIN
842
843	vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
844	vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
845	vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
846	vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
847	vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
848	vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
849
850	/* load IV and byteswap */
851	movq 8(%r8), %r11;
852	movq (%r8), %r10;
853	bswapq %r11;
854	bswapq %r10;
855	vbroadcasti64x2 (%r8), %zmm20;
856	vpshufb %zmm19, %zmm20, %zmm20;
857
858	/* check need for handling 64-bit overflow and carry */
859	cmpq $(0xffffffffffffffff - 64), %r11;
860	ja .Lload_ctr_carry;
861
862	/* construct IVs */
863	vpaddq %zmm21, %zmm20, %zmm0;  /* +0:+1:+2:+3 */
864	vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
865	vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
866	vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
867	vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
868	vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
869	vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
870	vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
871	vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
872	vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
873	vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
874	vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
875	vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
876	vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
877	vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
878	vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
879	jmp .Lload_ctr_done;
880
881.Lload_ctr_carry:
882	/* construct IVs */
883	add_le128(%zmm0, %zmm20, %zmm21, %zmm25);  /* +0:+1:+2:+3 */
884	add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
885	add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
886	add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
887	add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
888	add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
889	add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
890	add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
891	add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
892	add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
893	add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
894	add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
895	add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
896	add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
897	add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
898	add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
899
900.Lload_ctr_done:
901	/* Byte-swap IVs and update counter. */
902	addq $64, %r11;
903	adcq $0, %r10;
904	vpshufb %zmm19, %zmm15, %zmm15;
905	vpshufb %zmm19, %zmm14, %zmm14;
906	vpshufb %zmm19, %zmm13, %zmm13;
907	vpshufb %zmm19, %zmm12, %zmm12;
908	vpshufb %zmm19, %zmm11, %zmm11;
909	vpshufb %zmm19, %zmm10, %zmm10;
910	vpshufb %zmm19, %zmm9, %zmm9;
911	vpshufb %zmm19, %zmm8, %zmm8;
912	bswapq %r11;
913	bswapq %r10;
914	vpshufb %zmm19, %zmm7, %zmm7;
915	vpshufb %zmm19, %zmm6, %zmm6;
916	vpshufb %zmm19, %zmm5, %zmm5;
917	vpshufb %zmm19, %zmm4, %zmm4;
918	vpshufb %zmm19, %zmm3, %zmm3;
919	vpshufb %zmm19, %zmm2, %zmm2;
920	vpshufb %zmm19, %zmm1, %zmm1;
921	vpshufb %zmm19, %zmm0, %zmm0;
922	movq %r11, 8(%r8);
923	movq %r10, (%r8);
924
925	FRAME_END
926	RET;
927SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
928
929SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
930	/* input:
931	 *      %rdi: ctx
932	 *      %rsi: dst
933	 *      %rdx: src
934	 *      %rcx: keystream
935	 *      %r8: iv (big endian, 128bit)
936	 */
937	FRAME_BEGIN
938
939	call __aria_gfni_avx512_ctr_gen_keystream_64way
940
941	leaq (%rsi), %r10;
942	leaq (%rdx), %r11;
943	leaq (%rcx), %rsi;
944	leaq (%rcx), %rdx;
945	leaq ARIA_CTX_enc_key(CTX), %r9;
946
947	call __aria_gfni_avx512_crypt_64way;
948
949	vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
950	vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
951	vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
952	vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
953	vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
954	vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
955	vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
956	vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
957	vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
958	vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
959	vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
960	vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
961	vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
962	vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
963	vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
964	vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
965	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
966		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
967		     %zmm15, %r10);
968
969	FRAME_END
970	RET;
971SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)
972