1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm, AES-NI/AVX optimized.
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
8 * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9 * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
10 */
11
12/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
13 *  https://github.com/mjosaarinen/sm4ni
14 */
15
16#include <linux/linkage.h>
17#include <asm/frame.h>
18
19#define rRIP         (%rip)
20
21#define RX0          %xmm0
22#define RX1          %xmm1
23#define MASK_4BIT    %xmm2
24#define RTMP0        %xmm3
25#define RTMP1        %xmm4
26#define RTMP2        %xmm5
27#define RTMP3        %xmm6
28#define RTMP4        %xmm7
29
30#define RA0          %xmm8
31#define RA1          %xmm9
32#define RA2          %xmm10
33#define RA3          %xmm11
34
35#define RB0          %xmm12
36#define RB1          %xmm13
37#define RB2          %xmm14
38#define RB3          %xmm15
39
40#define RNOT         %xmm0
41#define RBSWAP       %xmm1
42
43
44/* Transpose four 32-bit words between 128-bit vectors. */
45#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
46	vpunpckhdq x1, x0, t2;                \
47	vpunpckldq x1, x0, x0;                \
48	                                      \
49	vpunpckldq x3, x2, t1;                \
50	vpunpckhdq x3, x2, x2;                \
51	                                      \
52	vpunpckhqdq t1, x0, x1;               \
53	vpunpcklqdq t1, x0, x0;               \
54	                                      \
55	vpunpckhqdq x2, t2, x3;               \
56	vpunpcklqdq x2, t2, x2;
57
58/* pre-SubByte transform. */
59#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
60	vpand x, mask4bit, tmp0;                     \
61	vpandn x, mask4bit, x;                       \
62	vpsrld $4, x, x;                             \
63	                                             \
64	vpshufb tmp0, lo_t, tmp0;                    \
65	vpshufb x, hi_t, x;                          \
66	vpxor tmp0, x, x;
67
68/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
69 * 'vaeslastenc' instruction.
70 */
71#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
72	vpandn mask4bit, x, tmp0;                     \
73	vpsrld $4, x, x;                              \
74	vpand x, mask4bit, x;                         \
75	                                              \
76	vpshufb tmp0, lo_t, tmp0;                     \
77	vpshufb x, hi_t, x;                           \
78	vpxor tmp0, x, x;
79
80
81.section	.rodata.cst16, "aM", @progbits, 16
82.align 16
83
84/*
85 * Following four affine transform look-up tables are from work by
86 * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
87 *
88 * These allow exposing SM4 S-Box from AES SubByte.
89 */
90
91/* pre-SubByte affine transform, from SM4 field to AES field. */
92.Lpre_tf_lo_s:
93	.quad 0x9197E2E474720701, 0xC7C1B4B222245157
94.Lpre_tf_hi_s:
95	.quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
96
97/* post-SubByte affine transform, from AES field to SM4 field. */
98.Lpost_tf_lo_s:
99	.quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
100.Lpost_tf_hi_s:
101	.quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
102
103/* For isolating SubBytes from AESENCLAST, inverse shift row */
104.Linv_shift_row:
105	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
106	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
107
108/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
109.Linv_shift_row_rol_8:
110	.byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
111	.byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
112
113/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
114.Linv_shift_row_rol_16:
115	.byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
116	.byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
117
118/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
119.Linv_shift_row_rol_24:
120	.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
121	.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
122
123/* For CTR-mode IV byteswap */
124.Lbswap128_mask:
125	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
126
127/* For input word byte-swap */
128.Lbswap32_mask:
129	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
130
131.align 4
132/* 4-bit mask */
133.L0f0f0f0f:
134	.long 0x0f0f0f0f
135
136/* 12 bytes, only for padding */
137.Lpadding_deadbeef:
138	.long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
139
140
141.text
142.align 16
143
144/*
145 * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
146 *                           const u8 *src, int nblocks)
147 */
148.align 8
149SYM_FUNC_START(sm4_aesni_avx_crypt4)
150	/* input:
151	 *	%rdi: round key array, CTX
152	 *	%rsi: dst (1..4 blocks)
153	 *	%rdx: src (1..4 blocks)
154	 *	%rcx: num blocks (1..4)
155	 */
156	FRAME_BEGIN
157
158	vmovdqu 0*16(%rdx), RA0;
159	vmovdqa RA0, RA1;
160	vmovdqa RA0, RA2;
161	vmovdqa RA0, RA3;
162	cmpq $2, %rcx;
163	jb .Lblk4_load_input_done;
164	vmovdqu 1*16(%rdx), RA1;
165	je .Lblk4_load_input_done;
166	vmovdqu 2*16(%rdx), RA2;
167	cmpq $3, %rcx;
168	je .Lblk4_load_input_done;
169	vmovdqu 3*16(%rdx), RA3;
170
171.Lblk4_load_input_done:
172
173	vmovdqa .Lbswap32_mask rRIP, RTMP2;
174	vpshufb RTMP2, RA0, RA0;
175	vpshufb RTMP2, RA1, RA1;
176	vpshufb RTMP2, RA2, RA2;
177	vpshufb RTMP2, RA3, RA3;
178
179	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
180	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
181	vmovdqa .Lpre_tf_hi_s rRIP, RB0;
182	vmovdqa .Lpost_tf_lo_s rRIP, RB1;
183	vmovdqa .Lpost_tf_hi_s rRIP, RB2;
184	vmovdqa .Linv_shift_row rRIP, RB3;
185	vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
186	vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
187	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
188
189#define ROUND(round, s0, s1, s2, s3)                                \
190	vbroadcastss (4*(round))(%rdi), RX0;                        \
191	vpxor s1, RX0, RX0;                                         \
192	vpxor s2, RX0, RX0;                                         \
193	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */                 \
194	                                                            \
195	/* sbox, non-linear part */                                 \
196	transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0);           \
197	vaesenclast MASK_4BIT, RX0, RX0;                            \
198	transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0);            \
199	                                                            \
200	/* linear part */                                           \
201	vpshufb RB3, RX0, RTMP0;                                    \
202	vpxor RTMP0, s0, s0; /* s0 ^ x */                           \
203	vpshufb RTMP2, RX0, RTMP1;                                  \
204	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */               \
205	vpshufb RTMP3, RX0, RTMP1;                                  \
206	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */   \
207	vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1;            \
208	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */               \
209	vpslld $2, RTMP0, RTMP1;                                    \
210	vpsrld $30, RTMP0, RTMP0;                                   \
211	vpxor RTMP0, s0, s0;                                        \
212	/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
213	vpxor RTMP1, s0, s0;
214
215	leaq (32*4)(%rdi), %rax;
216.align 16
217.Lroundloop_blk4:
218	ROUND(0, RA0, RA1, RA2, RA3);
219	ROUND(1, RA1, RA2, RA3, RA0);
220	ROUND(2, RA2, RA3, RA0, RA1);
221	ROUND(3, RA3, RA0, RA1, RA2);
222	leaq (4*4)(%rdi), %rdi;
223	cmpq %rax, %rdi;
224	jne .Lroundloop_blk4;
225
226#undef ROUND
227
228	vmovdqa .Lbswap128_mask rRIP, RTMP2;
229
230	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
231	vpshufb RTMP2, RA0, RA0;
232	vpshufb RTMP2, RA1, RA1;
233	vpshufb RTMP2, RA2, RA2;
234	vpshufb RTMP2, RA3, RA3;
235
236	vmovdqu RA0, 0*16(%rsi);
237	cmpq $2, %rcx;
238	jb .Lblk4_store_output_done;
239	vmovdqu RA1, 1*16(%rsi);
240	je .Lblk4_store_output_done;
241	vmovdqu RA2, 2*16(%rsi);
242	cmpq $3, %rcx;
243	je .Lblk4_store_output_done;
244	vmovdqu RA3, 3*16(%rsi);
245
246.Lblk4_store_output_done:
247	vzeroall;
248	FRAME_END
249	RET;
250SYM_FUNC_END(sm4_aesni_avx_crypt4)
251
252.align 8
253SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
254	/* input:
255	 *	%rdi: round key array, CTX
256	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
257	 *						plaintext blocks
258	 * output:
259	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
260	 * 						ciphertext blocks
261	 */
262	FRAME_BEGIN
263
264	vmovdqa .Lbswap32_mask rRIP, RTMP2;
265	vpshufb RTMP2, RA0, RA0;
266	vpshufb RTMP2, RA1, RA1;
267	vpshufb RTMP2, RA2, RA2;
268	vpshufb RTMP2, RA3, RA3;
269	vpshufb RTMP2, RB0, RB0;
270	vpshufb RTMP2, RB1, RB1;
271	vpshufb RTMP2, RB2, RB2;
272	vpshufb RTMP2, RB3, RB3;
273
274	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
275	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
276	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
277
278#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3)                \
279	vbroadcastss (4*(round))(%rdi), RX0;                        \
280	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;                          \
281	vmovdqa .Lpre_tf_hi_s rRIP, RTMP1;                          \
282	vmovdqa RX0, RX1;                                           \
283	vpxor s1, RX0, RX0;                                         \
284	vpxor s2, RX0, RX0;                                         \
285	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */                 \
286	vmovdqa .Lpost_tf_lo_s rRIP, RTMP2;                         \
287	vmovdqa .Lpost_tf_hi_s rRIP, RTMP3;                         \
288	vpxor r1, RX1, RX1;                                         \
289	vpxor r2, RX1, RX1;                                         \
290	vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */                 \
291                                                                    \
292	/* sbox, non-linear part */                                 \
293	transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
294	transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
295	vmovdqa .Linv_shift_row rRIP, RTMP4;                        \
296	vaesenclast MASK_4BIT, RX0, RX0;                            \
297	vaesenclast MASK_4BIT, RX1, RX1;                            \
298	transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
299	transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
300                                                                    \
301	/* linear part */                                           \
302	vpshufb RTMP4, RX0, RTMP0;                                  \
303	vpxor RTMP0, s0, s0; /* s0 ^ x */                           \
304	vpshufb RTMP4, RX1, RTMP2;                                  \
305	vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4;                  \
306	vpxor RTMP2, r0, r0; /* r0 ^ x */                           \
307	vpshufb RTMP4, RX0, RTMP1;                                  \
308	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */               \
309	vpshufb RTMP4, RX1, RTMP3;                                  \
310	vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4;                 \
311	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */               \
312	vpshufb RTMP4, RX0, RTMP1;                                  \
313	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */   \
314	vpshufb RTMP4, RX1, RTMP3;                                  \
315	vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4;                 \
316	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */   \
317	vpshufb RTMP4, RX0, RTMP1;                                  \
318	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */               \
319	/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
320	vpslld $2, RTMP0, RTMP1;                                    \
321	vpsrld $30, RTMP0, RTMP0;                                   \
322	vpxor RTMP0, s0, s0;                                        \
323	vpxor RTMP1, s0, s0;                                        \
324	vpshufb RTMP4, RX1, RTMP3;                                  \
325	vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */               \
326	/* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
327	vpslld $2, RTMP2, RTMP3;                                    \
328	vpsrld $30, RTMP2, RTMP2;                                   \
329	vpxor RTMP2, r0, r0;                                        \
330	vpxor RTMP3, r0, r0;
331
332	leaq (32*4)(%rdi), %rax;
333.align 16
334.Lroundloop_blk8:
335	ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
336	ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
337	ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
338	ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
339	leaq (4*4)(%rdi), %rdi;
340	cmpq %rax, %rdi;
341	jne .Lroundloop_blk8;
342
343#undef ROUND
344
345	vmovdqa .Lbswap128_mask rRIP, RTMP2;
346
347	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
348	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
349	vpshufb RTMP2, RA0, RA0;
350	vpshufb RTMP2, RA1, RA1;
351	vpshufb RTMP2, RA2, RA2;
352	vpshufb RTMP2, RA3, RA3;
353	vpshufb RTMP2, RB0, RB0;
354	vpshufb RTMP2, RB1, RB1;
355	vpshufb RTMP2, RB2, RB2;
356	vpshufb RTMP2, RB3, RB3;
357
358	FRAME_END
359	RET;
360SYM_FUNC_END(__sm4_crypt_blk8)
361
362/*
363 * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
364 *                           const u8 *src, int nblocks)
365 */
366.align 8
367SYM_FUNC_START(sm4_aesni_avx_crypt8)
368	/* input:
369	 *	%rdi: round key array, CTX
370	 *	%rsi: dst (1..8 blocks)
371	 *	%rdx: src (1..8 blocks)
372	 *	%rcx: num blocks (1..8)
373	 */
374	cmpq $5, %rcx;
375	jb sm4_aesni_avx_crypt4;
376
377	FRAME_BEGIN
378
379	vmovdqu (0 * 16)(%rdx), RA0;
380	vmovdqu (1 * 16)(%rdx), RA1;
381	vmovdqu (2 * 16)(%rdx), RA2;
382	vmovdqu (3 * 16)(%rdx), RA3;
383	vmovdqu (4 * 16)(%rdx), RB0;
384	vmovdqa RB0, RB1;
385	vmovdqa RB0, RB2;
386	vmovdqa RB0, RB3;
387	je .Lblk8_load_input_done;
388	vmovdqu (5 * 16)(%rdx), RB1;
389	cmpq $7, %rcx;
390	jb .Lblk8_load_input_done;
391	vmovdqu (6 * 16)(%rdx), RB2;
392	je .Lblk8_load_input_done;
393	vmovdqu (7 * 16)(%rdx), RB3;
394
395.Lblk8_load_input_done:
396	call __sm4_crypt_blk8;
397
398	cmpq $6, %rcx;
399	vmovdqu RA0, (0 * 16)(%rsi);
400	vmovdqu RA1, (1 * 16)(%rsi);
401	vmovdqu RA2, (2 * 16)(%rsi);
402	vmovdqu RA3, (3 * 16)(%rsi);
403	vmovdqu RB0, (4 * 16)(%rsi);
404	jb .Lblk8_store_output_done;
405	vmovdqu RB1, (5 * 16)(%rsi);
406	je .Lblk8_store_output_done;
407	vmovdqu RB2, (6 * 16)(%rsi);
408	cmpq $7, %rcx;
409	je .Lblk8_store_output_done;
410	vmovdqu RB3, (7 * 16)(%rsi);
411
412.Lblk8_store_output_done:
413	vzeroall;
414	FRAME_END
415	RET;
416SYM_FUNC_END(sm4_aesni_avx_crypt8)
417
418/*
419 * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
420 *                                 const u8 *src, u8 *iv)
421 */
422.align 8
423SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
424	/* input:
425	 *	%rdi: round key array, CTX
426	 *	%rsi: dst (8 blocks)
427	 *	%rdx: src (8 blocks)
428	 *	%rcx: iv (big endian, 128bit)
429	 */
430	FRAME_BEGIN
431
432	/* load IV and byteswap */
433	vmovdqu (%rcx), RA0;
434
435	vmovdqa .Lbswap128_mask rRIP, RBSWAP;
436	vpshufb RBSWAP, RA0, RTMP0; /* be => le */
437
438	vpcmpeqd RNOT, RNOT, RNOT;
439	vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */
440
441#define inc_le128(x, minus_one, tmp) \
442	vpcmpeqq minus_one, x, tmp;  \
443	vpsubq minus_one, x, x;      \
444	vpslldq $8, tmp, tmp;        \
445	vpsubq tmp, x, x;
446
447	/* construct IVs */
448	inc_le128(RTMP0, RNOT, RTMP2); /* +1 */
449	vpshufb RBSWAP, RTMP0, RA1;
450	inc_le128(RTMP0, RNOT, RTMP2); /* +2 */
451	vpshufb RBSWAP, RTMP0, RA2;
452	inc_le128(RTMP0, RNOT, RTMP2); /* +3 */
453	vpshufb RBSWAP, RTMP0, RA3;
454	inc_le128(RTMP0, RNOT, RTMP2); /* +4 */
455	vpshufb RBSWAP, RTMP0, RB0;
456	inc_le128(RTMP0, RNOT, RTMP2); /* +5 */
457	vpshufb RBSWAP, RTMP0, RB1;
458	inc_le128(RTMP0, RNOT, RTMP2); /* +6 */
459	vpshufb RBSWAP, RTMP0, RB2;
460	inc_le128(RTMP0, RNOT, RTMP2); /* +7 */
461	vpshufb RBSWAP, RTMP0, RB3;
462	inc_le128(RTMP0, RNOT, RTMP2); /* +8 */
463	vpshufb RBSWAP, RTMP0, RTMP1;
464
465	/* store new IV */
466	vmovdqu RTMP1, (%rcx);
467
468	call __sm4_crypt_blk8;
469
470	vpxor (0 * 16)(%rdx), RA0, RA0;
471	vpxor (1 * 16)(%rdx), RA1, RA1;
472	vpxor (2 * 16)(%rdx), RA2, RA2;
473	vpxor (3 * 16)(%rdx), RA3, RA3;
474	vpxor (4 * 16)(%rdx), RB0, RB0;
475	vpxor (5 * 16)(%rdx), RB1, RB1;
476	vpxor (6 * 16)(%rdx), RB2, RB2;
477	vpxor (7 * 16)(%rdx), RB3, RB3;
478
479	vmovdqu RA0, (0 * 16)(%rsi);
480	vmovdqu RA1, (1 * 16)(%rsi);
481	vmovdqu RA2, (2 * 16)(%rsi);
482	vmovdqu RA3, (3 * 16)(%rsi);
483	vmovdqu RB0, (4 * 16)(%rsi);
484	vmovdqu RB1, (5 * 16)(%rsi);
485	vmovdqu RB2, (6 * 16)(%rsi);
486	vmovdqu RB3, (7 * 16)(%rsi);
487
488	vzeroall;
489	FRAME_END
490	RET;
491SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
492
493/*
494 * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
495 *                                 const u8 *src, u8 *iv)
496 */
497.align 8
498SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
499	/* input:
500	 *	%rdi: round key array, CTX
501	 *	%rsi: dst (8 blocks)
502	 *	%rdx: src (8 blocks)
503	 *	%rcx: iv
504	 */
505	FRAME_BEGIN
506
507	vmovdqu (0 * 16)(%rdx), RA0;
508	vmovdqu (1 * 16)(%rdx), RA1;
509	vmovdqu (2 * 16)(%rdx), RA2;
510	vmovdqu (3 * 16)(%rdx), RA3;
511	vmovdqu (4 * 16)(%rdx), RB0;
512	vmovdqu (5 * 16)(%rdx), RB1;
513	vmovdqu (6 * 16)(%rdx), RB2;
514	vmovdqu (7 * 16)(%rdx), RB3;
515
516	call __sm4_crypt_blk8;
517
518	vmovdqu (7 * 16)(%rdx), RNOT;
519	vpxor (%rcx), RA0, RA0;
520	vpxor (0 * 16)(%rdx), RA1, RA1;
521	vpxor (1 * 16)(%rdx), RA2, RA2;
522	vpxor (2 * 16)(%rdx), RA3, RA3;
523	vpxor (3 * 16)(%rdx), RB0, RB0;
524	vpxor (4 * 16)(%rdx), RB1, RB1;
525	vpxor (5 * 16)(%rdx), RB2, RB2;
526	vpxor (6 * 16)(%rdx), RB3, RB3;
527	vmovdqu RNOT, (%rcx); /* store new IV */
528
529	vmovdqu RA0, (0 * 16)(%rsi);
530	vmovdqu RA1, (1 * 16)(%rsi);
531	vmovdqu RA2, (2 * 16)(%rsi);
532	vmovdqu RA3, (3 * 16)(%rsi);
533	vmovdqu RB0, (4 * 16)(%rsi);
534	vmovdqu RB1, (5 * 16)(%rsi);
535	vmovdqu RB2, (6 * 16)(%rsi);
536	vmovdqu RB3, (7 * 16)(%rsi);
537
538	vzeroall;
539	FRAME_END
540	RET;
541SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
542
543/*
544 * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
545 *                                 const u8 *src, u8 *iv)
546 */
547.align 8
548SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
549	/* input:
550	 *	%rdi: round key array, CTX
551	 *	%rsi: dst (8 blocks)
552	 *	%rdx: src (8 blocks)
553	 *	%rcx: iv
554	 */
555	FRAME_BEGIN
556
557	/* Load input */
558	vmovdqu (%rcx), RA0;
559	vmovdqu 0 * 16(%rdx), RA1;
560	vmovdqu 1 * 16(%rdx), RA2;
561	vmovdqu 2 * 16(%rdx), RA3;
562	vmovdqu 3 * 16(%rdx), RB0;
563	vmovdqu 4 * 16(%rdx), RB1;
564	vmovdqu 5 * 16(%rdx), RB2;
565	vmovdqu 6 * 16(%rdx), RB3;
566
567	/* Update IV */
568	vmovdqu 7 * 16(%rdx), RNOT;
569	vmovdqu RNOT, (%rcx);
570
571	call __sm4_crypt_blk8;
572
573	vpxor (0 * 16)(%rdx), RA0, RA0;
574	vpxor (1 * 16)(%rdx), RA1, RA1;
575	vpxor (2 * 16)(%rdx), RA2, RA2;
576	vpxor (3 * 16)(%rdx), RA3, RA3;
577	vpxor (4 * 16)(%rdx), RB0, RB0;
578	vpxor (5 * 16)(%rdx), RB1, RB1;
579	vpxor (6 * 16)(%rdx), RB2, RB2;
580	vpxor (7 * 16)(%rdx), RB3, RB3;
581
582	vmovdqu RA0, (0 * 16)(%rsi);
583	vmovdqu RA1, (1 * 16)(%rsi);
584	vmovdqu RA2, (2 * 16)(%rsi);
585	vmovdqu RA3, (3 * 16)(%rsi);
586	vmovdqu RB0, (4 * 16)(%rsi);
587	vmovdqu RB1, (5 * 16)(%rsi);
588	vmovdqu RB2, (6 * 16)(%rsi);
589	vmovdqu RB3, (7 * 16)(%rsi);
590
591	vzeroall;
592	FRAME_END
593	RET;
594SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)
595