1a7ee22eeSTianjia Zhang/* SPDX-License-Identifier: GPL-2.0-or-later */
2a7ee22eeSTianjia Zhang/*
3a7ee22eeSTianjia Zhang * SM4 Cipher Algorithm, AES-NI/AVX optimized.
4a7ee22eeSTianjia Zhang * as specified in
5a7ee22eeSTianjia Zhang * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6a7ee22eeSTianjia Zhang *
7a7ee22eeSTianjia Zhang * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
8a7ee22eeSTianjia Zhang * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9a7ee22eeSTianjia Zhang * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
10a7ee22eeSTianjia Zhang */
11a7ee22eeSTianjia Zhang
12a7ee22eeSTianjia Zhang/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
13a7ee22eeSTianjia Zhang *  https://github.com/mjosaarinen/sm4ni
14a7ee22eeSTianjia Zhang */
15a7ee22eeSTianjia Zhang
16a7ee22eeSTianjia Zhang#include <linux/linkage.h>
17*2d203c46SEric Biggers#include <linux/cfi_types.h>
18a7ee22eeSTianjia Zhang#include <asm/frame.h>
19a7ee22eeSTianjia Zhang
20a7ee22eeSTianjia Zhang#define rRIP         (%rip)
21a7ee22eeSTianjia Zhang
22a7ee22eeSTianjia Zhang#define RX0          %xmm0
23a7ee22eeSTianjia Zhang#define RX1          %xmm1
24a7ee22eeSTianjia Zhang#define MASK_4BIT    %xmm2
25a7ee22eeSTianjia Zhang#define RTMP0        %xmm3
26a7ee22eeSTianjia Zhang#define RTMP1        %xmm4
27a7ee22eeSTianjia Zhang#define RTMP2        %xmm5
28a7ee22eeSTianjia Zhang#define RTMP3        %xmm6
29a7ee22eeSTianjia Zhang#define RTMP4        %xmm7
30a7ee22eeSTianjia Zhang
31a7ee22eeSTianjia Zhang#define RA0          %xmm8
32a7ee22eeSTianjia Zhang#define RA1          %xmm9
33a7ee22eeSTianjia Zhang#define RA2          %xmm10
34a7ee22eeSTianjia Zhang#define RA3          %xmm11
35a7ee22eeSTianjia Zhang
36a7ee22eeSTianjia Zhang#define RB0          %xmm12
37a7ee22eeSTianjia Zhang#define RB1          %xmm13
38a7ee22eeSTianjia Zhang#define RB2          %xmm14
39a7ee22eeSTianjia Zhang#define RB3          %xmm15
40a7ee22eeSTianjia Zhang
41a7ee22eeSTianjia Zhang#define RNOT         %xmm0
42a7ee22eeSTianjia Zhang#define RBSWAP       %xmm1
43a7ee22eeSTianjia Zhang
44a7ee22eeSTianjia Zhang
45a7ee22eeSTianjia Zhang/* Transpose four 32-bit words between 128-bit vectors. */
46a7ee22eeSTianjia Zhang#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
47a7ee22eeSTianjia Zhang	vpunpckhdq x1, x0, t2;                \
48a7ee22eeSTianjia Zhang	vpunpckldq x1, x0, x0;                \
49a7ee22eeSTianjia Zhang	                                      \
50a7ee22eeSTianjia Zhang	vpunpckldq x3, x2, t1;                \
51a7ee22eeSTianjia Zhang	vpunpckhdq x3, x2, x2;                \
52a7ee22eeSTianjia Zhang	                                      \
53a7ee22eeSTianjia Zhang	vpunpckhqdq t1, x0, x1;               \
54a7ee22eeSTianjia Zhang	vpunpcklqdq t1, x0, x0;               \
55a7ee22eeSTianjia Zhang	                                      \
56a7ee22eeSTianjia Zhang	vpunpckhqdq x2, t2, x3;               \
57a7ee22eeSTianjia Zhang	vpunpcklqdq x2, t2, x2;
58a7ee22eeSTianjia Zhang
59a7ee22eeSTianjia Zhang/* pre-SubByte transform. */
60a7ee22eeSTianjia Zhang#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
61a7ee22eeSTianjia Zhang	vpand x, mask4bit, tmp0;                     \
62a7ee22eeSTianjia Zhang	vpandn x, mask4bit, x;                       \
63a7ee22eeSTianjia Zhang	vpsrld $4, x, x;                             \
64a7ee22eeSTianjia Zhang	                                             \
65a7ee22eeSTianjia Zhang	vpshufb tmp0, lo_t, tmp0;                    \
66a7ee22eeSTianjia Zhang	vpshufb x, hi_t, x;                          \
67a7ee22eeSTianjia Zhang	vpxor tmp0, x, x;
68a7ee22eeSTianjia Zhang
69a7ee22eeSTianjia Zhang/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
70a7ee22eeSTianjia Zhang * 'vaeslastenc' instruction.
71a7ee22eeSTianjia Zhang */
72a7ee22eeSTianjia Zhang#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
73a7ee22eeSTianjia Zhang	vpandn mask4bit, x, tmp0;                     \
74a7ee22eeSTianjia Zhang	vpsrld $4, x, x;                              \
75a7ee22eeSTianjia Zhang	vpand x, mask4bit, x;                         \
76a7ee22eeSTianjia Zhang	                                              \
77a7ee22eeSTianjia Zhang	vpshufb tmp0, lo_t, tmp0;                     \
78a7ee22eeSTianjia Zhang	vpshufb x, hi_t, x;                           \
79a7ee22eeSTianjia Zhang	vpxor tmp0, x, x;
80a7ee22eeSTianjia Zhang
81a7ee22eeSTianjia Zhang
82f8690a4bSTianjia Zhang.section	.rodata.cst16, "aM", @progbits, 16
83a7ee22eeSTianjia Zhang.align 16
84a7ee22eeSTianjia Zhang
85a7ee22eeSTianjia Zhang/*
86a7ee22eeSTianjia Zhang * Following four affine transform look-up tables are from work by
87a7ee22eeSTianjia Zhang * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
88a7ee22eeSTianjia Zhang *
89a7ee22eeSTianjia Zhang * These allow exposing SM4 S-Box from AES SubByte.
90a7ee22eeSTianjia Zhang */
91a7ee22eeSTianjia Zhang
92a7ee22eeSTianjia Zhang/* pre-SubByte affine transform, from SM4 field to AES field. */
93a7ee22eeSTianjia Zhang.Lpre_tf_lo_s:
94a7ee22eeSTianjia Zhang	.quad 0x9197E2E474720701, 0xC7C1B4B222245157
95a7ee22eeSTianjia Zhang.Lpre_tf_hi_s:
96a7ee22eeSTianjia Zhang	.quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
97a7ee22eeSTianjia Zhang
98a7ee22eeSTianjia Zhang/* post-SubByte affine transform, from AES field to SM4 field. */
99a7ee22eeSTianjia Zhang.Lpost_tf_lo_s:
100a7ee22eeSTianjia Zhang	.quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
101a7ee22eeSTianjia Zhang.Lpost_tf_hi_s:
102a7ee22eeSTianjia Zhang	.quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
103a7ee22eeSTianjia Zhang
104a7ee22eeSTianjia Zhang/* For isolating SubBytes from AESENCLAST, inverse shift row */
105a7ee22eeSTianjia Zhang.Linv_shift_row:
106a7ee22eeSTianjia Zhang	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
107a7ee22eeSTianjia Zhang	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
108a7ee22eeSTianjia Zhang
109a7ee22eeSTianjia Zhang/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
110a7ee22eeSTianjia Zhang.Linv_shift_row_rol_8:
111a7ee22eeSTianjia Zhang	.byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
112a7ee22eeSTianjia Zhang	.byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
113a7ee22eeSTianjia Zhang
114a7ee22eeSTianjia Zhang/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
115a7ee22eeSTianjia Zhang.Linv_shift_row_rol_16:
116a7ee22eeSTianjia Zhang	.byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
117a7ee22eeSTianjia Zhang	.byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
118a7ee22eeSTianjia Zhang
119a7ee22eeSTianjia Zhang/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
120a7ee22eeSTianjia Zhang.Linv_shift_row_rol_24:
121a7ee22eeSTianjia Zhang	.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
122a7ee22eeSTianjia Zhang	.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
123a7ee22eeSTianjia Zhang
124a7ee22eeSTianjia Zhang/* For CTR-mode IV byteswap */
125a7ee22eeSTianjia Zhang.Lbswap128_mask:
126a7ee22eeSTianjia Zhang	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
127a7ee22eeSTianjia Zhang
128a7ee22eeSTianjia Zhang/* For input word byte-swap */
129a7ee22eeSTianjia Zhang.Lbswap32_mask:
130a7ee22eeSTianjia Zhang	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
131a7ee22eeSTianjia Zhang
132a7ee22eeSTianjia Zhang.align 4
133a7ee22eeSTianjia Zhang/* 4-bit mask */
134a7ee22eeSTianjia Zhang.L0f0f0f0f:
135a7ee22eeSTianjia Zhang	.long 0x0f0f0f0f
136a7ee22eeSTianjia Zhang
137f8690a4bSTianjia Zhang/* 12 bytes, only for padding */
138f8690a4bSTianjia Zhang.Lpadding_deadbeef:
139f8690a4bSTianjia Zhang	.long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
140f8690a4bSTianjia Zhang
141a7ee22eeSTianjia Zhang
142a7ee22eeSTianjia Zhang.text
143a7ee22eeSTianjia Zhang
144a7ee22eeSTianjia Zhang/*
145a7ee22eeSTianjia Zhang * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
146a7ee22eeSTianjia Zhang *                           const u8 *src, int nblocks)
147a7ee22eeSTianjia Zhang */
148a7ee22eeSTianjia ZhangSYM_FUNC_START(sm4_aesni_avx_crypt4)
149a7ee22eeSTianjia Zhang	/* input:
150a7ee22eeSTianjia Zhang	 *	%rdi: round key array, CTX
151a7ee22eeSTianjia Zhang	 *	%rsi: dst (1..4 blocks)
152a7ee22eeSTianjia Zhang	 *	%rdx: src (1..4 blocks)
153a7ee22eeSTianjia Zhang	 *	%rcx: num blocks (1..4)
154a7ee22eeSTianjia Zhang	 */
155a7ee22eeSTianjia Zhang	FRAME_BEGIN
156a7ee22eeSTianjia Zhang
157a7ee22eeSTianjia Zhang	vmovdqu 0*16(%rdx), RA0;
158a7ee22eeSTianjia Zhang	vmovdqa RA0, RA1;
159a7ee22eeSTianjia Zhang	vmovdqa RA0, RA2;
160a7ee22eeSTianjia Zhang	vmovdqa RA0, RA3;
161a7ee22eeSTianjia Zhang	cmpq $2, %rcx;
162a7ee22eeSTianjia Zhang	jb .Lblk4_load_input_done;
163a7ee22eeSTianjia Zhang	vmovdqu 1*16(%rdx), RA1;
164a7ee22eeSTianjia Zhang	je .Lblk4_load_input_done;
165a7ee22eeSTianjia Zhang	vmovdqu 2*16(%rdx), RA2;
166a7ee22eeSTianjia Zhang	cmpq $3, %rcx;
167a7ee22eeSTianjia Zhang	je .Lblk4_load_input_done;
168a7ee22eeSTianjia Zhang	vmovdqu 3*16(%rdx), RA3;
169a7ee22eeSTianjia Zhang
170a7ee22eeSTianjia Zhang.Lblk4_load_input_done:
171a7ee22eeSTianjia Zhang
172a7ee22eeSTianjia Zhang	vmovdqa .Lbswap32_mask rRIP, RTMP2;
173a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA0, RA0;
174a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA1, RA1;
175a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA2, RA2;
176a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA3, RA3;
177a7ee22eeSTianjia Zhang
178a7ee22eeSTianjia Zhang	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
179a7ee22eeSTianjia Zhang	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
180a7ee22eeSTianjia Zhang	vmovdqa .Lpre_tf_hi_s rRIP, RB0;
181a7ee22eeSTianjia Zhang	vmovdqa .Lpost_tf_lo_s rRIP, RB1;
182a7ee22eeSTianjia Zhang	vmovdqa .Lpost_tf_hi_s rRIP, RB2;
183a7ee22eeSTianjia Zhang	vmovdqa .Linv_shift_row rRIP, RB3;
184a7ee22eeSTianjia Zhang	vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
185a7ee22eeSTianjia Zhang	vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
186a7ee22eeSTianjia Zhang	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
187a7ee22eeSTianjia Zhang
188a7ee22eeSTianjia Zhang#define ROUND(round, s0, s1, s2, s3)                                \
189a7ee22eeSTianjia Zhang	vbroadcastss (4*(round))(%rdi), RX0;                        \
190a7ee22eeSTianjia Zhang	vpxor s1, RX0, RX0;                                         \
191a7ee22eeSTianjia Zhang	vpxor s2, RX0, RX0;                                         \
192a7ee22eeSTianjia Zhang	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */                 \
193a7ee22eeSTianjia Zhang	                                                            \
194a7ee22eeSTianjia Zhang	/* sbox, non-linear part */                                 \
195a7ee22eeSTianjia Zhang	transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0);           \
196a7ee22eeSTianjia Zhang	vaesenclast MASK_4BIT, RX0, RX0;                            \
197a7ee22eeSTianjia Zhang	transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0);            \
198a7ee22eeSTianjia Zhang	                                                            \
199a7ee22eeSTianjia Zhang	/* linear part */                                           \
200a7ee22eeSTianjia Zhang	vpshufb RB3, RX0, RTMP0;                                    \
201a7ee22eeSTianjia Zhang	vpxor RTMP0, s0, s0; /* s0 ^ x */                           \
202a7ee22eeSTianjia Zhang	vpshufb RTMP2, RX0, RTMP1;                                  \
203a7ee22eeSTianjia Zhang	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */               \
204a7ee22eeSTianjia Zhang	vpshufb RTMP3, RX0, RTMP1;                                  \
205a7ee22eeSTianjia Zhang	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */   \
206a7ee22eeSTianjia Zhang	vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1;            \
207a7ee22eeSTianjia Zhang	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */               \
208a7ee22eeSTianjia Zhang	vpslld $2, RTMP0, RTMP1;                                    \
209a7ee22eeSTianjia Zhang	vpsrld $30, RTMP0, RTMP0;                                   \
210a7ee22eeSTianjia Zhang	vpxor RTMP0, s0, s0;                                        \
211a7ee22eeSTianjia Zhang	/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
212a7ee22eeSTianjia Zhang	vpxor RTMP1, s0, s0;
213a7ee22eeSTianjia Zhang
214a7ee22eeSTianjia Zhang	leaq (32*4)(%rdi), %rax;
215a7ee22eeSTianjia Zhang.align 16
216a7ee22eeSTianjia Zhang.Lroundloop_blk4:
217a7ee22eeSTianjia Zhang	ROUND(0, RA0, RA1, RA2, RA3);
218a7ee22eeSTianjia Zhang	ROUND(1, RA1, RA2, RA3, RA0);
219a7ee22eeSTianjia Zhang	ROUND(2, RA2, RA3, RA0, RA1);
220a7ee22eeSTianjia Zhang	ROUND(3, RA3, RA0, RA1, RA2);
221a7ee22eeSTianjia Zhang	leaq (4*4)(%rdi), %rdi;
222a7ee22eeSTianjia Zhang	cmpq %rax, %rdi;
223a7ee22eeSTianjia Zhang	jne .Lroundloop_blk4;
224a7ee22eeSTianjia Zhang
225a7ee22eeSTianjia Zhang#undef ROUND
226a7ee22eeSTianjia Zhang
227a7ee22eeSTianjia Zhang	vmovdqa .Lbswap128_mask rRIP, RTMP2;
228a7ee22eeSTianjia Zhang
229a7ee22eeSTianjia Zhang	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
230a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA0, RA0;
231a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA1, RA1;
232a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA2, RA2;
233a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA3, RA3;
234a7ee22eeSTianjia Zhang
235a7ee22eeSTianjia Zhang	vmovdqu RA0, 0*16(%rsi);
236a7ee22eeSTianjia Zhang	cmpq $2, %rcx;
237a7ee22eeSTianjia Zhang	jb .Lblk4_store_output_done;
238a7ee22eeSTianjia Zhang	vmovdqu RA1, 1*16(%rsi);
239a7ee22eeSTianjia Zhang	je .Lblk4_store_output_done;
240a7ee22eeSTianjia Zhang	vmovdqu RA2, 2*16(%rsi);
241a7ee22eeSTianjia Zhang	cmpq $3, %rcx;
242a7ee22eeSTianjia Zhang	je .Lblk4_store_output_done;
243a7ee22eeSTianjia Zhang	vmovdqu RA3, 3*16(%rsi);
244a7ee22eeSTianjia Zhang
245a7ee22eeSTianjia Zhang.Lblk4_store_output_done:
246a7ee22eeSTianjia Zhang	vzeroall;
247a7ee22eeSTianjia Zhang	FRAME_END
248a7ee22eeSTianjia Zhang	RET;
249a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_crypt4)
250f94909ceSPeter Zijlstra
251a7ee22eeSTianjia ZhangSYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
252a7ee22eeSTianjia Zhang	/* input:
253a7ee22eeSTianjia Zhang	 *	%rdi: round key array, CTX
254a7ee22eeSTianjia Zhang	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
255a7ee22eeSTianjia Zhang	 *						plaintext blocks
256a7ee22eeSTianjia Zhang	 * output:
257a7ee22eeSTianjia Zhang	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
258a7ee22eeSTianjia Zhang	 * 						ciphertext blocks
259a7ee22eeSTianjia Zhang	 */
260a7ee22eeSTianjia Zhang	FRAME_BEGIN
261a7ee22eeSTianjia Zhang
262a7ee22eeSTianjia Zhang	vmovdqa .Lbswap32_mask rRIP, RTMP2;
263a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA0, RA0;
264a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA1, RA1;
265a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA2, RA2;
266a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA3, RA3;
267a7ee22eeSTianjia Zhang	vpshufb RTMP2, RB0, RB0;
268a7ee22eeSTianjia Zhang	vpshufb RTMP2, RB1, RB1;
269a7ee22eeSTianjia Zhang	vpshufb RTMP2, RB2, RB2;
270a7ee22eeSTianjia Zhang	vpshufb RTMP2, RB3, RB3;
271a7ee22eeSTianjia Zhang
272a7ee22eeSTianjia Zhang	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
273a7ee22eeSTianjia Zhang	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
274a7ee22eeSTianjia Zhang	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
275a7ee22eeSTianjia Zhang
276a7ee22eeSTianjia Zhang#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3)                \
277a7ee22eeSTianjia Zhang	vbroadcastss (4*(round))(%rdi), RX0;                        \
278a7ee22eeSTianjia Zhang	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;                          \
279a7ee22eeSTianjia Zhang	vmovdqa .Lpre_tf_hi_s rRIP, RTMP1;                          \
280a7ee22eeSTianjia Zhang	vmovdqa RX0, RX1;                                           \
281a7ee22eeSTianjia Zhang	vpxor s1, RX0, RX0;                                         \
282a7ee22eeSTianjia Zhang	vpxor s2, RX0, RX0;                                         \
283a7ee22eeSTianjia Zhang	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */                 \
284a7ee22eeSTianjia Zhang	vmovdqa .Lpost_tf_lo_s rRIP, RTMP2;                         \
285a7ee22eeSTianjia Zhang	vmovdqa .Lpost_tf_hi_s rRIP, RTMP3;                         \
286a7ee22eeSTianjia Zhang	vpxor r1, RX1, RX1;                                         \
287a7ee22eeSTianjia Zhang	vpxor r2, RX1, RX1;                                         \
288a7ee22eeSTianjia Zhang	vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */                 \
289a7ee22eeSTianjia Zhang                                                                    \
290a7ee22eeSTianjia Zhang	/* sbox, non-linear part */                                 \
291a7ee22eeSTianjia Zhang	transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
292a7ee22eeSTianjia Zhang	transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0);         \
293a7ee22eeSTianjia Zhang	vmovdqa .Linv_shift_row rRIP, RTMP4;                        \
294a7ee22eeSTianjia Zhang	vaesenclast MASK_4BIT, RX0, RX0;                            \
295a7ee22eeSTianjia Zhang	vaesenclast MASK_4BIT, RX1, RX1;                            \
296a7ee22eeSTianjia Zhang	transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
297a7ee22eeSTianjia Zhang	transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0);        \
298a7ee22eeSTianjia Zhang                                                                    \
299a7ee22eeSTianjia Zhang	/* linear part */                                           \
300a7ee22eeSTianjia Zhang	vpshufb RTMP4, RX0, RTMP0;                                  \
301a7ee22eeSTianjia Zhang	vpxor RTMP0, s0, s0; /* s0 ^ x */                           \
302a7ee22eeSTianjia Zhang	vpshufb RTMP4, RX1, RTMP2;                                  \
303a7ee22eeSTianjia Zhang	vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4;                  \
304a7ee22eeSTianjia Zhang	vpxor RTMP2, r0, r0; /* r0 ^ x */                           \
305a7ee22eeSTianjia Zhang	vpshufb RTMP4, RX0, RTMP1;                                  \
306a7ee22eeSTianjia Zhang	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */               \
307a7ee22eeSTianjia Zhang	vpshufb RTMP4, RX1, RTMP3;                                  \
308a7ee22eeSTianjia Zhang	vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4;                 \
309a7ee22eeSTianjia Zhang	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */               \
310a7ee22eeSTianjia Zhang	vpshufb RTMP4, RX0, RTMP1;                                  \
311a7ee22eeSTianjia Zhang	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */   \
312a7ee22eeSTianjia Zhang	vpshufb RTMP4, RX1, RTMP3;                                  \
313a7ee22eeSTianjia Zhang	vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4;                 \
314a7ee22eeSTianjia Zhang	vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */   \
315a7ee22eeSTianjia Zhang	vpshufb RTMP4, RX0, RTMP1;                                  \
316a7ee22eeSTianjia Zhang	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */               \
317a7ee22eeSTianjia Zhang	/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
318a7ee22eeSTianjia Zhang	vpslld $2, RTMP0, RTMP1;                                    \
319a7ee22eeSTianjia Zhang	vpsrld $30, RTMP0, RTMP0;                                   \
320a7ee22eeSTianjia Zhang	vpxor RTMP0, s0, s0;                                        \
321a7ee22eeSTianjia Zhang	vpxor RTMP1, s0, s0;                                        \
322a7ee22eeSTianjia Zhang	vpshufb RTMP4, RX1, RTMP3;                                  \
323a7ee22eeSTianjia Zhang	vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */               \
324a7ee22eeSTianjia Zhang	/* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
325a7ee22eeSTianjia Zhang	vpslld $2, RTMP2, RTMP3;                                    \
326a7ee22eeSTianjia Zhang	vpsrld $30, RTMP2, RTMP2;                                   \
327a7ee22eeSTianjia Zhang	vpxor RTMP2, r0, r0;                                        \
328a7ee22eeSTianjia Zhang	vpxor RTMP3, r0, r0;
329a7ee22eeSTianjia Zhang
330a7ee22eeSTianjia Zhang	leaq (32*4)(%rdi), %rax;
331a7ee22eeSTianjia Zhang.align 16
332a7ee22eeSTianjia Zhang.Lroundloop_blk8:
333a7ee22eeSTianjia Zhang	ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
334a7ee22eeSTianjia Zhang	ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
335a7ee22eeSTianjia Zhang	ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
336a7ee22eeSTianjia Zhang	ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
337a7ee22eeSTianjia Zhang	leaq (4*4)(%rdi), %rdi;
338a7ee22eeSTianjia Zhang	cmpq %rax, %rdi;
339a7ee22eeSTianjia Zhang	jne .Lroundloop_blk8;
340a7ee22eeSTianjia Zhang
341a7ee22eeSTianjia Zhang#undef ROUND
342a7ee22eeSTianjia Zhang
343a7ee22eeSTianjia Zhang	vmovdqa .Lbswap128_mask rRIP, RTMP2;
344a7ee22eeSTianjia Zhang
345a7ee22eeSTianjia Zhang	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
346a7ee22eeSTianjia Zhang	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
347a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA0, RA0;
348a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA1, RA1;
349a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA2, RA2;
350a7ee22eeSTianjia Zhang	vpshufb RTMP2, RA3, RA3;
351a7ee22eeSTianjia Zhang	vpshufb RTMP2, RB0, RB0;
352a7ee22eeSTianjia Zhang	vpshufb RTMP2, RB1, RB1;
353a7ee22eeSTianjia Zhang	vpshufb RTMP2, RB2, RB2;
354a7ee22eeSTianjia Zhang	vpshufb RTMP2, RB3, RB3;
355a7ee22eeSTianjia Zhang
356a7ee22eeSTianjia Zhang	FRAME_END
357a7ee22eeSTianjia Zhang	RET;
358a7ee22eeSTianjia ZhangSYM_FUNC_END(__sm4_crypt_blk8)
359a7ee22eeSTianjia Zhang
360f94909ceSPeter Zijlstra/*
361a7ee22eeSTianjia Zhang * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
362a7ee22eeSTianjia Zhang *                           const u8 *src, int nblocks)
363a7ee22eeSTianjia Zhang */
364a7ee22eeSTianjia ZhangSYM_FUNC_START(sm4_aesni_avx_crypt8)
365a7ee22eeSTianjia Zhang	/* input:
366a7ee22eeSTianjia Zhang	 *	%rdi: round key array, CTX
367a7ee22eeSTianjia Zhang	 *	%rsi: dst (1..8 blocks)
368a7ee22eeSTianjia Zhang	 *	%rdx: src (1..8 blocks)
369a7ee22eeSTianjia Zhang	 *	%rcx: num blocks (1..8)
370a7ee22eeSTianjia Zhang	 */
371a7ee22eeSTianjia Zhang	cmpq $5, %rcx;
372a7ee22eeSTianjia Zhang	jb sm4_aesni_avx_crypt4;
373a7ee22eeSTianjia Zhang
374a7ee22eeSTianjia Zhang	FRAME_BEGIN
375a7ee22eeSTianjia Zhang
376a7ee22eeSTianjia Zhang	vmovdqu (0 * 16)(%rdx), RA0;
3770e14ef38SJosh Poimboeuf	vmovdqu (1 * 16)(%rdx), RA1;
3780e14ef38SJosh Poimboeuf	vmovdqu (2 * 16)(%rdx), RA2;
3790e14ef38SJosh Poimboeuf	vmovdqu (3 * 16)(%rdx), RA3;
380a7ee22eeSTianjia Zhang	vmovdqu (4 * 16)(%rdx), RB0;
381a7ee22eeSTianjia Zhang	vmovdqa RB0, RB1;
382a7ee22eeSTianjia Zhang	vmovdqa RB0, RB2;
383a7ee22eeSTianjia Zhang	vmovdqa RB0, RB3;
384a7ee22eeSTianjia Zhang	je .Lblk8_load_input_done;
385a7ee22eeSTianjia Zhang	vmovdqu (5 * 16)(%rdx), RB1;
386a7ee22eeSTianjia Zhang	cmpq $7, %rcx;
387a7ee22eeSTianjia Zhang	jb .Lblk8_load_input_done;
388a7ee22eeSTianjia Zhang	vmovdqu (6 * 16)(%rdx), RB2;
389a7ee22eeSTianjia Zhang	je .Lblk8_load_input_done;
390a7ee22eeSTianjia Zhang	vmovdqu (7 * 16)(%rdx), RB3;
391a7ee22eeSTianjia Zhang
392a7ee22eeSTianjia Zhang.Lblk8_load_input_done:
393a7ee22eeSTianjia Zhang	call __sm4_crypt_blk8;
394a7ee22eeSTianjia Zhang
395a7ee22eeSTianjia Zhang	cmpq $6, %rcx;
396a7ee22eeSTianjia Zhang	vmovdqu RA0, (0 * 16)(%rsi);
397a7ee22eeSTianjia Zhang	vmovdqu RA1, (1 * 16)(%rsi);
398a7ee22eeSTianjia Zhang	vmovdqu RA2, (2 * 16)(%rsi);
399a7ee22eeSTianjia Zhang	vmovdqu RA3, (3 * 16)(%rsi);
400a7ee22eeSTianjia Zhang	vmovdqu RB0, (4 * 16)(%rsi);
401a7ee22eeSTianjia Zhang	jb .Lblk8_store_output_done;
402a7ee22eeSTianjia Zhang	vmovdqu RB1, (5 * 16)(%rsi);
403a7ee22eeSTianjia Zhang	je .Lblk8_store_output_done;
404a7ee22eeSTianjia Zhang	vmovdqu RB2, (6 * 16)(%rsi);
405a7ee22eeSTianjia Zhang	cmpq $7, %rcx;
406a7ee22eeSTianjia Zhang	je .Lblk8_store_output_done;
407a7ee22eeSTianjia Zhang	vmovdqu RB3, (7 * 16)(%rsi);
408a7ee22eeSTianjia Zhang
409a7ee22eeSTianjia Zhang.Lblk8_store_output_done:
410a7ee22eeSTianjia Zhang	vzeroall;
411a7ee22eeSTianjia Zhang	FRAME_END
412a7ee22eeSTianjia Zhang	RET;
413a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_crypt8)
414a7ee22eeSTianjia Zhang
415a7ee22eeSTianjia Zhang/*
416f94909ceSPeter Zijlstra * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
417a7ee22eeSTianjia Zhang *                                 const u8 *src, u8 *iv)
418a7ee22eeSTianjia Zhang */
419a7ee22eeSTianjia ZhangSYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
420a7ee22eeSTianjia Zhang	/* input:
421a7ee22eeSTianjia Zhang	 *	%rdi: round key array, CTX
422a7ee22eeSTianjia Zhang	 *	%rsi: dst (8 blocks)
423a7ee22eeSTianjia Zhang	 *	%rdx: src (8 blocks)
424*2d203c46SEric Biggers	 *	%rcx: iv (big endian, 128bit)
425a7ee22eeSTianjia Zhang	 */
426a7ee22eeSTianjia Zhang	FRAME_BEGIN
427a7ee22eeSTianjia Zhang
428a7ee22eeSTianjia Zhang	/* load IV and byteswap */
429a7ee22eeSTianjia Zhang	vmovdqu (%rcx), RA0;
430a7ee22eeSTianjia Zhang
431a7ee22eeSTianjia Zhang	vmovdqa .Lbswap128_mask rRIP, RBSWAP;
432a7ee22eeSTianjia Zhang	vpshufb RBSWAP, RA0, RTMP0; /* be => le */
433a7ee22eeSTianjia Zhang
434a7ee22eeSTianjia Zhang	vpcmpeqd RNOT, RNOT, RNOT;
435a7ee22eeSTianjia Zhang	vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */
436a7ee22eeSTianjia Zhang
437a7ee22eeSTianjia Zhang#define inc_le128(x, minus_one, tmp) \
438a7ee22eeSTianjia Zhang	vpcmpeqq minus_one, x, tmp;  \
439a7ee22eeSTianjia Zhang	vpsubq minus_one, x, x;      \
440a7ee22eeSTianjia Zhang	vpslldq $8, tmp, tmp;        \
441a7ee22eeSTianjia Zhang	vpsubq tmp, x, x;
442a7ee22eeSTianjia Zhang
443a7ee22eeSTianjia Zhang	/* construct IVs */
444a7ee22eeSTianjia Zhang	inc_le128(RTMP0, RNOT, RTMP2); /* +1 */
445a7ee22eeSTianjia Zhang	vpshufb RBSWAP, RTMP0, RA1;
446a7ee22eeSTianjia Zhang	inc_le128(RTMP0, RNOT, RTMP2); /* +2 */
447a7ee22eeSTianjia Zhang	vpshufb RBSWAP, RTMP0, RA2;
448a7ee22eeSTianjia Zhang	inc_le128(RTMP0, RNOT, RTMP2); /* +3 */
449a7ee22eeSTianjia Zhang	vpshufb RBSWAP, RTMP0, RA3;
450a7ee22eeSTianjia Zhang	inc_le128(RTMP0, RNOT, RTMP2); /* +4 */
451a7ee22eeSTianjia Zhang	vpshufb RBSWAP, RTMP0, RB0;
452a7ee22eeSTianjia Zhang	inc_le128(RTMP0, RNOT, RTMP2); /* +5 */
453a7ee22eeSTianjia Zhang	vpshufb RBSWAP, RTMP0, RB1;
454a7ee22eeSTianjia Zhang	inc_le128(RTMP0, RNOT, RTMP2); /* +6 */
455a7ee22eeSTianjia Zhang	vpshufb RBSWAP, RTMP0, RB2;
456a7ee22eeSTianjia Zhang	inc_le128(RTMP0, RNOT, RTMP2); /* +7 */
457a7ee22eeSTianjia Zhang	vpshufb RBSWAP, RTMP0, RB3;
458a7ee22eeSTianjia Zhang	inc_le128(RTMP0, RNOT, RTMP2); /* +8 */
459a7ee22eeSTianjia Zhang	vpshufb RBSWAP, RTMP0, RTMP1;
460a7ee22eeSTianjia Zhang
461a7ee22eeSTianjia Zhang	/* store new IV */
462a7ee22eeSTianjia Zhang	vmovdqu RTMP1, (%rcx);
463a7ee22eeSTianjia Zhang
464a7ee22eeSTianjia Zhang	call __sm4_crypt_blk8;
465a7ee22eeSTianjia Zhang
466a7ee22eeSTianjia Zhang	vpxor (0 * 16)(%rdx), RA0, RA0;
467a7ee22eeSTianjia Zhang	vpxor (1 * 16)(%rdx), RA1, RA1;
468a7ee22eeSTianjia Zhang	vpxor (2 * 16)(%rdx), RA2, RA2;
469a7ee22eeSTianjia Zhang	vpxor (3 * 16)(%rdx), RA3, RA3;
470a7ee22eeSTianjia Zhang	vpxor (4 * 16)(%rdx), RB0, RB0;
471a7ee22eeSTianjia Zhang	vpxor (5 * 16)(%rdx), RB1, RB1;
472a7ee22eeSTianjia Zhang	vpxor (6 * 16)(%rdx), RB2, RB2;
473a7ee22eeSTianjia Zhang	vpxor (7 * 16)(%rdx), RB3, RB3;
474a7ee22eeSTianjia Zhang
475a7ee22eeSTianjia Zhang	vmovdqu RA0, (0 * 16)(%rsi);
476a7ee22eeSTianjia Zhang	vmovdqu RA1, (1 * 16)(%rsi);
477a7ee22eeSTianjia Zhang	vmovdqu RA2, (2 * 16)(%rsi);
478a7ee22eeSTianjia Zhang	vmovdqu RA3, (3 * 16)(%rsi);
479a7ee22eeSTianjia Zhang	vmovdqu RB0, (4 * 16)(%rsi);
480a7ee22eeSTianjia Zhang	vmovdqu RB1, (5 * 16)(%rsi);
481a7ee22eeSTianjia Zhang	vmovdqu RB2, (6 * 16)(%rsi);
482a7ee22eeSTianjia Zhang	vmovdqu RB3, (7 * 16)(%rsi);
483a7ee22eeSTianjia Zhang
484a7ee22eeSTianjia Zhang	vzeroall;
485a7ee22eeSTianjia Zhang	FRAME_END
486a7ee22eeSTianjia Zhang	RET;
487a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
488a7ee22eeSTianjia Zhang
489a7ee22eeSTianjia Zhang/*
490a7ee22eeSTianjia Zhang * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
491f94909ceSPeter Zijlstra *                                 const u8 *src, u8 *iv)
492a7ee22eeSTianjia Zhang */
493a7ee22eeSTianjia ZhangSYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
494a7ee22eeSTianjia Zhang	/* input:
495a7ee22eeSTianjia Zhang	 *	%rdi: round key array, CTX
496a7ee22eeSTianjia Zhang	 *	%rsi: dst (8 blocks)
497a7ee22eeSTianjia Zhang	 *	%rdx: src (8 blocks)
498a7ee22eeSTianjia Zhang	 *	%rcx: iv
499*2d203c46SEric Biggers	 */
500a7ee22eeSTianjia Zhang	FRAME_BEGIN
501a7ee22eeSTianjia Zhang
502a7ee22eeSTianjia Zhang	vmovdqu (0 * 16)(%rdx), RA0;
503a7ee22eeSTianjia Zhang	vmovdqu (1 * 16)(%rdx), RA1;
504a7ee22eeSTianjia Zhang	vmovdqu (2 * 16)(%rdx), RA2;
505a7ee22eeSTianjia Zhang	vmovdqu (3 * 16)(%rdx), RA3;
506a7ee22eeSTianjia Zhang	vmovdqu (4 * 16)(%rdx), RB0;
507a7ee22eeSTianjia Zhang	vmovdqu (5 * 16)(%rdx), RB1;
508a7ee22eeSTianjia Zhang	vmovdqu (6 * 16)(%rdx), RB2;
509a7ee22eeSTianjia Zhang	vmovdqu (7 * 16)(%rdx), RB3;
510a7ee22eeSTianjia Zhang
511a7ee22eeSTianjia Zhang	call __sm4_crypt_blk8;
512a7ee22eeSTianjia Zhang
513a7ee22eeSTianjia Zhang	vmovdqu (7 * 16)(%rdx), RNOT;
514a7ee22eeSTianjia Zhang	vpxor (%rcx), RA0, RA0;
515a7ee22eeSTianjia Zhang	vpxor (0 * 16)(%rdx), RA1, RA1;
516a7ee22eeSTianjia Zhang	vpxor (1 * 16)(%rdx), RA2, RA2;
517a7ee22eeSTianjia Zhang	vpxor (2 * 16)(%rdx), RA3, RA3;
518a7ee22eeSTianjia Zhang	vpxor (3 * 16)(%rdx), RB0, RB0;
519a7ee22eeSTianjia Zhang	vpxor (4 * 16)(%rdx), RB1, RB1;
520a7ee22eeSTianjia Zhang	vpxor (5 * 16)(%rdx), RB2, RB2;
521a7ee22eeSTianjia Zhang	vpxor (6 * 16)(%rdx), RB3, RB3;
522a7ee22eeSTianjia Zhang	vmovdqu RNOT, (%rcx); /* store new IV */
523a7ee22eeSTianjia Zhang
524a7ee22eeSTianjia Zhang	vmovdqu RA0, (0 * 16)(%rsi);
525a7ee22eeSTianjia Zhang	vmovdqu RA1, (1 * 16)(%rsi);
526a7ee22eeSTianjia Zhang	vmovdqu RA2, (2 * 16)(%rsi);
527a7ee22eeSTianjia Zhang	vmovdqu RA3, (3 * 16)(%rsi);
528a7ee22eeSTianjia Zhang	vmovdqu RB0, (4 * 16)(%rsi);
529a7ee22eeSTianjia Zhang	vmovdqu RB1, (5 * 16)(%rsi);
530a7ee22eeSTianjia Zhang	vmovdqu RB2, (6 * 16)(%rsi);
531a7ee22eeSTianjia Zhang	vmovdqu RB3, (7 * 16)(%rsi);
532a7ee22eeSTianjia Zhang
533a7ee22eeSTianjia Zhang	vzeroall;
534a7ee22eeSTianjia Zhang	FRAME_END
535a7ee22eeSTianjia Zhang	RET;
536a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
537a7ee22eeSTianjia Zhang
538a7ee22eeSTianjia Zhang/*
539a7ee22eeSTianjia Zhang * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
540a7ee22eeSTianjia Zhang *                                 const u8 *src, u8 *iv)
541f94909ceSPeter Zijlstra */
542a7ee22eeSTianjia ZhangSYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
543a7ee22eeSTianjia Zhang	/* input:
544a7ee22eeSTianjia Zhang	 *	%rdi: round key array, CTX
545a7ee22eeSTianjia Zhang	 *	%rsi: dst (8 blocks)
546a7ee22eeSTianjia Zhang	 *	%rdx: src (8 blocks)
547a7ee22eeSTianjia Zhang	 *	%rcx: iv
548a7ee22eeSTianjia Zhang	 */
549*2d203c46SEric Biggers	FRAME_BEGIN
550a7ee22eeSTianjia Zhang
551a7ee22eeSTianjia Zhang	/* Load input */
552a7ee22eeSTianjia Zhang	vmovdqu (%rcx), RA0;
553a7ee22eeSTianjia Zhang	vmovdqu 0 * 16(%rdx), RA1;
554a7ee22eeSTianjia Zhang	vmovdqu 1 * 16(%rdx), RA2;
555a7ee22eeSTianjia Zhang	vmovdqu 2 * 16(%rdx), RA3;
556a7ee22eeSTianjia Zhang	vmovdqu 3 * 16(%rdx), RB0;
557a7ee22eeSTianjia Zhang	vmovdqu 4 * 16(%rdx), RB1;
558a7ee22eeSTianjia Zhang	vmovdqu 5 * 16(%rdx), RB2;
559a7ee22eeSTianjia Zhang	vmovdqu 6 * 16(%rdx), RB3;
560a7ee22eeSTianjia Zhang
561a7ee22eeSTianjia Zhang	/* Update IV */
562a7ee22eeSTianjia Zhang	vmovdqu 7 * 16(%rdx), RNOT;
563a7ee22eeSTianjia Zhang	vmovdqu RNOT, (%rcx);
564a7ee22eeSTianjia Zhang
565a7ee22eeSTianjia Zhang	call __sm4_crypt_blk8;
566a7ee22eeSTianjia Zhang
567a7ee22eeSTianjia Zhang	vpxor (0 * 16)(%rdx), RA0, RA0;
568a7ee22eeSTianjia Zhang	vpxor (1 * 16)(%rdx), RA1, RA1;
569a7ee22eeSTianjia Zhang	vpxor (2 * 16)(%rdx), RA2, RA2;
570a7ee22eeSTianjia Zhang	vpxor (3 * 16)(%rdx), RA3, RA3;
571a7ee22eeSTianjia Zhang	vpxor (4 * 16)(%rdx), RB0, RB0;
572a7ee22eeSTianjia Zhang	vpxor (5 * 16)(%rdx), RB1, RB1;
573a7ee22eeSTianjia Zhang	vpxor (6 * 16)(%rdx), RB2, RB2;
574a7ee22eeSTianjia Zhang	vpxor (7 * 16)(%rdx), RB3, RB3;
575a7ee22eeSTianjia Zhang
576a7ee22eeSTianjia Zhang	vmovdqu RA0, (0 * 16)(%rsi);
577a7ee22eeSTianjia Zhang	vmovdqu RA1, (1 * 16)(%rsi);
578a7ee22eeSTianjia Zhang	vmovdqu RA2, (2 * 16)(%rsi);
579a7ee22eeSTianjia Zhang	vmovdqu RA3, (3 * 16)(%rsi);
580a7ee22eeSTianjia Zhang	vmovdqu RB0, (4 * 16)(%rsi);
581a7ee22eeSTianjia Zhang	vmovdqu RB1, (5 * 16)(%rsi);
582a7ee22eeSTianjia Zhang	vmovdqu RB2, (6 * 16)(%rsi);
583a7ee22eeSTianjia Zhang	vmovdqu RB3, (7 * 16)(%rsi);
584a7ee22eeSTianjia Zhang
585a7ee22eeSTianjia Zhang	vzeroall;
586a7ee22eeSTianjia Zhang	FRAME_END
587a7ee22eeSTianjia Zhang	RET;
588a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)
589a7ee22eeSTianjia Zhang