1a7ee22eeSTianjia Zhang/* SPDX-License-Identifier: GPL-2.0-or-later */ 2a7ee22eeSTianjia Zhang/* 3a7ee22eeSTianjia Zhang * SM4 Cipher Algorithm, AES-NI/AVX optimized. 4a7ee22eeSTianjia Zhang * as specified in 5a7ee22eeSTianjia Zhang * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 6a7ee22eeSTianjia Zhang * 7a7ee22eeSTianjia Zhang * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi> 8a7ee22eeSTianjia Zhang * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9a7ee22eeSTianjia Zhang * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 10a7ee22eeSTianjia Zhang */ 11a7ee22eeSTianjia Zhang 12a7ee22eeSTianjia Zhang/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: 13a7ee22eeSTianjia Zhang * https://github.com/mjosaarinen/sm4ni 14a7ee22eeSTianjia Zhang */ 15a7ee22eeSTianjia Zhang 16a7ee22eeSTianjia Zhang#include <linux/linkage.h> 17*2d203c46SEric Biggers#include <linux/cfi_types.h> 18a7ee22eeSTianjia Zhang#include <asm/frame.h> 19a7ee22eeSTianjia Zhang 20a7ee22eeSTianjia Zhang#define rRIP (%rip) 21a7ee22eeSTianjia Zhang 22a7ee22eeSTianjia Zhang#define RX0 %xmm0 23a7ee22eeSTianjia Zhang#define RX1 %xmm1 24a7ee22eeSTianjia Zhang#define MASK_4BIT %xmm2 25a7ee22eeSTianjia Zhang#define RTMP0 %xmm3 26a7ee22eeSTianjia Zhang#define RTMP1 %xmm4 27a7ee22eeSTianjia Zhang#define RTMP2 %xmm5 28a7ee22eeSTianjia Zhang#define RTMP3 %xmm6 29a7ee22eeSTianjia Zhang#define RTMP4 %xmm7 30a7ee22eeSTianjia Zhang 31a7ee22eeSTianjia Zhang#define RA0 %xmm8 32a7ee22eeSTianjia Zhang#define RA1 %xmm9 33a7ee22eeSTianjia Zhang#define RA2 %xmm10 34a7ee22eeSTianjia Zhang#define RA3 %xmm11 35a7ee22eeSTianjia Zhang 36a7ee22eeSTianjia Zhang#define RB0 %xmm12 37a7ee22eeSTianjia Zhang#define RB1 %xmm13 38a7ee22eeSTianjia Zhang#define RB2 %xmm14 39a7ee22eeSTianjia Zhang#define RB3 %xmm15 40a7ee22eeSTianjia Zhang 41a7ee22eeSTianjia Zhang#define RNOT %xmm0 42a7ee22eeSTianjia Zhang#define RBSWAP %xmm1 43a7ee22eeSTianjia Zhang 44a7ee22eeSTianjia Zhang 45a7ee22eeSTianjia Zhang/* Transpose four 32-bit words between 128-bit vectors. */ 46a7ee22eeSTianjia Zhang#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 47a7ee22eeSTianjia Zhang vpunpckhdq x1, x0, t2; \ 48a7ee22eeSTianjia Zhang vpunpckldq x1, x0, x0; \ 49a7ee22eeSTianjia Zhang \ 50a7ee22eeSTianjia Zhang vpunpckldq x3, x2, t1; \ 51a7ee22eeSTianjia Zhang vpunpckhdq x3, x2, x2; \ 52a7ee22eeSTianjia Zhang \ 53a7ee22eeSTianjia Zhang vpunpckhqdq t1, x0, x1; \ 54a7ee22eeSTianjia Zhang vpunpcklqdq t1, x0, x0; \ 55a7ee22eeSTianjia Zhang \ 56a7ee22eeSTianjia Zhang vpunpckhqdq x2, t2, x3; \ 57a7ee22eeSTianjia Zhang vpunpcklqdq x2, t2, x2; 58a7ee22eeSTianjia Zhang 59a7ee22eeSTianjia Zhang/* pre-SubByte transform. */ 60a7ee22eeSTianjia Zhang#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ 61a7ee22eeSTianjia Zhang vpand x, mask4bit, tmp0; \ 62a7ee22eeSTianjia Zhang vpandn x, mask4bit, x; \ 63a7ee22eeSTianjia Zhang vpsrld $4, x, x; \ 64a7ee22eeSTianjia Zhang \ 65a7ee22eeSTianjia Zhang vpshufb tmp0, lo_t, tmp0; \ 66a7ee22eeSTianjia Zhang vpshufb x, hi_t, x; \ 67a7ee22eeSTianjia Zhang vpxor tmp0, x, x; 68a7ee22eeSTianjia Zhang 69a7ee22eeSTianjia Zhang/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by 70a7ee22eeSTianjia Zhang * 'vaeslastenc' instruction. 71a7ee22eeSTianjia Zhang */ 72a7ee22eeSTianjia Zhang#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ 73a7ee22eeSTianjia Zhang vpandn mask4bit, x, tmp0; \ 74a7ee22eeSTianjia Zhang vpsrld $4, x, x; \ 75a7ee22eeSTianjia Zhang vpand x, mask4bit, x; \ 76a7ee22eeSTianjia Zhang \ 77a7ee22eeSTianjia Zhang vpshufb tmp0, lo_t, tmp0; \ 78a7ee22eeSTianjia Zhang vpshufb x, hi_t, x; \ 79a7ee22eeSTianjia Zhang vpxor tmp0, x, x; 80a7ee22eeSTianjia Zhang 81a7ee22eeSTianjia Zhang 82f8690a4bSTianjia Zhang.section .rodata.cst16, "aM", @progbits, 16 83a7ee22eeSTianjia Zhang.align 16 84a7ee22eeSTianjia Zhang 85a7ee22eeSTianjia Zhang/* 86a7ee22eeSTianjia Zhang * Following four affine transform look-up tables are from work by 87a7ee22eeSTianjia Zhang * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni 88a7ee22eeSTianjia Zhang * 89a7ee22eeSTianjia Zhang * These allow exposing SM4 S-Box from AES SubByte. 90a7ee22eeSTianjia Zhang */ 91a7ee22eeSTianjia Zhang 92a7ee22eeSTianjia Zhang/* pre-SubByte affine transform, from SM4 field to AES field. */ 93a7ee22eeSTianjia Zhang.Lpre_tf_lo_s: 94a7ee22eeSTianjia Zhang .quad 0x9197E2E474720701, 0xC7C1B4B222245157 95a7ee22eeSTianjia Zhang.Lpre_tf_hi_s: 96a7ee22eeSTianjia Zhang .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 97a7ee22eeSTianjia Zhang 98a7ee22eeSTianjia Zhang/* post-SubByte affine transform, from AES field to SM4 field. */ 99a7ee22eeSTianjia Zhang.Lpost_tf_lo_s: 100a7ee22eeSTianjia Zhang .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 101a7ee22eeSTianjia Zhang.Lpost_tf_hi_s: 102a7ee22eeSTianjia Zhang .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF 103a7ee22eeSTianjia Zhang 104a7ee22eeSTianjia Zhang/* For isolating SubBytes from AESENCLAST, inverse shift row */ 105a7ee22eeSTianjia Zhang.Linv_shift_row: 106a7ee22eeSTianjia Zhang .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 107a7ee22eeSTianjia Zhang .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 108a7ee22eeSTianjia Zhang 109a7ee22eeSTianjia Zhang/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ 110a7ee22eeSTianjia Zhang.Linv_shift_row_rol_8: 111a7ee22eeSTianjia Zhang .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e 112a7ee22eeSTianjia Zhang .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 113a7ee22eeSTianjia Zhang 114a7ee22eeSTianjia Zhang/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ 115a7ee22eeSTianjia Zhang.Linv_shift_row_rol_16: 116a7ee22eeSTianjia Zhang .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 117a7ee22eeSTianjia Zhang .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 118a7ee22eeSTianjia Zhang 119a7ee22eeSTianjia Zhang/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ 120a7ee22eeSTianjia Zhang.Linv_shift_row_rol_24: 121a7ee22eeSTianjia Zhang .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 122a7ee22eeSTianjia Zhang .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c 123a7ee22eeSTianjia Zhang 124a7ee22eeSTianjia Zhang/* For CTR-mode IV byteswap */ 125a7ee22eeSTianjia Zhang.Lbswap128_mask: 126a7ee22eeSTianjia Zhang .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 127a7ee22eeSTianjia Zhang 128a7ee22eeSTianjia Zhang/* For input word byte-swap */ 129a7ee22eeSTianjia Zhang.Lbswap32_mask: 130a7ee22eeSTianjia Zhang .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 131a7ee22eeSTianjia Zhang 132a7ee22eeSTianjia Zhang.align 4 133a7ee22eeSTianjia Zhang/* 4-bit mask */ 134a7ee22eeSTianjia Zhang.L0f0f0f0f: 135a7ee22eeSTianjia Zhang .long 0x0f0f0f0f 136a7ee22eeSTianjia Zhang 137f8690a4bSTianjia Zhang/* 12 bytes, only for padding */ 138f8690a4bSTianjia Zhang.Lpadding_deadbeef: 139f8690a4bSTianjia Zhang .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef 140f8690a4bSTianjia Zhang 141a7ee22eeSTianjia Zhang 142a7ee22eeSTianjia Zhang.text 143a7ee22eeSTianjia Zhang 144a7ee22eeSTianjia Zhang/* 145a7ee22eeSTianjia Zhang * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, 146a7ee22eeSTianjia Zhang * const u8 *src, int nblocks) 147a7ee22eeSTianjia Zhang */ 148a7ee22eeSTianjia ZhangSYM_FUNC_START(sm4_aesni_avx_crypt4) 149a7ee22eeSTianjia Zhang /* input: 150a7ee22eeSTianjia Zhang * %rdi: round key array, CTX 151a7ee22eeSTianjia Zhang * %rsi: dst (1..4 blocks) 152a7ee22eeSTianjia Zhang * %rdx: src (1..4 blocks) 153a7ee22eeSTianjia Zhang * %rcx: num blocks (1..4) 154a7ee22eeSTianjia Zhang */ 155a7ee22eeSTianjia Zhang FRAME_BEGIN 156a7ee22eeSTianjia Zhang 157a7ee22eeSTianjia Zhang vmovdqu 0*16(%rdx), RA0; 158a7ee22eeSTianjia Zhang vmovdqa RA0, RA1; 159a7ee22eeSTianjia Zhang vmovdqa RA0, RA2; 160a7ee22eeSTianjia Zhang vmovdqa RA0, RA3; 161a7ee22eeSTianjia Zhang cmpq $2, %rcx; 162a7ee22eeSTianjia Zhang jb .Lblk4_load_input_done; 163a7ee22eeSTianjia Zhang vmovdqu 1*16(%rdx), RA1; 164a7ee22eeSTianjia Zhang je .Lblk4_load_input_done; 165a7ee22eeSTianjia Zhang vmovdqu 2*16(%rdx), RA2; 166a7ee22eeSTianjia Zhang cmpq $3, %rcx; 167a7ee22eeSTianjia Zhang je .Lblk4_load_input_done; 168a7ee22eeSTianjia Zhang vmovdqu 3*16(%rdx), RA3; 169a7ee22eeSTianjia Zhang 170a7ee22eeSTianjia Zhang.Lblk4_load_input_done: 171a7ee22eeSTianjia Zhang 172a7ee22eeSTianjia Zhang vmovdqa .Lbswap32_mask rRIP, RTMP2; 173a7ee22eeSTianjia Zhang vpshufb RTMP2, RA0, RA0; 174a7ee22eeSTianjia Zhang vpshufb RTMP2, RA1, RA1; 175a7ee22eeSTianjia Zhang vpshufb RTMP2, RA2, RA2; 176a7ee22eeSTianjia Zhang vpshufb RTMP2, RA3, RA3; 177a7ee22eeSTianjia Zhang 178a7ee22eeSTianjia Zhang vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; 179a7ee22eeSTianjia Zhang vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; 180a7ee22eeSTianjia Zhang vmovdqa .Lpre_tf_hi_s rRIP, RB0; 181a7ee22eeSTianjia Zhang vmovdqa .Lpost_tf_lo_s rRIP, RB1; 182a7ee22eeSTianjia Zhang vmovdqa .Lpost_tf_hi_s rRIP, RB2; 183a7ee22eeSTianjia Zhang vmovdqa .Linv_shift_row rRIP, RB3; 184a7ee22eeSTianjia Zhang vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2; 185a7ee22eeSTianjia Zhang vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3; 186a7ee22eeSTianjia Zhang transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 187a7ee22eeSTianjia Zhang 188a7ee22eeSTianjia Zhang#define ROUND(round, s0, s1, s2, s3) \ 189a7ee22eeSTianjia Zhang vbroadcastss (4*(round))(%rdi), RX0; \ 190a7ee22eeSTianjia Zhang vpxor s1, RX0, RX0; \ 191a7ee22eeSTianjia Zhang vpxor s2, RX0, RX0; \ 192a7ee22eeSTianjia Zhang vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ 193a7ee22eeSTianjia Zhang \ 194a7ee22eeSTianjia Zhang /* sbox, non-linear part */ \ 195a7ee22eeSTianjia Zhang transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \ 196a7ee22eeSTianjia Zhang vaesenclast MASK_4BIT, RX0, RX0; \ 197a7ee22eeSTianjia Zhang transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \ 198a7ee22eeSTianjia Zhang \ 199a7ee22eeSTianjia Zhang /* linear part */ \ 200a7ee22eeSTianjia Zhang vpshufb RB3, RX0, RTMP0; \ 201a7ee22eeSTianjia Zhang vpxor RTMP0, s0, s0; /* s0 ^ x */ \ 202a7ee22eeSTianjia Zhang vpshufb RTMP2, RX0, RTMP1; \ 203a7ee22eeSTianjia Zhang vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ 204a7ee22eeSTianjia Zhang vpshufb RTMP3, RX0, RTMP1; \ 205a7ee22eeSTianjia Zhang vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ 206a7ee22eeSTianjia Zhang vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \ 207a7ee22eeSTianjia Zhang vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ 208a7ee22eeSTianjia Zhang vpslld $2, RTMP0, RTMP1; \ 209a7ee22eeSTianjia Zhang vpsrld $30, RTMP0, RTMP0; \ 210a7ee22eeSTianjia Zhang vpxor RTMP0, s0, s0; \ 211a7ee22eeSTianjia Zhang /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ 212a7ee22eeSTianjia Zhang vpxor RTMP1, s0, s0; 213a7ee22eeSTianjia Zhang 214a7ee22eeSTianjia Zhang leaq (32*4)(%rdi), %rax; 215a7ee22eeSTianjia Zhang.align 16 216a7ee22eeSTianjia Zhang.Lroundloop_blk4: 217a7ee22eeSTianjia Zhang ROUND(0, RA0, RA1, RA2, RA3); 218a7ee22eeSTianjia Zhang ROUND(1, RA1, RA2, RA3, RA0); 219a7ee22eeSTianjia Zhang ROUND(2, RA2, RA3, RA0, RA1); 220a7ee22eeSTianjia Zhang ROUND(3, RA3, RA0, RA1, RA2); 221a7ee22eeSTianjia Zhang leaq (4*4)(%rdi), %rdi; 222a7ee22eeSTianjia Zhang cmpq %rax, %rdi; 223a7ee22eeSTianjia Zhang jne .Lroundloop_blk4; 224a7ee22eeSTianjia Zhang 225a7ee22eeSTianjia Zhang#undef ROUND 226a7ee22eeSTianjia Zhang 227a7ee22eeSTianjia Zhang vmovdqa .Lbswap128_mask rRIP, RTMP2; 228a7ee22eeSTianjia Zhang 229a7ee22eeSTianjia Zhang transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 230a7ee22eeSTianjia Zhang vpshufb RTMP2, RA0, RA0; 231a7ee22eeSTianjia Zhang vpshufb RTMP2, RA1, RA1; 232a7ee22eeSTianjia Zhang vpshufb RTMP2, RA2, RA2; 233a7ee22eeSTianjia Zhang vpshufb RTMP2, RA3, RA3; 234a7ee22eeSTianjia Zhang 235a7ee22eeSTianjia Zhang vmovdqu RA0, 0*16(%rsi); 236a7ee22eeSTianjia Zhang cmpq $2, %rcx; 237a7ee22eeSTianjia Zhang jb .Lblk4_store_output_done; 238a7ee22eeSTianjia Zhang vmovdqu RA1, 1*16(%rsi); 239a7ee22eeSTianjia Zhang je .Lblk4_store_output_done; 240a7ee22eeSTianjia Zhang vmovdqu RA2, 2*16(%rsi); 241a7ee22eeSTianjia Zhang cmpq $3, %rcx; 242a7ee22eeSTianjia Zhang je .Lblk4_store_output_done; 243a7ee22eeSTianjia Zhang vmovdqu RA3, 3*16(%rsi); 244a7ee22eeSTianjia Zhang 245a7ee22eeSTianjia Zhang.Lblk4_store_output_done: 246a7ee22eeSTianjia Zhang vzeroall; 247a7ee22eeSTianjia Zhang FRAME_END 248a7ee22eeSTianjia Zhang RET; 249a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_crypt4) 250f94909ceSPeter Zijlstra 251a7ee22eeSTianjia ZhangSYM_FUNC_START_LOCAL(__sm4_crypt_blk8) 252a7ee22eeSTianjia Zhang /* input: 253a7ee22eeSTianjia Zhang * %rdi: round key array, CTX 254a7ee22eeSTianjia Zhang * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel 255a7ee22eeSTianjia Zhang * plaintext blocks 256a7ee22eeSTianjia Zhang * output: 257a7ee22eeSTianjia Zhang * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel 258a7ee22eeSTianjia Zhang * ciphertext blocks 259a7ee22eeSTianjia Zhang */ 260a7ee22eeSTianjia Zhang FRAME_BEGIN 261a7ee22eeSTianjia Zhang 262a7ee22eeSTianjia Zhang vmovdqa .Lbswap32_mask rRIP, RTMP2; 263a7ee22eeSTianjia Zhang vpshufb RTMP2, RA0, RA0; 264a7ee22eeSTianjia Zhang vpshufb RTMP2, RA1, RA1; 265a7ee22eeSTianjia Zhang vpshufb RTMP2, RA2, RA2; 266a7ee22eeSTianjia Zhang vpshufb RTMP2, RA3, RA3; 267a7ee22eeSTianjia Zhang vpshufb RTMP2, RB0, RB0; 268a7ee22eeSTianjia Zhang vpshufb RTMP2, RB1, RB1; 269a7ee22eeSTianjia Zhang vpshufb RTMP2, RB2, RB2; 270a7ee22eeSTianjia Zhang vpshufb RTMP2, RB3, RB3; 271a7ee22eeSTianjia Zhang 272a7ee22eeSTianjia Zhang vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; 273a7ee22eeSTianjia Zhang transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 274a7ee22eeSTianjia Zhang transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); 275a7ee22eeSTianjia Zhang 276a7ee22eeSTianjia Zhang#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ 277a7ee22eeSTianjia Zhang vbroadcastss (4*(round))(%rdi), RX0; \ 278a7ee22eeSTianjia Zhang vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \ 279a7ee22eeSTianjia Zhang vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \ 280a7ee22eeSTianjia Zhang vmovdqa RX0, RX1; \ 281a7ee22eeSTianjia Zhang vpxor s1, RX0, RX0; \ 282a7ee22eeSTianjia Zhang vpxor s2, RX0, RX0; \ 283a7ee22eeSTianjia Zhang vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ 284a7ee22eeSTianjia Zhang vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \ 285a7ee22eeSTianjia Zhang vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \ 286a7ee22eeSTianjia Zhang vpxor r1, RX1, RX1; \ 287a7ee22eeSTianjia Zhang vpxor r2, RX1, RX1; \ 288a7ee22eeSTianjia Zhang vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ 289a7ee22eeSTianjia Zhang \ 290a7ee22eeSTianjia Zhang /* sbox, non-linear part */ \ 291a7ee22eeSTianjia Zhang transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ 292a7ee22eeSTianjia Zhang transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ 293a7ee22eeSTianjia Zhang vmovdqa .Linv_shift_row rRIP, RTMP4; \ 294a7ee22eeSTianjia Zhang vaesenclast MASK_4BIT, RX0, RX0; \ 295a7ee22eeSTianjia Zhang vaesenclast MASK_4BIT, RX1, RX1; \ 296a7ee22eeSTianjia Zhang transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ 297a7ee22eeSTianjia Zhang transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ 298a7ee22eeSTianjia Zhang \ 299a7ee22eeSTianjia Zhang /* linear part */ \ 300a7ee22eeSTianjia Zhang vpshufb RTMP4, RX0, RTMP0; \ 301a7ee22eeSTianjia Zhang vpxor RTMP0, s0, s0; /* s0 ^ x */ \ 302a7ee22eeSTianjia Zhang vpshufb RTMP4, RX1, RTMP2; \ 303a7ee22eeSTianjia Zhang vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \ 304a7ee22eeSTianjia Zhang vpxor RTMP2, r0, r0; /* r0 ^ x */ \ 305a7ee22eeSTianjia Zhang vpshufb RTMP4, RX0, RTMP1; \ 306a7ee22eeSTianjia Zhang vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ 307a7ee22eeSTianjia Zhang vpshufb RTMP4, RX1, RTMP3; \ 308a7ee22eeSTianjia Zhang vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \ 309a7ee22eeSTianjia Zhang vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ 310a7ee22eeSTianjia Zhang vpshufb RTMP4, RX0, RTMP1; \ 311a7ee22eeSTianjia Zhang vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ 312a7ee22eeSTianjia Zhang vpshufb RTMP4, RX1, RTMP3; \ 313a7ee22eeSTianjia Zhang vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \ 314a7ee22eeSTianjia Zhang vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ 315a7ee22eeSTianjia Zhang vpshufb RTMP4, RX0, RTMP1; \ 316a7ee22eeSTianjia Zhang vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ 317a7ee22eeSTianjia Zhang /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ 318a7ee22eeSTianjia Zhang vpslld $2, RTMP0, RTMP1; \ 319a7ee22eeSTianjia Zhang vpsrld $30, RTMP0, RTMP0; \ 320a7ee22eeSTianjia Zhang vpxor RTMP0, s0, s0; \ 321a7ee22eeSTianjia Zhang vpxor RTMP1, s0, s0; \ 322a7ee22eeSTianjia Zhang vpshufb RTMP4, RX1, RTMP3; \ 323a7ee22eeSTianjia Zhang vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ 324a7ee22eeSTianjia Zhang /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ 325a7ee22eeSTianjia Zhang vpslld $2, RTMP2, RTMP3; \ 326a7ee22eeSTianjia Zhang vpsrld $30, RTMP2, RTMP2; \ 327a7ee22eeSTianjia Zhang vpxor RTMP2, r0, r0; \ 328a7ee22eeSTianjia Zhang vpxor RTMP3, r0, r0; 329a7ee22eeSTianjia Zhang 330a7ee22eeSTianjia Zhang leaq (32*4)(%rdi), %rax; 331a7ee22eeSTianjia Zhang.align 16 332a7ee22eeSTianjia Zhang.Lroundloop_blk8: 333a7ee22eeSTianjia Zhang ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); 334a7ee22eeSTianjia Zhang ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); 335a7ee22eeSTianjia Zhang ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); 336a7ee22eeSTianjia Zhang ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); 337a7ee22eeSTianjia Zhang leaq (4*4)(%rdi), %rdi; 338a7ee22eeSTianjia Zhang cmpq %rax, %rdi; 339a7ee22eeSTianjia Zhang jne .Lroundloop_blk8; 340a7ee22eeSTianjia Zhang 341a7ee22eeSTianjia Zhang#undef ROUND 342a7ee22eeSTianjia Zhang 343a7ee22eeSTianjia Zhang vmovdqa .Lbswap128_mask rRIP, RTMP2; 344a7ee22eeSTianjia Zhang 345a7ee22eeSTianjia Zhang transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 346a7ee22eeSTianjia Zhang transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); 347a7ee22eeSTianjia Zhang vpshufb RTMP2, RA0, RA0; 348a7ee22eeSTianjia Zhang vpshufb RTMP2, RA1, RA1; 349a7ee22eeSTianjia Zhang vpshufb RTMP2, RA2, RA2; 350a7ee22eeSTianjia Zhang vpshufb RTMP2, RA3, RA3; 351a7ee22eeSTianjia Zhang vpshufb RTMP2, RB0, RB0; 352a7ee22eeSTianjia Zhang vpshufb RTMP2, RB1, RB1; 353a7ee22eeSTianjia Zhang vpshufb RTMP2, RB2, RB2; 354a7ee22eeSTianjia Zhang vpshufb RTMP2, RB3, RB3; 355a7ee22eeSTianjia Zhang 356a7ee22eeSTianjia Zhang FRAME_END 357a7ee22eeSTianjia Zhang RET; 358a7ee22eeSTianjia ZhangSYM_FUNC_END(__sm4_crypt_blk8) 359a7ee22eeSTianjia Zhang 360f94909ceSPeter Zijlstra/* 361a7ee22eeSTianjia Zhang * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, 362a7ee22eeSTianjia Zhang * const u8 *src, int nblocks) 363a7ee22eeSTianjia Zhang */ 364a7ee22eeSTianjia ZhangSYM_FUNC_START(sm4_aesni_avx_crypt8) 365a7ee22eeSTianjia Zhang /* input: 366a7ee22eeSTianjia Zhang * %rdi: round key array, CTX 367a7ee22eeSTianjia Zhang * %rsi: dst (1..8 blocks) 368a7ee22eeSTianjia Zhang * %rdx: src (1..8 blocks) 369a7ee22eeSTianjia Zhang * %rcx: num blocks (1..8) 370a7ee22eeSTianjia Zhang */ 371a7ee22eeSTianjia Zhang cmpq $5, %rcx; 372a7ee22eeSTianjia Zhang jb sm4_aesni_avx_crypt4; 373a7ee22eeSTianjia Zhang 374a7ee22eeSTianjia Zhang FRAME_BEGIN 375a7ee22eeSTianjia Zhang 376a7ee22eeSTianjia Zhang vmovdqu (0 * 16)(%rdx), RA0; 3770e14ef38SJosh Poimboeuf vmovdqu (1 * 16)(%rdx), RA1; 3780e14ef38SJosh Poimboeuf vmovdqu (2 * 16)(%rdx), RA2; 3790e14ef38SJosh Poimboeuf vmovdqu (3 * 16)(%rdx), RA3; 380a7ee22eeSTianjia Zhang vmovdqu (4 * 16)(%rdx), RB0; 381a7ee22eeSTianjia Zhang vmovdqa RB0, RB1; 382a7ee22eeSTianjia Zhang vmovdqa RB0, RB2; 383a7ee22eeSTianjia Zhang vmovdqa RB0, RB3; 384a7ee22eeSTianjia Zhang je .Lblk8_load_input_done; 385a7ee22eeSTianjia Zhang vmovdqu (5 * 16)(%rdx), RB1; 386a7ee22eeSTianjia Zhang cmpq $7, %rcx; 387a7ee22eeSTianjia Zhang jb .Lblk8_load_input_done; 388a7ee22eeSTianjia Zhang vmovdqu (6 * 16)(%rdx), RB2; 389a7ee22eeSTianjia Zhang je .Lblk8_load_input_done; 390a7ee22eeSTianjia Zhang vmovdqu (7 * 16)(%rdx), RB3; 391a7ee22eeSTianjia Zhang 392a7ee22eeSTianjia Zhang.Lblk8_load_input_done: 393a7ee22eeSTianjia Zhang call __sm4_crypt_blk8; 394a7ee22eeSTianjia Zhang 395a7ee22eeSTianjia Zhang cmpq $6, %rcx; 396a7ee22eeSTianjia Zhang vmovdqu RA0, (0 * 16)(%rsi); 397a7ee22eeSTianjia Zhang vmovdqu RA1, (1 * 16)(%rsi); 398a7ee22eeSTianjia Zhang vmovdqu RA2, (2 * 16)(%rsi); 399a7ee22eeSTianjia Zhang vmovdqu RA3, (3 * 16)(%rsi); 400a7ee22eeSTianjia Zhang vmovdqu RB0, (4 * 16)(%rsi); 401a7ee22eeSTianjia Zhang jb .Lblk8_store_output_done; 402a7ee22eeSTianjia Zhang vmovdqu RB1, (5 * 16)(%rsi); 403a7ee22eeSTianjia Zhang je .Lblk8_store_output_done; 404a7ee22eeSTianjia Zhang vmovdqu RB2, (6 * 16)(%rsi); 405a7ee22eeSTianjia Zhang cmpq $7, %rcx; 406a7ee22eeSTianjia Zhang je .Lblk8_store_output_done; 407a7ee22eeSTianjia Zhang vmovdqu RB3, (7 * 16)(%rsi); 408a7ee22eeSTianjia Zhang 409a7ee22eeSTianjia Zhang.Lblk8_store_output_done: 410a7ee22eeSTianjia Zhang vzeroall; 411a7ee22eeSTianjia Zhang FRAME_END 412a7ee22eeSTianjia Zhang RET; 413a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_crypt8) 414a7ee22eeSTianjia Zhang 415a7ee22eeSTianjia Zhang/* 416f94909ceSPeter Zijlstra * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, 417a7ee22eeSTianjia Zhang * const u8 *src, u8 *iv) 418a7ee22eeSTianjia Zhang */ 419a7ee22eeSTianjia ZhangSYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) 420a7ee22eeSTianjia Zhang /* input: 421a7ee22eeSTianjia Zhang * %rdi: round key array, CTX 422a7ee22eeSTianjia Zhang * %rsi: dst (8 blocks) 423a7ee22eeSTianjia Zhang * %rdx: src (8 blocks) 424*2d203c46SEric Biggers * %rcx: iv (big endian, 128bit) 425a7ee22eeSTianjia Zhang */ 426a7ee22eeSTianjia Zhang FRAME_BEGIN 427a7ee22eeSTianjia Zhang 428a7ee22eeSTianjia Zhang /* load IV and byteswap */ 429a7ee22eeSTianjia Zhang vmovdqu (%rcx), RA0; 430a7ee22eeSTianjia Zhang 431a7ee22eeSTianjia Zhang vmovdqa .Lbswap128_mask rRIP, RBSWAP; 432a7ee22eeSTianjia Zhang vpshufb RBSWAP, RA0, RTMP0; /* be => le */ 433a7ee22eeSTianjia Zhang 434a7ee22eeSTianjia Zhang vpcmpeqd RNOT, RNOT, RNOT; 435a7ee22eeSTianjia Zhang vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */ 436a7ee22eeSTianjia Zhang 437a7ee22eeSTianjia Zhang#define inc_le128(x, minus_one, tmp) \ 438a7ee22eeSTianjia Zhang vpcmpeqq minus_one, x, tmp; \ 439a7ee22eeSTianjia Zhang vpsubq minus_one, x, x; \ 440a7ee22eeSTianjia Zhang vpslldq $8, tmp, tmp; \ 441a7ee22eeSTianjia Zhang vpsubq tmp, x, x; 442a7ee22eeSTianjia Zhang 443a7ee22eeSTianjia Zhang /* construct IVs */ 444a7ee22eeSTianjia Zhang inc_le128(RTMP0, RNOT, RTMP2); /* +1 */ 445a7ee22eeSTianjia Zhang vpshufb RBSWAP, RTMP0, RA1; 446a7ee22eeSTianjia Zhang inc_le128(RTMP0, RNOT, RTMP2); /* +2 */ 447a7ee22eeSTianjia Zhang vpshufb RBSWAP, RTMP0, RA2; 448a7ee22eeSTianjia Zhang inc_le128(RTMP0, RNOT, RTMP2); /* +3 */ 449a7ee22eeSTianjia Zhang vpshufb RBSWAP, RTMP0, RA3; 450a7ee22eeSTianjia Zhang inc_le128(RTMP0, RNOT, RTMP2); /* +4 */ 451a7ee22eeSTianjia Zhang vpshufb RBSWAP, RTMP0, RB0; 452a7ee22eeSTianjia Zhang inc_le128(RTMP0, RNOT, RTMP2); /* +5 */ 453a7ee22eeSTianjia Zhang vpshufb RBSWAP, RTMP0, RB1; 454a7ee22eeSTianjia Zhang inc_le128(RTMP0, RNOT, RTMP2); /* +6 */ 455a7ee22eeSTianjia Zhang vpshufb RBSWAP, RTMP0, RB2; 456a7ee22eeSTianjia Zhang inc_le128(RTMP0, RNOT, RTMP2); /* +7 */ 457a7ee22eeSTianjia Zhang vpshufb RBSWAP, RTMP0, RB3; 458a7ee22eeSTianjia Zhang inc_le128(RTMP0, RNOT, RTMP2); /* +8 */ 459a7ee22eeSTianjia Zhang vpshufb RBSWAP, RTMP0, RTMP1; 460a7ee22eeSTianjia Zhang 461a7ee22eeSTianjia Zhang /* store new IV */ 462a7ee22eeSTianjia Zhang vmovdqu RTMP1, (%rcx); 463a7ee22eeSTianjia Zhang 464a7ee22eeSTianjia Zhang call __sm4_crypt_blk8; 465a7ee22eeSTianjia Zhang 466a7ee22eeSTianjia Zhang vpxor (0 * 16)(%rdx), RA0, RA0; 467a7ee22eeSTianjia Zhang vpxor (1 * 16)(%rdx), RA1, RA1; 468a7ee22eeSTianjia Zhang vpxor (2 * 16)(%rdx), RA2, RA2; 469a7ee22eeSTianjia Zhang vpxor (3 * 16)(%rdx), RA3, RA3; 470a7ee22eeSTianjia Zhang vpxor (4 * 16)(%rdx), RB0, RB0; 471a7ee22eeSTianjia Zhang vpxor (5 * 16)(%rdx), RB1, RB1; 472a7ee22eeSTianjia Zhang vpxor (6 * 16)(%rdx), RB2, RB2; 473a7ee22eeSTianjia Zhang vpxor (7 * 16)(%rdx), RB3, RB3; 474a7ee22eeSTianjia Zhang 475a7ee22eeSTianjia Zhang vmovdqu RA0, (0 * 16)(%rsi); 476a7ee22eeSTianjia Zhang vmovdqu RA1, (1 * 16)(%rsi); 477a7ee22eeSTianjia Zhang vmovdqu RA2, (2 * 16)(%rsi); 478a7ee22eeSTianjia Zhang vmovdqu RA3, (3 * 16)(%rsi); 479a7ee22eeSTianjia Zhang vmovdqu RB0, (4 * 16)(%rsi); 480a7ee22eeSTianjia Zhang vmovdqu RB1, (5 * 16)(%rsi); 481a7ee22eeSTianjia Zhang vmovdqu RB2, (6 * 16)(%rsi); 482a7ee22eeSTianjia Zhang vmovdqu RB3, (7 * 16)(%rsi); 483a7ee22eeSTianjia Zhang 484a7ee22eeSTianjia Zhang vzeroall; 485a7ee22eeSTianjia Zhang FRAME_END 486a7ee22eeSTianjia Zhang RET; 487a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) 488a7ee22eeSTianjia Zhang 489a7ee22eeSTianjia Zhang/* 490a7ee22eeSTianjia Zhang * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, 491f94909ceSPeter Zijlstra * const u8 *src, u8 *iv) 492a7ee22eeSTianjia Zhang */ 493a7ee22eeSTianjia ZhangSYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) 494a7ee22eeSTianjia Zhang /* input: 495a7ee22eeSTianjia Zhang * %rdi: round key array, CTX 496a7ee22eeSTianjia Zhang * %rsi: dst (8 blocks) 497a7ee22eeSTianjia Zhang * %rdx: src (8 blocks) 498a7ee22eeSTianjia Zhang * %rcx: iv 499*2d203c46SEric Biggers */ 500a7ee22eeSTianjia Zhang FRAME_BEGIN 501a7ee22eeSTianjia Zhang 502a7ee22eeSTianjia Zhang vmovdqu (0 * 16)(%rdx), RA0; 503a7ee22eeSTianjia Zhang vmovdqu (1 * 16)(%rdx), RA1; 504a7ee22eeSTianjia Zhang vmovdqu (2 * 16)(%rdx), RA2; 505a7ee22eeSTianjia Zhang vmovdqu (3 * 16)(%rdx), RA3; 506a7ee22eeSTianjia Zhang vmovdqu (4 * 16)(%rdx), RB0; 507a7ee22eeSTianjia Zhang vmovdqu (5 * 16)(%rdx), RB1; 508a7ee22eeSTianjia Zhang vmovdqu (6 * 16)(%rdx), RB2; 509a7ee22eeSTianjia Zhang vmovdqu (7 * 16)(%rdx), RB3; 510a7ee22eeSTianjia Zhang 511a7ee22eeSTianjia Zhang call __sm4_crypt_blk8; 512a7ee22eeSTianjia Zhang 513a7ee22eeSTianjia Zhang vmovdqu (7 * 16)(%rdx), RNOT; 514a7ee22eeSTianjia Zhang vpxor (%rcx), RA0, RA0; 515a7ee22eeSTianjia Zhang vpxor (0 * 16)(%rdx), RA1, RA1; 516a7ee22eeSTianjia Zhang vpxor (1 * 16)(%rdx), RA2, RA2; 517a7ee22eeSTianjia Zhang vpxor (2 * 16)(%rdx), RA3, RA3; 518a7ee22eeSTianjia Zhang vpxor (3 * 16)(%rdx), RB0, RB0; 519a7ee22eeSTianjia Zhang vpxor (4 * 16)(%rdx), RB1, RB1; 520a7ee22eeSTianjia Zhang vpxor (5 * 16)(%rdx), RB2, RB2; 521a7ee22eeSTianjia Zhang vpxor (6 * 16)(%rdx), RB3, RB3; 522a7ee22eeSTianjia Zhang vmovdqu RNOT, (%rcx); /* store new IV */ 523a7ee22eeSTianjia Zhang 524a7ee22eeSTianjia Zhang vmovdqu RA0, (0 * 16)(%rsi); 525a7ee22eeSTianjia Zhang vmovdqu RA1, (1 * 16)(%rsi); 526a7ee22eeSTianjia Zhang vmovdqu RA2, (2 * 16)(%rsi); 527a7ee22eeSTianjia Zhang vmovdqu RA3, (3 * 16)(%rsi); 528a7ee22eeSTianjia Zhang vmovdqu RB0, (4 * 16)(%rsi); 529a7ee22eeSTianjia Zhang vmovdqu RB1, (5 * 16)(%rsi); 530a7ee22eeSTianjia Zhang vmovdqu RB2, (6 * 16)(%rsi); 531a7ee22eeSTianjia Zhang vmovdqu RB3, (7 * 16)(%rsi); 532a7ee22eeSTianjia Zhang 533a7ee22eeSTianjia Zhang vzeroall; 534a7ee22eeSTianjia Zhang FRAME_END 535a7ee22eeSTianjia Zhang RET; 536a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) 537a7ee22eeSTianjia Zhang 538a7ee22eeSTianjia Zhang/* 539a7ee22eeSTianjia Zhang * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, 540a7ee22eeSTianjia Zhang * const u8 *src, u8 *iv) 541f94909ceSPeter Zijlstra */ 542a7ee22eeSTianjia ZhangSYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) 543a7ee22eeSTianjia Zhang /* input: 544a7ee22eeSTianjia Zhang * %rdi: round key array, CTX 545a7ee22eeSTianjia Zhang * %rsi: dst (8 blocks) 546a7ee22eeSTianjia Zhang * %rdx: src (8 blocks) 547a7ee22eeSTianjia Zhang * %rcx: iv 548a7ee22eeSTianjia Zhang */ 549*2d203c46SEric Biggers FRAME_BEGIN 550a7ee22eeSTianjia Zhang 551a7ee22eeSTianjia Zhang /* Load input */ 552a7ee22eeSTianjia Zhang vmovdqu (%rcx), RA0; 553a7ee22eeSTianjia Zhang vmovdqu 0 * 16(%rdx), RA1; 554a7ee22eeSTianjia Zhang vmovdqu 1 * 16(%rdx), RA2; 555a7ee22eeSTianjia Zhang vmovdqu 2 * 16(%rdx), RA3; 556a7ee22eeSTianjia Zhang vmovdqu 3 * 16(%rdx), RB0; 557a7ee22eeSTianjia Zhang vmovdqu 4 * 16(%rdx), RB1; 558a7ee22eeSTianjia Zhang vmovdqu 5 * 16(%rdx), RB2; 559a7ee22eeSTianjia Zhang vmovdqu 6 * 16(%rdx), RB3; 560a7ee22eeSTianjia Zhang 561a7ee22eeSTianjia Zhang /* Update IV */ 562a7ee22eeSTianjia Zhang vmovdqu 7 * 16(%rdx), RNOT; 563a7ee22eeSTianjia Zhang vmovdqu RNOT, (%rcx); 564a7ee22eeSTianjia Zhang 565a7ee22eeSTianjia Zhang call __sm4_crypt_blk8; 566a7ee22eeSTianjia Zhang 567a7ee22eeSTianjia Zhang vpxor (0 * 16)(%rdx), RA0, RA0; 568a7ee22eeSTianjia Zhang vpxor (1 * 16)(%rdx), RA1, RA1; 569a7ee22eeSTianjia Zhang vpxor (2 * 16)(%rdx), RA2, RA2; 570a7ee22eeSTianjia Zhang vpxor (3 * 16)(%rdx), RA3, RA3; 571a7ee22eeSTianjia Zhang vpxor (4 * 16)(%rdx), RB0, RB0; 572a7ee22eeSTianjia Zhang vpxor (5 * 16)(%rdx), RB1, RB1; 573a7ee22eeSTianjia Zhang vpxor (6 * 16)(%rdx), RB2, RB2; 574a7ee22eeSTianjia Zhang vpxor (7 * 16)(%rdx), RB3, RB3; 575a7ee22eeSTianjia Zhang 576a7ee22eeSTianjia Zhang vmovdqu RA0, (0 * 16)(%rsi); 577a7ee22eeSTianjia Zhang vmovdqu RA1, (1 * 16)(%rsi); 578a7ee22eeSTianjia Zhang vmovdqu RA2, (2 * 16)(%rsi); 579a7ee22eeSTianjia Zhang vmovdqu RA3, (3 * 16)(%rsi); 580a7ee22eeSTianjia Zhang vmovdqu RB0, (4 * 16)(%rsi); 581a7ee22eeSTianjia Zhang vmovdqu RB1, (5 * 16)(%rsi); 582a7ee22eeSTianjia Zhang vmovdqu RB2, (6 * 16)(%rsi); 583a7ee22eeSTianjia Zhang vmovdqu RB3, (7 * 16)(%rsi); 584a7ee22eeSTianjia Zhang 585a7ee22eeSTianjia Zhang vzeroall; 586a7ee22eeSTianjia Zhang FRAME_END 587a7ee22eeSTianjia Zhang RET; 588a7ee22eeSTianjia ZhangSYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8) 589a7ee22eeSTianjia Zhang