1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4 Cipher Algorithm, AES-NI/AVX optimized. 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 6 * 7 * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi> 8 * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 10 */ 11 12/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: 13 * https://github.com/mjosaarinen/sm4ni 14 */ 15 16#include <linux/linkage.h> 17#include <linux/cfi_types.h> 18#include <asm/frame.h> 19 20#define rRIP (%rip) 21 22#define RX0 %xmm0 23#define RX1 %xmm1 24#define MASK_4BIT %xmm2 25#define RTMP0 %xmm3 26#define RTMP1 %xmm4 27#define RTMP2 %xmm5 28#define RTMP3 %xmm6 29#define RTMP4 %xmm7 30 31#define RA0 %xmm8 32#define RA1 %xmm9 33#define RA2 %xmm10 34#define RA3 %xmm11 35 36#define RB0 %xmm12 37#define RB1 %xmm13 38#define RB2 %xmm14 39#define RB3 %xmm15 40 41#define RNOT %xmm0 42#define RBSWAP %xmm1 43 44 45/* Transpose four 32-bit words between 128-bit vectors. */ 46#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 47 vpunpckhdq x1, x0, t2; \ 48 vpunpckldq x1, x0, x0; \ 49 \ 50 vpunpckldq x3, x2, t1; \ 51 vpunpckhdq x3, x2, x2; \ 52 \ 53 vpunpckhqdq t1, x0, x1; \ 54 vpunpcklqdq t1, x0, x0; \ 55 \ 56 vpunpckhqdq x2, t2, x3; \ 57 vpunpcklqdq x2, t2, x2; 58 59/* pre-SubByte transform. */ 60#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ 61 vpand x, mask4bit, tmp0; \ 62 vpandn x, mask4bit, x; \ 63 vpsrld $4, x, x; \ 64 \ 65 vpshufb tmp0, lo_t, tmp0; \ 66 vpshufb x, hi_t, x; \ 67 vpxor tmp0, x, x; 68 69/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by 70 * 'vaeslastenc' instruction. 71 */ 72#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ 73 vpandn mask4bit, x, tmp0; \ 74 vpsrld $4, x, x; \ 75 vpand x, mask4bit, x; \ 76 \ 77 vpshufb tmp0, lo_t, tmp0; \ 78 vpshufb x, hi_t, x; \ 79 vpxor tmp0, x, x; 80 81 82.section .rodata.cst16, "aM", @progbits, 16 83.align 16 84 85/* 86 * Following four affine transform look-up tables are from work by 87 * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni 88 * 89 * These allow exposing SM4 S-Box from AES SubByte. 90 */ 91 92/* pre-SubByte affine transform, from SM4 field to AES field. */ 93.Lpre_tf_lo_s: 94 .quad 0x9197E2E474720701, 0xC7C1B4B222245157 95.Lpre_tf_hi_s: 96 .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 97 98/* post-SubByte affine transform, from AES field to SM4 field. */ 99.Lpost_tf_lo_s: 100 .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 101.Lpost_tf_hi_s: 102 .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF 103 104/* For isolating SubBytes from AESENCLAST, inverse shift row */ 105.Linv_shift_row: 106 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 107 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 108 109/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ 110.Linv_shift_row_rol_8: 111 .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e 112 .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 113 114/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ 115.Linv_shift_row_rol_16: 116 .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 117 .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 118 119/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ 120.Linv_shift_row_rol_24: 121 .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 122 .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c 123 124/* For CTR-mode IV byteswap */ 125.Lbswap128_mask: 126 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 127 128/* For input word byte-swap */ 129.Lbswap32_mask: 130 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 131 132.align 4 133/* 4-bit mask */ 134.L0f0f0f0f: 135 .long 0x0f0f0f0f 136 137/* 12 bytes, only for padding */ 138.Lpadding_deadbeef: 139 .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef 140 141 142.text 143.align 16 144 145/* 146 * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, 147 * const u8 *src, int nblocks) 148 */ 149.align 8 150SYM_FUNC_START(sm4_aesni_avx_crypt4) 151 /* input: 152 * %rdi: round key array, CTX 153 * %rsi: dst (1..4 blocks) 154 * %rdx: src (1..4 blocks) 155 * %rcx: num blocks (1..4) 156 */ 157 FRAME_BEGIN 158 159 vmovdqu 0*16(%rdx), RA0; 160 vmovdqa RA0, RA1; 161 vmovdqa RA0, RA2; 162 vmovdqa RA0, RA3; 163 cmpq $2, %rcx; 164 jb .Lblk4_load_input_done; 165 vmovdqu 1*16(%rdx), RA1; 166 je .Lblk4_load_input_done; 167 vmovdqu 2*16(%rdx), RA2; 168 cmpq $3, %rcx; 169 je .Lblk4_load_input_done; 170 vmovdqu 3*16(%rdx), RA3; 171 172.Lblk4_load_input_done: 173 174 vmovdqa .Lbswap32_mask rRIP, RTMP2; 175 vpshufb RTMP2, RA0, RA0; 176 vpshufb RTMP2, RA1, RA1; 177 vpshufb RTMP2, RA2, RA2; 178 vpshufb RTMP2, RA3, RA3; 179 180 vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; 181 vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; 182 vmovdqa .Lpre_tf_hi_s rRIP, RB0; 183 vmovdqa .Lpost_tf_lo_s rRIP, RB1; 184 vmovdqa .Lpost_tf_hi_s rRIP, RB2; 185 vmovdqa .Linv_shift_row rRIP, RB3; 186 vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2; 187 vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3; 188 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 189 190#define ROUND(round, s0, s1, s2, s3) \ 191 vbroadcastss (4*(round))(%rdi), RX0; \ 192 vpxor s1, RX0, RX0; \ 193 vpxor s2, RX0, RX0; \ 194 vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ 195 \ 196 /* sbox, non-linear part */ \ 197 transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \ 198 vaesenclast MASK_4BIT, RX0, RX0; \ 199 transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \ 200 \ 201 /* linear part */ \ 202 vpshufb RB3, RX0, RTMP0; \ 203 vpxor RTMP0, s0, s0; /* s0 ^ x */ \ 204 vpshufb RTMP2, RX0, RTMP1; \ 205 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ 206 vpshufb RTMP3, RX0, RTMP1; \ 207 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ 208 vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \ 209 vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ 210 vpslld $2, RTMP0, RTMP1; \ 211 vpsrld $30, RTMP0, RTMP0; \ 212 vpxor RTMP0, s0, s0; \ 213 /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ 214 vpxor RTMP1, s0, s0; 215 216 leaq (32*4)(%rdi), %rax; 217.align 16 218.Lroundloop_blk4: 219 ROUND(0, RA0, RA1, RA2, RA3); 220 ROUND(1, RA1, RA2, RA3, RA0); 221 ROUND(2, RA2, RA3, RA0, RA1); 222 ROUND(3, RA3, RA0, RA1, RA2); 223 leaq (4*4)(%rdi), %rdi; 224 cmpq %rax, %rdi; 225 jne .Lroundloop_blk4; 226 227#undef ROUND 228 229 vmovdqa .Lbswap128_mask rRIP, RTMP2; 230 231 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 232 vpshufb RTMP2, RA0, RA0; 233 vpshufb RTMP2, RA1, RA1; 234 vpshufb RTMP2, RA2, RA2; 235 vpshufb RTMP2, RA3, RA3; 236 237 vmovdqu RA0, 0*16(%rsi); 238 cmpq $2, %rcx; 239 jb .Lblk4_store_output_done; 240 vmovdqu RA1, 1*16(%rsi); 241 je .Lblk4_store_output_done; 242 vmovdqu RA2, 2*16(%rsi); 243 cmpq $3, %rcx; 244 je .Lblk4_store_output_done; 245 vmovdqu RA3, 3*16(%rsi); 246 247.Lblk4_store_output_done: 248 vzeroall; 249 FRAME_END 250 RET; 251SYM_FUNC_END(sm4_aesni_avx_crypt4) 252 253.align 8 254SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) 255 /* input: 256 * %rdi: round key array, CTX 257 * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel 258 * plaintext blocks 259 * output: 260 * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel 261 * ciphertext blocks 262 */ 263 FRAME_BEGIN 264 265 vmovdqa .Lbswap32_mask rRIP, RTMP2; 266 vpshufb RTMP2, RA0, RA0; 267 vpshufb RTMP2, RA1, RA1; 268 vpshufb RTMP2, RA2, RA2; 269 vpshufb RTMP2, RA3, RA3; 270 vpshufb RTMP2, RB0, RB0; 271 vpshufb RTMP2, RB1, RB1; 272 vpshufb RTMP2, RB2, RB2; 273 vpshufb RTMP2, RB3, RB3; 274 275 vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; 276 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 277 transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); 278 279#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ 280 vbroadcastss (4*(round))(%rdi), RX0; \ 281 vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \ 282 vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \ 283 vmovdqa RX0, RX1; \ 284 vpxor s1, RX0, RX0; \ 285 vpxor s2, RX0, RX0; \ 286 vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ 287 vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \ 288 vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \ 289 vpxor r1, RX1, RX1; \ 290 vpxor r2, RX1, RX1; \ 291 vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ 292 \ 293 /* sbox, non-linear part */ \ 294 transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ 295 transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ 296 vmovdqa .Linv_shift_row rRIP, RTMP4; \ 297 vaesenclast MASK_4BIT, RX0, RX0; \ 298 vaesenclast MASK_4BIT, RX1, RX1; \ 299 transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ 300 transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ 301 \ 302 /* linear part */ \ 303 vpshufb RTMP4, RX0, RTMP0; \ 304 vpxor RTMP0, s0, s0; /* s0 ^ x */ \ 305 vpshufb RTMP4, RX1, RTMP2; \ 306 vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \ 307 vpxor RTMP2, r0, r0; /* r0 ^ x */ \ 308 vpshufb RTMP4, RX0, RTMP1; \ 309 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ 310 vpshufb RTMP4, RX1, RTMP3; \ 311 vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \ 312 vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ 313 vpshufb RTMP4, RX0, RTMP1; \ 314 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ 315 vpshufb RTMP4, RX1, RTMP3; \ 316 vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \ 317 vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ 318 vpshufb RTMP4, RX0, RTMP1; \ 319 vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ 320 /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ 321 vpslld $2, RTMP0, RTMP1; \ 322 vpsrld $30, RTMP0, RTMP0; \ 323 vpxor RTMP0, s0, s0; \ 324 vpxor RTMP1, s0, s0; \ 325 vpshufb RTMP4, RX1, RTMP3; \ 326 vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ 327 /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ 328 vpslld $2, RTMP2, RTMP3; \ 329 vpsrld $30, RTMP2, RTMP2; \ 330 vpxor RTMP2, r0, r0; \ 331 vpxor RTMP3, r0, r0; 332 333 leaq (32*4)(%rdi), %rax; 334.align 16 335.Lroundloop_blk8: 336 ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); 337 ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); 338 ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); 339 ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); 340 leaq (4*4)(%rdi), %rdi; 341 cmpq %rax, %rdi; 342 jne .Lroundloop_blk8; 343 344#undef ROUND 345 346 vmovdqa .Lbswap128_mask rRIP, RTMP2; 347 348 transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); 349 transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); 350 vpshufb RTMP2, RA0, RA0; 351 vpshufb RTMP2, RA1, RA1; 352 vpshufb RTMP2, RA2, RA2; 353 vpshufb RTMP2, RA3, RA3; 354 vpshufb RTMP2, RB0, RB0; 355 vpshufb RTMP2, RB1, RB1; 356 vpshufb RTMP2, RB2, RB2; 357 vpshufb RTMP2, RB3, RB3; 358 359 FRAME_END 360 RET; 361SYM_FUNC_END(__sm4_crypt_blk8) 362 363/* 364 * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, 365 * const u8 *src, int nblocks) 366 */ 367.align 8 368SYM_FUNC_START(sm4_aesni_avx_crypt8) 369 /* input: 370 * %rdi: round key array, CTX 371 * %rsi: dst (1..8 blocks) 372 * %rdx: src (1..8 blocks) 373 * %rcx: num blocks (1..8) 374 */ 375 cmpq $5, %rcx; 376 jb sm4_aesni_avx_crypt4; 377 378 FRAME_BEGIN 379 380 vmovdqu (0 * 16)(%rdx), RA0; 381 vmovdqu (1 * 16)(%rdx), RA1; 382 vmovdqu (2 * 16)(%rdx), RA2; 383 vmovdqu (3 * 16)(%rdx), RA3; 384 vmovdqu (4 * 16)(%rdx), RB0; 385 vmovdqa RB0, RB1; 386 vmovdqa RB0, RB2; 387 vmovdqa RB0, RB3; 388 je .Lblk8_load_input_done; 389 vmovdqu (5 * 16)(%rdx), RB1; 390 cmpq $7, %rcx; 391 jb .Lblk8_load_input_done; 392 vmovdqu (6 * 16)(%rdx), RB2; 393 je .Lblk8_load_input_done; 394 vmovdqu (7 * 16)(%rdx), RB3; 395 396.Lblk8_load_input_done: 397 call __sm4_crypt_blk8; 398 399 cmpq $6, %rcx; 400 vmovdqu RA0, (0 * 16)(%rsi); 401 vmovdqu RA1, (1 * 16)(%rsi); 402 vmovdqu RA2, (2 * 16)(%rsi); 403 vmovdqu RA3, (3 * 16)(%rsi); 404 vmovdqu RB0, (4 * 16)(%rsi); 405 jb .Lblk8_store_output_done; 406 vmovdqu RB1, (5 * 16)(%rsi); 407 je .Lblk8_store_output_done; 408 vmovdqu RB2, (6 * 16)(%rsi); 409 cmpq $7, %rcx; 410 je .Lblk8_store_output_done; 411 vmovdqu RB3, (7 * 16)(%rsi); 412 413.Lblk8_store_output_done: 414 vzeroall; 415 FRAME_END 416 RET; 417SYM_FUNC_END(sm4_aesni_avx_crypt8) 418 419/* 420 * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, 421 * const u8 *src, u8 *iv) 422 */ 423.align 8 424SYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) 425 /* input: 426 * %rdi: round key array, CTX 427 * %rsi: dst (8 blocks) 428 * %rdx: src (8 blocks) 429 * %rcx: iv (big endian, 128bit) 430 */ 431 FRAME_BEGIN 432 433 /* load IV and byteswap */ 434 vmovdqu (%rcx), RA0; 435 436 vmovdqa .Lbswap128_mask rRIP, RBSWAP; 437 vpshufb RBSWAP, RA0, RTMP0; /* be => le */ 438 439 vpcmpeqd RNOT, RNOT, RNOT; 440 vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */ 441 442#define inc_le128(x, minus_one, tmp) \ 443 vpcmpeqq minus_one, x, tmp; \ 444 vpsubq minus_one, x, x; \ 445 vpslldq $8, tmp, tmp; \ 446 vpsubq tmp, x, x; 447 448 /* construct IVs */ 449 inc_le128(RTMP0, RNOT, RTMP2); /* +1 */ 450 vpshufb RBSWAP, RTMP0, RA1; 451 inc_le128(RTMP0, RNOT, RTMP2); /* +2 */ 452 vpshufb RBSWAP, RTMP0, RA2; 453 inc_le128(RTMP0, RNOT, RTMP2); /* +3 */ 454 vpshufb RBSWAP, RTMP0, RA3; 455 inc_le128(RTMP0, RNOT, RTMP2); /* +4 */ 456 vpshufb RBSWAP, RTMP0, RB0; 457 inc_le128(RTMP0, RNOT, RTMP2); /* +5 */ 458 vpshufb RBSWAP, RTMP0, RB1; 459 inc_le128(RTMP0, RNOT, RTMP2); /* +6 */ 460 vpshufb RBSWAP, RTMP0, RB2; 461 inc_le128(RTMP0, RNOT, RTMP2); /* +7 */ 462 vpshufb RBSWAP, RTMP0, RB3; 463 inc_le128(RTMP0, RNOT, RTMP2); /* +8 */ 464 vpshufb RBSWAP, RTMP0, RTMP1; 465 466 /* store new IV */ 467 vmovdqu RTMP1, (%rcx); 468 469 call __sm4_crypt_blk8; 470 471 vpxor (0 * 16)(%rdx), RA0, RA0; 472 vpxor (1 * 16)(%rdx), RA1, RA1; 473 vpxor (2 * 16)(%rdx), RA2, RA2; 474 vpxor (3 * 16)(%rdx), RA3, RA3; 475 vpxor (4 * 16)(%rdx), RB0, RB0; 476 vpxor (5 * 16)(%rdx), RB1, RB1; 477 vpxor (6 * 16)(%rdx), RB2, RB2; 478 vpxor (7 * 16)(%rdx), RB3, RB3; 479 480 vmovdqu RA0, (0 * 16)(%rsi); 481 vmovdqu RA1, (1 * 16)(%rsi); 482 vmovdqu RA2, (2 * 16)(%rsi); 483 vmovdqu RA3, (3 * 16)(%rsi); 484 vmovdqu RB0, (4 * 16)(%rsi); 485 vmovdqu RB1, (5 * 16)(%rsi); 486 vmovdqu RB2, (6 * 16)(%rsi); 487 vmovdqu RB3, (7 * 16)(%rsi); 488 489 vzeroall; 490 FRAME_END 491 RET; 492SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) 493 494/* 495 * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, 496 * const u8 *src, u8 *iv) 497 */ 498.align 8 499SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) 500 /* input: 501 * %rdi: round key array, CTX 502 * %rsi: dst (8 blocks) 503 * %rdx: src (8 blocks) 504 * %rcx: iv 505 */ 506 FRAME_BEGIN 507 508 vmovdqu (0 * 16)(%rdx), RA0; 509 vmovdqu (1 * 16)(%rdx), RA1; 510 vmovdqu (2 * 16)(%rdx), RA2; 511 vmovdqu (3 * 16)(%rdx), RA3; 512 vmovdqu (4 * 16)(%rdx), RB0; 513 vmovdqu (5 * 16)(%rdx), RB1; 514 vmovdqu (6 * 16)(%rdx), RB2; 515 vmovdqu (7 * 16)(%rdx), RB3; 516 517 call __sm4_crypt_blk8; 518 519 vmovdqu (7 * 16)(%rdx), RNOT; 520 vpxor (%rcx), RA0, RA0; 521 vpxor (0 * 16)(%rdx), RA1, RA1; 522 vpxor (1 * 16)(%rdx), RA2, RA2; 523 vpxor (2 * 16)(%rdx), RA3, RA3; 524 vpxor (3 * 16)(%rdx), RB0, RB0; 525 vpxor (4 * 16)(%rdx), RB1, RB1; 526 vpxor (5 * 16)(%rdx), RB2, RB2; 527 vpxor (6 * 16)(%rdx), RB3, RB3; 528 vmovdqu RNOT, (%rcx); /* store new IV */ 529 530 vmovdqu RA0, (0 * 16)(%rsi); 531 vmovdqu RA1, (1 * 16)(%rsi); 532 vmovdqu RA2, (2 * 16)(%rsi); 533 vmovdqu RA3, (3 * 16)(%rsi); 534 vmovdqu RB0, (4 * 16)(%rsi); 535 vmovdqu RB1, (5 * 16)(%rsi); 536 vmovdqu RB2, (6 * 16)(%rsi); 537 vmovdqu RB3, (7 * 16)(%rsi); 538 539 vzeroall; 540 FRAME_END 541 RET; 542SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) 543 544/* 545 * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, 546 * const u8 *src, u8 *iv) 547 */ 548.align 8 549SYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) 550 /* input: 551 * %rdi: round key array, CTX 552 * %rsi: dst (8 blocks) 553 * %rdx: src (8 blocks) 554 * %rcx: iv 555 */ 556 FRAME_BEGIN 557 558 /* Load input */ 559 vmovdqu (%rcx), RA0; 560 vmovdqu 0 * 16(%rdx), RA1; 561 vmovdqu 1 * 16(%rdx), RA2; 562 vmovdqu 2 * 16(%rdx), RA3; 563 vmovdqu 3 * 16(%rdx), RB0; 564 vmovdqu 4 * 16(%rdx), RB1; 565 vmovdqu 5 * 16(%rdx), RB2; 566 vmovdqu 6 * 16(%rdx), RB3; 567 568 /* Update IV */ 569 vmovdqu 7 * 16(%rdx), RNOT; 570 vmovdqu RNOT, (%rcx); 571 572 call __sm4_crypt_blk8; 573 574 vpxor (0 * 16)(%rdx), RA0, RA0; 575 vpxor (1 * 16)(%rdx), RA1, RA1; 576 vpxor (2 * 16)(%rdx), RA2, RA2; 577 vpxor (3 * 16)(%rdx), RA3, RA3; 578 vpxor (4 * 16)(%rdx), RB0, RB0; 579 vpxor (5 * 16)(%rdx), RB1, RB1; 580 vpxor (6 * 16)(%rdx), RB2, RB2; 581 vpxor (7 * 16)(%rdx), RB3, RB3; 582 583 vmovdqu RA0, (0 * 16)(%rsi); 584 vmovdqu RA1, (1 * 16)(%rsi); 585 vmovdqu RA2, (2 * 16)(%rsi); 586 vmovdqu RA3, (3 * 16)(%rsi); 587 vmovdqu RB0, (4 * 16)(%rsi); 588 vmovdqu RB1, (5 * 16)(%rsi); 589 vmovdqu RB2, (6 * 16)(%rsi); 590 vmovdqu RB3, (7 * 16)(%rsi); 591 592 vzeroall; 593 FRAME_END 594 RET; 595SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8) 596