1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4 Cipher Algorithm for ARMv8 NEON 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 6 * 7 * Copyright (C) 2022, Alibaba Group. 8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14/* Register macros */ 15 16#define RTMP0 v8 17#define RTMP1 v9 18#define RTMP2 v10 19#define RTMP3 v11 20 21#define RTMP4 v12 22#define RTMP5 v13 23#define RTMP6 v14 24#define RTMP7 v15 25 26#define RX0 v12 27#define RX1 v13 28#define RKEY v14 29#define RIV v15 30 31/* Helper macros. */ 32 33#define SM4_PREPARE() \ 34 adr_l x5, crypto_sm4_sbox; \ 35 ld1 {v16.16b-v19.16b}, [x5], #64; \ 36 ld1 {v20.16b-v23.16b}, [x5], #64; \ 37 ld1 {v24.16b-v27.16b}, [x5], #64; \ 38 ld1 {v28.16b-v31.16b}, [x5]; 39 40#define transpose_4x4(s0, s1, s2, s3) \ 41 zip1 RTMP0.4s, s0.4s, s1.4s; \ 42 zip1 RTMP1.4s, s2.4s, s3.4s; \ 43 zip2 RTMP2.4s, s0.4s, s1.4s; \ 44 zip2 RTMP3.4s, s2.4s, s3.4s; \ 45 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ 46 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ 47 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ 48 zip2 s3.2d, RTMP2.2d, RTMP3.2d; 49 50#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ 51 zip1 RTMP0.4s, s0.4s, s1.4s; \ 52 zip1 RTMP1.4s, s2.4s, s3.4s; \ 53 zip2 RTMP2.4s, s0.4s, s1.4s; \ 54 zip2 RTMP3.4s, s2.4s, s3.4s; \ 55 zip1 RTMP4.4s, s4.4s, s5.4s; \ 56 zip1 RTMP5.4s, s6.4s, s7.4s; \ 57 zip2 RTMP6.4s, s4.4s, s5.4s; \ 58 zip2 RTMP7.4s, s6.4s, s7.4s; \ 59 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ 60 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ 61 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ 62 zip2 s3.2d, RTMP2.2d, RTMP3.2d; \ 63 zip1 s4.2d, RTMP4.2d, RTMP5.2d; \ 64 zip2 s5.2d, RTMP4.2d, RTMP5.2d; \ 65 zip1 s6.2d, RTMP6.2d, RTMP7.2d; \ 66 zip2 s7.2d, RTMP6.2d, RTMP7.2d; 67 68#define rotate_clockwise_4x4(s0, s1, s2, s3) \ 69 zip1 RTMP0.4s, s1.4s, s0.4s; \ 70 zip2 RTMP1.4s, s1.4s, s0.4s; \ 71 zip1 RTMP2.4s, s3.4s, s2.4s; \ 72 zip2 RTMP3.4s, s3.4s, s2.4s; \ 73 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ 74 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ 75 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ 76 zip2 s3.2d, RTMP3.2d, RTMP1.2d; 77 78#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ 79 zip1 RTMP0.4s, s1.4s, s0.4s; \ 80 zip1 RTMP2.4s, s3.4s, s2.4s; \ 81 zip2 RTMP1.4s, s1.4s, s0.4s; \ 82 zip2 RTMP3.4s, s3.4s, s2.4s; \ 83 zip1 RTMP4.4s, s5.4s, s4.4s; \ 84 zip1 RTMP6.4s, s7.4s, s6.4s; \ 85 zip2 RTMP5.4s, s5.4s, s4.4s; \ 86 zip2 RTMP7.4s, s7.4s, s6.4s; \ 87 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ 88 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ 89 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ 90 zip2 s3.2d, RTMP3.2d, RTMP1.2d; \ 91 zip1 s4.2d, RTMP6.2d, RTMP4.2d; \ 92 zip2 s5.2d, RTMP6.2d, RTMP4.2d; \ 93 zip1 s6.2d, RTMP7.2d, RTMP5.2d; \ 94 zip2 s7.2d, RTMP7.2d, RTMP5.2d; 95 96#define ROUND4(round, s0, s1, s2, s3) \ 97 dup RX0.4s, RKEY.s[round]; \ 98 /* rk ^ s1 ^ s2 ^ s3 */ \ 99 eor RTMP1.16b, s2.16b, s3.16b; \ 100 eor RX0.16b, RX0.16b, s1.16b; \ 101 eor RX0.16b, RX0.16b, RTMP1.16b; \ 102 \ 103 /* sbox, non-linear part */ \ 104 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ 105 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ 106 sub RX0.16b, RX0.16b, RTMP3.16b; \ 107 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ 108 sub RX0.16b, RX0.16b, RTMP3.16b; \ 109 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ 110 sub RX0.16b, RX0.16b, RTMP3.16b; \ 111 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ 112 \ 113 /* linear part */ \ 114 shl RTMP1.4s, RTMP0.4s, #8; \ 115 shl RTMP2.4s, RTMP0.4s, #16; \ 116 shl RTMP3.4s, RTMP0.4s, #24; \ 117 sri RTMP1.4s, RTMP0.4s, #(32-8); \ 118 sri RTMP2.4s, RTMP0.4s, #(32-16); \ 119 sri RTMP3.4s, RTMP0.4s, #(32-24); \ 120 /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ 121 eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \ 122 eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \ 123 /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \ 124 eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \ 125 shl RTMP2.4s, RTMP1.4s, 2; \ 126 sri RTMP2.4s, RTMP1.4s, #(32-2); \ 127 eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \ 128 /* s0 ^= RTMP3 */ \ 129 eor s0.16b, s0.16b, RTMP3.16b; 130 131#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \ 132 mov x6, 8; \ 1334: \ 134 ld1 {RKEY.4s}, [x0], #16; \ 135 subs x6, x6, #1; \ 136 \ 137 ROUND4(0, b0, b1, b2, b3); \ 138 ROUND4(1, b1, b2, b3, b0); \ 139 ROUND4(2, b2, b3, b0, b1); \ 140 ROUND4(3, b3, b0, b1, b2); \ 141 \ 142 bne 4b; \ 143 \ 144 rev32 b0.16b, b0.16b; \ 145 rev32 b1.16b, b1.16b; \ 146 rev32 b2.16b, b2.16b; \ 147 rev32 b3.16b, b3.16b; \ 148 \ 149 rotate_clockwise_4x4(b0, b1, b2, b3); \ 150 \ 151 /* repoint to rkey */ \ 152 sub x0, x0, #128; 153 154#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ 155 rev32 b0.16b, b0.16b; \ 156 rev32 b1.16b, b1.16b; \ 157 rev32 b2.16b, b2.16b; \ 158 rev32 b3.16b, b3.16b; \ 159 SM4_CRYPT_BLK4_BE(b0, b1, b2, b3); 160 161#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \ 162 /* rk ^ s1 ^ s2 ^ s3 */ \ 163 dup RX0.4s, RKEY.s[round]; \ 164 eor RTMP0.16b, s2.16b, s3.16b; \ 165 mov RX1.16b, RX0.16b; \ 166 eor RTMP1.16b, t2.16b, t3.16b; \ 167 eor RX0.16b, RX0.16b, s1.16b; \ 168 eor RX1.16b, RX1.16b, t1.16b; \ 169 eor RX0.16b, RX0.16b, RTMP0.16b; \ 170 eor RX1.16b, RX1.16b, RTMP1.16b; \ 171 \ 172 /* sbox, non-linear part */ \ 173 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ 174 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ 175 tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \ 176 sub RX0.16b, RX0.16b, RTMP3.16b; \ 177 sub RX1.16b, RX1.16b, RTMP3.16b; \ 178 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ 179 tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \ 180 sub RX0.16b, RX0.16b, RTMP3.16b; \ 181 sub RX1.16b, RX1.16b, RTMP3.16b; \ 182 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ 183 tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \ 184 sub RX0.16b, RX0.16b, RTMP3.16b; \ 185 sub RX1.16b, RX1.16b, RTMP3.16b; \ 186 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ 187 tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \ 188 \ 189 /* linear part */ \ 190 shl RX0.4s, RTMP0.4s, #8; \ 191 shl RX1.4s, RTMP1.4s, #8; \ 192 shl RTMP2.4s, RTMP0.4s, #16; \ 193 shl RTMP3.4s, RTMP1.4s, #16; \ 194 sri RX0.4s, RTMP0.4s, #(32 - 8); \ 195 sri RX1.4s, RTMP1.4s, #(32 - 8); \ 196 sri RTMP2.4s, RTMP0.4s, #(32 - 16); \ 197 sri RTMP3.4s, RTMP1.4s, #(32 - 16); \ 198 /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ 199 eor RX0.16b, RX0.16b, RTMP0.16b; \ 200 eor RX1.16b, RX1.16b, RTMP1.16b; \ 201 eor RX0.16b, RX0.16b, RTMP2.16b; \ 202 eor RX1.16b, RX1.16b, RTMP3.16b; \ 203 /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \ 204 shl RTMP2.4s, RTMP0.4s, #24; \ 205 shl RTMP3.4s, RTMP1.4s, #24; \ 206 sri RTMP2.4s, RTMP0.4s, #(32 - 24); \ 207 sri RTMP3.4s, RTMP1.4s, #(32 - 24); \ 208 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ 209 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ 210 shl RTMP2.4s, RX0.4s, #2; \ 211 shl RTMP3.4s, RX1.4s, #2; \ 212 sri RTMP2.4s, RX0.4s, #(32 - 2); \ 213 sri RTMP3.4s, RX1.4s, #(32 - 2); \ 214 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ 215 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ 216 /* s0/t0 ^= RTMP0/1 */ \ 217 eor s0.16b, s0.16b, RTMP0.16b; \ 218 eor t0.16b, t0.16b, RTMP1.16b; 219 220#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \ 221 rev32 b0.16b, b0.16b; \ 222 rev32 b1.16b, b1.16b; \ 223 rev32 b2.16b, b2.16b; \ 224 rev32 b3.16b, b3.16b; \ 225 rev32 b4.16b, b4.16b; \ 226 rev32 b5.16b, b5.16b; \ 227 rev32 b6.16b, b6.16b; \ 228 rev32 b7.16b, b7.16b; \ 229 \ 230 mov x6, 8; \ 2318: \ 232 ld1 {RKEY.4s}, [x0], #16; \ 233 subs x6, x6, #1; \ 234 \ 235 ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \ 236 ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \ 237 ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \ 238 ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \ 239 \ 240 bne 8b; \ 241 \ 242 rev32 b0.16b, b0.16b; \ 243 rev32 b1.16b, b1.16b; \ 244 rev32 b2.16b, b2.16b; \ 245 rev32 b3.16b, b3.16b; \ 246 rev32 b4.16b, b4.16b; \ 247 rev32 b5.16b, b5.16b; \ 248 rev32 b6.16b, b6.16b; \ 249 rev32 b7.16b, b7.16b; \ 250 \ 251 /* repoint to rkey */ \ 252 sub x0, x0, #128; 253 254#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ 255 SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \ 256 rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \ 257 258 259.align 3 260SYM_FUNC_START(sm4_neon_crypt) 261 /* input: 262 * x0: round key array, CTX 263 * x1: dst 264 * x2: src 265 * w3: nblocks 266 */ 267 SM4_PREPARE() 268 269.Lcrypt_loop_8x: 270 sub w3, w3, #8 271 tbnz w3, #31, .Lcrypt_4x 272 273 ld4 {v0.4s-v3.4s}, [x2], #64 274 ld4 {v4.4s-v7.4s}, [x2], #64 275 276 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 277 278 st1 {v0.16b-v3.16b}, [x1], #64 279 st1 {v4.16b-v7.16b}, [x1], #64 280 281 cbz w3, .Lcrypt_end 282 b .Lcrypt_loop_8x 283 284.Lcrypt_4x: 285 add w3, w3, #8 286 cmp w3, #4 287 blt .Lcrypt_tail 288 289 sub w3, w3, #4 290 291 ld4 {v0.4s-v3.4s}, [x2], #64 292 293 SM4_CRYPT_BLK4(v0, v1, v2, v3) 294 295 st1 {v0.16b-v3.16b}, [x1], #64 296 297 cbz w3, .Lcrypt_end 298 299.Lcrypt_tail: 300 cmp w3, #2 301 ld1 {v0.16b}, [x2], #16 302 blt .Lcrypt_tail_load_done 303 ld1 {v1.16b}, [x2], #16 304 beq .Lcrypt_tail_load_done 305 ld1 {v2.16b}, [x2], #16 306 307.Lcrypt_tail_load_done: 308 transpose_4x4(v0, v1, v2, v3) 309 310 SM4_CRYPT_BLK4(v0, v1, v2, v3) 311 312 cmp w3, #2 313 st1 {v0.16b}, [x1], #16 314 blt .Lcrypt_end 315 st1 {v1.16b}, [x1], #16 316 beq .Lcrypt_end 317 st1 {v2.16b}, [x1], #16 318 319.Lcrypt_end: 320 ret 321SYM_FUNC_END(sm4_neon_crypt) 322 323.align 3 324SYM_FUNC_START(sm4_neon_cbc_dec) 325 /* input: 326 * x0: round key array, CTX 327 * x1: dst 328 * x2: src 329 * x3: iv (big endian, 128 bit) 330 * w4: nblocks 331 */ 332 SM4_PREPARE() 333 334 ld1 {RIV.16b}, [x3] 335 336.Lcbc_dec_loop_8x: 337 sub w4, w4, #8 338 tbnz w4, #31, .Lcbc_dec_4x 339 340 ld4 {v0.4s-v3.4s}, [x2], #64 341 ld4 {v4.4s-v7.4s}, [x2] 342 343 SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7) 344 345 /* Avoid overwriting the RIV register */ 346 rotate_clockwise_4x4(v0, v1, v2, v3) 347 rotate_clockwise_4x4(v4, v5, v6, v7) 348 349 sub x2, x2, #64 350 351 eor v0.16b, v0.16b, RIV.16b 352 353 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 354 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 355 356 eor v1.16b, v1.16b, RTMP0.16b 357 eor v2.16b, v2.16b, RTMP1.16b 358 eor v3.16b, v3.16b, RTMP2.16b 359 eor v4.16b, v4.16b, RTMP3.16b 360 eor v5.16b, v5.16b, RTMP4.16b 361 eor v6.16b, v6.16b, RTMP5.16b 362 eor v7.16b, v7.16b, RTMP6.16b 363 364 mov RIV.16b, RTMP7.16b 365 366 st1 {v0.16b-v3.16b}, [x1], #64 367 st1 {v4.16b-v7.16b}, [x1], #64 368 369 cbz w4, .Lcbc_dec_end 370 b .Lcbc_dec_loop_8x 371 372.Lcbc_dec_4x: 373 add w4, w4, #8 374 cmp w4, #4 375 blt .Lcbc_dec_tail 376 377 sub w4, w4, #4 378 379 ld1 {v0.16b-v3.16b}, [x2], #64 380 381 rev32 v4.16b, v0.16b 382 rev32 v5.16b, v1.16b 383 rev32 v6.16b, v2.16b 384 rev32 v7.16b, v3.16b 385 386 transpose_4x4(v4, v5, v6, v7) 387 388 SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) 389 390 eor v4.16b, v4.16b, RIV.16b 391 eor v5.16b, v5.16b, v0.16b 392 eor v6.16b, v6.16b, v1.16b 393 eor v7.16b, v7.16b, v2.16b 394 395 mov RIV.16b, v3.16b 396 397 st1 {v4.16b-v7.16b}, [x1], #64 398 399 cbz w4, .Lcbc_dec_end 400 401.Lcbc_dec_tail: 402 cmp w4, #2 403 ld1 {v0.16b}, [x2], #16 404 blt .Lcbc_dec_tail_load_done 405 ld1 {v1.16b}, [x2], #16 406 beq .Lcbc_dec_tail_load_done 407 ld1 {v2.16b}, [x2], #16 408 409.Lcbc_dec_tail_load_done: 410 rev32 v4.16b, v0.16b 411 rev32 v5.16b, v1.16b 412 rev32 v6.16b, v2.16b 413 414 transpose_4x4(v4, v5, v6, v7) 415 416 SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) 417 418 cmp w4, #2 419 eor v4.16b, v4.16b, RIV.16b 420 mov RIV.16b, v0.16b 421 st1 {v4.16b}, [x1], #16 422 blt .Lcbc_dec_end 423 424 eor v5.16b, v5.16b, v0.16b 425 mov RIV.16b, v1.16b 426 st1 {v5.16b}, [x1], #16 427 beq .Lcbc_dec_end 428 429 eor v6.16b, v6.16b, v1.16b 430 mov RIV.16b, v2.16b 431 st1 {v6.16b}, [x1], #16 432 433.Lcbc_dec_end: 434 /* store new IV */ 435 st1 {RIV.16b}, [x3] 436 437 ret 438SYM_FUNC_END(sm4_neon_cbc_dec) 439 440.align 3 441SYM_FUNC_START(sm4_neon_cfb_dec) 442 /* input: 443 * x0: round key array, CTX 444 * x1: dst 445 * x2: src 446 * x3: iv (big endian, 128 bit) 447 * w4: nblocks 448 */ 449 SM4_PREPARE() 450 451 ld1 {v0.16b}, [x3] 452 453.Lcfb_dec_loop_8x: 454 sub w4, w4, #8 455 tbnz w4, #31, .Lcfb_dec_4x 456 457 ld1 {v1.16b-v3.16b}, [x2], #48 458 ld4 {v4.4s-v7.4s}, [x2] 459 460 transpose_4x4(v0, v1, v2, v3) 461 462 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 463 464 sub x2, x2, #48 465 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 466 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 467 468 eor v0.16b, v0.16b, RTMP0.16b 469 eor v1.16b, v1.16b, RTMP1.16b 470 eor v2.16b, v2.16b, RTMP2.16b 471 eor v3.16b, v3.16b, RTMP3.16b 472 eor v4.16b, v4.16b, RTMP4.16b 473 eor v5.16b, v5.16b, RTMP5.16b 474 eor v6.16b, v6.16b, RTMP6.16b 475 eor v7.16b, v7.16b, RTMP7.16b 476 477 st1 {v0.16b-v3.16b}, [x1], #64 478 st1 {v4.16b-v7.16b}, [x1], #64 479 480 mov v0.16b, RTMP7.16b 481 482 cbz w4, .Lcfb_dec_end 483 b .Lcfb_dec_loop_8x 484 485.Lcfb_dec_4x: 486 add w4, w4, #8 487 cmp w4, #4 488 blt .Lcfb_dec_tail 489 490 sub w4, w4, #4 491 492 ld1 {v4.16b-v7.16b}, [x2], #64 493 494 rev32 v0.16b, v0.16b /* v0 is IV register */ 495 rev32 v1.16b, v4.16b 496 rev32 v2.16b, v5.16b 497 rev32 v3.16b, v6.16b 498 499 transpose_4x4(v0, v1, v2, v3) 500 501 SM4_CRYPT_BLK4_BE(v0, v1, v2, v3) 502 503 eor v0.16b, v0.16b, v4.16b 504 eor v1.16b, v1.16b, v5.16b 505 eor v2.16b, v2.16b, v6.16b 506 eor v3.16b, v3.16b, v7.16b 507 508 st1 {v0.16b-v3.16b}, [x1], #64 509 510 mov v0.16b, v7.16b 511 512 cbz w4, .Lcfb_dec_end 513 514.Lcfb_dec_tail: 515 cmp w4, #2 516 ld1 {v4.16b}, [x2], #16 517 blt .Lcfb_dec_tail_load_done 518 ld1 {v5.16b}, [x2], #16 519 beq .Lcfb_dec_tail_load_done 520 ld1 {v6.16b}, [x2], #16 521 522.Lcfb_dec_tail_load_done: 523 rev32 v0.16b, v0.16b /* v0 is IV register */ 524 rev32 v1.16b, v4.16b 525 rev32 v2.16b, v5.16b 526 527 transpose_4x4(v0, v1, v2, v3) 528 529 SM4_CRYPT_BLK4_BE(v0, v1, v2, v3) 530 531 cmp w4, #2 532 eor v0.16b, v0.16b, v4.16b 533 st1 {v0.16b}, [x1], #16 534 mov v0.16b, v4.16b 535 blt .Lcfb_dec_end 536 537 eor v1.16b, v1.16b, v5.16b 538 st1 {v1.16b}, [x1], #16 539 mov v0.16b, v5.16b 540 beq .Lcfb_dec_end 541 542 eor v2.16b, v2.16b, v6.16b 543 st1 {v2.16b}, [x1], #16 544 mov v0.16b, v6.16b 545 546.Lcfb_dec_end: 547 /* store new IV */ 548 st1 {v0.16b}, [x3] 549 550 ret 551SYM_FUNC_END(sm4_neon_cfb_dec) 552 553.align 3 554SYM_FUNC_START(sm4_neon_ctr_crypt) 555 /* input: 556 * x0: round key array, CTX 557 * x1: dst 558 * x2: src 559 * x3: ctr (big endian, 128 bit) 560 * w4: nblocks 561 */ 562 SM4_PREPARE() 563 564 ldp x7, x8, [x3] 565 rev x7, x7 566 rev x8, x8 567 568.Lctr_crypt_loop_8x: 569 sub w4, w4, #8 570 tbnz w4, #31, .Lctr_crypt_4x 571 572#define inc_le128(vctr) \ 573 mov vctr.d[1], x8; \ 574 mov vctr.d[0], x7; \ 575 adds x8, x8, #1; \ 576 rev64 vctr.16b, vctr.16b; \ 577 adc x7, x7, xzr; 578 579 /* construct CTRs */ 580 inc_le128(v0) /* +0 */ 581 inc_le128(v1) /* +1 */ 582 inc_le128(v2) /* +2 */ 583 inc_le128(v3) /* +3 */ 584 inc_le128(v4) /* +4 */ 585 inc_le128(v5) /* +5 */ 586 inc_le128(v6) /* +6 */ 587 inc_le128(v7) /* +7 */ 588 589 transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7) 590 591 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 592 593 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 594 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 595 596 eor v0.16b, v0.16b, RTMP0.16b 597 eor v1.16b, v1.16b, RTMP1.16b 598 eor v2.16b, v2.16b, RTMP2.16b 599 eor v3.16b, v3.16b, RTMP3.16b 600 eor v4.16b, v4.16b, RTMP4.16b 601 eor v5.16b, v5.16b, RTMP5.16b 602 eor v6.16b, v6.16b, RTMP6.16b 603 eor v7.16b, v7.16b, RTMP7.16b 604 605 st1 {v0.16b-v3.16b}, [x1], #64 606 st1 {v4.16b-v7.16b}, [x1], #64 607 608 cbz w4, .Lctr_crypt_end 609 b .Lctr_crypt_loop_8x 610 611.Lctr_crypt_4x: 612 add w4, w4, #8 613 cmp w4, #4 614 blt .Lctr_crypt_tail 615 616 sub w4, w4, #4 617 618 /* construct CTRs */ 619 inc_le128(v0) /* +0 */ 620 inc_le128(v1) /* +1 */ 621 inc_le128(v2) /* +2 */ 622 inc_le128(v3) /* +3 */ 623 624 ld1 {v4.16b-v7.16b}, [x2], #64 625 626 transpose_4x4(v0, v1, v2, v3) 627 628 SM4_CRYPT_BLK4(v0, v1, v2, v3) 629 630 eor v0.16b, v0.16b, v4.16b 631 eor v1.16b, v1.16b, v5.16b 632 eor v2.16b, v2.16b, v6.16b 633 eor v3.16b, v3.16b, v7.16b 634 635 st1 {v0.16b-v3.16b}, [x1], #64 636 637 cbz w4, .Lctr_crypt_end 638 639.Lctr_crypt_tail: 640 /* inc_le128 will change the sign bit */ 641 ld1 {v4.16b}, [x2], #16 642 inc_le128(v0) 643 cmp w4, #2 644 blt .Lctr_crypt_tail_load_done 645 646 ld1 {v5.16b}, [x2], #16 647 inc_le128(v1) 648 cmp w4, #2 649 beq .Lctr_crypt_tail_load_done 650 651 ld1 {v6.16b}, [x2], #16 652 inc_le128(v2) 653 654.Lctr_crypt_tail_load_done: 655 transpose_4x4(v0, v1, v2, v3) 656 657 SM4_CRYPT_BLK4(v0, v1, v2, v3) 658 659 cmp w4, #2 660 661 eor v0.16b, v0.16b, v4.16b 662 st1 {v0.16b}, [x1], #16 663 blt .Lctr_crypt_end 664 665 eor v1.16b, v1.16b, v5.16b 666 st1 {v1.16b}, [x1], #16 667 beq .Lctr_crypt_end 668 669 eor v2.16b, v2.16b, v6.16b 670 st1 {v2.16b}, [x1], #16 671 672.Lctr_crypt_end: 673 /* store new CTR */ 674 rev x7, x7 675 rev x8, x8 676 stp x7, x8, [x3] 677 678 ret 679SYM_FUNC_END(sm4_neon_ctr_crypt) 680