1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 6 * 7 * Copyright (C) 2022, Alibaba Group. 8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13#include "sm4-ce-asm.h" 14 15.arch armv8-a+crypto 16 17.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 18 20, 24, 25, 26, 27, 28, 29, 30, 31 19 .set .Lv\b\().4s, \b 20.endr 21 22.macro sm4e, vd, vn 23 .inst 0xcec08400 | (.L\vn << 5) | .L\vd 24.endm 25 26.macro sm4ekey, vd, vn, vm 27 .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd 28.endm 29 30/* Register macros */ 31 32#define RTMP0 v16 33#define RTMP1 v17 34#define RTMP2 v18 35#define RTMP3 v19 36 37#define RIV v20 38#define RMAC v20 39#define RMASK v21 40 41 42.align 3 43SYM_FUNC_START(sm4_ce_expand_key) 44 /* input: 45 * x0: 128-bit key 46 * x1: rkey_enc 47 * x2: rkey_dec 48 * x3: fk array 49 * x4: ck array 50 */ 51 ld1 {v0.16b}, [x0]; 52 rev32 v0.16b, v0.16b; 53 ld1 {v1.16b}, [x3]; 54 /* load ck */ 55 ld1 {v24.16b-v27.16b}, [x4], #64; 56 ld1 {v28.16b-v31.16b}, [x4]; 57 58 /* input ^ fk */ 59 eor v0.16b, v0.16b, v1.16b; 60 61 sm4ekey v0.4s, v0.4s, v24.4s; 62 sm4ekey v1.4s, v0.4s, v25.4s; 63 sm4ekey v2.4s, v1.4s, v26.4s; 64 sm4ekey v3.4s, v2.4s, v27.4s; 65 sm4ekey v4.4s, v3.4s, v28.4s; 66 sm4ekey v5.4s, v4.4s, v29.4s; 67 sm4ekey v6.4s, v5.4s, v30.4s; 68 sm4ekey v7.4s, v6.4s, v31.4s; 69 70 adr_l x5, .Lbswap128_mask 71 ld1 {v24.16b}, [x5] 72 73 st1 {v0.16b-v3.16b}, [x1], #64; 74 st1 {v4.16b-v7.16b}, [x1]; 75 76 tbl v16.16b, {v7.16b}, v24.16b 77 tbl v17.16b, {v6.16b}, v24.16b 78 tbl v18.16b, {v5.16b}, v24.16b 79 tbl v19.16b, {v4.16b}, v24.16b 80 tbl v20.16b, {v3.16b}, v24.16b 81 tbl v21.16b, {v2.16b}, v24.16b 82 tbl v22.16b, {v1.16b}, v24.16b 83 tbl v23.16b, {v0.16b}, v24.16b 84 85 st1 {v16.16b-v19.16b}, [x2], #64 86 st1 {v20.16b-v23.16b}, [x2] 87 88 ret; 89SYM_FUNC_END(sm4_ce_expand_key) 90 91.align 3 92SYM_FUNC_START(sm4_ce_crypt_block) 93 /* input: 94 * x0: round key array, CTX 95 * x1: dst 96 * x2: src 97 */ 98 SM4_PREPARE(x0) 99 100 ld1 {v0.16b}, [x2]; 101 SM4_CRYPT_BLK(v0); 102 st1 {v0.16b}, [x1]; 103 104 ret; 105SYM_FUNC_END(sm4_ce_crypt_block) 106 107.align 3 108SYM_FUNC_START(sm4_ce_crypt) 109 /* input: 110 * x0: round key array, CTX 111 * x1: dst 112 * x2: src 113 * w3: nblocks 114 */ 115 SM4_PREPARE(x0) 116 117.Lcrypt_loop_blk: 118 sub w3, w3, #8; 119 tbnz w3, #31, .Lcrypt_tail8; 120 121 ld1 {v0.16b-v3.16b}, [x2], #64; 122 ld1 {v4.16b-v7.16b}, [x2], #64; 123 124 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 125 126 st1 {v0.16b-v3.16b}, [x1], #64; 127 st1 {v4.16b-v7.16b}, [x1], #64; 128 129 cbz w3, .Lcrypt_end; 130 b .Lcrypt_loop_blk; 131 132.Lcrypt_tail8: 133 add w3, w3, #8; 134 cmp w3, #4; 135 blt .Lcrypt_tail4; 136 137 sub w3, w3, #4; 138 139 ld1 {v0.16b-v3.16b}, [x2], #64; 140 SM4_CRYPT_BLK4(v0, v1, v2, v3); 141 st1 {v0.16b-v3.16b}, [x1], #64; 142 143 cbz w3, .Lcrypt_end; 144 145.Lcrypt_tail4: 146 sub w3, w3, #1; 147 148 ld1 {v0.16b}, [x2], #16; 149 SM4_CRYPT_BLK(v0); 150 st1 {v0.16b}, [x1], #16; 151 152 cbnz w3, .Lcrypt_tail4; 153 154.Lcrypt_end: 155 ret; 156SYM_FUNC_END(sm4_ce_crypt) 157 158.align 3 159SYM_FUNC_START(sm4_ce_cbc_enc) 160 /* input: 161 * x0: round key array, CTX 162 * x1: dst 163 * x2: src 164 * x3: iv (big endian, 128 bit) 165 * w4: nblocks 166 */ 167 SM4_PREPARE(x0) 168 169 ld1 {RIV.16b}, [x3] 170 171.Lcbc_enc_loop_4x: 172 cmp w4, #4 173 blt .Lcbc_enc_loop_1x 174 175 sub w4, w4, #4 176 177 ld1 {v0.16b-v3.16b}, [x2], #64 178 179 eor v0.16b, v0.16b, RIV.16b 180 SM4_CRYPT_BLK(v0) 181 eor v1.16b, v1.16b, v0.16b 182 SM4_CRYPT_BLK(v1) 183 eor v2.16b, v2.16b, v1.16b 184 SM4_CRYPT_BLK(v2) 185 eor v3.16b, v3.16b, v2.16b 186 SM4_CRYPT_BLK(v3) 187 188 st1 {v0.16b-v3.16b}, [x1], #64 189 mov RIV.16b, v3.16b 190 191 cbz w4, .Lcbc_enc_end 192 b .Lcbc_enc_loop_4x 193 194.Lcbc_enc_loop_1x: 195 sub w4, w4, #1 196 197 ld1 {v0.16b}, [x2], #16 198 199 eor RIV.16b, RIV.16b, v0.16b 200 SM4_CRYPT_BLK(RIV) 201 202 st1 {RIV.16b}, [x1], #16 203 204 cbnz w4, .Lcbc_enc_loop_1x 205 206.Lcbc_enc_end: 207 /* store new IV */ 208 st1 {RIV.16b}, [x3] 209 210 ret 211SYM_FUNC_END(sm4_ce_cbc_enc) 212 213.align 3 214SYM_FUNC_START(sm4_ce_cbc_dec) 215 /* input: 216 * x0: round key array, CTX 217 * x1: dst 218 * x2: src 219 * x3: iv (big endian, 128 bit) 220 * w4: nblocks 221 */ 222 SM4_PREPARE(x0) 223 224 ld1 {RIV.16b}, [x3] 225 226.Lcbc_dec_loop_8x: 227 sub w4, w4, #8 228 tbnz w4, #31, .Lcbc_dec_4x 229 230 ld1 {v0.16b-v3.16b}, [x2], #64 231 ld1 {v4.16b-v7.16b}, [x2], #64 232 233 rev32 v8.16b, v0.16b 234 rev32 v9.16b, v1.16b 235 rev32 v10.16b, v2.16b 236 rev32 v11.16b, v3.16b 237 rev32 v12.16b, v4.16b 238 rev32 v13.16b, v5.16b 239 rev32 v14.16b, v6.16b 240 rev32 v15.16b, v7.16b 241 242 SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) 243 244 eor v8.16b, v8.16b, RIV.16b 245 eor v9.16b, v9.16b, v0.16b 246 eor v10.16b, v10.16b, v1.16b 247 eor v11.16b, v11.16b, v2.16b 248 eor v12.16b, v12.16b, v3.16b 249 eor v13.16b, v13.16b, v4.16b 250 eor v14.16b, v14.16b, v5.16b 251 eor v15.16b, v15.16b, v6.16b 252 253 st1 {v8.16b-v11.16b}, [x1], #64 254 st1 {v12.16b-v15.16b}, [x1], #64 255 256 mov RIV.16b, v7.16b 257 258 cbz w4, .Lcbc_dec_end 259 b .Lcbc_dec_loop_8x 260 261.Lcbc_dec_4x: 262 add w4, w4, #8 263 cmp w4, #4 264 blt .Lcbc_dec_loop_1x 265 266 sub w4, w4, #4 267 268 ld1 {v0.16b-v3.16b}, [x2], #64 269 270 rev32 v8.16b, v0.16b 271 rev32 v9.16b, v1.16b 272 rev32 v10.16b, v2.16b 273 rev32 v11.16b, v3.16b 274 275 SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) 276 277 eor v8.16b, v8.16b, RIV.16b 278 eor v9.16b, v9.16b, v0.16b 279 eor v10.16b, v10.16b, v1.16b 280 eor v11.16b, v11.16b, v2.16b 281 282 st1 {v8.16b-v11.16b}, [x1], #64 283 284 mov RIV.16b, v3.16b 285 286 cbz w4, .Lcbc_dec_end 287 288.Lcbc_dec_loop_1x: 289 sub w4, w4, #1 290 291 ld1 {v0.16b}, [x2], #16 292 293 rev32 v8.16b, v0.16b 294 295 SM4_CRYPT_BLK_BE(v8) 296 297 eor v8.16b, v8.16b, RIV.16b 298 st1 {v8.16b}, [x1], #16 299 300 mov RIV.16b, v0.16b 301 302 cbnz w4, .Lcbc_dec_loop_1x 303 304.Lcbc_dec_end: 305 /* store new IV */ 306 st1 {RIV.16b}, [x3] 307 308 ret 309SYM_FUNC_END(sm4_ce_cbc_dec) 310 311.align 3 312SYM_FUNC_START(sm4_ce_cbc_cts_enc) 313 /* input: 314 * x0: round key array, CTX 315 * x1: dst 316 * x2: src 317 * x3: iv (big endian, 128 bit) 318 * w4: nbytes 319 */ 320 SM4_PREPARE(x0) 321 322 sub w5, w4, #16 323 uxtw x5, w5 324 325 ld1 {RIV.16b}, [x3] 326 327 ld1 {v0.16b}, [x2] 328 eor RIV.16b, RIV.16b, v0.16b 329 SM4_CRYPT_BLK(RIV) 330 331 /* load permute table */ 332 adr_l x6, .Lcts_permute_table 333 add x7, x6, #32 334 add x6, x6, x5 335 sub x7, x7, x5 336 ld1 {v3.16b}, [x6] 337 ld1 {v4.16b}, [x7] 338 339 /* overlapping loads */ 340 add x2, x2, x5 341 ld1 {v1.16b}, [x2] 342 343 /* create Cn from En-1 */ 344 tbl v0.16b, {RIV.16b}, v3.16b 345 /* padding Pn with zeros */ 346 tbl v1.16b, {v1.16b}, v4.16b 347 348 eor v1.16b, v1.16b, RIV.16b 349 SM4_CRYPT_BLK(v1) 350 351 /* overlapping stores */ 352 add x5, x1, x5 353 st1 {v0.16b}, [x5] 354 st1 {v1.16b}, [x1] 355 356 ret 357SYM_FUNC_END(sm4_ce_cbc_cts_enc) 358 359.align 3 360SYM_FUNC_START(sm4_ce_cbc_cts_dec) 361 /* input: 362 * x0: round key array, CTX 363 * x1: dst 364 * x2: src 365 * x3: iv (big endian, 128 bit) 366 * w4: nbytes 367 */ 368 SM4_PREPARE(x0) 369 370 sub w5, w4, #16 371 uxtw x5, w5 372 373 ld1 {RIV.16b}, [x3] 374 375 /* load permute table */ 376 adr_l x6, .Lcts_permute_table 377 add x7, x6, #32 378 add x6, x6, x5 379 sub x7, x7, x5 380 ld1 {v3.16b}, [x6] 381 ld1 {v4.16b}, [x7] 382 383 /* overlapping loads */ 384 ld1 {v0.16b}, [x2], x5 385 ld1 {v1.16b}, [x2] 386 387 SM4_CRYPT_BLK(v0) 388 /* select the first Ln bytes of Xn to create Pn */ 389 tbl v2.16b, {v0.16b}, v3.16b 390 eor v2.16b, v2.16b, v1.16b 391 392 /* overwrite the first Ln bytes with Cn to create En-1 */ 393 tbx v0.16b, {v1.16b}, v4.16b 394 SM4_CRYPT_BLK(v0) 395 eor v0.16b, v0.16b, RIV.16b 396 397 /* overlapping stores */ 398 add x5, x1, x5 399 st1 {v2.16b}, [x5] 400 st1 {v0.16b}, [x1] 401 402 ret 403SYM_FUNC_END(sm4_ce_cbc_cts_dec) 404 405.align 3 406SYM_FUNC_START(sm4_ce_cfb_enc) 407 /* input: 408 * x0: round key array, CTX 409 * x1: dst 410 * x2: src 411 * x3: iv (big endian, 128 bit) 412 * w4: nblocks 413 */ 414 SM4_PREPARE(x0) 415 416 ld1 {RIV.16b}, [x3] 417 418.Lcfb_enc_loop_4x: 419 cmp w4, #4 420 blt .Lcfb_enc_loop_1x 421 422 sub w4, w4, #4 423 424 ld1 {v0.16b-v3.16b}, [x2], #64 425 426 rev32 v8.16b, RIV.16b 427 SM4_CRYPT_BLK_BE(v8) 428 eor v0.16b, v0.16b, v8.16b 429 430 rev32 v8.16b, v0.16b 431 SM4_CRYPT_BLK_BE(v8) 432 eor v1.16b, v1.16b, v8.16b 433 434 rev32 v8.16b, v1.16b 435 SM4_CRYPT_BLK_BE(v8) 436 eor v2.16b, v2.16b, v8.16b 437 438 rev32 v8.16b, v2.16b 439 SM4_CRYPT_BLK_BE(v8) 440 eor v3.16b, v3.16b, v8.16b 441 442 st1 {v0.16b-v3.16b}, [x1], #64 443 mov RIV.16b, v3.16b 444 445 cbz w4, .Lcfb_enc_end 446 b .Lcfb_enc_loop_4x 447 448.Lcfb_enc_loop_1x: 449 sub w4, w4, #1 450 451 ld1 {v0.16b}, [x2], #16 452 453 SM4_CRYPT_BLK(RIV) 454 eor RIV.16b, RIV.16b, v0.16b 455 456 st1 {RIV.16b}, [x1], #16 457 458 cbnz w4, .Lcfb_enc_loop_1x 459 460.Lcfb_enc_end: 461 /* store new IV */ 462 st1 {RIV.16b}, [x3] 463 464 ret 465SYM_FUNC_END(sm4_ce_cfb_enc) 466 467.align 3 468SYM_FUNC_START(sm4_ce_cfb_dec) 469 /* input: 470 * x0: round key array, CTX 471 * x1: dst 472 * x2: src 473 * x3: iv (big endian, 128 bit) 474 * w4: nblocks 475 */ 476 SM4_PREPARE(x0) 477 478 ld1 {RIV.16b}, [x3] 479 480.Lcfb_dec_loop_8x: 481 sub w4, w4, #8 482 tbnz w4, #31, .Lcfb_dec_4x 483 484 ld1 {v0.16b-v3.16b}, [x2], #64 485 ld1 {v4.16b-v7.16b}, [x2], #64 486 487 rev32 v8.16b, RIV.16b 488 rev32 v9.16b, v0.16b 489 rev32 v10.16b, v1.16b 490 rev32 v11.16b, v2.16b 491 rev32 v12.16b, v3.16b 492 rev32 v13.16b, v4.16b 493 rev32 v14.16b, v5.16b 494 rev32 v15.16b, v6.16b 495 496 SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) 497 498 mov RIV.16b, v7.16b 499 500 eor v0.16b, v0.16b, v8.16b 501 eor v1.16b, v1.16b, v9.16b 502 eor v2.16b, v2.16b, v10.16b 503 eor v3.16b, v3.16b, v11.16b 504 eor v4.16b, v4.16b, v12.16b 505 eor v5.16b, v5.16b, v13.16b 506 eor v6.16b, v6.16b, v14.16b 507 eor v7.16b, v7.16b, v15.16b 508 509 st1 {v0.16b-v3.16b}, [x1], #64 510 st1 {v4.16b-v7.16b}, [x1], #64 511 512 cbz w4, .Lcfb_dec_end 513 b .Lcfb_dec_loop_8x 514 515.Lcfb_dec_4x: 516 add w4, w4, #8 517 cmp w4, #4 518 blt .Lcfb_dec_loop_1x 519 520 sub w4, w4, #4 521 522 ld1 {v0.16b-v3.16b}, [x2], #64 523 524 rev32 v8.16b, RIV.16b 525 rev32 v9.16b, v0.16b 526 rev32 v10.16b, v1.16b 527 rev32 v11.16b, v2.16b 528 529 SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) 530 531 mov RIV.16b, v3.16b 532 533 eor v0.16b, v0.16b, v8.16b 534 eor v1.16b, v1.16b, v9.16b 535 eor v2.16b, v2.16b, v10.16b 536 eor v3.16b, v3.16b, v11.16b 537 538 st1 {v0.16b-v3.16b}, [x1], #64 539 540 cbz w4, .Lcfb_dec_end 541 542.Lcfb_dec_loop_1x: 543 sub w4, w4, #1 544 545 ld1 {v0.16b}, [x2], #16 546 547 SM4_CRYPT_BLK(RIV) 548 549 eor RIV.16b, RIV.16b, v0.16b 550 st1 {RIV.16b}, [x1], #16 551 552 mov RIV.16b, v0.16b 553 554 cbnz w4, .Lcfb_dec_loop_1x 555 556.Lcfb_dec_end: 557 /* store new IV */ 558 st1 {RIV.16b}, [x3] 559 560 ret 561SYM_FUNC_END(sm4_ce_cfb_dec) 562 563.align 3 564SYM_FUNC_START(sm4_ce_ctr_enc) 565 /* input: 566 * x0: round key array, CTX 567 * x1: dst 568 * x2: src 569 * x3: ctr (big endian, 128 bit) 570 * w4: nblocks 571 */ 572 SM4_PREPARE(x0) 573 574 ldp x7, x8, [x3] 575 rev x7, x7 576 rev x8, x8 577 578.Lctr_loop_8x: 579 sub w4, w4, #8 580 tbnz w4, #31, .Lctr_4x 581 582#define inc_le128(vctr) \ 583 mov vctr.d[1], x8; \ 584 mov vctr.d[0], x7; \ 585 adds x8, x8, #1; \ 586 rev64 vctr.16b, vctr.16b; \ 587 adc x7, x7, xzr; 588 589 /* construct CTRs */ 590 inc_le128(v0) /* +0 */ 591 inc_le128(v1) /* +1 */ 592 inc_le128(v2) /* +2 */ 593 inc_le128(v3) /* +3 */ 594 inc_le128(v4) /* +4 */ 595 inc_le128(v5) /* +5 */ 596 inc_le128(v6) /* +6 */ 597 inc_le128(v7) /* +7 */ 598 599 ld1 {v8.16b-v11.16b}, [x2], #64 600 ld1 {v12.16b-v15.16b}, [x2], #64 601 602 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 603 604 eor v0.16b, v0.16b, v8.16b 605 eor v1.16b, v1.16b, v9.16b 606 eor v2.16b, v2.16b, v10.16b 607 eor v3.16b, v3.16b, v11.16b 608 eor v4.16b, v4.16b, v12.16b 609 eor v5.16b, v5.16b, v13.16b 610 eor v6.16b, v6.16b, v14.16b 611 eor v7.16b, v7.16b, v15.16b 612 613 st1 {v0.16b-v3.16b}, [x1], #64 614 st1 {v4.16b-v7.16b}, [x1], #64 615 616 cbz w4, .Lctr_end 617 b .Lctr_loop_8x 618 619.Lctr_4x: 620 add w4, w4, #8 621 cmp w4, #4 622 blt .Lctr_loop_1x 623 624 sub w4, w4, #4 625 626 /* construct CTRs */ 627 inc_le128(v0) /* +0 */ 628 inc_le128(v1) /* +1 */ 629 inc_le128(v2) /* +2 */ 630 inc_le128(v3) /* +3 */ 631 632 ld1 {v8.16b-v11.16b}, [x2], #64 633 634 SM4_CRYPT_BLK4(v0, v1, v2, v3) 635 636 eor v0.16b, v0.16b, v8.16b 637 eor v1.16b, v1.16b, v9.16b 638 eor v2.16b, v2.16b, v10.16b 639 eor v3.16b, v3.16b, v11.16b 640 641 st1 {v0.16b-v3.16b}, [x1], #64 642 643 cbz w4, .Lctr_end 644 645.Lctr_loop_1x: 646 sub w4, w4, #1 647 648 /* construct CTRs */ 649 inc_le128(v0) 650 651 ld1 {v8.16b}, [x2], #16 652 653 SM4_CRYPT_BLK(v0) 654 655 eor v0.16b, v0.16b, v8.16b 656 st1 {v0.16b}, [x1], #16 657 658 cbnz w4, .Lctr_loop_1x 659 660.Lctr_end: 661 /* store new CTR */ 662 rev x7, x7 663 rev x8, x8 664 stp x7, x8, [x3] 665 666 ret 667SYM_FUNC_END(sm4_ce_ctr_enc) 668 669 670#define tweak_next(vt, vin, RTMP) \ 671 sshr RTMP.2d, vin.2d, #63; \ 672 and RTMP.16b, RTMP.16b, RMASK.16b; \ 673 add vt.2d, vin.2d, vin.2d; \ 674 ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \ 675 eor vt.16b, vt.16b, RTMP.16b; 676 677.align 3 678SYM_FUNC_START(sm4_ce_xts_enc) 679 /* input: 680 * x0: round key array, CTX 681 * x1: dst 682 * x2: src 683 * x3: tweak (big endian, 128 bit) 684 * w4: nbytes 685 * x5: round key array for IV 686 */ 687 ld1 {v8.16b}, [x3] 688 689 cbz x5, .Lxts_enc_nofirst 690 691 SM4_PREPARE(x5) 692 693 /* Generate first tweak */ 694 SM4_CRYPT_BLK(v8) 695 696.Lxts_enc_nofirst: 697 SM4_PREPARE(x0) 698 699 ands w5, w4, #15 700 lsr w4, w4, #4 701 sub w6, w4, #1 702 csel w4, w4, w6, eq 703 uxtw x5, w5 704 705 movi RMASK.2s, #0x1 706 movi RTMP0.2s, #0x87 707 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s 708 709 cbz w4, .Lxts_enc_cts 710 711.Lxts_enc_loop_8x: 712 sub w4, w4, #8 713 tbnz w4, #31, .Lxts_enc_4x 714 715 tweak_next( v9, v8, RTMP0) 716 tweak_next(v10, v9, RTMP1) 717 tweak_next(v11, v10, RTMP2) 718 tweak_next(v12, v11, RTMP3) 719 tweak_next(v13, v12, RTMP0) 720 tweak_next(v14, v13, RTMP1) 721 tweak_next(v15, v14, RTMP2) 722 723 ld1 {v0.16b-v3.16b}, [x2], #64 724 ld1 {v4.16b-v7.16b}, [x2], #64 725 eor v0.16b, v0.16b, v8.16b 726 eor v1.16b, v1.16b, v9.16b 727 eor v2.16b, v2.16b, v10.16b 728 eor v3.16b, v3.16b, v11.16b 729 eor v4.16b, v4.16b, v12.16b 730 eor v5.16b, v5.16b, v13.16b 731 eor v6.16b, v6.16b, v14.16b 732 eor v7.16b, v7.16b, v15.16b 733 734 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 735 736 eor v0.16b, v0.16b, v8.16b 737 eor v1.16b, v1.16b, v9.16b 738 eor v2.16b, v2.16b, v10.16b 739 eor v3.16b, v3.16b, v11.16b 740 eor v4.16b, v4.16b, v12.16b 741 eor v5.16b, v5.16b, v13.16b 742 eor v6.16b, v6.16b, v14.16b 743 eor v7.16b, v7.16b, v15.16b 744 st1 {v0.16b-v3.16b}, [x1], #64 745 st1 {v4.16b-v7.16b}, [x1], #64 746 747 tweak_next(v8, v15, RTMP3) 748 749 cbz w4, .Lxts_enc_cts 750 b .Lxts_enc_loop_8x 751 752.Lxts_enc_4x: 753 add w4, w4, #8 754 cmp w4, #4 755 blt .Lxts_enc_loop_1x 756 757 sub w4, w4, #4 758 759 tweak_next( v9, v8, RTMP0) 760 tweak_next(v10, v9, RTMP1) 761 tweak_next(v11, v10, RTMP2) 762 763 ld1 {v0.16b-v3.16b}, [x2], #64 764 eor v0.16b, v0.16b, v8.16b 765 eor v1.16b, v1.16b, v9.16b 766 eor v2.16b, v2.16b, v10.16b 767 eor v3.16b, v3.16b, v11.16b 768 769 SM4_CRYPT_BLK4(v0, v1, v2, v3) 770 771 eor v0.16b, v0.16b, v8.16b 772 eor v1.16b, v1.16b, v9.16b 773 eor v2.16b, v2.16b, v10.16b 774 eor v3.16b, v3.16b, v11.16b 775 st1 {v0.16b-v3.16b}, [x1], #64 776 777 tweak_next(v8, v11, RTMP3) 778 779 cbz w4, .Lxts_enc_cts 780 781.Lxts_enc_loop_1x: 782 sub w4, w4, #1 783 784 ld1 {v0.16b}, [x2], #16 785 eor v0.16b, v0.16b, v8.16b 786 787 SM4_CRYPT_BLK(v0) 788 789 eor v0.16b, v0.16b, v8.16b 790 st1 {v0.16b}, [x1], #16 791 792 tweak_next(v8, v8, RTMP0) 793 794 cbnz w4, .Lxts_enc_loop_1x 795 796.Lxts_enc_cts: 797 cbz x5, .Lxts_enc_end 798 799 /* cipher text stealing */ 800 801 tweak_next(v9, v8, RTMP0) 802 ld1 {v0.16b}, [x2] 803 eor v0.16b, v0.16b, v8.16b 804 SM4_CRYPT_BLK(v0) 805 eor v0.16b, v0.16b, v8.16b 806 807 /* load permute table */ 808 adr_l x6, .Lcts_permute_table 809 add x7, x6, #32 810 add x6, x6, x5 811 sub x7, x7, x5 812 ld1 {v3.16b}, [x6] 813 ld1 {v4.16b}, [x7] 814 815 /* overlapping loads */ 816 add x2, x2, x5 817 ld1 {v1.16b}, [x2] 818 819 /* create Cn from En-1 */ 820 tbl v2.16b, {v0.16b}, v3.16b 821 /* padding Pn with En-1 at the end */ 822 tbx v0.16b, {v1.16b}, v4.16b 823 824 eor v0.16b, v0.16b, v9.16b 825 SM4_CRYPT_BLK(v0) 826 eor v0.16b, v0.16b, v9.16b 827 828 829 /* overlapping stores */ 830 add x5, x1, x5 831 st1 {v2.16b}, [x5] 832 st1 {v0.16b}, [x1] 833 834 b .Lxts_enc_ret 835 836.Lxts_enc_end: 837 /* store new tweak */ 838 st1 {v8.16b}, [x3] 839 840.Lxts_enc_ret: 841 ret 842SYM_FUNC_END(sm4_ce_xts_enc) 843 844.align 3 845SYM_FUNC_START(sm4_ce_xts_dec) 846 /* input: 847 * x0: round key array, CTX 848 * x1: dst 849 * x2: src 850 * x3: tweak (big endian, 128 bit) 851 * w4: nbytes 852 * x5: round key array for IV 853 */ 854 ld1 {v8.16b}, [x3] 855 856 cbz x5, .Lxts_dec_nofirst 857 858 SM4_PREPARE(x5) 859 860 /* Generate first tweak */ 861 SM4_CRYPT_BLK(v8) 862 863.Lxts_dec_nofirst: 864 SM4_PREPARE(x0) 865 866 ands w5, w4, #15 867 lsr w4, w4, #4 868 sub w6, w4, #1 869 csel w4, w4, w6, eq 870 uxtw x5, w5 871 872 movi RMASK.2s, #0x1 873 movi RTMP0.2s, #0x87 874 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s 875 876 cbz w4, .Lxts_dec_cts 877 878.Lxts_dec_loop_8x: 879 sub w4, w4, #8 880 tbnz w4, #31, .Lxts_dec_4x 881 882 tweak_next( v9, v8, RTMP0) 883 tweak_next(v10, v9, RTMP1) 884 tweak_next(v11, v10, RTMP2) 885 tweak_next(v12, v11, RTMP3) 886 tweak_next(v13, v12, RTMP0) 887 tweak_next(v14, v13, RTMP1) 888 tweak_next(v15, v14, RTMP2) 889 890 ld1 {v0.16b-v3.16b}, [x2], #64 891 ld1 {v4.16b-v7.16b}, [x2], #64 892 eor v0.16b, v0.16b, v8.16b 893 eor v1.16b, v1.16b, v9.16b 894 eor v2.16b, v2.16b, v10.16b 895 eor v3.16b, v3.16b, v11.16b 896 eor v4.16b, v4.16b, v12.16b 897 eor v5.16b, v5.16b, v13.16b 898 eor v6.16b, v6.16b, v14.16b 899 eor v7.16b, v7.16b, v15.16b 900 901 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 902 903 eor v0.16b, v0.16b, v8.16b 904 eor v1.16b, v1.16b, v9.16b 905 eor v2.16b, v2.16b, v10.16b 906 eor v3.16b, v3.16b, v11.16b 907 eor v4.16b, v4.16b, v12.16b 908 eor v5.16b, v5.16b, v13.16b 909 eor v6.16b, v6.16b, v14.16b 910 eor v7.16b, v7.16b, v15.16b 911 st1 {v0.16b-v3.16b}, [x1], #64 912 st1 {v4.16b-v7.16b}, [x1], #64 913 914 tweak_next(v8, v15, RTMP3) 915 916 cbz w4, .Lxts_dec_cts 917 b .Lxts_dec_loop_8x 918 919.Lxts_dec_4x: 920 add w4, w4, #8 921 cmp w4, #4 922 blt .Lxts_dec_loop_1x 923 924 sub w4, w4, #4 925 926 tweak_next( v9, v8, RTMP0) 927 tweak_next(v10, v9, RTMP1) 928 tweak_next(v11, v10, RTMP2) 929 930 ld1 {v0.16b-v3.16b}, [x2], #64 931 eor v0.16b, v0.16b, v8.16b 932 eor v1.16b, v1.16b, v9.16b 933 eor v2.16b, v2.16b, v10.16b 934 eor v3.16b, v3.16b, v11.16b 935 936 SM4_CRYPT_BLK4(v0, v1, v2, v3) 937 938 eor v0.16b, v0.16b, v8.16b 939 eor v1.16b, v1.16b, v9.16b 940 eor v2.16b, v2.16b, v10.16b 941 eor v3.16b, v3.16b, v11.16b 942 st1 {v0.16b-v3.16b}, [x1], #64 943 944 tweak_next(v8, v11, RTMP3) 945 946 cbz w4, .Lxts_dec_cts 947 948.Lxts_dec_loop_1x: 949 sub w4, w4, #1 950 951 ld1 {v0.16b}, [x2], #16 952 eor v0.16b, v0.16b, v8.16b 953 954 SM4_CRYPT_BLK(v0) 955 956 eor v0.16b, v0.16b, v8.16b 957 st1 {v0.16b}, [x1], #16 958 959 tweak_next(v8, v8, RTMP0) 960 961 cbnz w4, .Lxts_dec_loop_1x 962 963.Lxts_dec_cts: 964 cbz x5, .Lxts_dec_end 965 966 /* cipher text stealing */ 967 968 tweak_next(v9, v8, RTMP0) 969 ld1 {v0.16b}, [x2] 970 eor v0.16b, v0.16b, v9.16b 971 SM4_CRYPT_BLK(v0) 972 eor v0.16b, v0.16b, v9.16b 973 974 /* load permute table */ 975 adr_l x6, .Lcts_permute_table 976 add x7, x6, #32 977 add x6, x6, x5 978 sub x7, x7, x5 979 ld1 {v3.16b}, [x6] 980 ld1 {v4.16b}, [x7] 981 982 /* overlapping loads */ 983 add x2, x2, x5 984 ld1 {v1.16b}, [x2] 985 986 /* create Cn from En-1 */ 987 tbl v2.16b, {v0.16b}, v3.16b 988 /* padding Pn with En-1 at the end */ 989 tbx v0.16b, {v1.16b}, v4.16b 990 991 eor v0.16b, v0.16b, v8.16b 992 SM4_CRYPT_BLK(v0) 993 eor v0.16b, v0.16b, v8.16b 994 995 996 /* overlapping stores */ 997 add x5, x1, x5 998 st1 {v2.16b}, [x5] 999 st1 {v0.16b}, [x1] 1000 1001 b .Lxts_dec_ret 1002 1003.Lxts_dec_end: 1004 /* store new tweak */ 1005 st1 {v8.16b}, [x3] 1006 1007.Lxts_dec_ret: 1008 ret 1009SYM_FUNC_END(sm4_ce_xts_dec) 1010 1011.align 3 1012SYM_FUNC_START(sm4_ce_mac_update) 1013 /* input: 1014 * x0: round key array, CTX 1015 * x1: digest 1016 * x2: src 1017 * w3: nblocks 1018 * w4: enc_before 1019 * w5: enc_after 1020 */ 1021 SM4_PREPARE(x0) 1022 1023 ld1 {RMAC.16b}, [x1] 1024 1025 cbz w4, .Lmac_update 1026 1027 SM4_CRYPT_BLK(RMAC) 1028 1029.Lmac_update: 1030 cbz w3, .Lmac_ret 1031 1032 sub w6, w3, #1 1033 cmp w5, wzr 1034 csel w3, w3, w6, ne 1035 1036 cbz w3, .Lmac_end 1037 1038.Lmac_loop_4x: 1039 cmp w3, #4 1040 blt .Lmac_loop_1x 1041 1042 sub w3, w3, #4 1043 1044 ld1 {v0.16b-v3.16b}, [x2], #64 1045 1046 eor RMAC.16b, RMAC.16b, v0.16b 1047 SM4_CRYPT_BLK(RMAC) 1048 eor RMAC.16b, RMAC.16b, v1.16b 1049 SM4_CRYPT_BLK(RMAC) 1050 eor RMAC.16b, RMAC.16b, v2.16b 1051 SM4_CRYPT_BLK(RMAC) 1052 eor RMAC.16b, RMAC.16b, v3.16b 1053 SM4_CRYPT_BLK(RMAC) 1054 1055 cbz w3, .Lmac_end 1056 b .Lmac_loop_4x 1057 1058.Lmac_loop_1x: 1059 sub w3, w3, #1 1060 1061 ld1 {v0.16b}, [x2], #16 1062 1063 eor RMAC.16b, RMAC.16b, v0.16b 1064 SM4_CRYPT_BLK(RMAC) 1065 1066 cbnz w3, .Lmac_loop_1x 1067 1068 1069.Lmac_end: 1070 cbnz w5, .Lmac_ret 1071 1072 ld1 {v0.16b}, [x2], #16 1073 eor RMAC.16b, RMAC.16b, v0.16b 1074 1075.Lmac_ret: 1076 st1 {RMAC.16b}, [x1] 1077 ret 1078SYM_FUNC_END(sm4_ce_mac_update) 1079 1080 1081 .section ".rodata", "a" 1082 .align 4 1083.Lbswap128_mask: 1084 .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b 1085 .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 1086 1087.Lcts_permute_table: 1088 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1089 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1090 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 1091 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 1092 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1093 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1094