1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions 4 * as specified in rfc8998 5 * https://datatracker.ietf.org/doc/html/rfc8998 6 * 7 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13#include "sm4-ce-asm.h" 14 15.arch armv8-a+crypto 16 17.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31 18 .set .Lv\b\().4s, \b 19.endr 20 21.macro sm4e, vd, vn 22 .inst 0xcec08400 | (.L\vn << 5) | .L\vd 23.endm 24 25/* Register macros */ 26 27/* Used for both encryption and decryption */ 28#define RHASH v21 29#define RRCONST v22 30#define RZERO v23 31 32/* Helper macros. */ 33 34/* 35 * input: m0, m1 36 * output: r0:r1 (low 128-bits in r0, high in r1) 37 */ 38#define PMUL_128x128(r0, r1, m0, m1, T0, T1) \ 39 ext T0.16b, m1.16b, m1.16b, #8; \ 40 pmull r0.1q, m0.1d, m1.1d; \ 41 pmull T1.1q, m0.1d, T0.1d; \ 42 pmull2 T0.1q, m0.2d, T0.2d; \ 43 pmull2 r1.1q, m0.2d, m1.2d; \ 44 eor T0.16b, T0.16b, T1.16b; \ 45 ext T1.16b, RZERO.16b, T0.16b, #8; \ 46 ext T0.16b, T0.16b, RZERO.16b, #8; \ 47 eor r0.16b, r0.16b, T1.16b; \ 48 eor r1.16b, r1.16b, T0.16b; 49 50#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \ 51 r2, r3, m2, m3, T2, T3, \ 52 r4, r5, m4, m5, T4, T5, \ 53 r6, r7, m6, m7, T6, T7) \ 54 ext T0.16b, m1.16b, m1.16b, #8; \ 55 ext T2.16b, m3.16b, m3.16b, #8; \ 56 ext T4.16b, m5.16b, m5.16b, #8; \ 57 ext T6.16b, m7.16b, m7.16b, #8; \ 58 pmull r0.1q, m0.1d, m1.1d; \ 59 pmull r2.1q, m2.1d, m3.1d; \ 60 pmull r4.1q, m4.1d, m5.1d; \ 61 pmull r6.1q, m6.1d, m7.1d; \ 62 pmull T1.1q, m0.1d, T0.1d; \ 63 pmull T3.1q, m2.1d, T2.1d; \ 64 pmull T5.1q, m4.1d, T4.1d; \ 65 pmull T7.1q, m6.1d, T6.1d; \ 66 pmull2 T0.1q, m0.2d, T0.2d; \ 67 pmull2 T2.1q, m2.2d, T2.2d; \ 68 pmull2 T4.1q, m4.2d, T4.2d; \ 69 pmull2 T6.1q, m6.2d, T6.2d; \ 70 pmull2 r1.1q, m0.2d, m1.2d; \ 71 pmull2 r3.1q, m2.2d, m3.2d; \ 72 pmull2 r5.1q, m4.2d, m5.2d; \ 73 pmull2 r7.1q, m6.2d, m7.2d; \ 74 eor T0.16b, T0.16b, T1.16b; \ 75 eor T2.16b, T2.16b, T3.16b; \ 76 eor T4.16b, T4.16b, T5.16b; \ 77 eor T6.16b, T6.16b, T7.16b; \ 78 ext T1.16b, RZERO.16b, T0.16b, #8; \ 79 ext T3.16b, RZERO.16b, T2.16b, #8; \ 80 ext T5.16b, RZERO.16b, T4.16b, #8; \ 81 ext T7.16b, RZERO.16b, T6.16b, #8; \ 82 ext T0.16b, T0.16b, RZERO.16b, #8; \ 83 ext T2.16b, T2.16b, RZERO.16b, #8; \ 84 ext T4.16b, T4.16b, RZERO.16b, #8; \ 85 ext T6.16b, T6.16b, RZERO.16b, #8; \ 86 eor r0.16b, r0.16b, T1.16b; \ 87 eor r2.16b, r2.16b, T3.16b; \ 88 eor r4.16b, r4.16b, T5.16b; \ 89 eor r6.16b, r6.16b, T7.16b; \ 90 eor r1.16b, r1.16b, T0.16b; \ 91 eor r3.16b, r3.16b, T2.16b; \ 92 eor r5.16b, r5.16b, T4.16b; \ 93 eor r7.16b, r7.16b, T6.16b; 94 95/* 96 * input: r0:r1 (low 128-bits in r0, high in r1) 97 * output: a 98 */ 99#define REDUCTION(a, r0, r1, rconst, T0, T1) \ 100 pmull2 T0.1q, r1.2d, rconst.2d; \ 101 ext T1.16b, T0.16b, RZERO.16b, #8; \ 102 ext T0.16b, RZERO.16b, T0.16b, #8; \ 103 eor r1.16b, r1.16b, T1.16b; \ 104 eor r0.16b, r0.16b, T0.16b; \ 105 pmull T0.1q, r1.1d, rconst.1d; \ 106 eor a.16b, r0.16b, T0.16b; 107 108#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \ 109 rev32 b0.16b, b0.16b; \ 110 ext T0.16b, m1.16b, m1.16b, #8; \ 111 sm4e b0.4s, v24.4s; \ 112 pmull r0.1q, m0.1d, m1.1d; \ 113 sm4e b0.4s, v25.4s; \ 114 pmull T1.1q, m0.1d, T0.1d; \ 115 sm4e b0.4s, v26.4s; \ 116 pmull2 T0.1q, m0.2d, T0.2d; \ 117 sm4e b0.4s, v27.4s; \ 118 pmull2 r1.1q, m0.2d, m1.2d; \ 119 sm4e b0.4s, v28.4s; \ 120 eor T0.16b, T0.16b, T1.16b; \ 121 sm4e b0.4s, v29.4s; \ 122 ext T1.16b, RZERO.16b, T0.16b, #8; \ 123 sm4e b0.4s, v30.4s; \ 124 ext T0.16b, T0.16b, RZERO.16b, #8; \ 125 sm4e b0.4s, v31.4s; \ 126 eor r0.16b, r0.16b, T1.16b; \ 127 rev64 b0.4s, b0.4s; \ 128 eor r1.16b, r1.16b, T0.16b; \ 129 ext b0.16b, b0.16b, b0.16b, #8; \ 130 rev32 b0.16b, b0.16b; 131 132#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \ 133 r0, r1, m0, m1, T0, T1, \ 134 r2, r3, m2, m3, T2, T3, \ 135 r4, r5, m4, m5, T4, T5) \ 136 rev32 b0.16b, b0.16b; \ 137 rev32 b1.16b, b1.16b; \ 138 rev32 b2.16b, b2.16b; \ 139 ext T0.16b, m1.16b, m1.16b, #8; \ 140 ext T2.16b, m3.16b, m3.16b, #8; \ 141 ext T4.16b, m5.16b, m5.16b, #8; \ 142 sm4e b0.4s, v24.4s; \ 143 sm4e b1.4s, v24.4s; \ 144 sm4e b2.4s, v24.4s; \ 145 pmull r0.1q, m0.1d, m1.1d; \ 146 pmull r2.1q, m2.1d, m3.1d; \ 147 pmull r4.1q, m4.1d, m5.1d; \ 148 sm4e b0.4s, v25.4s; \ 149 sm4e b1.4s, v25.4s; \ 150 sm4e b2.4s, v25.4s; \ 151 pmull T1.1q, m0.1d, T0.1d; \ 152 pmull T3.1q, m2.1d, T2.1d; \ 153 pmull T5.1q, m4.1d, T4.1d; \ 154 sm4e b0.4s, v26.4s; \ 155 sm4e b1.4s, v26.4s; \ 156 sm4e b2.4s, v26.4s; \ 157 pmull2 T0.1q, m0.2d, T0.2d; \ 158 pmull2 T2.1q, m2.2d, T2.2d; \ 159 pmull2 T4.1q, m4.2d, T4.2d; \ 160 sm4e b0.4s, v27.4s; \ 161 sm4e b1.4s, v27.4s; \ 162 sm4e b2.4s, v27.4s; \ 163 pmull2 r1.1q, m0.2d, m1.2d; \ 164 pmull2 r3.1q, m2.2d, m3.2d; \ 165 pmull2 r5.1q, m4.2d, m5.2d; \ 166 sm4e b0.4s, v28.4s; \ 167 sm4e b1.4s, v28.4s; \ 168 sm4e b2.4s, v28.4s; \ 169 eor T0.16b, T0.16b, T1.16b; \ 170 eor T2.16b, T2.16b, T3.16b; \ 171 eor T4.16b, T4.16b, T5.16b; \ 172 sm4e b0.4s, v29.4s; \ 173 sm4e b1.4s, v29.4s; \ 174 sm4e b2.4s, v29.4s; \ 175 ext T1.16b, RZERO.16b, T0.16b, #8; \ 176 ext T3.16b, RZERO.16b, T2.16b, #8; \ 177 ext T5.16b, RZERO.16b, T4.16b, #8; \ 178 sm4e b0.4s, v30.4s; \ 179 sm4e b1.4s, v30.4s; \ 180 sm4e b2.4s, v30.4s; \ 181 ext T0.16b, T0.16b, RZERO.16b, #8; \ 182 ext T2.16b, T2.16b, RZERO.16b, #8; \ 183 ext T4.16b, T4.16b, RZERO.16b, #8; \ 184 sm4e b0.4s, v31.4s; \ 185 sm4e b1.4s, v31.4s; \ 186 sm4e b2.4s, v31.4s; \ 187 eor r0.16b, r0.16b, T1.16b; \ 188 eor r2.16b, r2.16b, T3.16b; \ 189 eor r4.16b, r4.16b, T5.16b; \ 190 rev64 b0.4s, b0.4s; \ 191 rev64 b1.4s, b1.4s; \ 192 rev64 b2.4s, b2.4s; \ 193 eor r1.16b, r1.16b, T0.16b; \ 194 eor r3.16b, r3.16b, T2.16b; \ 195 eor r5.16b, r5.16b, T4.16b; \ 196 ext b0.16b, b0.16b, b0.16b, #8; \ 197 ext b1.16b, b1.16b, b1.16b, #8; \ 198 ext b2.16b, b2.16b, b2.16b, #8; \ 199 eor r0.16b, r0.16b, r2.16b; \ 200 eor r1.16b, r1.16b, r3.16b; \ 201 rev32 b0.16b, b0.16b; \ 202 rev32 b1.16b, b1.16b; \ 203 rev32 b2.16b, b2.16b; \ 204 eor r0.16b, r0.16b, r4.16b; \ 205 eor r1.16b, r1.16b, r5.16b; 206 207#define inc32_le128(vctr) \ 208 mov vctr.d[1], x9; \ 209 add w6, w9, #1; \ 210 mov vctr.d[0], x8; \ 211 bfi x9, x6, #0, #32; \ 212 rev64 vctr.16b, vctr.16b; 213 214#define GTAG_HASH_LENGTHS(vctr0, vlen) \ 215 ld1 {vlen.16b}, [x7]; \ 216 /* construct CTR0 */ \ 217 /* the lower 32-bits of initial IV is always be32(1) */ \ 218 mov x6, #0x1; \ 219 bfi x9, x6, #0, #32; \ 220 mov vctr0.d[0], x8; \ 221 mov vctr0.d[1], x9; \ 222 rbit vlen.16b, vlen.16b; \ 223 rev64 vctr0.16b, vctr0.16b; \ 224 /* authtag = GCTR(CTR0, GHASH) */ \ 225 eor RHASH.16b, RHASH.16b, vlen.16b; \ 226 SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \ 227 RTMP0, RTMP1); \ 228 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \ 229 rbit RHASH.16b, RHASH.16b; \ 230 eor RHASH.16b, RHASH.16b, vctr0.16b; 231 232 233/* Register macros for encrypt and ghash */ 234 235/* can be the same as input v0-v3 */ 236#define RR1 v0 237#define RR3 v1 238#define RR5 v2 239#define RR7 v3 240 241#define RR0 v4 242#define RR2 v5 243#define RR4 v6 244#define RR6 v7 245 246#define RTMP0 v8 247#define RTMP1 v9 248#define RTMP2 v10 249#define RTMP3 v11 250#define RTMP4 v12 251#define RTMP5 v13 252#define RTMP6 v14 253#define RTMP7 v15 254 255#define RH1 v16 256#define RH2 v17 257#define RH3 v18 258#define RH4 v19 259 260.align 3 261SYM_FUNC_START(sm4_ce_pmull_ghash_setup) 262 /* input: 263 * x0: round key array, CTX 264 * x1: ghash table 265 */ 266 SM4_PREPARE(x0) 267 268 adr_l x2, .Lghash_rconst 269 ld1r {RRCONST.2d}, [x2] 270 271 eor RZERO.16b, RZERO.16b, RZERO.16b 272 273 /* H = E(K, 0^128) */ 274 rev32 v0.16b, RZERO.16b 275 SM4_CRYPT_BLK_BE(v0) 276 277 /* H ^ 1 */ 278 rbit RH1.16b, v0.16b 279 280 /* H ^ 2 */ 281 PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1) 282 REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3) 283 284 /* H ^ 3 */ 285 PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1) 286 REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3) 287 288 /* H ^ 4 */ 289 PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1) 290 REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3) 291 292 st1 {RH1.16b-RH4.16b}, [x1] 293 294 ret 295SYM_FUNC_END(sm4_ce_pmull_ghash_setup) 296 297.align 3 298SYM_FUNC_START(pmull_ghash_update) 299 /* input: 300 * x0: ghash table 301 * x1: ghash result 302 * x2: src 303 * w3: nblocks 304 */ 305 ld1 {RH1.16b-RH4.16b}, [x0] 306 307 ld1 {RHASH.16b}, [x1] 308 rbit RHASH.16b, RHASH.16b 309 310 adr_l x4, .Lghash_rconst 311 ld1r {RRCONST.2d}, [x4] 312 313 eor RZERO.16b, RZERO.16b, RZERO.16b 314 315.Lghash_loop_4x: 316 cmp w3, #4 317 blt .Lghash_loop_1x 318 319 sub w3, w3, #4 320 321 ld1 {v0.16b-v3.16b}, [x2], #64 322 323 rbit v0.16b, v0.16b 324 rbit v1.16b, v1.16b 325 rbit v2.16b, v2.16b 326 rbit v3.16b, v3.16b 327 328 /* 329 * (in0 ^ HASH) * H^4 => rr0:rr1 330 * (in1) * H^3 => rr2:rr3 331 * (in2) * H^2 => rr4:rr5 332 * (in3) * H^1 => rr6:rr7 333 */ 334 eor RHASH.16b, RHASH.16b, v0.16b 335 336 PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1, 337 RR2, RR3, v1, RH3, RTMP2, RTMP3, 338 RR4, RR5, v2, RH2, RTMP4, RTMP5, 339 RR6, RR7, v3, RH1, RTMP6, RTMP7) 340 341 eor RR0.16b, RR0.16b, RR2.16b 342 eor RR1.16b, RR1.16b, RR3.16b 343 eor RR0.16b, RR0.16b, RR4.16b 344 eor RR1.16b, RR1.16b, RR5.16b 345 eor RR0.16b, RR0.16b, RR6.16b 346 eor RR1.16b, RR1.16b, RR7.16b 347 348 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) 349 350 cbz w3, .Lghash_end 351 b .Lghash_loop_4x 352 353.Lghash_loop_1x: 354 sub w3, w3, #1 355 356 ld1 {v0.16b}, [x2], #16 357 rbit v0.16b, v0.16b 358 eor RHASH.16b, RHASH.16b, v0.16b 359 360 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 361 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 362 363 cbnz w3, .Lghash_loop_1x 364 365.Lghash_end: 366 rbit RHASH.16b, RHASH.16b 367 st1 {RHASH.2d}, [x1] 368 369 ret 370SYM_FUNC_END(pmull_ghash_update) 371 372.align 3 373SYM_FUNC_START(sm4_ce_pmull_gcm_enc) 374 /* input: 375 * x0: round key array, CTX 376 * x1: dst 377 * x2: src 378 * x3: ctr (big endian, 128 bit) 379 * w4: nbytes 380 * x5: ghash result 381 * x6: ghash table 382 * x7: lengths (only for last block) 383 */ 384 SM4_PREPARE(x0) 385 386 ldp x8, x9, [x3] 387 rev x8, x8 388 rev x9, x9 389 390 ld1 {RH1.16b-RH4.16b}, [x6] 391 392 ld1 {RHASH.16b}, [x5] 393 rbit RHASH.16b, RHASH.16b 394 395 adr_l x6, .Lghash_rconst 396 ld1r {RRCONST.2d}, [x6] 397 398 eor RZERO.16b, RZERO.16b, RZERO.16b 399 400 cbz w4, .Lgcm_enc_hash_len 401 402.Lgcm_enc_loop_4x: 403 cmp w4, #(4 * 16) 404 blt .Lgcm_enc_loop_1x 405 406 sub w4, w4, #(4 * 16) 407 408 /* construct CTRs */ 409 inc32_le128(v0) /* +0 */ 410 inc32_le128(v1) /* +1 */ 411 inc32_le128(v2) /* +2 */ 412 inc32_le128(v3) /* +3 */ 413 414 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 415 416 SM4_CRYPT_BLK4(v0, v1, v2, v3) 417 418 eor v0.16b, v0.16b, RTMP0.16b 419 eor v1.16b, v1.16b, RTMP1.16b 420 eor v2.16b, v2.16b, RTMP2.16b 421 eor v3.16b, v3.16b, RTMP3.16b 422 st1 {v0.16b-v3.16b}, [x1], #64 423 424 /* ghash update */ 425 426 rbit v0.16b, v0.16b 427 rbit v1.16b, v1.16b 428 rbit v2.16b, v2.16b 429 rbit v3.16b, v3.16b 430 431 /* 432 * (in0 ^ HASH) * H^4 => rr0:rr1 433 * (in1) * H^3 => rr2:rr3 434 * (in2) * H^2 => rr4:rr5 435 * (in3) * H^1 => rr6:rr7 436 */ 437 eor RHASH.16b, RHASH.16b, v0.16b 438 439 PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1, 440 RR2, RR3, v1, RH3, RTMP2, RTMP3, 441 RR4, RR5, v2, RH2, RTMP4, RTMP5, 442 RR6, RR7, v3, RH1, RTMP6, RTMP7) 443 444 eor RR0.16b, RR0.16b, RR2.16b 445 eor RR1.16b, RR1.16b, RR3.16b 446 eor RR0.16b, RR0.16b, RR4.16b 447 eor RR1.16b, RR1.16b, RR5.16b 448 eor RR0.16b, RR0.16b, RR6.16b 449 eor RR1.16b, RR1.16b, RR7.16b 450 451 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) 452 453 cbz w4, .Lgcm_enc_hash_len 454 b .Lgcm_enc_loop_4x 455 456.Lgcm_enc_loop_1x: 457 cmp w4, #16 458 blt .Lgcm_enc_tail 459 460 sub w4, w4, #16 461 462 /* construct CTRs */ 463 inc32_le128(v0) 464 465 ld1 {RTMP0.16b}, [x2], #16 466 467 SM4_CRYPT_BLK(v0) 468 469 eor v0.16b, v0.16b, RTMP0.16b 470 st1 {v0.16b}, [x1], #16 471 472 /* ghash update */ 473 rbit v0.16b, v0.16b 474 eor RHASH.16b, RHASH.16b, v0.16b 475 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 476 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 477 478 cbz w4, .Lgcm_enc_hash_len 479 b .Lgcm_enc_loop_1x 480 481.Lgcm_enc_tail: 482 /* construct CTRs */ 483 inc32_le128(v0) 484 SM4_CRYPT_BLK(v0) 485 486 /* load permute table */ 487 adr_l x0, .Lcts_permute_table 488 add x0, x0, #32 489 sub x0, x0, w4, uxtw 490 ld1 {v3.16b}, [x0] 491 492.Lgcm_enc_tail_loop: 493 /* do encrypt */ 494 ldrb w0, [x2], #1 /* get 1 byte from input */ 495 umov w6, v0.b[0] /* get top crypted byte */ 496 eor w6, w6, w0 /* w6 = CTR ^ input */ 497 strb w6, [x1], #1 /* store out byte */ 498 499 /* shift right out one byte */ 500 ext v0.16b, v0.16b, v0.16b, #1 501 /* the last ciphertext is placed in high bytes */ 502 ins v0.b[15], w6 503 504 subs w4, w4, #1 505 bne .Lgcm_enc_tail_loop 506 507 /* padding last block with zeros */ 508 tbl v0.16b, {v0.16b}, v3.16b 509 510 /* ghash update */ 511 rbit v0.16b, v0.16b 512 eor RHASH.16b, RHASH.16b, v0.16b 513 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 514 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 515 516.Lgcm_enc_hash_len: 517 cbz x7, .Lgcm_enc_end 518 519 GTAG_HASH_LENGTHS(v1, v3) 520 521 b .Lgcm_enc_ret 522 523.Lgcm_enc_end: 524 /* store new CTR */ 525 rev x8, x8 526 rev x9, x9 527 stp x8, x9, [x3] 528 529 rbit RHASH.16b, RHASH.16b 530 531.Lgcm_enc_ret: 532 /* store new MAC */ 533 st1 {RHASH.2d}, [x5] 534 535 ret 536SYM_FUNC_END(sm4_ce_pmull_gcm_enc) 537 538#undef RR1 539#undef RR3 540#undef RR5 541#undef RR7 542#undef RR0 543#undef RR2 544#undef RR4 545#undef RR6 546#undef RTMP0 547#undef RTMP1 548#undef RTMP2 549#undef RTMP3 550#undef RTMP4 551#undef RTMP5 552#undef RTMP6 553#undef RTMP7 554#undef RH1 555#undef RH2 556#undef RH3 557#undef RH4 558 559 560/* Register macros for decrypt */ 561 562/* v0-v2 for building CTRs, v3-v5 for saving inputs */ 563 564#define RR1 v6 565#define RR3 v7 566#define RR5 v8 567 568#define RR0 v9 569#define RR2 v10 570#define RR4 v11 571 572#define RTMP0 v12 573#define RTMP1 v13 574#define RTMP2 v14 575#define RTMP3 v15 576#define RTMP4 v16 577#define RTMP5 v17 578 579#define RH1 v18 580#define RH2 v19 581#define RH3 v20 582 583.align 3 584SYM_FUNC_START(sm4_ce_pmull_gcm_dec) 585 /* input: 586 * x0: round key array, CTX 587 * x1: dst 588 * x2: src 589 * x3: ctr (big endian, 128 bit) 590 * w4: nbytes 591 * x5: ghash result 592 * x6: ghash table 593 * x7: lengths (only for last block) 594 */ 595 SM4_PREPARE(x0) 596 597 ldp x8, x9, [x3] 598 rev x8, x8 599 rev x9, x9 600 601 ld1 {RH1.16b-RH3.16b}, [x6] 602 603 ld1 {RHASH.16b}, [x5] 604 rbit RHASH.16b, RHASH.16b 605 606 adr_l x6, .Lghash_rconst 607 ld1r {RRCONST.2d}, [x6] 608 609 eor RZERO.16b, RZERO.16b, RZERO.16b 610 611 cbz w4, .Lgcm_dec_hash_len 612 613.Lgcm_dec_loop_3x: 614 cmp w4, #(3 * 16) 615 blt .Lgcm_dec_loop_1x 616 617 sub w4, w4, #(3 * 16) 618 619 ld1 {v3.16b-v5.16b}, [x2], #(3 * 16) 620 621 /* construct CTRs */ 622 inc32_le128(v0) /* +0 */ 623 rbit v6.16b, v3.16b 624 inc32_le128(v1) /* +1 */ 625 rbit v7.16b, v4.16b 626 inc32_le128(v2) /* +2 */ 627 rbit v8.16b, v5.16b 628 629 eor RHASH.16b, RHASH.16b, v6.16b 630 631 /* decrypt & ghash update */ 632 SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2, 633 RR0, RR1, RHASH, RH3, RTMP0, RTMP1, 634 RR2, RR3, v7, RH2, RTMP2, RTMP3, 635 RR4, RR5, v8, RH1, RTMP4, RTMP5) 636 637 eor v0.16b, v0.16b, v3.16b 638 eor v1.16b, v1.16b, v4.16b 639 eor v2.16b, v2.16b, v5.16b 640 641 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) 642 643 st1 {v0.16b-v2.16b}, [x1], #(3 * 16) 644 645 cbz w4, .Lgcm_dec_hash_len 646 b .Lgcm_dec_loop_3x 647 648.Lgcm_dec_loop_1x: 649 cmp w4, #16 650 blt .Lgcm_dec_tail 651 652 sub w4, w4, #16 653 654 ld1 {v3.16b}, [x2], #16 655 656 /* construct CTRs */ 657 inc32_le128(v0) 658 rbit v6.16b, v3.16b 659 660 eor RHASH.16b, RHASH.16b, v6.16b 661 662 SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 663 664 eor v0.16b, v0.16b, v3.16b 665 666 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 667 668 st1 {v0.16b}, [x1], #16 669 670 cbz w4, .Lgcm_dec_hash_len 671 b .Lgcm_dec_loop_1x 672 673.Lgcm_dec_tail: 674 /* construct CTRs */ 675 inc32_le128(v0) 676 SM4_CRYPT_BLK(v0) 677 678 /* load permute table */ 679 adr_l x0, .Lcts_permute_table 680 add x0, x0, #32 681 sub x0, x0, w4, uxtw 682 ld1 {v3.16b}, [x0] 683 684.Lgcm_dec_tail_loop: 685 /* do decrypt */ 686 ldrb w0, [x2], #1 /* get 1 byte from input */ 687 umov w6, v0.b[0] /* get top crypted byte */ 688 eor w6, w6, w0 /* w6 = CTR ^ input */ 689 strb w6, [x1], #1 /* store out byte */ 690 691 /* shift right out one byte */ 692 ext v0.16b, v0.16b, v0.16b, #1 693 /* the last ciphertext is placed in high bytes */ 694 ins v0.b[15], w0 695 696 subs w4, w4, #1 697 bne .Lgcm_dec_tail_loop 698 699 /* padding last block with zeros */ 700 tbl v0.16b, {v0.16b}, v3.16b 701 702 /* ghash update */ 703 rbit v0.16b, v0.16b 704 eor RHASH.16b, RHASH.16b, v0.16b 705 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 706 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 707 708.Lgcm_dec_hash_len: 709 cbz x7, .Lgcm_dec_end 710 711 GTAG_HASH_LENGTHS(v1, v3) 712 713 b .Lgcm_dec_ret 714 715.Lgcm_dec_end: 716 /* store new CTR */ 717 rev x8, x8 718 rev x9, x9 719 stp x8, x9, [x3] 720 721 rbit RHASH.16b, RHASH.16b 722 723.Lgcm_dec_ret: 724 /* store new MAC */ 725 st1 {RHASH.2d}, [x5] 726 727 ret 728SYM_FUNC_END(sm4_ce_pmull_gcm_dec) 729 730 .section ".rodata", "a" 731 .align 4 732.Lcts_permute_table: 733 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 734 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 735 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 736 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 737 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 738 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 739 740.Lghash_rconst: 741 .quad 0x87 742