1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 4 * 5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 SHASH .req v0 12 SHASH2 .req v1 13 T1 .req v2 14 T2 .req v3 15 MASK .req v4 16 XL .req v5 17 XM .req v6 18 XH .req v7 19 IN1 .req v7 20 21 k00_16 .req v8 22 k32_48 .req v9 23 24 t3 .req v10 25 t4 .req v11 26 t5 .req v12 27 t6 .req v13 28 t7 .req v14 29 t8 .req v15 30 t9 .req v16 31 32 perm1 .req v17 33 perm2 .req v18 34 perm3 .req v19 35 36 sh1 .req v20 37 sh2 .req v21 38 sh3 .req v22 39 sh4 .req v23 40 41 ss1 .req v24 42 ss2 .req v25 43 ss3 .req v26 44 ss4 .req v27 45 46 XL2 .req v8 47 XM2 .req v9 48 XH2 .req v10 49 XL3 .req v11 50 XM3 .req v12 51 XH3 .req v13 52 TT3 .req v14 53 TT4 .req v15 54 HH .req v16 55 HH3 .req v17 56 HH4 .req v18 57 HH34 .req v19 58 59 .text 60 .arch armv8-a+crypto 61 62 .macro __pmull_p64, rd, rn, rm 63 pmull \rd\().1q, \rn\().1d, \rm\().1d 64 .endm 65 66 .macro __pmull2_p64, rd, rn, rm 67 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 68 .endm 69 70 .macro __pmull_p8, rq, ad, bd 71 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 72 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 73 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 74 75 __pmull_p8_\bd \rq, \ad 76 .endm 77 78 .macro __pmull2_p8, rq, ad, bd 79 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 80 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 81 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 82 83 __pmull2_p8_\bd \rq, \ad 84 .endm 85 86 .macro __pmull_p8_SHASH, rq, ad 87 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 88 .endm 89 90 .macro __pmull_p8_SHASH2, rq, ad 91 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 92 .endm 93 94 .macro __pmull2_p8_SHASH, rq, ad 95 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 96 .endm 97 98 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 99 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 100 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 101 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 102 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 103 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 104 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 105 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 106 pmull\t \rq\().8h, \ad, \bd // D = A*B 107 108 eor t3.16b, t3.16b, t4.16b // L = E + F 109 eor t5.16b, t5.16b, t6.16b // M = G + H 110 eor t7.16b, t7.16b, t8.16b // N = I + J 111 112 uzp1 t4.2d, t3.2d, t5.2d 113 uzp2 t3.2d, t3.2d, t5.2d 114 uzp1 t6.2d, t7.2d, t9.2d 115 uzp2 t7.2d, t7.2d, t9.2d 116 117 // t3 = (L) (P0 + P1) << 8 118 // t5 = (M) (P2 + P3) << 16 119 eor t4.16b, t4.16b, t3.16b 120 and t3.16b, t3.16b, k32_48.16b 121 122 // t7 = (N) (P4 + P5) << 24 123 // t9 = (K) (P6 + P7) << 32 124 eor t6.16b, t6.16b, t7.16b 125 and t7.16b, t7.16b, k00_16.16b 126 127 eor t4.16b, t4.16b, t3.16b 128 eor t6.16b, t6.16b, t7.16b 129 130 zip2 t5.2d, t4.2d, t3.2d 131 zip1 t3.2d, t4.2d, t3.2d 132 zip2 t9.2d, t6.2d, t7.2d 133 zip1 t7.2d, t6.2d, t7.2d 134 135 ext t3.16b, t3.16b, t3.16b, #15 136 ext t5.16b, t5.16b, t5.16b, #14 137 ext t7.16b, t7.16b, t7.16b, #13 138 ext t9.16b, t9.16b, t9.16b, #12 139 140 eor t3.16b, t3.16b, t5.16b 141 eor t7.16b, t7.16b, t9.16b 142 eor \rq\().16b, \rq\().16b, t3.16b 143 eor \rq\().16b, \rq\().16b, t7.16b 144 .endm 145 146 .macro __pmull_pre_p64 147 add x8, x3, #16 148 ld1 {HH.2d-HH4.2d}, [x8] 149 150 trn1 SHASH2.2d, SHASH.2d, HH.2d 151 trn2 T1.2d, SHASH.2d, HH.2d 152 eor SHASH2.16b, SHASH2.16b, T1.16b 153 154 trn1 HH34.2d, HH3.2d, HH4.2d 155 trn2 T1.2d, HH3.2d, HH4.2d 156 eor HH34.16b, HH34.16b, T1.16b 157 158 movi MASK.16b, #0xe1 159 shl MASK.2d, MASK.2d, #57 160 .endm 161 162 .macro __pmull_pre_p8 163 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 164 eor SHASH2.16b, SHASH2.16b, SHASH.16b 165 166 // k00_16 := 0x0000000000000000_000000000000ffff 167 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 168 movi k32_48.2d, #0xffffffff 169 mov k32_48.h[2], k32_48.h[0] 170 ushr k00_16.2d, k32_48.2d, #32 171 172 // prepare the permutation vectors 173 mov_q x5, 0x080f0e0d0c0b0a09 174 movi T1.8b, #8 175 dup perm1.2d, x5 176 eor perm1.16b, perm1.16b, T1.16b 177 ushr perm2.2d, perm1.2d, #8 178 ushr perm3.2d, perm1.2d, #16 179 ushr T1.2d, perm1.2d, #24 180 sli perm2.2d, perm1.2d, #56 181 sli perm3.2d, perm1.2d, #48 182 sli T1.2d, perm1.2d, #40 183 184 // precompute loop invariants 185 tbl sh1.16b, {SHASH.16b}, perm1.16b 186 tbl sh2.16b, {SHASH.16b}, perm2.16b 187 tbl sh3.16b, {SHASH.16b}, perm3.16b 188 tbl sh4.16b, {SHASH.16b}, T1.16b 189 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 190 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 191 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 192 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 193 .endm 194 195 // 196 // PMULL (64x64->128) based reduction for CPUs that can do 197 // it in a single instruction. 198 // 199 .macro __pmull_reduce_p64 200 pmull T2.1q, XL.1d, MASK.1d 201 eor XM.16b, XM.16b, T1.16b 202 203 mov XH.d[0], XM.d[1] 204 mov XM.d[1], XL.d[0] 205 206 eor XL.16b, XM.16b, T2.16b 207 ext T2.16b, XL.16b, XL.16b, #8 208 pmull XL.1q, XL.1d, MASK.1d 209 .endm 210 211 // 212 // Alternative reduction for CPUs that lack support for the 213 // 64x64->128 PMULL instruction 214 // 215 .macro __pmull_reduce_p8 216 eor XM.16b, XM.16b, T1.16b 217 218 mov XL.d[1], XM.d[0] 219 mov XH.d[0], XM.d[1] 220 221 shl T1.2d, XL.2d, #57 222 shl T2.2d, XL.2d, #62 223 eor T2.16b, T2.16b, T1.16b 224 shl T1.2d, XL.2d, #63 225 eor T2.16b, T2.16b, T1.16b 226 ext T1.16b, XL.16b, XH.16b, #8 227 eor T2.16b, T2.16b, T1.16b 228 229 mov XL.d[1], T2.d[0] 230 mov XH.d[0], T2.d[1] 231 232 ushr T2.2d, XL.2d, #1 233 eor XH.16b, XH.16b, XL.16b 234 eor XL.16b, XL.16b, T2.16b 235 ushr T2.2d, T2.2d, #6 236 ushr XL.2d, XL.2d, #1 237 .endm 238 239 .macro __pmull_ghash, pn 240 ld1 {SHASH.2d}, [x3] 241 ld1 {XL.2d}, [x1] 242 243 __pmull_pre_\pn 244 245 /* do the head block first, if supplied */ 246 cbz x4, 0f 247 ld1 {T1.2d}, [x4] 248 mov x4, xzr 249 b 3f 250 2510: .ifc \pn, p64 252 tbnz w0, #0, 2f // skip until #blocks is a 253 tbnz w0, #1, 2f // round multiple of 4 254 2551: ld1 {XM3.16b-TT4.16b}, [x2], #64 256 257 sub w0, w0, #4 258 259 rev64 T1.16b, XM3.16b 260 rev64 T2.16b, XH3.16b 261 rev64 TT4.16b, TT4.16b 262 rev64 TT3.16b, TT3.16b 263 264 ext IN1.16b, TT4.16b, TT4.16b, #8 265 ext XL3.16b, TT3.16b, TT3.16b, #8 266 267 eor TT4.16b, TT4.16b, IN1.16b 268 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 269 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 270 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 271 272 eor TT3.16b, TT3.16b, XL3.16b 273 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 274 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 275 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 276 277 ext IN1.16b, T2.16b, T2.16b, #8 278 eor XL2.16b, XL2.16b, XL3.16b 279 eor XH2.16b, XH2.16b, XH3.16b 280 eor XM2.16b, XM2.16b, XM3.16b 281 282 eor T2.16b, T2.16b, IN1.16b 283 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 284 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 285 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 286 287 eor XL2.16b, XL2.16b, XL3.16b 288 eor XH2.16b, XH2.16b, XH3.16b 289 eor XM2.16b, XM2.16b, XM3.16b 290 291 ext IN1.16b, T1.16b, T1.16b, #8 292 ext TT3.16b, XL.16b, XL.16b, #8 293 eor XL.16b, XL.16b, IN1.16b 294 eor T1.16b, T1.16b, TT3.16b 295 296 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 297 eor T1.16b, T1.16b, XL.16b 298 pmull XL.1q, HH4.1d, XL.1d // a0 * b0 299 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 300 301 eor XL.16b, XL.16b, XL2.16b 302 eor XH.16b, XH.16b, XH2.16b 303 eor XM.16b, XM.16b, XM2.16b 304 305 eor T2.16b, XL.16b, XH.16b 306 ext T1.16b, XL.16b, XH.16b, #8 307 eor XM.16b, XM.16b, T2.16b 308 309 __pmull_reduce_p64 310 311 eor T2.16b, T2.16b, XH.16b 312 eor XL.16b, XL.16b, T2.16b 313 314 cbz w0, 5f 315 b 1b 316 .endif 317 3182: ld1 {T1.2d}, [x2], #16 319 sub w0, w0, #1 320 3213: /* multiply XL by SHASH in GF(2^128) */ 322CPU_LE( rev64 T1.16b, T1.16b ) 323 324 ext T2.16b, XL.16b, XL.16b, #8 325 ext IN1.16b, T1.16b, T1.16b, #8 326 eor T1.16b, T1.16b, T2.16b 327 eor XL.16b, XL.16b, IN1.16b 328 329 __pmull2_\pn XH, XL, SHASH // a1 * b1 330 eor T1.16b, T1.16b, XL.16b 331 __pmull_\pn XL, XL, SHASH // a0 * b0 332 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 333 3344: eor T2.16b, XL.16b, XH.16b 335 ext T1.16b, XL.16b, XH.16b, #8 336 eor XM.16b, XM.16b, T2.16b 337 338 __pmull_reduce_\pn 339 340 eor T2.16b, T2.16b, XH.16b 341 eor XL.16b, XL.16b, T2.16b 342 343 cbnz w0, 0b 344 3455: st1 {XL.2d}, [x1] 346 ret 347 .endm 348 349 /* 350 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 351 * struct ghash_key const *k, const char *head) 352 */ 353ENTRY(pmull_ghash_update_p64) 354 __pmull_ghash p64 355ENDPROC(pmull_ghash_update_p64) 356 357ENTRY(pmull_ghash_update_p8) 358 __pmull_ghash p8 359ENDPROC(pmull_ghash_update_p8) 360 361 KS0 .req v12 362 KS1 .req v13 363 INP0 .req v14 364 INP1 .req v15 365 366 .macro load_round_keys, rounds, rk 367 cmp \rounds, #12 368 blo 2222f /* 128 bits */ 369 beq 1111f /* 192 bits */ 370 ld1 {v17.4s-v18.4s}, [\rk], #32 3711111: ld1 {v19.4s-v20.4s}, [\rk], #32 3722222: ld1 {v21.4s-v24.4s}, [\rk], #64 373 ld1 {v25.4s-v28.4s}, [\rk], #64 374 ld1 {v29.4s-v31.4s}, [\rk] 375 .endm 376 377 .macro enc_round, state, key 378 aese \state\().16b, \key\().16b 379 aesmc \state\().16b, \state\().16b 380 .endm 381 382 .macro enc_block, state, rounds 383 cmp \rounds, #12 384 b.lo 2222f /* 128 bits */ 385 b.eq 1111f /* 192 bits */ 386 enc_round \state, v17 387 enc_round \state, v18 3881111: enc_round \state, v19 389 enc_round \state, v20 3902222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 391 enc_round \state, \key 392 .endr 393 aese \state\().16b, v30.16b 394 eor \state\().16b, \state\().16b, v31.16b 395 .endm 396 397 .macro pmull_gcm_do_crypt, enc 398 ld1 {SHASH.2d}, [x4], #16 399 ld1 {HH.2d}, [x4] 400 ld1 {XL.2d}, [x1] 401 ldr x8, [x5, #8] // load lower counter 402 403 movi MASK.16b, #0xe1 404 trn1 SHASH2.2d, SHASH.2d, HH.2d 405 trn2 T1.2d, SHASH.2d, HH.2d 406CPU_LE( rev x8, x8 ) 407 shl MASK.2d, MASK.2d, #57 408 eor SHASH2.16b, SHASH2.16b, T1.16b 409 410 .if \enc == 1 411 ldr x10, [sp] 412 ld1 {KS0.16b-KS1.16b}, [x10] 413 .endif 414 415 cbnz x6, 4f 416 4170: ld1 {INP0.16b-INP1.16b}, [x3], #32 418 419 rev x9, x8 420 add x11, x8, #1 421 add x8, x8, #2 422 423 .if \enc == 1 424 eor INP0.16b, INP0.16b, KS0.16b // encrypt input 425 eor INP1.16b, INP1.16b, KS1.16b 426 .endif 427 428 ld1 {KS0.8b}, [x5] // load upper counter 429 rev x11, x11 430 sub w0, w0, #2 431 mov KS1.8b, KS0.8b 432 ins KS0.d[1], x9 // set lower counter 433 ins KS1.d[1], x11 434 435 rev64 T1.16b, INP1.16b 436 437 cmp w7, #12 438 b.ge 2f // AES-192/256? 439 4401: enc_round KS0, v21 441 ext IN1.16b, T1.16b, T1.16b, #8 442 443 enc_round KS1, v21 444 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 445 446 enc_round KS0, v22 447 eor T1.16b, T1.16b, IN1.16b 448 449 enc_round KS1, v22 450 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 451 452 enc_round KS0, v23 453 pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) 454 455 enc_round KS1, v23 456 rev64 T1.16b, INP0.16b 457 ext T2.16b, XL.16b, XL.16b, #8 458 459 enc_round KS0, v24 460 ext IN1.16b, T1.16b, T1.16b, #8 461 eor T1.16b, T1.16b, T2.16b 462 463 enc_round KS1, v24 464 eor XL.16b, XL.16b, IN1.16b 465 466 enc_round KS0, v25 467 eor T1.16b, T1.16b, XL.16b 468 469 enc_round KS1, v25 470 pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 471 472 enc_round KS0, v26 473 pmull XL.1q, HH.1d, XL.1d // a0 * b0 474 475 enc_round KS1, v26 476 pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) 477 478 enc_round KS0, v27 479 eor XL.16b, XL.16b, XL2.16b 480 eor XH.16b, XH.16b, XH2.16b 481 482 enc_round KS1, v27 483 eor XM.16b, XM.16b, XM2.16b 484 ext T1.16b, XL.16b, XH.16b, #8 485 486 enc_round KS0, v28 487 eor T2.16b, XL.16b, XH.16b 488 eor XM.16b, XM.16b, T1.16b 489 490 enc_round KS1, v28 491 eor XM.16b, XM.16b, T2.16b 492 493 enc_round KS0, v29 494 pmull T2.1q, XL.1d, MASK.1d 495 496 enc_round KS1, v29 497 mov XH.d[0], XM.d[1] 498 mov XM.d[1], XL.d[0] 499 500 aese KS0.16b, v30.16b 501 eor XL.16b, XM.16b, T2.16b 502 503 aese KS1.16b, v30.16b 504 ext T2.16b, XL.16b, XL.16b, #8 505 506 eor KS0.16b, KS0.16b, v31.16b 507 pmull XL.1q, XL.1d, MASK.1d 508 eor T2.16b, T2.16b, XH.16b 509 510 eor KS1.16b, KS1.16b, v31.16b 511 eor XL.16b, XL.16b, T2.16b 512 513 .if \enc == 0 514 eor INP0.16b, INP0.16b, KS0.16b 515 eor INP1.16b, INP1.16b, KS1.16b 516 .endif 517 518 st1 {INP0.16b-INP1.16b}, [x2], #32 519 520 cbnz w0, 0b 521 522CPU_LE( rev x8, x8 ) 523 st1 {XL.2d}, [x1] 524 str x8, [x5, #8] // store lower counter 525 526 .if \enc == 1 527 st1 {KS0.16b-KS1.16b}, [x10] 528 .endif 529 530 ret 531 5322: b.eq 3f // AES-192? 533 enc_round KS0, v17 534 enc_round KS1, v17 535 enc_round KS0, v18 536 enc_round KS1, v18 5373: enc_round KS0, v19 538 enc_round KS1, v19 539 enc_round KS0, v20 540 enc_round KS1, v20 541 b 1b 542 5434: load_round_keys w7, x6 544 b 0b 545 .endm 546 547 /* 548 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 549 * struct ghash_key const *k, u8 ctr[], 550 * int rounds, u8 ks[]) 551 */ 552ENTRY(pmull_gcm_encrypt) 553 pmull_gcm_do_crypt 1 554ENDPROC(pmull_gcm_encrypt) 555 556 /* 557 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 558 * struct ghash_key const *k, u8 ctr[], 559 * int rounds) 560 */ 561ENTRY(pmull_gcm_decrypt) 562 pmull_gcm_do_crypt 0 563ENDPROC(pmull_gcm_decrypt) 564 565 /* 566 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) 567 */ 568ENTRY(pmull_gcm_encrypt_block) 569 cbz x2, 0f 570 load_round_keys w3, x2 5710: ld1 {v0.16b}, [x1] 572 enc_block v0, w3 573 st1 {v0.16b}, [x0] 574 ret 575ENDPROC(pmull_gcm_encrypt_block) 576