1/* 2 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 3 * 4 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 SHASH .req v0 15 SHASH2 .req v1 16 T1 .req v2 17 T2 .req v3 18 MASK .req v4 19 XL .req v5 20 XM .req v6 21 XH .req v7 22 IN1 .req v7 23 24 k00_16 .req v8 25 k32_48 .req v9 26 27 t3 .req v10 28 t4 .req v11 29 t5 .req v12 30 t6 .req v13 31 t7 .req v14 32 t8 .req v15 33 t9 .req v16 34 35 perm1 .req v17 36 perm2 .req v18 37 perm3 .req v19 38 39 sh1 .req v20 40 sh2 .req v21 41 sh3 .req v22 42 sh4 .req v23 43 44 ss1 .req v24 45 ss2 .req v25 46 ss3 .req v26 47 ss4 .req v27 48 49 XL2 .req v8 50 XM2 .req v9 51 XH2 .req v10 52 XL3 .req v11 53 XM3 .req v12 54 XH3 .req v13 55 TT3 .req v14 56 TT4 .req v15 57 HH .req v16 58 HH3 .req v17 59 HH4 .req v18 60 HH34 .req v19 61 62 .text 63 .arch armv8-a+crypto 64 65 .macro __pmull_p64, rd, rn, rm 66 pmull \rd\().1q, \rn\().1d, \rm\().1d 67 .endm 68 69 .macro __pmull2_p64, rd, rn, rm 70 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 71 .endm 72 73 .macro __pmull_p8, rq, ad, bd 74 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 75 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 76 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 77 78 __pmull_p8_\bd \rq, \ad 79 .endm 80 81 .macro __pmull2_p8, rq, ad, bd 82 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 83 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 84 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 85 86 __pmull2_p8_\bd \rq, \ad 87 .endm 88 89 .macro __pmull_p8_SHASH, rq, ad 90 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 91 .endm 92 93 .macro __pmull_p8_SHASH2, rq, ad 94 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 95 .endm 96 97 .macro __pmull2_p8_SHASH, rq, ad 98 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 99 .endm 100 101 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 102 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 103 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 104 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 105 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 106 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 107 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 108 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 109 pmull\t \rq\().8h, \ad, \bd // D = A*B 110 111 eor t3.16b, t3.16b, t4.16b // L = E + F 112 eor t5.16b, t5.16b, t6.16b // M = G + H 113 eor t7.16b, t7.16b, t8.16b // N = I + J 114 115 uzp1 t4.2d, t3.2d, t5.2d 116 uzp2 t3.2d, t3.2d, t5.2d 117 uzp1 t6.2d, t7.2d, t9.2d 118 uzp2 t7.2d, t7.2d, t9.2d 119 120 // t3 = (L) (P0 + P1) << 8 121 // t5 = (M) (P2 + P3) << 16 122 eor t4.16b, t4.16b, t3.16b 123 and t3.16b, t3.16b, k32_48.16b 124 125 // t7 = (N) (P4 + P5) << 24 126 // t9 = (K) (P6 + P7) << 32 127 eor t6.16b, t6.16b, t7.16b 128 and t7.16b, t7.16b, k00_16.16b 129 130 eor t4.16b, t4.16b, t3.16b 131 eor t6.16b, t6.16b, t7.16b 132 133 zip2 t5.2d, t4.2d, t3.2d 134 zip1 t3.2d, t4.2d, t3.2d 135 zip2 t9.2d, t6.2d, t7.2d 136 zip1 t7.2d, t6.2d, t7.2d 137 138 ext t3.16b, t3.16b, t3.16b, #15 139 ext t5.16b, t5.16b, t5.16b, #14 140 ext t7.16b, t7.16b, t7.16b, #13 141 ext t9.16b, t9.16b, t9.16b, #12 142 143 eor t3.16b, t3.16b, t5.16b 144 eor t7.16b, t7.16b, t9.16b 145 eor \rq\().16b, \rq\().16b, t3.16b 146 eor \rq\().16b, \rq\().16b, t7.16b 147 .endm 148 149 .macro __pmull_pre_p64 150 add x8, x3, #16 151 ld1 {HH.2d-HH4.2d}, [x8] 152 153 trn1 SHASH2.2d, SHASH.2d, HH.2d 154 trn2 T1.2d, SHASH.2d, HH.2d 155 eor SHASH2.16b, SHASH2.16b, T1.16b 156 157 trn1 HH34.2d, HH3.2d, HH4.2d 158 trn2 T1.2d, HH3.2d, HH4.2d 159 eor HH34.16b, HH34.16b, T1.16b 160 161 movi MASK.16b, #0xe1 162 shl MASK.2d, MASK.2d, #57 163 .endm 164 165 .macro __pmull_pre_p8 166 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 167 eor SHASH2.16b, SHASH2.16b, SHASH.16b 168 169 // k00_16 := 0x0000000000000000_000000000000ffff 170 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 171 movi k32_48.2d, #0xffffffff 172 mov k32_48.h[2], k32_48.h[0] 173 ushr k00_16.2d, k32_48.2d, #32 174 175 // prepare the permutation vectors 176 mov_q x5, 0x080f0e0d0c0b0a09 177 movi T1.8b, #8 178 dup perm1.2d, x5 179 eor perm1.16b, perm1.16b, T1.16b 180 ushr perm2.2d, perm1.2d, #8 181 ushr perm3.2d, perm1.2d, #16 182 ushr T1.2d, perm1.2d, #24 183 sli perm2.2d, perm1.2d, #56 184 sli perm3.2d, perm1.2d, #48 185 sli T1.2d, perm1.2d, #40 186 187 // precompute loop invariants 188 tbl sh1.16b, {SHASH.16b}, perm1.16b 189 tbl sh2.16b, {SHASH.16b}, perm2.16b 190 tbl sh3.16b, {SHASH.16b}, perm3.16b 191 tbl sh4.16b, {SHASH.16b}, T1.16b 192 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 193 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 194 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 195 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 196 .endm 197 198 // 199 // PMULL (64x64->128) based reduction for CPUs that can do 200 // it in a single instruction. 201 // 202 .macro __pmull_reduce_p64 203 pmull T2.1q, XL.1d, MASK.1d 204 eor XM.16b, XM.16b, T1.16b 205 206 mov XH.d[0], XM.d[1] 207 mov XM.d[1], XL.d[0] 208 209 eor XL.16b, XM.16b, T2.16b 210 ext T2.16b, XL.16b, XL.16b, #8 211 pmull XL.1q, XL.1d, MASK.1d 212 .endm 213 214 // 215 // Alternative reduction for CPUs that lack support for the 216 // 64x64->128 PMULL instruction 217 // 218 .macro __pmull_reduce_p8 219 eor XM.16b, XM.16b, T1.16b 220 221 mov XL.d[1], XM.d[0] 222 mov XH.d[0], XM.d[1] 223 224 shl T1.2d, XL.2d, #57 225 shl T2.2d, XL.2d, #62 226 eor T2.16b, T2.16b, T1.16b 227 shl T1.2d, XL.2d, #63 228 eor T2.16b, T2.16b, T1.16b 229 ext T1.16b, XL.16b, XH.16b, #8 230 eor T2.16b, T2.16b, T1.16b 231 232 mov XL.d[1], T2.d[0] 233 mov XH.d[0], T2.d[1] 234 235 ushr T2.2d, XL.2d, #1 236 eor XH.16b, XH.16b, XL.16b 237 eor XL.16b, XL.16b, T2.16b 238 ushr T2.2d, T2.2d, #6 239 ushr XL.2d, XL.2d, #1 240 .endm 241 242 .macro __pmull_ghash, pn 243 ld1 {SHASH.2d}, [x3] 244 ld1 {XL.2d}, [x1] 245 246 __pmull_pre_\pn 247 248 /* do the head block first, if supplied */ 249 cbz x4, 0f 250 ld1 {T1.2d}, [x4] 251 mov x4, xzr 252 b 3f 253 2540: .ifc \pn, p64 255 tbnz w0, #0, 2f // skip until #blocks is a 256 tbnz w0, #1, 2f // round multiple of 4 257 2581: ld1 {XM3.16b-TT4.16b}, [x2], #64 259 260 sub w0, w0, #4 261 262 rev64 T1.16b, XM3.16b 263 rev64 T2.16b, XH3.16b 264 rev64 TT4.16b, TT4.16b 265 rev64 TT3.16b, TT3.16b 266 267 ext IN1.16b, TT4.16b, TT4.16b, #8 268 ext XL3.16b, TT3.16b, TT3.16b, #8 269 270 eor TT4.16b, TT4.16b, IN1.16b 271 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 272 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 273 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 274 275 eor TT3.16b, TT3.16b, XL3.16b 276 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 277 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 278 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 279 280 ext IN1.16b, T2.16b, T2.16b, #8 281 eor XL2.16b, XL2.16b, XL3.16b 282 eor XH2.16b, XH2.16b, XH3.16b 283 eor XM2.16b, XM2.16b, XM3.16b 284 285 eor T2.16b, T2.16b, IN1.16b 286 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 287 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 288 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 289 290 eor XL2.16b, XL2.16b, XL3.16b 291 eor XH2.16b, XH2.16b, XH3.16b 292 eor XM2.16b, XM2.16b, XM3.16b 293 294 ext IN1.16b, T1.16b, T1.16b, #8 295 ext TT3.16b, XL.16b, XL.16b, #8 296 eor XL.16b, XL.16b, IN1.16b 297 eor T1.16b, T1.16b, TT3.16b 298 299 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 300 eor T1.16b, T1.16b, XL.16b 301 pmull XL.1q, HH4.1d, XL.1d // a0 * b0 302 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 303 304 eor XL.16b, XL.16b, XL2.16b 305 eor XH.16b, XH.16b, XH2.16b 306 eor XM.16b, XM.16b, XM2.16b 307 308 eor T2.16b, XL.16b, XH.16b 309 ext T1.16b, XL.16b, XH.16b, #8 310 eor XM.16b, XM.16b, T2.16b 311 312 __pmull_reduce_p64 313 314 eor T2.16b, T2.16b, XH.16b 315 eor XL.16b, XL.16b, T2.16b 316 317 cbz w0, 5f 318 b 1b 319 .endif 320 3212: ld1 {T1.2d}, [x2], #16 322 sub w0, w0, #1 323 3243: /* multiply XL by SHASH in GF(2^128) */ 325CPU_LE( rev64 T1.16b, T1.16b ) 326 327 ext T2.16b, XL.16b, XL.16b, #8 328 ext IN1.16b, T1.16b, T1.16b, #8 329 eor T1.16b, T1.16b, T2.16b 330 eor XL.16b, XL.16b, IN1.16b 331 332 __pmull2_\pn XH, XL, SHASH // a1 * b1 333 eor T1.16b, T1.16b, XL.16b 334 __pmull_\pn XL, XL, SHASH // a0 * b0 335 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 336 3374: eor T2.16b, XL.16b, XH.16b 338 ext T1.16b, XL.16b, XH.16b, #8 339 eor XM.16b, XM.16b, T2.16b 340 341 __pmull_reduce_\pn 342 343 eor T2.16b, T2.16b, XH.16b 344 eor XL.16b, XL.16b, T2.16b 345 346 cbnz w0, 0b 347 3485: st1 {XL.2d}, [x1] 349 ret 350 .endm 351 352 /* 353 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 354 * struct ghash_key const *k, const char *head) 355 */ 356ENTRY(pmull_ghash_update_p64) 357 __pmull_ghash p64 358ENDPROC(pmull_ghash_update_p64) 359 360ENTRY(pmull_ghash_update_p8) 361 __pmull_ghash p8 362ENDPROC(pmull_ghash_update_p8) 363 364 KS0 .req v12 365 KS1 .req v13 366 INP0 .req v14 367 INP1 .req v15 368 369 .macro load_round_keys, rounds, rk 370 cmp \rounds, #12 371 blo 2222f /* 128 bits */ 372 beq 1111f /* 192 bits */ 373 ld1 {v17.4s-v18.4s}, [\rk], #32 3741111: ld1 {v19.4s-v20.4s}, [\rk], #32 3752222: ld1 {v21.4s-v24.4s}, [\rk], #64 376 ld1 {v25.4s-v28.4s}, [\rk], #64 377 ld1 {v29.4s-v31.4s}, [\rk] 378 .endm 379 380 .macro enc_round, state, key 381 aese \state\().16b, \key\().16b 382 aesmc \state\().16b, \state\().16b 383 .endm 384 385 .macro enc_block, state, rounds 386 cmp \rounds, #12 387 b.lo 2222f /* 128 bits */ 388 b.eq 1111f /* 192 bits */ 389 enc_round \state, v17 390 enc_round \state, v18 3911111: enc_round \state, v19 392 enc_round \state, v20 3932222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 394 enc_round \state, \key 395 .endr 396 aese \state\().16b, v30.16b 397 eor \state\().16b, \state\().16b, v31.16b 398 .endm 399 400 .macro pmull_gcm_do_crypt, enc 401 ld1 {SHASH.2d}, [x4], #16 402 ld1 {HH.2d}, [x4] 403 ld1 {XL.2d}, [x1] 404 ldr x8, [x5, #8] // load lower counter 405 406 movi MASK.16b, #0xe1 407 trn1 SHASH2.2d, SHASH.2d, HH.2d 408 trn2 T1.2d, SHASH.2d, HH.2d 409CPU_LE( rev x8, x8 ) 410 shl MASK.2d, MASK.2d, #57 411 eor SHASH2.16b, SHASH2.16b, T1.16b 412 413 .if \enc == 1 414 ldr x10, [sp] 415 ld1 {KS0.16b-KS1.16b}, [x10] 416 .endif 417 418 cbnz x6, 4f 419 4200: ld1 {INP0.16b-INP1.16b}, [x3], #32 421 422 rev x9, x8 423 add x11, x8, #1 424 add x8, x8, #2 425 426 .if \enc == 1 427 eor INP0.16b, INP0.16b, KS0.16b // encrypt input 428 eor INP1.16b, INP1.16b, KS1.16b 429 .endif 430 431 ld1 {KS0.8b}, [x5] // load upper counter 432 rev x11, x11 433 sub w0, w0, #2 434 mov KS1.8b, KS0.8b 435 ins KS0.d[1], x9 // set lower counter 436 ins KS1.d[1], x11 437 438 rev64 T1.16b, INP1.16b 439 440 cmp w7, #12 441 b.ge 2f // AES-192/256? 442 4431: enc_round KS0, v21 444 ext IN1.16b, T1.16b, T1.16b, #8 445 446 enc_round KS1, v21 447 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 448 449 enc_round KS0, v22 450 eor T1.16b, T1.16b, IN1.16b 451 452 enc_round KS1, v22 453 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 454 455 enc_round KS0, v23 456 pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) 457 458 enc_round KS1, v23 459 rev64 T1.16b, INP0.16b 460 ext T2.16b, XL.16b, XL.16b, #8 461 462 enc_round KS0, v24 463 ext IN1.16b, T1.16b, T1.16b, #8 464 eor T1.16b, T1.16b, T2.16b 465 466 enc_round KS1, v24 467 eor XL.16b, XL.16b, IN1.16b 468 469 enc_round KS0, v25 470 eor T1.16b, T1.16b, XL.16b 471 472 enc_round KS1, v25 473 pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 474 475 enc_round KS0, v26 476 pmull XL.1q, HH.1d, XL.1d // a0 * b0 477 478 enc_round KS1, v26 479 pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) 480 481 enc_round KS0, v27 482 eor XL.16b, XL.16b, XL2.16b 483 eor XH.16b, XH.16b, XH2.16b 484 485 enc_round KS1, v27 486 eor XM.16b, XM.16b, XM2.16b 487 ext T1.16b, XL.16b, XH.16b, #8 488 489 enc_round KS0, v28 490 eor T2.16b, XL.16b, XH.16b 491 eor XM.16b, XM.16b, T1.16b 492 493 enc_round KS1, v28 494 eor XM.16b, XM.16b, T2.16b 495 496 enc_round KS0, v29 497 pmull T2.1q, XL.1d, MASK.1d 498 499 enc_round KS1, v29 500 mov XH.d[0], XM.d[1] 501 mov XM.d[1], XL.d[0] 502 503 aese KS0.16b, v30.16b 504 eor XL.16b, XM.16b, T2.16b 505 506 aese KS1.16b, v30.16b 507 ext T2.16b, XL.16b, XL.16b, #8 508 509 eor KS0.16b, KS0.16b, v31.16b 510 pmull XL.1q, XL.1d, MASK.1d 511 eor T2.16b, T2.16b, XH.16b 512 513 eor KS1.16b, KS1.16b, v31.16b 514 eor XL.16b, XL.16b, T2.16b 515 516 .if \enc == 0 517 eor INP0.16b, INP0.16b, KS0.16b 518 eor INP1.16b, INP1.16b, KS1.16b 519 .endif 520 521 st1 {INP0.16b-INP1.16b}, [x2], #32 522 523 cbnz w0, 0b 524 525CPU_LE( rev x8, x8 ) 526 st1 {XL.2d}, [x1] 527 str x8, [x5, #8] // store lower counter 528 529 .if \enc == 1 530 st1 {KS0.16b-KS1.16b}, [x10] 531 .endif 532 533 ret 534 5352: b.eq 3f // AES-192? 536 enc_round KS0, v17 537 enc_round KS1, v17 538 enc_round KS0, v18 539 enc_round KS1, v18 5403: enc_round KS0, v19 541 enc_round KS1, v19 542 enc_round KS0, v20 543 enc_round KS1, v20 544 b 1b 545 5464: load_round_keys w7, x6 547 b 0b 548 .endm 549 550 /* 551 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 552 * struct ghash_key const *k, u8 ctr[], 553 * int rounds, u8 ks[]) 554 */ 555ENTRY(pmull_gcm_encrypt) 556 pmull_gcm_do_crypt 1 557ENDPROC(pmull_gcm_encrypt) 558 559 /* 560 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 561 * struct ghash_key const *k, u8 ctr[], 562 * int rounds) 563 */ 564ENTRY(pmull_gcm_decrypt) 565 pmull_gcm_do_crypt 0 566ENDPROC(pmull_gcm_decrypt) 567 568 /* 569 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) 570 */ 571ENTRY(pmull_gcm_encrypt_block) 572 cbz x2, 0f 573 load_round_keys w3, x2 5740: ld1 {v0.16b}, [x1] 575 enc_block v0, w3 576 st1 {v0.16b}, [x0] 577 ret 578ENDPROC(pmull_gcm_encrypt_block) 579