1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 4 * 5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <linux/cfi_types.h> 10#include <asm/assembler.h> 11 12 SHASH .req v0 13 SHASH2 .req v1 14 T1 .req v2 15 T2 .req v3 16 MASK .req v4 17 XM .req v5 18 XL .req v6 19 XH .req v7 20 IN1 .req v7 21 22 k00_16 .req v8 23 k32_48 .req v9 24 25 t3 .req v10 26 t4 .req v11 27 t5 .req v12 28 t6 .req v13 29 t7 .req v14 30 t8 .req v15 31 t9 .req v16 32 33 perm1 .req v17 34 perm2 .req v18 35 perm3 .req v19 36 37 sh1 .req v20 38 sh2 .req v21 39 sh3 .req v22 40 sh4 .req v23 41 42 ss1 .req v24 43 ss2 .req v25 44 ss3 .req v26 45 ss4 .req v27 46 47 XL2 .req v8 48 XM2 .req v9 49 XH2 .req v10 50 XL3 .req v11 51 XM3 .req v12 52 XH3 .req v13 53 TT3 .req v14 54 TT4 .req v15 55 HH .req v16 56 HH3 .req v17 57 HH4 .req v18 58 HH34 .req v19 59 60 .text 61 .arch armv8-a+crypto 62 63 .macro __pmull_p64, rd, rn, rm 64 pmull \rd\().1q, \rn\().1d, \rm\().1d 65 .endm 66 67 .macro __pmull2_p64, rd, rn, rm 68 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 69 .endm 70 71 .macro __pmull_p8, rq, ad, bd 72 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 73 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 74 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 75 76 __pmull_p8_\bd \rq, \ad 77 .endm 78 79 .macro __pmull2_p8, rq, ad, bd 80 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 81 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 82 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 83 84 __pmull2_p8_\bd \rq, \ad 85 .endm 86 87 .macro __pmull_p8_SHASH, rq, ad 88 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 89 .endm 90 91 .macro __pmull_p8_SHASH2, rq, ad 92 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 93 .endm 94 95 .macro __pmull2_p8_SHASH, rq, ad 96 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 97 .endm 98 99 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 100 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 101 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 102 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 103 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 104 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 105 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 106 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 107 pmull\t \rq\().8h, \ad, \bd // D = A*B 108 109 eor t3.16b, t3.16b, t4.16b // L = E + F 110 eor t5.16b, t5.16b, t6.16b // M = G + H 111 eor t7.16b, t7.16b, t8.16b // N = I + J 112 113 uzp1 t4.2d, t3.2d, t5.2d 114 uzp2 t3.2d, t3.2d, t5.2d 115 uzp1 t6.2d, t7.2d, t9.2d 116 uzp2 t7.2d, t7.2d, t9.2d 117 118 // t3 = (L) (P0 + P1) << 8 119 // t5 = (M) (P2 + P3) << 16 120 eor t4.16b, t4.16b, t3.16b 121 and t3.16b, t3.16b, k32_48.16b 122 123 // t7 = (N) (P4 + P5) << 24 124 // t9 = (K) (P6 + P7) << 32 125 eor t6.16b, t6.16b, t7.16b 126 and t7.16b, t7.16b, k00_16.16b 127 128 eor t4.16b, t4.16b, t3.16b 129 eor t6.16b, t6.16b, t7.16b 130 131 zip2 t5.2d, t4.2d, t3.2d 132 zip1 t3.2d, t4.2d, t3.2d 133 zip2 t9.2d, t6.2d, t7.2d 134 zip1 t7.2d, t6.2d, t7.2d 135 136 ext t3.16b, t3.16b, t3.16b, #15 137 ext t5.16b, t5.16b, t5.16b, #14 138 ext t7.16b, t7.16b, t7.16b, #13 139 ext t9.16b, t9.16b, t9.16b, #12 140 141 eor t3.16b, t3.16b, t5.16b 142 eor t7.16b, t7.16b, t9.16b 143 eor \rq\().16b, \rq\().16b, t3.16b 144 eor \rq\().16b, \rq\().16b, t7.16b 145 .endm 146 147 .macro __pmull_pre_p64 148 add x8, x3, #16 149 ld1 {HH.2d-HH4.2d}, [x8] 150 151 trn1 SHASH2.2d, SHASH.2d, HH.2d 152 trn2 T1.2d, SHASH.2d, HH.2d 153 eor SHASH2.16b, SHASH2.16b, T1.16b 154 155 trn1 HH34.2d, HH3.2d, HH4.2d 156 trn2 T1.2d, HH3.2d, HH4.2d 157 eor HH34.16b, HH34.16b, T1.16b 158 159 movi MASK.16b, #0xe1 160 shl MASK.2d, MASK.2d, #57 161 .endm 162 163 .macro __pmull_pre_p8 164 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 165 eor SHASH2.16b, SHASH2.16b, SHASH.16b 166 167 // k00_16 := 0x0000000000000000_000000000000ffff 168 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 169 movi k32_48.2d, #0xffffffff 170 mov k32_48.h[2], k32_48.h[0] 171 ushr k00_16.2d, k32_48.2d, #32 172 173 // prepare the permutation vectors 174 mov_q x5, 0x080f0e0d0c0b0a09 175 movi T1.8b, #8 176 dup perm1.2d, x5 177 eor perm1.16b, perm1.16b, T1.16b 178 ushr perm2.2d, perm1.2d, #8 179 ushr perm3.2d, perm1.2d, #16 180 ushr T1.2d, perm1.2d, #24 181 sli perm2.2d, perm1.2d, #56 182 sli perm3.2d, perm1.2d, #48 183 sli T1.2d, perm1.2d, #40 184 185 // precompute loop invariants 186 tbl sh1.16b, {SHASH.16b}, perm1.16b 187 tbl sh2.16b, {SHASH.16b}, perm2.16b 188 tbl sh3.16b, {SHASH.16b}, perm3.16b 189 tbl sh4.16b, {SHASH.16b}, T1.16b 190 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 191 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 192 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 193 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 194 .endm 195 196 // 197 // PMULL (64x64->128) based reduction for CPUs that can do 198 // it in a single instruction. 199 // 200 .macro __pmull_reduce_p64 201 pmull T2.1q, XL.1d, MASK.1d 202 eor XM.16b, XM.16b, T1.16b 203 204 mov XH.d[0], XM.d[1] 205 mov XM.d[1], XL.d[0] 206 207 eor XL.16b, XM.16b, T2.16b 208 ext T2.16b, XL.16b, XL.16b, #8 209 pmull XL.1q, XL.1d, MASK.1d 210 .endm 211 212 // 213 // Alternative reduction for CPUs that lack support for the 214 // 64x64->128 PMULL instruction 215 // 216 .macro __pmull_reduce_p8 217 eor XM.16b, XM.16b, T1.16b 218 219 mov XL.d[1], XM.d[0] 220 mov XH.d[0], XM.d[1] 221 222 shl T1.2d, XL.2d, #57 223 shl T2.2d, XL.2d, #62 224 eor T2.16b, T2.16b, T1.16b 225 shl T1.2d, XL.2d, #63 226 eor T2.16b, T2.16b, T1.16b 227 ext T1.16b, XL.16b, XH.16b, #8 228 eor T2.16b, T2.16b, T1.16b 229 230 mov XL.d[1], T2.d[0] 231 mov XH.d[0], T2.d[1] 232 233 ushr T2.2d, XL.2d, #1 234 eor XH.16b, XH.16b, XL.16b 235 eor XL.16b, XL.16b, T2.16b 236 ushr T2.2d, T2.2d, #6 237 ushr XL.2d, XL.2d, #1 238 .endm 239 240 .macro __pmull_ghash, pn 241 ld1 {SHASH.2d}, [x3] 242 ld1 {XL.2d}, [x1] 243 244 __pmull_pre_\pn 245 246 /* do the head block first, if supplied */ 247 cbz x4, 0f 248 ld1 {T1.2d}, [x4] 249 mov x4, xzr 250 b 3f 251 2520: .ifc \pn, p64 253 tbnz w0, #0, 2f // skip until #blocks is a 254 tbnz w0, #1, 2f // round multiple of 4 255 2561: ld1 {XM3.16b-TT4.16b}, [x2], #64 257 258 sub w0, w0, #4 259 260 rev64 T1.16b, XM3.16b 261 rev64 T2.16b, XH3.16b 262 rev64 TT4.16b, TT4.16b 263 rev64 TT3.16b, TT3.16b 264 265 ext IN1.16b, TT4.16b, TT4.16b, #8 266 ext XL3.16b, TT3.16b, TT3.16b, #8 267 268 eor TT4.16b, TT4.16b, IN1.16b 269 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 270 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 271 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 272 273 eor TT3.16b, TT3.16b, XL3.16b 274 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 275 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 276 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 277 278 ext IN1.16b, T2.16b, T2.16b, #8 279 eor XL2.16b, XL2.16b, XL3.16b 280 eor XH2.16b, XH2.16b, XH3.16b 281 eor XM2.16b, XM2.16b, XM3.16b 282 283 eor T2.16b, T2.16b, IN1.16b 284 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 285 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 286 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 287 288 eor XL2.16b, XL2.16b, XL3.16b 289 eor XH2.16b, XH2.16b, XH3.16b 290 eor XM2.16b, XM2.16b, XM3.16b 291 292 ext IN1.16b, T1.16b, T1.16b, #8 293 ext TT3.16b, XL.16b, XL.16b, #8 294 eor XL.16b, XL.16b, IN1.16b 295 eor T1.16b, T1.16b, TT3.16b 296 297 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 298 eor T1.16b, T1.16b, XL.16b 299 pmull XL.1q, HH4.1d, XL.1d // a0 * b0 300 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 301 302 eor XL.16b, XL.16b, XL2.16b 303 eor XH.16b, XH.16b, XH2.16b 304 eor XM.16b, XM.16b, XM2.16b 305 306 eor T2.16b, XL.16b, XH.16b 307 ext T1.16b, XL.16b, XH.16b, #8 308 eor XM.16b, XM.16b, T2.16b 309 310 __pmull_reduce_p64 311 312 eor T2.16b, T2.16b, XH.16b 313 eor XL.16b, XL.16b, T2.16b 314 315 cbz w0, 5f 316 b 1b 317 .endif 318 3192: ld1 {T1.2d}, [x2], #16 320 sub w0, w0, #1 321 3223: /* multiply XL by SHASH in GF(2^128) */ 323CPU_LE( rev64 T1.16b, T1.16b ) 324 325 ext T2.16b, XL.16b, XL.16b, #8 326 ext IN1.16b, T1.16b, T1.16b, #8 327 eor T1.16b, T1.16b, T2.16b 328 eor XL.16b, XL.16b, IN1.16b 329 330 __pmull2_\pn XH, XL, SHASH // a1 * b1 331 eor T1.16b, T1.16b, XL.16b 332 __pmull_\pn XL, XL, SHASH // a0 * b0 333 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 334 3354: eor T2.16b, XL.16b, XH.16b 336 ext T1.16b, XL.16b, XH.16b, #8 337 eor XM.16b, XM.16b, T2.16b 338 339 __pmull_reduce_\pn 340 341 eor T2.16b, T2.16b, XH.16b 342 eor XL.16b, XL.16b, T2.16b 343 344 cbnz w0, 0b 345 3465: st1 {XL.2d}, [x1] 347 ret 348 .endm 349 350 /* 351 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 352 * struct ghash_key const *k, const char *head) 353 */ 354SYM_TYPED_FUNC_START(pmull_ghash_update_p64) 355 __pmull_ghash p64 356SYM_FUNC_END(pmull_ghash_update_p64) 357 358SYM_TYPED_FUNC_START(pmull_ghash_update_p8) 359 __pmull_ghash p8 360SYM_FUNC_END(pmull_ghash_update_p8) 361 362 KS0 .req v8 363 KS1 .req v9 364 KS2 .req v10 365 KS3 .req v11 366 367 INP0 .req v21 368 INP1 .req v22 369 INP2 .req v23 370 INP3 .req v24 371 372 K0 .req v25 373 K1 .req v26 374 K2 .req v27 375 K3 .req v28 376 K4 .req v12 377 K5 .req v13 378 K6 .req v4 379 K7 .req v5 380 K8 .req v14 381 K9 .req v15 382 KK .req v29 383 KL .req v30 384 KM .req v31 385 386 .macro load_round_keys, rounds, rk, tmp 387 add \tmp, \rk, #64 388 ld1 {K0.4s-K3.4s}, [\rk] 389 ld1 {K4.4s-K5.4s}, [\tmp] 390 add \tmp, \rk, \rounds, lsl #4 391 sub \tmp, \tmp, #32 392 ld1 {KK.4s-KM.4s}, [\tmp] 393 .endm 394 395 .macro enc_round, state, key 396 aese \state\().16b, \key\().16b 397 aesmc \state\().16b, \state\().16b 398 .endm 399 400 .macro enc_qround, s0, s1, s2, s3, key 401 enc_round \s0, \key 402 enc_round \s1, \key 403 enc_round \s2, \key 404 enc_round \s3, \key 405 .endm 406 407 .macro enc_block, state, rounds, rk, tmp 408 add \tmp, \rk, #96 409 ld1 {K6.4s-K7.4s}, [\tmp], #32 410 .irp key, K0, K1, K2, K3, K4 K5 411 enc_round \state, \key 412 .endr 413 414 tbnz \rounds, #2, .Lnot128_\@ 415.Lout256_\@: 416 enc_round \state, K6 417 enc_round \state, K7 418 419.Lout192_\@: 420 enc_round \state, KK 421 aese \state\().16b, KL.16b 422 eor \state\().16b, \state\().16b, KM.16b 423 424 .subsection 1 425.Lnot128_\@: 426 ld1 {K8.4s-K9.4s}, [\tmp], #32 427 enc_round \state, K6 428 enc_round \state, K7 429 ld1 {K6.4s-K7.4s}, [\tmp] 430 enc_round \state, K8 431 enc_round \state, K9 432 tbz \rounds, #1, .Lout192_\@ 433 b .Lout256_\@ 434 .previous 435 .endm 436 437 .align 6 438 .macro pmull_gcm_do_crypt, enc 439 stp x29, x30, [sp, #-32]! 440 mov x29, sp 441 str x19, [sp, #24] 442 443 load_round_keys x7, x6, x8 444 445 ld1 {SHASH.2d}, [x3], #16 446 ld1 {HH.2d-HH4.2d}, [x3] 447 448 trn1 SHASH2.2d, SHASH.2d, HH.2d 449 trn2 T1.2d, SHASH.2d, HH.2d 450 eor SHASH2.16b, SHASH2.16b, T1.16b 451 452 trn1 HH34.2d, HH3.2d, HH4.2d 453 trn2 T1.2d, HH3.2d, HH4.2d 454 eor HH34.16b, HH34.16b, T1.16b 455 456 ld1 {XL.2d}, [x4] 457 458 cbz x0, 3f // tag only? 459 460 ldr w8, [x5, #12] // load lower counter 461CPU_LE( rev w8, w8 ) 462 4630: mov w9, #4 // max blocks per round 464 add x10, x0, #0xf 465 lsr x10, x10, #4 // remaining blocks 466 467 subs x0, x0, #64 468 csel w9, w10, w9, mi 469 add w8, w8, w9 470 471 bmi 1f 472 ld1 {INP0.16b-INP3.16b}, [x2], #64 473 .subsection 1 474 /* 475 * Populate the four input registers right to left with up to 63 bytes 476 * of data, using overlapping loads to avoid branches. 477 * 478 * INP0 INP1 INP2 INP3 479 * 1 byte | | | |x | 480 * 16 bytes | | | |xxxxxxxx| 481 * 17 bytes | | |xxxxxxxx|x | 482 * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | 483 * etc etc 484 * 485 * Note that this code may read up to 15 bytes before the start of 486 * the input. It is up to the calling code to ensure this is safe if 487 * this happens in the first iteration of the loop (i.e., when the 488 * input size is < 16 bytes) 489 */ 4901: mov x15, #16 491 ands x19, x0, #0xf 492 csel x19, x19, x15, ne 493 adr_l x17, .Lpermute_table + 16 494 495 sub x11, x15, x19 496 add x12, x17, x11 497 sub x17, x17, x11 498 ld1 {T1.16b}, [x12] 499 sub x10, x1, x11 500 sub x11, x2, x11 501 502 cmp x0, #-16 503 csel x14, x15, xzr, gt 504 cmp x0, #-32 505 csel x15, x15, xzr, gt 506 cmp x0, #-48 507 csel x16, x19, xzr, gt 508 csel x1, x1, x10, gt 509 csel x2, x2, x11, gt 510 511 ld1 {INP0.16b}, [x2], x14 512 ld1 {INP1.16b}, [x2], x15 513 ld1 {INP2.16b}, [x2], x16 514 ld1 {INP3.16b}, [x2] 515 tbl INP3.16b, {INP3.16b}, T1.16b 516 b 2f 517 .previous 518 5192: .if \enc == 0 520 bl pmull_gcm_ghash_4x 521 .endif 522 523 bl pmull_gcm_enc_4x 524 525 tbnz x0, #63, 6f 526 st1 {INP0.16b-INP3.16b}, [x1], #64 527 .if \enc == 1 528 bl pmull_gcm_ghash_4x 529 .endif 530 bne 0b 531 5323: ldp x19, x10, [sp, #24] 533 cbz x10, 5f // output tag? 534 535 ld1 {INP3.16b}, [x10] // load lengths[] 536 mov w9, #1 537 bl pmull_gcm_ghash_4x 538 539 mov w11, #(0x1 << 24) // BE '1U' 540 ld1 {KS0.16b}, [x5] 541 mov KS0.s[3], w11 542 543 enc_block KS0, x7, x6, x12 544 545 ext XL.16b, XL.16b, XL.16b, #8 546 rev64 XL.16b, XL.16b 547 eor XL.16b, XL.16b, KS0.16b 548 549 .if \enc == 1 550 st1 {XL.16b}, [x10] // store tag 551 .else 552 ldp x11, x12, [sp, #40] // load tag pointer and authsize 553 adr_l x17, .Lpermute_table 554 ld1 {KS0.16b}, [x11] // load supplied tag 555 add x17, x17, x12 556 ld1 {KS1.16b}, [x17] // load permute vector 557 558 cmeq XL.16b, XL.16b, KS0.16b // compare tags 559 mvn XL.16b, XL.16b // -1 for fail, 0 for pass 560 tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only 561 sminv b0, XL.16b // signed minimum across XL 562 smov w0, v0.b[0] // return b0 563 .endif 564 5654: ldp x29, x30, [sp], #32 566 ret 567 5685: 569CPU_LE( rev w8, w8 ) 570 str w8, [x5, #12] // store lower counter 571 st1 {XL.2d}, [x4] 572 b 4b 573 5746: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors 575 sub x17, x17, x19, lsl #1 576 577 cmp w9, #1 578 beq 7f 579 .subsection 1 5807: ld1 {INP2.16b}, [x1] 581 tbx INP2.16b, {INP3.16b}, T1.16b 582 mov INP3.16b, INP2.16b 583 b 8f 584 .previous 585 586 st1 {INP0.16b}, [x1], x14 587 st1 {INP1.16b}, [x1], x15 588 st1 {INP2.16b}, [x1], x16 589 tbl INP3.16b, {INP3.16b}, T1.16b 590 tbx INP3.16b, {INP2.16b}, T2.16b 5918: st1 {INP3.16b}, [x1] 592 593 .if \enc == 1 594 ld1 {T1.16b}, [x17] 595 tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits 596 bl pmull_gcm_ghash_4x 597 .endif 598 b 3b 599 .endm 600 601 /* 602 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[], 603 * struct ghash_key const *k, u64 dg[], u8 ctr[], 604 * int rounds, u8 tag) 605 */ 606SYM_FUNC_START(pmull_gcm_encrypt) 607 pmull_gcm_do_crypt 1 608SYM_FUNC_END(pmull_gcm_encrypt) 609 610 /* 611 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[], 612 * struct ghash_key const *k, u64 dg[], u8 ctr[], 613 * int rounds, u8 tag) 614 */ 615SYM_FUNC_START(pmull_gcm_decrypt) 616 pmull_gcm_do_crypt 0 617SYM_FUNC_END(pmull_gcm_decrypt) 618 619SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x) 620 movi MASK.16b, #0xe1 621 shl MASK.2d, MASK.2d, #57 622 623 rev64 T1.16b, INP0.16b 624 rev64 T2.16b, INP1.16b 625 rev64 TT3.16b, INP2.16b 626 rev64 TT4.16b, INP3.16b 627 628 ext XL.16b, XL.16b, XL.16b, #8 629 630 tbz w9, #2, 0f // <4 blocks? 631 .subsection 1 6320: movi XH2.16b, #0 633 movi XM2.16b, #0 634 movi XL2.16b, #0 635 636 tbz w9, #0, 1f // 2 blocks? 637 tbz w9, #1, 2f // 1 block? 638 639 eor T2.16b, T2.16b, XL.16b 640 ext T1.16b, T2.16b, T2.16b, #8 641 b .Lgh3 642 6431: eor TT3.16b, TT3.16b, XL.16b 644 ext T2.16b, TT3.16b, TT3.16b, #8 645 b .Lgh2 646 6472: eor TT4.16b, TT4.16b, XL.16b 648 ext IN1.16b, TT4.16b, TT4.16b, #8 649 b .Lgh1 650 .previous 651 652 eor T1.16b, T1.16b, XL.16b 653 ext IN1.16b, T1.16b, T1.16b, #8 654 655 pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 656 eor T1.16b, T1.16b, IN1.16b 657 pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 658 pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 659 660 ext T1.16b, T2.16b, T2.16b, #8 661.Lgh3: eor T2.16b, T2.16b, T1.16b 662 pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 663 pmull XL.1q, HH3.1d, T1.1d // a0 * b0 664 pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 665 666 eor XH2.16b, XH2.16b, XH.16b 667 eor XL2.16b, XL2.16b, XL.16b 668 eor XM2.16b, XM2.16b, XM.16b 669 670 ext T2.16b, TT3.16b, TT3.16b, #8 671.Lgh2: eor TT3.16b, TT3.16b, T2.16b 672 pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 673 pmull XL.1q, HH.1d, T2.1d // a0 * b0 674 pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 675 676 eor XH2.16b, XH2.16b, XH.16b 677 eor XL2.16b, XL2.16b, XL.16b 678 eor XM2.16b, XM2.16b, XM.16b 679 680 ext IN1.16b, TT4.16b, TT4.16b, #8 681.Lgh1: eor TT4.16b, TT4.16b, IN1.16b 682 pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 683 pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 684 pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 685 686 eor XH.16b, XH.16b, XH2.16b 687 eor XL.16b, XL.16b, XL2.16b 688 eor XM.16b, XM.16b, XM2.16b 689 690 eor T2.16b, XL.16b, XH.16b 691 ext T1.16b, XL.16b, XH.16b, #8 692 eor XM.16b, XM.16b, T2.16b 693 694 __pmull_reduce_p64 695 696 eor T2.16b, T2.16b, XH.16b 697 eor XL.16b, XL.16b, T2.16b 698 699 ret 700SYM_FUNC_END(pmull_gcm_ghash_4x) 701 702SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x) 703 ld1 {KS0.16b}, [x5] // load upper counter 704 sub w10, w8, #4 705 sub w11, w8, #3 706 sub w12, w8, #2 707 sub w13, w8, #1 708 rev w10, w10 709 rev w11, w11 710 rev w12, w12 711 rev w13, w13 712 mov KS1.16b, KS0.16b 713 mov KS2.16b, KS0.16b 714 mov KS3.16b, KS0.16b 715 ins KS0.s[3], w10 // set lower counter 716 ins KS1.s[3], w11 717 ins KS2.s[3], w12 718 ins KS3.s[3], w13 719 720 add x10, x6, #96 // round key pointer 721 ld1 {K6.4s-K7.4s}, [x10], #32 722 .irp key, K0, K1, K2, K3, K4, K5 723 enc_qround KS0, KS1, KS2, KS3, \key 724 .endr 725 726 tbnz x7, #2, .Lnot128 727 .subsection 1 728.Lnot128: 729 ld1 {K8.4s-K9.4s}, [x10], #32 730 .irp key, K6, K7 731 enc_qround KS0, KS1, KS2, KS3, \key 732 .endr 733 ld1 {K6.4s-K7.4s}, [x10] 734 .irp key, K8, K9 735 enc_qround KS0, KS1, KS2, KS3, \key 736 .endr 737 tbz x7, #1, .Lout192 738 b .Lout256 739 .previous 740 741.Lout256: 742 .irp key, K6, K7 743 enc_qround KS0, KS1, KS2, KS3, \key 744 .endr 745 746.Lout192: 747 enc_qround KS0, KS1, KS2, KS3, KK 748 749 aese KS0.16b, KL.16b 750 aese KS1.16b, KL.16b 751 aese KS2.16b, KL.16b 752 aese KS3.16b, KL.16b 753 754 eor KS0.16b, KS0.16b, KM.16b 755 eor KS1.16b, KS1.16b, KM.16b 756 eor KS2.16b, KS2.16b, KM.16b 757 eor KS3.16b, KS3.16b, KM.16b 758 759 eor INP0.16b, INP0.16b, KS0.16b 760 eor INP1.16b, INP1.16b, KS1.16b 761 eor INP2.16b, INP2.16b, KS2.16b 762 eor INP3.16b, INP3.16b, KS3.16b 763 764 ret 765SYM_FUNC_END(pmull_gcm_enc_4x) 766 767 .section ".rodata", "a" 768 .align 6 769.Lpermute_table: 770 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 771 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 772 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 773 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 774 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 775 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 776 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 777 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 778 .previous 779