1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 4 * 5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <linux/cfi_types.h> 10#include <asm/assembler.h> 11 12 SHASH .req v0 13 SHASH2 .req v1 14 T1 .req v2 15 T2 .req v3 16 MASK .req v4 17 XM .req v5 18 XL .req v6 19 XH .req v7 20 IN1 .req v7 21 22 k00_16 .req v8 23 k32_48 .req v9 24 25 t3 .req v10 26 t4 .req v11 27 t5 .req v12 28 t6 .req v13 29 t7 .req v14 30 t8 .req v15 31 t9 .req v16 32 33 perm1 .req v17 34 perm2 .req v18 35 perm3 .req v19 36 37 sh1 .req v20 38 sh2 .req v21 39 sh3 .req v22 40 sh4 .req v23 41 42 ss1 .req v24 43 ss2 .req v25 44 ss3 .req v26 45 ss4 .req v27 46 47 XL2 .req v8 48 XM2 .req v9 49 XH2 .req v10 50 XL3 .req v11 51 XM3 .req v12 52 XH3 .req v13 53 TT3 .req v14 54 TT4 .req v15 55 HH .req v16 56 HH3 .req v17 57 HH4 .req v18 58 HH34 .req v19 59 60 .text 61 .arch armv8-a+crypto 62 63 .macro __pmull_p64, rd, rn, rm 64 pmull \rd\().1q, \rn\().1d, \rm\().1d 65 .endm 66 67 .macro __pmull2_p64, rd, rn, rm 68 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 69 .endm 70 71 .macro __pmull_p8, rq, ad, bd 72 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 73 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 74 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 75 76 __pmull_p8_\bd \rq, \ad 77 .endm 78 79 .macro __pmull2_p8, rq, ad, bd 80 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 81 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 82 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 83 84 __pmull2_p8_\bd \rq, \ad 85 .endm 86 87 .macro __pmull_p8_SHASH, rq, ad 88 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 89 .endm 90 91 .macro __pmull_p8_SHASH2, rq, ad 92 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 93 .endm 94 95 .macro __pmull2_p8_SHASH, rq, ad 96 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 97 .endm 98 99 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 100 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 101 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 102 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 103 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 104 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 105 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 106 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 107 pmull\t \rq\().8h, \ad, \bd // D = A*B 108 109 eor t3.16b, t3.16b, t4.16b // L = E + F 110 eor t5.16b, t5.16b, t6.16b // M = G + H 111 eor t7.16b, t7.16b, t8.16b // N = I + J 112 113 uzp1 t4.2d, t3.2d, t5.2d 114 uzp2 t3.2d, t3.2d, t5.2d 115 uzp1 t6.2d, t7.2d, t9.2d 116 uzp2 t7.2d, t7.2d, t9.2d 117 118 // t3 = (L) (P0 + P1) << 8 119 // t5 = (M) (P2 + P3) << 16 120 eor t4.16b, t4.16b, t3.16b 121 and t3.16b, t3.16b, k32_48.16b 122 123 // t7 = (N) (P4 + P5) << 24 124 // t9 = (K) (P6 + P7) << 32 125 eor t6.16b, t6.16b, t7.16b 126 and t7.16b, t7.16b, k00_16.16b 127 128 eor t4.16b, t4.16b, t3.16b 129 eor t6.16b, t6.16b, t7.16b 130 131 zip2 t5.2d, t4.2d, t3.2d 132 zip1 t3.2d, t4.2d, t3.2d 133 zip2 t9.2d, t6.2d, t7.2d 134 zip1 t7.2d, t6.2d, t7.2d 135 136 ext t3.16b, t3.16b, t3.16b, #15 137 ext t5.16b, t5.16b, t5.16b, #14 138 ext t7.16b, t7.16b, t7.16b, #13 139 ext t9.16b, t9.16b, t9.16b, #12 140 141 eor t3.16b, t3.16b, t5.16b 142 eor t7.16b, t7.16b, t9.16b 143 eor \rq\().16b, \rq\().16b, t3.16b 144 eor \rq\().16b, \rq\().16b, t7.16b 145 .endm 146 147 .macro __pmull_pre_p64 148 add x8, x3, #16 149 ld1 {HH.2d-HH4.2d}, [x8] 150 151 trn1 SHASH2.2d, SHASH.2d, HH.2d 152 trn2 T1.2d, SHASH.2d, HH.2d 153 eor SHASH2.16b, SHASH2.16b, T1.16b 154 155 trn1 HH34.2d, HH3.2d, HH4.2d 156 trn2 T1.2d, HH3.2d, HH4.2d 157 eor HH34.16b, HH34.16b, T1.16b 158 159 movi MASK.16b, #0xe1 160 shl MASK.2d, MASK.2d, #57 161 .endm 162 163 .macro __pmull_pre_p8 164 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 165 eor SHASH2.16b, SHASH2.16b, SHASH.16b 166 167 // k00_16 := 0x0000000000000000_000000000000ffff 168 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 169 movi k32_48.2d, #0xffffffff 170 mov k32_48.h[2], k32_48.h[0] 171 ushr k00_16.2d, k32_48.2d, #32 172 173 // prepare the permutation vectors 174 mov_q x5, 0x080f0e0d0c0b0a09 175 movi T1.8b, #8 176 dup perm1.2d, x5 177 eor perm1.16b, perm1.16b, T1.16b 178 ushr perm2.2d, perm1.2d, #8 179 ushr perm3.2d, perm1.2d, #16 180 ushr T1.2d, perm1.2d, #24 181 sli perm2.2d, perm1.2d, #56 182 sli perm3.2d, perm1.2d, #48 183 sli T1.2d, perm1.2d, #40 184 185 // precompute loop invariants 186 tbl sh1.16b, {SHASH.16b}, perm1.16b 187 tbl sh2.16b, {SHASH.16b}, perm2.16b 188 tbl sh3.16b, {SHASH.16b}, perm3.16b 189 tbl sh4.16b, {SHASH.16b}, T1.16b 190 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 191 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 192 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 193 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 194 .endm 195 196 // 197 // PMULL (64x64->128) based reduction for CPUs that can do 198 // it in a single instruction. 199 // 200 .macro __pmull_reduce_p64 201 pmull T2.1q, XL.1d, MASK.1d 202 eor XM.16b, XM.16b, T1.16b 203 204 mov XH.d[0], XM.d[1] 205 mov XM.d[1], XL.d[0] 206 207 eor XL.16b, XM.16b, T2.16b 208 ext T2.16b, XL.16b, XL.16b, #8 209 pmull XL.1q, XL.1d, MASK.1d 210 .endm 211 212 // 213 // Alternative reduction for CPUs that lack support for the 214 // 64x64->128 PMULL instruction 215 // 216 .macro __pmull_reduce_p8 217 eor XM.16b, XM.16b, T1.16b 218 219 mov XL.d[1], XM.d[0] 220 mov XH.d[0], XM.d[1] 221 222 shl T1.2d, XL.2d, #57 223 shl T2.2d, XL.2d, #62 224 eor T2.16b, T2.16b, T1.16b 225 shl T1.2d, XL.2d, #63 226 eor T2.16b, T2.16b, T1.16b 227 ext T1.16b, XL.16b, XH.16b, #8 228 eor T2.16b, T2.16b, T1.16b 229 230 mov XL.d[1], T2.d[0] 231 mov XH.d[0], T2.d[1] 232 233 ushr T2.2d, XL.2d, #1 234 eor XH.16b, XH.16b, XL.16b 235 eor XL.16b, XL.16b, T2.16b 236 ushr T2.2d, T2.2d, #6 237 ushr XL.2d, XL.2d, #1 238 .endm 239 240 .macro __pmull_ghash, pn 241 ld1 {SHASH.2d}, [x3] 242 ld1 {XL.2d}, [x1] 243 244 __pmull_pre_\pn 245 246 /* do the head block first, if supplied */ 247 cbz x4, 0f 248 ld1 {T1.2d}, [x4] 249 mov x4, xzr 250 b 3f 251 2520: .ifc \pn, p64 253 tbnz w0, #0, 2f // skip until #blocks is a 254 tbnz w0, #1, 2f // round multiple of 4 255 2561: ld1 {XM3.16b-TT4.16b}, [x2], #64 257 258 sub w0, w0, #4 259 260 rev64 T1.16b, XM3.16b 261 rev64 T2.16b, XH3.16b 262 rev64 TT4.16b, TT4.16b 263 rev64 TT3.16b, TT3.16b 264 265 ext IN1.16b, TT4.16b, TT4.16b, #8 266 ext XL3.16b, TT3.16b, TT3.16b, #8 267 268 eor TT4.16b, TT4.16b, IN1.16b 269 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 270 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 271 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 272 273 eor TT3.16b, TT3.16b, XL3.16b 274 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 275 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 276 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 277 278 ext IN1.16b, T2.16b, T2.16b, #8 279 eor XL2.16b, XL2.16b, XL3.16b 280 eor XH2.16b, XH2.16b, XH3.16b 281 eor XM2.16b, XM2.16b, XM3.16b 282 283 eor T2.16b, T2.16b, IN1.16b 284 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 285 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 286 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 287 288 eor XL2.16b, XL2.16b, XL3.16b 289 eor XH2.16b, XH2.16b, XH3.16b 290 eor XM2.16b, XM2.16b, XM3.16b 291 292 ext IN1.16b, T1.16b, T1.16b, #8 293 ext TT3.16b, XL.16b, XL.16b, #8 294 eor XL.16b, XL.16b, IN1.16b 295 eor T1.16b, T1.16b, TT3.16b 296 297 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 298 eor T1.16b, T1.16b, XL.16b 299 pmull XL.1q, HH4.1d, XL.1d // a0 * b0 300 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 301 302 eor XL.16b, XL.16b, XL2.16b 303 eor XH.16b, XH.16b, XH2.16b 304 eor XM.16b, XM.16b, XM2.16b 305 306 eor T2.16b, XL.16b, XH.16b 307 ext T1.16b, XL.16b, XH.16b, #8 308 eor XM.16b, XM.16b, T2.16b 309 310 __pmull_reduce_p64 311 312 eor T2.16b, T2.16b, XH.16b 313 eor XL.16b, XL.16b, T2.16b 314 315 cbz w0, 5f 316 b 1b 317 .endif 318 3192: ld1 {T1.2d}, [x2], #16 320 sub w0, w0, #1 321 3223: /* multiply XL by SHASH in GF(2^128) */ 323CPU_LE( rev64 T1.16b, T1.16b ) 324 325 ext T2.16b, XL.16b, XL.16b, #8 326 ext IN1.16b, T1.16b, T1.16b, #8 327 eor T1.16b, T1.16b, T2.16b 328 eor XL.16b, XL.16b, IN1.16b 329 330 __pmull2_\pn XH, XL, SHASH // a1 * b1 331 eor T1.16b, T1.16b, XL.16b 332 __pmull_\pn XL, XL, SHASH // a0 * b0 333 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 334 3354: eor T2.16b, XL.16b, XH.16b 336 ext T1.16b, XL.16b, XH.16b, #8 337 eor XM.16b, XM.16b, T2.16b 338 339 __pmull_reduce_\pn 340 341 eor T2.16b, T2.16b, XH.16b 342 eor XL.16b, XL.16b, T2.16b 343 344 cbnz w0, 0b 345 3465: st1 {XL.2d}, [x1] 347 ret 348 .endm 349 350 /* 351 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 352 * struct ghash_key const *k, const char *head) 353 */ 354SYM_TYPED_FUNC_START(pmull_ghash_update_p64) 355 __pmull_ghash p64 356SYM_FUNC_END(pmull_ghash_update_p64) 357 358SYM_TYPED_FUNC_START(pmull_ghash_update_p8) 359 __pmull_ghash p8 360SYM_FUNC_END(pmull_ghash_update_p8) 361 362 KS0 .req v8 363 KS1 .req v9 364 KS2 .req v10 365 KS3 .req v11 366 367 INP0 .req v21 368 INP1 .req v22 369 INP2 .req v23 370 INP3 .req v24 371 372 K0 .req v25 373 K1 .req v26 374 K2 .req v27 375 K3 .req v28 376 K4 .req v12 377 K5 .req v13 378 K6 .req v4 379 K7 .req v5 380 K8 .req v14 381 K9 .req v15 382 KK .req v29 383 KL .req v30 384 KM .req v31 385 386 .macro load_round_keys, rounds, rk, tmp 387 add \tmp, \rk, #64 388 ld1 {K0.4s-K3.4s}, [\rk] 389 ld1 {K4.4s-K5.4s}, [\tmp] 390 add \tmp, \rk, \rounds, lsl #4 391 sub \tmp, \tmp, #32 392 ld1 {KK.4s-KM.4s}, [\tmp] 393 .endm 394 395 .macro enc_round, state, key 396 aese \state\().16b, \key\().16b 397 aesmc \state\().16b, \state\().16b 398 .endm 399 400 .macro enc_qround, s0, s1, s2, s3, key 401 enc_round \s0, \key 402 enc_round \s1, \key 403 enc_round \s2, \key 404 enc_round \s3, \key 405 .endm 406 407 .macro enc_block, state, rounds, rk, tmp 408 add \tmp, \rk, #96 409 ld1 {K6.4s-K7.4s}, [\tmp], #32 410 .irp key, K0, K1, K2, K3, K4 K5 411 enc_round \state, \key 412 .endr 413 414 tbnz \rounds, #2, .Lnot128_\@ 415.Lout256_\@: 416 enc_round \state, K6 417 enc_round \state, K7 418 419.Lout192_\@: 420 enc_round \state, KK 421 aese \state\().16b, KL.16b 422 eor \state\().16b, \state\().16b, KM.16b 423 424 .subsection 1 425.Lnot128_\@: 426 ld1 {K8.4s-K9.4s}, [\tmp], #32 427 enc_round \state, K6 428 enc_round \state, K7 429 ld1 {K6.4s-K7.4s}, [\tmp] 430 enc_round \state, K8 431 enc_round \state, K9 432 tbz \rounds, #1, .Lout192_\@ 433 b .Lout256_\@ 434 .previous 435 .endm 436 437 .align 6 438 .macro pmull_gcm_do_crypt, enc 439 frame_push 1 440 441 load_round_keys x7, x6, x8 442 443 ld1 {SHASH.2d}, [x3], #16 444 ld1 {HH.2d-HH4.2d}, [x3] 445 446 trn1 SHASH2.2d, SHASH.2d, HH.2d 447 trn2 T1.2d, SHASH.2d, HH.2d 448 eor SHASH2.16b, SHASH2.16b, T1.16b 449 450 trn1 HH34.2d, HH3.2d, HH4.2d 451 trn2 T1.2d, HH3.2d, HH4.2d 452 eor HH34.16b, HH34.16b, T1.16b 453 454 ld1 {XL.2d}, [x4] 455 456 cbz x0, 3f // tag only? 457 458 ldr w8, [x5, #12] // load lower counter 459CPU_LE( rev w8, w8 ) 460 4610: mov w9, #4 // max blocks per round 462 add x10, x0, #0xf 463 lsr x10, x10, #4 // remaining blocks 464 465 subs x0, x0, #64 466 csel w9, w10, w9, mi 467 add w8, w8, w9 468 469 bmi 1f 470 ld1 {INP0.16b-INP3.16b}, [x2], #64 471 .subsection 1 472 /* 473 * Populate the four input registers right to left with up to 63 bytes 474 * of data, using overlapping loads to avoid branches. 475 * 476 * INP0 INP1 INP2 INP3 477 * 1 byte | | | |x | 478 * 16 bytes | | | |xxxxxxxx| 479 * 17 bytes | | |xxxxxxxx|x | 480 * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | 481 * etc etc 482 * 483 * Note that this code may read up to 15 bytes before the start of 484 * the input. It is up to the calling code to ensure this is safe if 485 * this happens in the first iteration of the loop (i.e., when the 486 * input size is < 16 bytes) 487 */ 4881: mov x15, #16 489 ands x19, x0, #0xf 490 csel x19, x19, x15, ne 491 adr_l x17, .Lpermute_table + 16 492 493 sub x11, x15, x19 494 add x12, x17, x11 495 sub x17, x17, x11 496 ld1 {T1.16b}, [x12] 497 sub x10, x1, x11 498 sub x11, x2, x11 499 500 cmp x0, #-16 501 csel x14, x15, xzr, gt 502 cmp x0, #-32 503 csel x15, x15, xzr, gt 504 cmp x0, #-48 505 csel x16, x19, xzr, gt 506 csel x1, x1, x10, gt 507 csel x2, x2, x11, gt 508 509 ld1 {INP0.16b}, [x2], x14 510 ld1 {INP1.16b}, [x2], x15 511 ld1 {INP2.16b}, [x2], x16 512 ld1 {INP3.16b}, [x2] 513 tbl INP3.16b, {INP3.16b}, T1.16b 514 b 2f 515 .previous 516 5172: .if \enc == 0 518 bl pmull_gcm_ghash_4x 519 .endif 520 521 bl pmull_gcm_enc_4x 522 523 tbnz x0, #63, 6f 524 st1 {INP0.16b-INP3.16b}, [x1], #64 525 .if \enc == 1 526 bl pmull_gcm_ghash_4x 527 .endif 528 bne 0b 529 5303: ldr x10, [sp, #.Lframe_local_offset] 531 cbz x10, 5f // output tag? 532 533 ld1 {INP3.16b}, [x10] // load lengths[] 534 mov w9, #1 535 bl pmull_gcm_ghash_4x 536 537 mov w11, #(0x1 << 24) // BE '1U' 538 ld1 {KS0.16b}, [x5] 539 mov KS0.s[3], w11 540 541 enc_block KS0, x7, x6, x12 542 543 ext XL.16b, XL.16b, XL.16b, #8 544 rev64 XL.16b, XL.16b 545 eor XL.16b, XL.16b, KS0.16b 546 547 .if \enc == 1 548 st1 {XL.16b}, [x10] // store tag 549 .else 550 ldp x11, x12, [sp, #40] // load tag pointer and authsize 551 adr_l x17, .Lpermute_table 552 ld1 {KS0.16b}, [x11] // load supplied tag 553 add x17, x17, x12 554 ld1 {KS1.16b}, [x17] // load permute vector 555 556 cmeq XL.16b, XL.16b, KS0.16b // compare tags 557 mvn XL.16b, XL.16b // -1 for fail, 0 for pass 558 tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only 559 sminv b0, XL.16b // signed minimum across XL 560 smov w0, v0.b[0] // return b0 561 .endif 562 5634: frame_pop 564 ret 565 5665: 567CPU_LE( rev w8, w8 ) 568 str w8, [x5, #12] // store lower counter 569 st1 {XL.2d}, [x4] 570 b 4b 571 5726: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors 573 sub x17, x17, x19, lsl #1 574 575 cmp w9, #1 576 beq 7f 577 .subsection 1 5787: ld1 {INP2.16b}, [x1] 579 tbx INP2.16b, {INP3.16b}, T1.16b 580 mov INP3.16b, INP2.16b 581 b 8f 582 .previous 583 584 st1 {INP0.16b}, [x1], x14 585 st1 {INP1.16b}, [x1], x15 586 st1 {INP2.16b}, [x1], x16 587 tbl INP3.16b, {INP3.16b}, T1.16b 588 tbx INP3.16b, {INP2.16b}, T2.16b 5898: st1 {INP3.16b}, [x1] 590 591 .if \enc == 1 592 ld1 {T1.16b}, [x17] 593 tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits 594 bl pmull_gcm_ghash_4x 595 .endif 596 b 3b 597 .endm 598 599 /* 600 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[], 601 * struct ghash_key const *k, u64 dg[], u8 ctr[], 602 * int rounds, u8 tag) 603 */ 604SYM_FUNC_START(pmull_gcm_encrypt) 605 pmull_gcm_do_crypt 1 606SYM_FUNC_END(pmull_gcm_encrypt) 607 608 /* 609 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[], 610 * struct ghash_key const *k, u64 dg[], u8 ctr[], 611 * int rounds, u8 tag) 612 */ 613SYM_FUNC_START(pmull_gcm_decrypt) 614 pmull_gcm_do_crypt 0 615SYM_FUNC_END(pmull_gcm_decrypt) 616 617SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x) 618 movi MASK.16b, #0xe1 619 shl MASK.2d, MASK.2d, #57 620 621 rev64 T1.16b, INP0.16b 622 rev64 T2.16b, INP1.16b 623 rev64 TT3.16b, INP2.16b 624 rev64 TT4.16b, INP3.16b 625 626 ext XL.16b, XL.16b, XL.16b, #8 627 628 tbz w9, #2, 0f // <4 blocks? 629 .subsection 1 6300: movi XH2.16b, #0 631 movi XM2.16b, #0 632 movi XL2.16b, #0 633 634 tbz w9, #0, 1f // 2 blocks? 635 tbz w9, #1, 2f // 1 block? 636 637 eor T2.16b, T2.16b, XL.16b 638 ext T1.16b, T2.16b, T2.16b, #8 639 b .Lgh3 640 6411: eor TT3.16b, TT3.16b, XL.16b 642 ext T2.16b, TT3.16b, TT3.16b, #8 643 b .Lgh2 644 6452: eor TT4.16b, TT4.16b, XL.16b 646 ext IN1.16b, TT4.16b, TT4.16b, #8 647 b .Lgh1 648 .previous 649 650 eor T1.16b, T1.16b, XL.16b 651 ext IN1.16b, T1.16b, T1.16b, #8 652 653 pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 654 eor T1.16b, T1.16b, IN1.16b 655 pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 656 pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 657 658 ext T1.16b, T2.16b, T2.16b, #8 659.Lgh3: eor T2.16b, T2.16b, T1.16b 660 pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 661 pmull XL.1q, HH3.1d, T1.1d // a0 * b0 662 pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 663 664 eor XH2.16b, XH2.16b, XH.16b 665 eor XL2.16b, XL2.16b, XL.16b 666 eor XM2.16b, XM2.16b, XM.16b 667 668 ext T2.16b, TT3.16b, TT3.16b, #8 669.Lgh2: eor TT3.16b, TT3.16b, T2.16b 670 pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 671 pmull XL.1q, HH.1d, T2.1d // a0 * b0 672 pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 673 674 eor XH2.16b, XH2.16b, XH.16b 675 eor XL2.16b, XL2.16b, XL.16b 676 eor XM2.16b, XM2.16b, XM.16b 677 678 ext IN1.16b, TT4.16b, TT4.16b, #8 679.Lgh1: eor TT4.16b, TT4.16b, IN1.16b 680 pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 681 pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 682 pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 683 684 eor XH.16b, XH.16b, XH2.16b 685 eor XL.16b, XL.16b, XL2.16b 686 eor XM.16b, XM.16b, XM2.16b 687 688 eor T2.16b, XL.16b, XH.16b 689 ext T1.16b, XL.16b, XH.16b, #8 690 eor XM.16b, XM.16b, T2.16b 691 692 __pmull_reduce_p64 693 694 eor T2.16b, T2.16b, XH.16b 695 eor XL.16b, XL.16b, T2.16b 696 697 ret 698SYM_FUNC_END(pmull_gcm_ghash_4x) 699 700SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x) 701 ld1 {KS0.16b}, [x5] // load upper counter 702 sub w10, w8, #4 703 sub w11, w8, #3 704 sub w12, w8, #2 705 sub w13, w8, #1 706 rev w10, w10 707 rev w11, w11 708 rev w12, w12 709 rev w13, w13 710 mov KS1.16b, KS0.16b 711 mov KS2.16b, KS0.16b 712 mov KS3.16b, KS0.16b 713 ins KS0.s[3], w10 // set lower counter 714 ins KS1.s[3], w11 715 ins KS2.s[3], w12 716 ins KS3.s[3], w13 717 718 add x10, x6, #96 // round key pointer 719 ld1 {K6.4s-K7.4s}, [x10], #32 720 .irp key, K0, K1, K2, K3, K4, K5 721 enc_qround KS0, KS1, KS2, KS3, \key 722 .endr 723 724 tbnz x7, #2, .Lnot128 725 .subsection 1 726.Lnot128: 727 ld1 {K8.4s-K9.4s}, [x10], #32 728 .irp key, K6, K7 729 enc_qround KS0, KS1, KS2, KS3, \key 730 .endr 731 ld1 {K6.4s-K7.4s}, [x10] 732 .irp key, K8, K9 733 enc_qround KS0, KS1, KS2, KS3, \key 734 .endr 735 tbz x7, #1, .Lout192 736 b .Lout256 737 .previous 738 739.Lout256: 740 .irp key, K6, K7 741 enc_qround KS0, KS1, KS2, KS3, \key 742 .endr 743 744.Lout192: 745 enc_qround KS0, KS1, KS2, KS3, KK 746 747 aese KS0.16b, KL.16b 748 aese KS1.16b, KL.16b 749 aese KS2.16b, KL.16b 750 aese KS3.16b, KL.16b 751 752 eor KS0.16b, KS0.16b, KM.16b 753 eor KS1.16b, KS1.16b, KM.16b 754 eor KS2.16b, KS2.16b, KM.16b 755 eor KS3.16b, KS3.16b, KM.16b 756 757 eor INP0.16b, INP0.16b, KS0.16b 758 eor INP1.16b, INP1.16b, KS1.16b 759 eor INP2.16b, INP2.16b, KS2.16b 760 eor INP3.16b, INP3.16b, KS3.16b 761 762 ret 763SYM_FUNC_END(pmull_gcm_enc_4x) 764 765 .section ".rodata", "a" 766 .align 6 767.Lpermute_table: 768 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 769 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 770 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 771 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 772 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 773 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 774 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 775 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 776 .previous 777