1/* 2 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 3 * 4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 SHASH .req v0 15 SHASH2 .req v1 16 T1 .req v2 17 T2 .req v3 18 MASK .req v4 19 XL .req v5 20 XM .req v6 21 XH .req v7 22 IN1 .req v7 23 24 k00_16 .req v8 25 k32_48 .req v9 26 27 t3 .req v10 28 t4 .req v11 29 t5 .req v12 30 t6 .req v13 31 t7 .req v14 32 t8 .req v15 33 t9 .req v16 34 35 perm1 .req v17 36 perm2 .req v18 37 perm3 .req v19 38 39 sh1 .req v20 40 sh2 .req v21 41 sh3 .req v22 42 sh4 .req v23 43 44 ss1 .req v24 45 ss2 .req v25 46 ss3 .req v26 47 ss4 .req v27 48 49 .text 50 .arch armv8-a+crypto 51 52 .macro __pmull_p64, rd, rn, rm 53 pmull \rd\().1q, \rn\().1d, \rm\().1d 54 .endm 55 56 .macro __pmull2_p64, rd, rn, rm 57 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 58 .endm 59 60 .macro __pmull_p8, rq, ad, bd 61 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 62 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 63 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 64 65 __pmull_p8_\bd \rq, \ad 66 .endm 67 68 .macro __pmull2_p8, rq, ad, bd 69 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 70 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 71 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 72 73 __pmull2_p8_\bd \rq, \ad 74 .endm 75 76 .macro __pmull_p8_SHASH, rq, ad 77 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 78 .endm 79 80 .macro __pmull_p8_SHASH2, rq, ad 81 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 82 .endm 83 84 .macro __pmull2_p8_SHASH, rq, ad 85 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 86 .endm 87 88 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 89 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 90 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 91 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 92 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 93 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 94 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 95 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 96 pmull\t \rq\().8h, \ad, \bd // D = A*B 97 98 eor t3.16b, t3.16b, t4.16b // L = E + F 99 eor t5.16b, t5.16b, t6.16b // M = G + H 100 eor t7.16b, t7.16b, t8.16b // N = I + J 101 102 uzp1 t4.2d, t3.2d, t5.2d 103 uzp2 t3.2d, t3.2d, t5.2d 104 uzp1 t6.2d, t7.2d, t9.2d 105 uzp2 t7.2d, t7.2d, t9.2d 106 107 // t3 = (L) (P0 + P1) << 8 108 // t5 = (M) (P2 + P3) << 16 109 eor t4.16b, t4.16b, t3.16b 110 and t3.16b, t3.16b, k32_48.16b 111 112 // t7 = (N) (P4 + P5) << 24 113 // t9 = (K) (P6 + P7) << 32 114 eor t6.16b, t6.16b, t7.16b 115 and t7.16b, t7.16b, k00_16.16b 116 117 eor t4.16b, t4.16b, t3.16b 118 eor t6.16b, t6.16b, t7.16b 119 120 zip2 t5.2d, t4.2d, t3.2d 121 zip1 t3.2d, t4.2d, t3.2d 122 zip2 t9.2d, t6.2d, t7.2d 123 zip1 t7.2d, t6.2d, t7.2d 124 125 ext t3.16b, t3.16b, t3.16b, #15 126 ext t5.16b, t5.16b, t5.16b, #14 127 ext t7.16b, t7.16b, t7.16b, #13 128 ext t9.16b, t9.16b, t9.16b, #12 129 130 eor t3.16b, t3.16b, t5.16b 131 eor t7.16b, t7.16b, t9.16b 132 eor \rq\().16b, \rq\().16b, t3.16b 133 eor \rq\().16b, \rq\().16b, t7.16b 134 .endm 135 136 .macro __pmull_pre_p64 137 movi MASK.16b, #0xe1 138 shl MASK.2d, MASK.2d, #57 139 .endm 140 141 .macro __pmull_pre_p8 142 // k00_16 := 0x0000000000000000_000000000000ffff 143 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 144 movi k32_48.2d, #0xffffffff 145 mov k32_48.h[2], k32_48.h[0] 146 ushr k00_16.2d, k32_48.2d, #32 147 148 // prepare the permutation vectors 149 mov_q x5, 0x080f0e0d0c0b0a09 150 movi T1.8b, #8 151 dup perm1.2d, x5 152 eor perm1.16b, perm1.16b, T1.16b 153 ushr perm2.2d, perm1.2d, #8 154 ushr perm3.2d, perm1.2d, #16 155 ushr T1.2d, perm1.2d, #24 156 sli perm2.2d, perm1.2d, #56 157 sli perm3.2d, perm1.2d, #48 158 sli T1.2d, perm1.2d, #40 159 160 // precompute loop invariants 161 tbl sh1.16b, {SHASH.16b}, perm1.16b 162 tbl sh2.16b, {SHASH.16b}, perm2.16b 163 tbl sh3.16b, {SHASH.16b}, perm3.16b 164 tbl sh4.16b, {SHASH.16b}, T1.16b 165 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 166 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 167 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 168 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 169 .endm 170 171 // 172 // PMULL (64x64->128) based reduction for CPUs that can do 173 // it in a single instruction. 174 // 175 .macro __pmull_reduce_p64 176 pmull T2.1q, XL.1d, MASK.1d 177 eor XM.16b, XM.16b, T1.16b 178 179 mov XH.d[0], XM.d[1] 180 mov XM.d[1], XL.d[0] 181 182 eor XL.16b, XM.16b, T2.16b 183 ext T2.16b, XL.16b, XL.16b, #8 184 pmull XL.1q, XL.1d, MASK.1d 185 .endm 186 187 // 188 // Alternative reduction for CPUs that lack support for the 189 // 64x64->128 PMULL instruction 190 // 191 .macro __pmull_reduce_p8 192 eor XM.16b, XM.16b, T1.16b 193 194 mov XL.d[1], XM.d[0] 195 mov XH.d[0], XM.d[1] 196 197 shl T1.2d, XL.2d, #57 198 shl T2.2d, XL.2d, #62 199 eor T2.16b, T2.16b, T1.16b 200 shl T1.2d, XL.2d, #63 201 eor T2.16b, T2.16b, T1.16b 202 ext T1.16b, XL.16b, XH.16b, #8 203 eor T2.16b, T2.16b, T1.16b 204 205 mov XL.d[1], T2.d[0] 206 mov XH.d[0], T2.d[1] 207 208 ushr T2.2d, XL.2d, #1 209 eor XH.16b, XH.16b, XL.16b 210 eor XL.16b, XL.16b, T2.16b 211 ushr T2.2d, T2.2d, #6 212 ushr XL.2d, XL.2d, #1 213 .endm 214 215 .macro __pmull_ghash, pn 216 frame_push 5 217 218 mov x19, x0 219 mov x20, x1 220 mov x21, x2 221 mov x22, x3 222 mov x23, x4 223 2240: ld1 {SHASH.2d}, [x22] 225 ld1 {XL.2d}, [x20] 226 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 227 eor SHASH2.16b, SHASH2.16b, SHASH.16b 228 229 __pmull_pre_\pn 230 231 /* do the head block first, if supplied */ 232 cbz x23, 1f 233 ld1 {T1.2d}, [x23] 234 mov x23, xzr 235 b 2f 236 2371: ld1 {T1.2d}, [x21], #16 238 sub w19, w19, #1 239 2402: /* multiply XL by SHASH in GF(2^128) */ 241CPU_LE( rev64 T1.16b, T1.16b ) 242 243 ext T2.16b, XL.16b, XL.16b, #8 244 ext IN1.16b, T1.16b, T1.16b, #8 245 eor T1.16b, T1.16b, T2.16b 246 eor XL.16b, XL.16b, IN1.16b 247 248 __pmull2_\pn XH, XL, SHASH // a1 * b1 249 eor T1.16b, T1.16b, XL.16b 250 __pmull_\pn XL, XL, SHASH // a0 * b0 251 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 252 253 eor T2.16b, XL.16b, XH.16b 254 ext T1.16b, XL.16b, XH.16b, #8 255 eor XM.16b, XM.16b, T2.16b 256 257 __pmull_reduce_\pn 258 259 eor T2.16b, T2.16b, XH.16b 260 eor XL.16b, XL.16b, T2.16b 261 262 cbz w19, 3f 263 264 if_will_cond_yield_neon 265 st1 {XL.2d}, [x20] 266 do_cond_yield_neon 267 b 0b 268 endif_yield_neon 269 270 b 1b 271 2723: st1 {XL.2d}, [x20] 273 frame_pop 274 ret 275 .endm 276 277 /* 278 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 279 * struct ghash_key const *k, const char *head) 280 */ 281ENTRY(pmull_ghash_update_p64) 282 __pmull_ghash p64 283ENDPROC(pmull_ghash_update_p64) 284 285ENTRY(pmull_ghash_update_p8) 286 __pmull_ghash p8 287ENDPROC(pmull_ghash_update_p8) 288 289 KS0 .req v8 290 KS1 .req v9 291 INP0 .req v10 292 INP1 .req v11 293 294 .macro load_round_keys, rounds, rk 295 cmp \rounds, #12 296 blo 2222f /* 128 bits */ 297 beq 1111f /* 192 bits */ 298 ld1 {v17.4s-v18.4s}, [\rk], #32 2991111: ld1 {v19.4s-v20.4s}, [\rk], #32 3002222: ld1 {v21.4s-v24.4s}, [\rk], #64 301 ld1 {v25.4s-v28.4s}, [\rk], #64 302 ld1 {v29.4s-v31.4s}, [\rk] 303 .endm 304 305 .macro enc_round, state, key 306 aese \state\().16b, \key\().16b 307 aesmc \state\().16b, \state\().16b 308 .endm 309 310 .macro enc_block, state, rounds 311 cmp \rounds, #12 312 b.lo 2222f /* 128 bits */ 313 b.eq 1111f /* 192 bits */ 314 enc_round \state, v17 315 enc_round \state, v18 3161111: enc_round \state, v19 317 enc_round \state, v20 3182222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 319 enc_round \state, \key 320 .endr 321 aese \state\().16b, v30.16b 322 eor \state\().16b, \state\().16b, v31.16b 323 .endm 324 325 .macro pmull_gcm_do_crypt, enc 326 ld1 {SHASH.2d}, [x4] 327 ld1 {XL.2d}, [x1] 328 ldr x8, [x5, #8] // load lower counter 329 330 load_round_keys w7, x6 331 332 movi MASK.16b, #0xe1 333 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 334CPU_LE( rev x8, x8 ) 335 shl MASK.2d, MASK.2d, #57 336 eor SHASH2.16b, SHASH2.16b, SHASH.16b 337 338 .if \enc == 1 339 ldr x10, [sp] 340 ld1 {KS0.16b-KS1.16b}, [x10] 341 .endif 342 3430: ld1 {INP0.16b-INP1.16b}, [x3], #32 344 345 rev x9, x8 346 add x11, x8, #1 347 add x8, x8, #2 348 349 .if \enc == 1 350 eor INP0.16b, INP0.16b, KS0.16b // encrypt input 351 eor INP1.16b, INP1.16b, KS1.16b 352 .endif 353 354 ld1 {KS0.8b}, [x5] // load upper counter 355 rev x11, x11 356 sub w0, w0, #2 357 mov KS1.8b, KS0.8b 358 ins KS0.d[1], x9 // set lower counter 359 ins KS1.d[1], x11 360 361 rev64 T1.16b, INP0.16b 362 363 cmp w7, #12 364 b.ge 2f // AES-192/256? 365 3661: enc_round KS0, v21 367 368 ext T2.16b, XL.16b, XL.16b, #8 369 ext IN1.16b, T1.16b, T1.16b, #8 370 371 enc_round KS1, v21 372 373 eor T1.16b, T1.16b, T2.16b 374 eor XL.16b, XL.16b, IN1.16b 375 376 enc_round KS0, v22 377 378 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 379 eor T1.16b, T1.16b, XL.16b 380 381 enc_round KS1, v22 382 383 pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 384 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) 385 386 enc_round KS0, v23 387 388 ext T1.16b, XL.16b, XH.16b, #8 389 eor T2.16b, XL.16b, XH.16b 390 eor XM.16b, XM.16b, T1.16b 391 392 enc_round KS1, v23 393 394 eor XM.16b, XM.16b, T2.16b 395 pmull T2.1q, XL.1d, MASK.1d 396 397 enc_round KS0, v24 398 399 mov XH.d[0], XM.d[1] 400 mov XM.d[1], XL.d[0] 401 402 enc_round KS1, v24 403 404 eor XL.16b, XM.16b, T2.16b 405 406 enc_round KS0, v25 407 408 ext T2.16b, XL.16b, XL.16b, #8 409 410 enc_round KS1, v25 411 412 pmull XL.1q, XL.1d, MASK.1d 413 eor T2.16b, T2.16b, XH.16b 414 415 enc_round KS0, v26 416 417 eor XL.16b, XL.16b, T2.16b 418 rev64 T1.16b, INP1.16b 419 420 enc_round KS1, v26 421 422 ext T2.16b, XL.16b, XL.16b, #8 423 ext IN1.16b, T1.16b, T1.16b, #8 424 425 enc_round KS0, v27 426 427 eor T1.16b, T1.16b, T2.16b 428 eor XL.16b, XL.16b, IN1.16b 429 430 enc_round KS1, v27 431 432 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 433 eor T1.16b, T1.16b, XL.16b 434 435 enc_round KS0, v28 436 437 pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 438 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) 439 440 enc_round KS1, v28 441 442 ext T1.16b, XL.16b, XH.16b, #8 443 eor T2.16b, XL.16b, XH.16b 444 eor XM.16b, XM.16b, T1.16b 445 446 enc_round KS0, v29 447 448 eor XM.16b, XM.16b, T2.16b 449 pmull T2.1q, XL.1d, MASK.1d 450 451 enc_round KS1, v29 452 453 mov XH.d[0], XM.d[1] 454 mov XM.d[1], XL.d[0] 455 456 aese KS0.16b, v30.16b 457 458 eor XL.16b, XM.16b, T2.16b 459 460 aese KS1.16b, v30.16b 461 462 ext T2.16b, XL.16b, XL.16b, #8 463 464 eor KS0.16b, KS0.16b, v31.16b 465 466 pmull XL.1q, XL.1d, MASK.1d 467 eor T2.16b, T2.16b, XH.16b 468 469 eor KS1.16b, KS1.16b, v31.16b 470 471 eor XL.16b, XL.16b, T2.16b 472 473 .if \enc == 0 474 eor INP0.16b, INP0.16b, KS0.16b 475 eor INP1.16b, INP1.16b, KS1.16b 476 .endif 477 478 st1 {INP0.16b-INP1.16b}, [x2], #32 479 480 cbnz w0, 0b 481 482CPU_LE( rev x8, x8 ) 483 st1 {XL.2d}, [x1] 484 str x8, [x5, #8] // store lower counter 485 486 .if \enc == 1 487 st1 {KS0.16b-KS1.16b}, [x10] 488 .endif 489 490 ret 491 4922: b.eq 3f // AES-192? 493 enc_round KS0, v17 494 enc_round KS1, v17 495 enc_round KS0, v18 496 enc_round KS1, v18 4973: enc_round KS0, v19 498 enc_round KS1, v19 499 enc_round KS0, v20 500 enc_round KS1, v20 501 b 1b 502 .endm 503 504 /* 505 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 506 * struct ghash_key const *k, u8 ctr[], 507 * int rounds, u8 ks[]) 508 */ 509ENTRY(pmull_gcm_encrypt) 510 pmull_gcm_do_crypt 1 511ENDPROC(pmull_gcm_encrypt) 512 513 /* 514 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 515 * struct ghash_key const *k, u8 ctr[], 516 * int rounds) 517 */ 518ENTRY(pmull_gcm_decrypt) 519 pmull_gcm_do_crypt 0 520ENDPROC(pmull_gcm_decrypt) 521 522 /* 523 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) 524 */ 525ENTRY(pmull_gcm_encrypt_block) 526 cbz x2, 0f 527 load_round_keys w3, x2 5280: ld1 {v0.16b}, [x1] 529 enc_block v0, w3 530 st1 {v0.16b}, [x0] 531 ret 532ENDPROC(pmull_gcm_encrypt_block) 533