1/* 2 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 3 * 4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 SHASH .req v0 15 SHASH2 .req v1 16 T1 .req v2 17 T2 .req v3 18 MASK .req v4 19 XL .req v5 20 XM .req v6 21 XH .req v7 22 IN1 .req v7 23 24 k00_16 .req v8 25 k32_48 .req v9 26 27 t3 .req v10 28 t4 .req v11 29 t5 .req v12 30 t6 .req v13 31 t7 .req v14 32 t8 .req v15 33 t9 .req v16 34 35 perm1 .req v17 36 perm2 .req v18 37 perm3 .req v19 38 39 sh1 .req v20 40 sh2 .req v21 41 sh3 .req v22 42 sh4 .req v23 43 44 ss1 .req v24 45 ss2 .req v25 46 ss3 .req v26 47 ss4 .req v27 48 49 .text 50 .arch armv8-a+crypto 51 52 .macro __pmull_p64, rd, rn, rm 53 pmull \rd\().1q, \rn\().1d, \rm\().1d 54 .endm 55 56 .macro __pmull2_p64, rd, rn, rm 57 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 58 .endm 59 60 .macro __pmull_p8, rq, ad, bd 61 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 62 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 63 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 64 65 __pmull_p8_\bd \rq, \ad 66 .endm 67 68 .macro __pmull2_p8, rq, ad, bd 69 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 70 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 71 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 72 73 __pmull2_p8_\bd \rq, \ad 74 .endm 75 76 .macro __pmull_p8_SHASH, rq, ad 77 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 78 .endm 79 80 .macro __pmull_p8_SHASH2, rq, ad 81 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 82 .endm 83 84 .macro __pmull2_p8_SHASH, rq, ad 85 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 86 .endm 87 88 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 89 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 90 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 91 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 92 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 93 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 94 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 95 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 96 pmull\t \rq\().8h, \ad, \bd // D = A*B 97 98 eor t3.16b, t3.16b, t4.16b // L = E + F 99 eor t5.16b, t5.16b, t6.16b // M = G + H 100 eor t7.16b, t7.16b, t8.16b // N = I + J 101 102 uzp1 t4.2d, t3.2d, t5.2d 103 uzp2 t3.2d, t3.2d, t5.2d 104 uzp1 t6.2d, t7.2d, t9.2d 105 uzp2 t7.2d, t7.2d, t9.2d 106 107 // t3 = (L) (P0 + P1) << 8 108 // t5 = (M) (P2 + P3) << 16 109 eor t4.16b, t4.16b, t3.16b 110 and t3.16b, t3.16b, k32_48.16b 111 112 // t7 = (N) (P4 + P5) << 24 113 // t9 = (K) (P6 + P7) << 32 114 eor t6.16b, t6.16b, t7.16b 115 and t7.16b, t7.16b, k00_16.16b 116 117 eor t4.16b, t4.16b, t3.16b 118 eor t6.16b, t6.16b, t7.16b 119 120 zip2 t5.2d, t4.2d, t3.2d 121 zip1 t3.2d, t4.2d, t3.2d 122 zip2 t9.2d, t6.2d, t7.2d 123 zip1 t7.2d, t6.2d, t7.2d 124 125 ext t3.16b, t3.16b, t3.16b, #15 126 ext t5.16b, t5.16b, t5.16b, #14 127 ext t7.16b, t7.16b, t7.16b, #13 128 ext t9.16b, t9.16b, t9.16b, #12 129 130 eor t3.16b, t3.16b, t5.16b 131 eor t7.16b, t7.16b, t9.16b 132 eor \rq\().16b, \rq\().16b, t3.16b 133 eor \rq\().16b, \rq\().16b, t7.16b 134 .endm 135 136 .macro __pmull_pre_p64 137 movi MASK.16b, #0xe1 138 shl MASK.2d, MASK.2d, #57 139 .endm 140 141 .macro __pmull_pre_p8 142 // k00_16 := 0x0000000000000000_000000000000ffff 143 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 144 movi k32_48.2d, #0xffffffff 145 mov k32_48.h[2], k32_48.h[0] 146 ushr k00_16.2d, k32_48.2d, #32 147 148 // prepare the permutation vectors 149 mov_q x5, 0x080f0e0d0c0b0a09 150 movi T1.8b, #8 151 dup perm1.2d, x5 152 eor perm1.16b, perm1.16b, T1.16b 153 ushr perm2.2d, perm1.2d, #8 154 ushr perm3.2d, perm1.2d, #16 155 ushr T1.2d, perm1.2d, #24 156 sli perm2.2d, perm1.2d, #56 157 sli perm3.2d, perm1.2d, #48 158 sli T1.2d, perm1.2d, #40 159 160 // precompute loop invariants 161 tbl sh1.16b, {SHASH.16b}, perm1.16b 162 tbl sh2.16b, {SHASH.16b}, perm2.16b 163 tbl sh3.16b, {SHASH.16b}, perm3.16b 164 tbl sh4.16b, {SHASH.16b}, T1.16b 165 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 166 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 167 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 168 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 169 .endm 170 171 // 172 // PMULL (64x64->128) based reduction for CPUs that can do 173 // it in a single instruction. 174 // 175 .macro __pmull_reduce_p64 176 pmull T2.1q, XL.1d, MASK.1d 177 eor XM.16b, XM.16b, T1.16b 178 179 mov XH.d[0], XM.d[1] 180 mov XM.d[1], XL.d[0] 181 182 eor XL.16b, XM.16b, T2.16b 183 ext T2.16b, XL.16b, XL.16b, #8 184 pmull XL.1q, XL.1d, MASK.1d 185 .endm 186 187 // 188 // Alternative reduction for CPUs that lack support for the 189 // 64x64->128 PMULL instruction 190 // 191 .macro __pmull_reduce_p8 192 eor XM.16b, XM.16b, T1.16b 193 194 mov XL.d[1], XM.d[0] 195 mov XH.d[0], XM.d[1] 196 197 shl T1.2d, XL.2d, #57 198 shl T2.2d, XL.2d, #62 199 eor T2.16b, T2.16b, T1.16b 200 shl T1.2d, XL.2d, #63 201 eor T2.16b, T2.16b, T1.16b 202 ext T1.16b, XL.16b, XH.16b, #8 203 eor T2.16b, T2.16b, T1.16b 204 205 mov XL.d[1], T2.d[0] 206 mov XH.d[0], T2.d[1] 207 208 ushr T2.2d, XL.2d, #1 209 eor XH.16b, XH.16b, XL.16b 210 eor XL.16b, XL.16b, T2.16b 211 ushr T2.2d, T2.2d, #6 212 ushr XL.2d, XL.2d, #1 213 .endm 214 215 .macro __pmull_ghash, pn 216 ld1 {SHASH.2d}, [x3] 217 ld1 {XL.2d}, [x1] 218 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 219 eor SHASH2.16b, SHASH2.16b, SHASH.16b 220 221 __pmull_pre_\pn 222 223 /* do the head block first, if supplied */ 224 cbz x4, 0f 225 ld1 {T1.2d}, [x4] 226 b 1f 227 2280: ld1 {T1.2d}, [x2], #16 229 sub w0, w0, #1 230 2311: /* multiply XL by SHASH in GF(2^128) */ 232CPU_LE( rev64 T1.16b, T1.16b ) 233 234 ext T2.16b, XL.16b, XL.16b, #8 235 ext IN1.16b, T1.16b, T1.16b, #8 236 eor T1.16b, T1.16b, T2.16b 237 eor XL.16b, XL.16b, IN1.16b 238 239 __pmull2_\pn XH, XL, SHASH // a1 * b1 240 eor T1.16b, T1.16b, XL.16b 241 __pmull_\pn XL, XL, SHASH // a0 * b0 242 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 243 244 eor T2.16b, XL.16b, XH.16b 245 ext T1.16b, XL.16b, XH.16b, #8 246 eor XM.16b, XM.16b, T2.16b 247 248 __pmull_reduce_\pn 249 250 eor T2.16b, T2.16b, XH.16b 251 eor XL.16b, XL.16b, T2.16b 252 253 cbnz w0, 0b 254 255 st1 {XL.2d}, [x1] 256 ret 257 .endm 258 259 /* 260 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 261 * struct ghash_key const *k, const char *head) 262 */ 263ENTRY(pmull_ghash_update_p64) 264 __pmull_ghash p64 265ENDPROC(pmull_ghash_update_p64) 266 267ENTRY(pmull_ghash_update_p8) 268 __pmull_ghash p8 269ENDPROC(pmull_ghash_update_p8) 270 271 KS .req v8 272 CTR .req v9 273 INP .req v10 274 275 .macro load_round_keys, rounds, rk 276 cmp \rounds, #12 277 blo 2222f /* 128 bits */ 278 beq 1111f /* 192 bits */ 279 ld1 {v17.4s-v18.4s}, [\rk], #32 2801111: ld1 {v19.4s-v20.4s}, [\rk], #32 2812222: ld1 {v21.4s-v24.4s}, [\rk], #64 282 ld1 {v25.4s-v28.4s}, [\rk], #64 283 ld1 {v29.4s-v31.4s}, [\rk] 284 .endm 285 286 .macro enc_round, state, key 287 aese \state\().16b, \key\().16b 288 aesmc \state\().16b, \state\().16b 289 .endm 290 291 .macro enc_block, state, rounds 292 cmp \rounds, #12 293 b.lo 2222f /* 128 bits */ 294 b.eq 1111f /* 192 bits */ 295 enc_round \state, v17 296 enc_round \state, v18 2971111: enc_round \state, v19 298 enc_round \state, v20 2992222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 300 enc_round \state, \key 301 .endr 302 aese \state\().16b, v30.16b 303 eor \state\().16b, \state\().16b, v31.16b 304 .endm 305 306 .macro pmull_gcm_do_crypt, enc 307 ld1 {SHASH.2d}, [x4] 308 ld1 {XL.2d}, [x1] 309 ldr x8, [x5, #8] // load lower counter 310 311 movi MASK.16b, #0xe1 312 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 313CPU_LE( rev x8, x8 ) 314 shl MASK.2d, MASK.2d, #57 315 eor SHASH2.16b, SHASH2.16b, SHASH.16b 316 317 .if \enc == 1 318 ld1 {KS.16b}, [x7] 319 .endif 320 3210: ld1 {CTR.8b}, [x5] // load upper counter 322 ld1 {INP.16b}, [x3], #16 323 rev x9, x8 324 add x8, x8, #1 325 sub w0, w0, #1 326 ins CTR.d[1], x9 // set lower counter 327 328 .if \enc == 1 329 eor INP.16b, INP.16b, KS.16b // encrypt input 330 st1 {INP.16b}, [x2], #16 331 .endif 332 333 rev64 T1.16b, INP.16b 334 335 cmp w6, #12 336 b.ge 2f // AES-192/256? 337 3381: enc_round CTR, v21 339 340 ext T2.16b, XL.16b, XL.16b, #8 341 ext IN1.16b, T1.16b, T1.16b, #8 342 343 enc_round CTR, v22 344 345 eor T1.16b, T1.16b, T2.16b 346 eor XL.16b, XL.16b, IN1.16b 347 348 enc_round CTR, v23 349 350 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 351 eor T1.16b, T1.16b, XL.16b 352 353 enc_round CTR, v24 354 355 pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 356 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) 357 358 enc_round CTR, v25 359 360 ext T1.16b, XL.16b, XH.16b, #8 361 eor T2.16b, XL.16b, XH.16b 362 eor XM.16b, XM.16b, T1.16b 363 364 enc_round CTR, v26 365 366 eor XM.16b, XM.16b, T2.16b 367 pmull T2.1q, XL.1d, MASK.1d 368 369 enc_round CTR, v27 370 371 mov XH.d[0], XM.d[1] 372 mov XM.d[1], XL.d[0] 373 374 enc_round CTR, v28 375 376 eor XL.16b, XM.16b, T2.16b 377 378 enc_round CTR, v29 379 380 ext T2.16b, XL.16b, XL.16b, #8 381 382 aese CTR.16b, v30.16b 383 384 pmull XL.1q, XL.1d, MASK.1d 385 eor T2.16b, T2.16b, XH.16b 386 387 eor KS.16b, CTR.16b, v31.16b 388 389 eor XL.16b, XL.16b, T2.16b 390 391 .if \enc == 0 392 eor INP.16b, INP.16b, KS.16b 393 st1 {INP.16b}, [x2], #16 394 .endif 395 396 cbnz w0, 0b 397 398CPU_LE( rev x8, x8 ) 399 st1 {XL.2d}, [x1] 400 str x8, [x5, #8] // store lower counter 401 402 .if \enc == 1 403 st1 {KS.16b}, [x7] 404 .endif 405 406 ret 407 4082: b.eq 3f // AES-192? 409 enc_round CTR, v17 410 enc_round CTR, v18 4113: enc_round CTR, v19 412 enc_round CTR, v20 413 b 1b 414 .endm 415 416 /* 417 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 418 * struct ghash_key const *k, u8 ctr[], 419 * int rounds, u8 ks[]) 420 */ 421ENTRY(pmull_gcm_encrypt) 422 pmull_gcm_do_crypt 1 423ENDPROC(pmull_gcm_encrypt) 424 425 /* 426 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 427 * struct ghash_key const *k, u8 ctr[], 428 * int rounds) 429 */ 430ENTRY(pmull_gcm_decrypt) 431 pmull_gcm_do_crypt 0 432ENDPROC(pmull_gcm_decrypt) 433 434 /* 435 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) 436 */ 437ENTRY(pmull_gcm_encrypt_block) 438 cbz x2, 0f 439 load_round_keys w3, x2 4400: ld1 {v0.16b}, [x1] 441 enc_block v0, w3 442 st1 {v0.16b}, [x0] 443 ret 444ENDPROC(pmull_gcm_encrypt_block) 445