1/* 2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES 3 * 4 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* included by aes-ce.S and aes-neon.S */ 12 13 .text 14 .align 4 15 16/* 17 * There are several ways to instantiate this code: 18 * - no interleave, all inline 19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) 20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) 21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) 22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) 23 * 24 * Macros imported by this code: 25 * - enc_prepare - setup NEON registers for encryption 26 * - dec_prepare - setup NEON registers for decryption 27 * - enc_switch_key - change to new key after having prepared for encryption 28 * - encrypt_block - encrypt a single block 29 * - decrypt block - decrypt a single block 30 * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) 31 * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) 32 * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) 33 * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) 34 */ 35 36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) 37#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp 38#define FRAME_POP ldp x29, x30, [sp],#16 39 40#if INTERLEAVE == 2 41 42aes_encrypt_block2x: 43 encrypt_block2x v0, v1, w3, x2, x6, w7 44 ret 45ENDPROC(aes_encrypt_block2x) 46 47aes_decrypt_block2x: 48 decrypt_block2x v0, v1, w3, x2, x6, w7 49 ret 50ENDPROC(aes_decrypt_block2x) 51 52#elif INTERLEAVE == 4 53 54aes_encrypt_block4x: 55 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 56 ret 57ENDPROC(aes_encrypt_block4x) 58 59aes_decrypt_block4x: 60 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 61 ret 62ENDPROC(aes_decrypt_block4x) 63 64#else 65#error INTERLEAVE should equal 2 or 4 66#endif 67 68 .macro do_encrypt_block2x 69 bl aes_encrypt_block2x 70 .endm 71 72 .macro do_decrypt_block2x 73 bl aes_decrypt_block2x 74 .endm 75 76 .macro do_encrypt_block4x 77 bl aes_encrypt_block4x 78 .endm 79 80 .macro do_decrypt_block4x 81 bl aes_decrypt_block4x 82 .endm 83 84#else 85#define FRAME_PUSH 86#define FRAME_POP 87 88 .macro do_encrypt_block2x 89 encrypt_block2x v0, v1, w3, x2, x6, w7 90 .endm 91 92 .macro do_decrypt_block2x 93 decrypt_block2x v0, v1, w3, x2, x6, w7 94 .endm 95 96 .macro do_encrypt_block4x 97 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 98 .endm 99 100 .macro do_decrypt_block4x 101 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 102 .endm 103 104#endif 105 106 /* 107 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 108 * int blocks, int first) 109 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 110 * int blocks, int first) 111 */ 112 113AES_ENTRY(aes_ecb_encrypt) 114 FRAME_PUSH 115 cbz w5, .LecbencloopNx 116 117 enc_prepare w3, x2, x5 118 119.LecbencloopNx: 120#if INTERLEAVE >= 2 121 subs w4, w4, #INTERLEAVE 122 bmi .Lecbenc1x 123#if INTERLEAVE == 2 124 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 125 do_encrypt_block2x 126 st1 {v0.16b-v1.16b}, [x0], #32 127#else 128 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 129 do_encrypt_block4x 130 st1 {v0.16b-v3.16b}, [x0], #64 131#endif 132 b .LecbencloopNx 133.Lecbenc1x: 134 adds w4, w4, #INTERLEAVE 135 beq .Lecbencout 136#endif 137.Lecbencloop: 138 ld1 {v0.16b}, [x1], #16 /* get next pt block */ 139 encrypt_block v0, w3, x2, x5, w6 140 st1 {v0.16b}, [x0], #16 141 subs w4, w4, #1 142 bne .Lecbencloop 143.Lecbencout: 144 FRAME_POP 145 ret 146AES_ENDPROC(aes_ecb_encrypt) 147 148 149AES_ENTRY(aes_ecb_decrypt) 150 FRAME_PUSH 151 cbz w5, .LecbdecloopNx 152 153 dec_prepare w3, x2, x5 154 155.LecbdecloopNx: 156#if INTERLEAVE >= 2 157 subs w4, w4, #INTERLEAVE 158 bmi .Lecbdec1x 159#if INTERLEAVE == 2 160 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 161 do_decrypt_block2x 162 st1 {v0.16b-v1.16b}, [x0], #32 163#else 164 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 165 do_decrypt_block4x 166 st1 {v0.16b-v3.16b}, [x0], #64 167#endif 168 b .LecbdecloopNx 169.Lecbdec1x: 170 adds w4, w4, #INTERLEAVE 171 beq .Lecbdecout 172#endif 173.Lecbdecloop: 174 ld1 {v0.16b}, [x1], #16 /* get next ct block */ 175 decrypt_block v0, w3, x2, x5, w6 176 st1 {v0.16b}, [x0], #16 177 subs w4, w4, #1 178 bne .Lecbdecloop 179.Lecbdecout: 180 FRAME_POP 181 ret 182AES_ENDPROC(aes_ecb_decrypt) 183 184 185 /* 186 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 187 * int blocks, u8 iv[], int first) 188 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 189 * int blocks, u8 iv[], int first) 190 */ 191 192AES_ENTRY(aes_cbc_encrypt) 193 cbz w6, .Lcbcencloop 194 195 ld1 {v0.16b}, [x5] /* get iv */ 196 enc_prepare w3, x2, x6 197 198.Lcbcencloop: 199 ld1 {v1.16b}, [x1], #16 /* get next pt block */ 200 eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ 201 encrypt_block v0, w3, x2, x6, w7 202 st1 {v0.16b}, [x0], #16 203 subs w4, w4, #1 204 bne .Lcbcencloop 205 st1 {v0.16b}, [x5] /* return iv */ 206 ret 207AES_ENDPROC(aes_cbc_encrypt) 208 209 210AES_ENTRY(aes_cbc_decrypt) 211 FRAME_PUSH 212 cbz w6, .LcbcdecloopNx 213 214 ld1 {v7.16b}, [x5] /* get iv */ 215 dec_prepare w3, x2, x6 216 217.LcbcdecloopNx: 218#if INTERLEAVE >= 2 219 subs w4, w4, #INTERLEAVE 220 bmi .Lcbcdec1x 221#if INTERLEAVE == 2 222 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 223 mov v2.16b, v0.16b 224 mov v3.16b, v1.16b 225 do_decrypt_block2x 226 eor v0.16b, v0.16b, v7.16b 227 eor v1.16b, v1.16b, v2.16b 228 mov v7.16b, v3.16b 229 st1 {v0.16b-v1.16b}, [x0], #32 230#else 231 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 232 mov v4.16b, v0.16b 233 mov v5.16b, v1.16b 234 mov v6.16b, v2.16b 235 do_decrypt_block4x 236 sub x1, x1, #16 237 eor v0.16b, v0.16b, v7.16b 238 eor v1.16b, v1.16b, v4.16b 239 ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ 240 eor v2.16b, v2.16b, v5.16b 241 eor v3.16b, v3.16b, v6.16b 242 st1 {v0.16b-v3.16b}, [x0], #64 243#endif 244 b .LcbcdecloopNx 245.Lcbcdec1x: 246 adds w4, w4, #INTERLEAVE 247 beq .Lcbcdecout 248#endif 249.Lcbcdecloop: 250 ld1 {v1.16b}, [x1], #16 /* get next ct block */ 251 mov v0.16b, v1.16b /* ...and copy to v0 */ 252 decrypt_block v0, w3, x2, x6, w7 253 eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ 254 mov v7.16b, v1.16b /* ct is next iv */ 255 st1 {v0.16b}, [x0], #16 256 subs w4, w4, #1 257 bne .Lcbcdecloop 258.Lcbcdecout: 259 FRAME_POP 260 st1 {v7.16b}, [x5] /* return iv */ 261 ret 262AES_ENDPROC(aes_cbc_decrypt) 263 264 265 /* 266 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 267 * int blocks, u8 ctr[], int first) 268 */ 269 270AES_ENTRY(aes_ctr_encrypt) 271 FRAME_PUSH 272 cbz w6, .Lctrnotfirst /* 1st time around? */ 273 enc_prepare w3, x2, x6 274 ld1 {v4.16b}, [x5] 275 276.Lctrnotfirst: 277 umov x8, v4.d[1] /* keep swabbed ctr in reg */ 278 rev x8, x8 279#if INTERLEAVE >= 2 280 cmn w8, w4 /* 32 bit overflow? */ 281 bcs .Lctrloop 282.LctrloopNx: 283 subs w4, w4, #INTERLEAVE 284 bmi .Lctr1x 285#if INTERLEAVE == 2 286 mov v0.8b, v4.8b 287 mov v1.8b, v4.8b 288 rev x7, x8 289 add x8, x8, #1 290 ins v0.d[1], x7 291 rev x7, x8 292 add x8, x8, #1 293 ins v1.d[1], x7 294 ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ 295 do_encrypt_block2x 296 eor v0.16b, v0.16b, v2.16b 297 eor v1.16b, v1.16b, v3.16b 298 st1 {v0.16b-v1.16b}, [x0], #32 299#else 300 ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ 301 dup v7.4s, w8 302 mov v0.16b, v4.16b 303 add v7.4s, v7.4s, v8.4s 304 mov v1.16b, v4.16b 305 rev32 v8.16b, v7.16b 306 mov v2.16b, v4.16b 307 mov v3.16b, v4.16b 308 mov v1.s[3], v8.s[0] 309 mov v2.s[3], v8.s[1] 310 mov v3.s[3], v8.s[2] 311 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ 312 do_encrypt_block4x 313 eor v0.16b, v5.16b, v0.16b 314 ld1 {v5.16b}, [x1], #16 /* get 1 input block */ 315 eor v1.16b, v6.16b, v1.16b 316 eor v2.16b, v7.16b, v2.16b 317 eor v3.16b, v5.16b, v3.16b 318 st1 {v0.16b-v3.16b}, [x0], #64 319 add x8, x8, #INTERLEAVE 320#endif 321 rev x7, x8 322 ins v4.d[1], x7 323 cbz w4, .Lctrout 324 b .LctrloopNx 325.Lctr1x: 326 adds w4, w4, #INTERLEAVE 327 beq .Lctrout 328#endif 329.Lctrloop: 330 mov v0.16b, v4.16b 331 encrypt_block v0, w3, x2, x6, w7 332 333 adds x8, x8, #1 /* increment BE ctr */ 334 rev x7, x8 335 ins v4.d[1], x7 336 bcs .Lctrcarry /* overflow? */ 337 338.Lctrcarrydone: 339 subs w4, w4, #1 340 bmi .Lctrtailblock /* blocks <0 means tail block */ 341 ld1 {v3.16b}, [x1], #16 342 eor v3.16b, v0.16b, v3.16b 343 st1 {v3.16b}, [x0], #16 344 bne .Lctrloop 345 346.Lctrout: 347 st1 {v4.16b}, [x5] /* return next CTR value */ 348 FRAME_POP 349 ret 350 351.Lctrtailblock: 352 st1 {v0.16b}, [x0] 353 FRAME_POP 354 ret 355 356.Lctrcarry: 357 umov x7, v4.d[0] /* load upper word of ctr */ 358 rev x7, x7 /* ... to handle the carry */ 359 add x7, x7, #1 360 rev x7, x7 361 ins v4.d[0], x7 362 b .Lctrcarrydone 363AES_ENDPROC(aes_ctr_encrypt) 364 .ltorg 365 366 367 /* 368 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 369 * int blocks, u8 const rk2[], u8 iv[], int first) 370 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 371 * int blocks, u8 const rk2[], u8 iv[], int first) 372 */ 373 374 .macro next_tweak, out, in, const, tmp 375 sshr \tmp\().2d, \in\().2d, #63 376 and \tmp\().16b, \tmp\().16b, \const\().16b 377 add \out\().2d, \in\().2d, \in\().2d 378 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 379 eor \out\().16b, \out\().16b, \tmp\().16b 380 .endm 381 382.Lxts_mul_x: 383CPU_LE( .quad 1, 0x87 ) 384CPU_BE( .quad 0x87, 1 ) 385 386AES_ENTRY(aes_xts_encrypt) 387 FRAME_PUSH 388 cbz w7, .LxtsencloopNx 389 390 ld1 {v4.16b}, [x6] 391 enc_prepare w3, x5, x6 392 encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 393 enc_switch_key w3, x2, x6 394 ldr q7, .Lxts_mul_x 395 b .LxtsencNx 396 397.LxtsencloopNx: 398 ldr q7, .Lxts_mul_x 399 next_tweak v4, v4, v7, v8 400.LxtsencNx: 401#if INTERLEAVE >= 2 402 subs w4, w4, #INTERLEAVE 403 bmi .Lxtsenc1x 404#if INTERLEAVE == 2 405 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 406 next_tweak v5, v4, v7, v8 407 eor v0.16b, v0.16b, v4.16b 408 eor v1.16b, v1.16b, v5.16b 409 do_encrypt_block2x 410 eor v0.16b, v0.16b, v4.16b 411 eor v1.16b, v1.16b, v5.16b 412 st1 {v0.16b-v1.16b}, [x0], #32 413 cbz w4, .LxtsencoutNx 414 next_tweak v4, v5, v7, v8 415 b .LxtsencNx 416.LxtsencoutNx: 417 mov v4.16b, v5.16b 418 b .Lxtsencout 419#else 420 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 421 next_tweak v5, v4, v7, v8 422 eor v0.16b, v0.16b, v4.16b 423 next_tweak v6, v5, v7, v8 424 eor v1.16b, v1.16b, v5.16b 425 eor v2.16b, v2.16b, v6.16b 426 next_tweak v7, v6, v7, v8 427 eor v3.16b, v3.16b, v7.16b 428 do_encrypt_block4x 429 eor v3.16b, v3.16b, v7.16b 430 eor v0.16b, v0.16b, v4.16b 431 eor v1.16b, v1.16b, v5.16b 432 eor v2.16b, v2.16b, v6.16b 433 st1 {v0.16b-v3.16b}, [x0], #64 434 mov v4.16b, v7.16b 435 cbz w4, .Lxtsencout 436 b .LxtsencloopNx 437#endif 438.Lxtsenc1x: 439 adds w4, w4, #INTERLEAVE 440 beq .Lxtsencout 441#endif 442.Lxtsencloop: 443 ld1 {v1.16b}, [x1], #16 444 eor v0.16b, v1.16b, v4.16b 445 encrypt_block v0, w3, x2, x6, w7 446 eor v0.16b, v0.16b, v4.16b 447 st1 {v0.16b}, [x0], #16 448 subs w4, w4, #1 449 beq .Lxtsencout 450 next_tweak v4, v4, v7, v8 451 b .Lxtsencloop 452.Lxtsencout: 453 FRAME_POP 454 ret 455AES_ENDPROC(aes_xts_encrypt) 456 457 458AES_ENTRY(aes_xts_decrypt) 459 FRAME_PUSH 460 cbz w7, .LxtsdecloopNx 461 462 ld1 {v4.16b}, [x6] 463 enc_prepare w3, x5, x6 464 encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 465 dec_prepare w3, x2, x6 466 ldr q7, .Lxts_mul_x 467 b .LxtsdecNx 468 469.LxtsdecloopNx: 470 ldr q7, .Lxts_mul_x 471 next_tweak v4, v4, v7, v8 472.LxtsdecNx: 473#if INTERLEAVE >= 2 474 subs w4, w4, #INTERLEAVE 475 bmi .Lxtsdec1x 476#if INTERLEAVE == 2 477 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 478 next_tweak v5, v4, v7, v8 479 eor v0.16b, v0.16b, v4.16b 480 eor v1.16b, v1.16b, v5.16b 481 do_decrypt_block2x 482 eor v0.16b, v0.16b, v4.16b 483 eor v1.16b, v1.16b, v5.16b 484 st1 {v0.16b-v1.16b}, [x0], #32 485 cbz w4, .LxtsdecoutNx 486 next_tweak v4, v5, v7, v8 487 b .LxtsdecNx 488.LxtsdecoutNx: 489 mov v4.16b, v5.16b 490 b .Lxtsdecout 491#else 492 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 493 next_tweak v5, v4, v7, v8 494 eor v0.16b, v0.16b, v4.16b 495 next_tweak v6, v5, v7, v8 496 eor v1.16b, v1.16b, v5.16b 497 eor v2.16b, v2.16b, v6.16b 498 next_tweak v7, v6, v7, v8 499 eor v3.16b, v3.16b, v7.16b 500 do_decrypt_block4x 501 eor v3.16b, v3.16b, v7.16b 502 eor v0.16b, v0.16b, v4.16b 503 eor v1.16b, v1.16b, v5.16b 504 eor v2.16b, v2.16b, v6.16b 505 st1 {v0.16b-v3.16b}, [x0], #64 506 mov v4.16b, v7.16b 507 cbz w4, .Lxtsdecout 508 b .LxtsdecloopNx 509#endif 510.Lxtsdec1x: 511 adds w4, w4, #INTERLEAVE 512 beq .Lxtsdecout 513#endif 514.Lxtsdecloop: 515 ld1 {v1.16b}, [x1], #16 516 eor v0.16b, v1.16b, v4.16b 517 decrypt_block v0, w3, x2, x6, w7 518 eor v0.16b, v0.16b, v4.16b 519 st1 {v0.16b}, [x0], #16 520 subs w4, w4, #1 521 beq .Lxtsdecout 522 next_tweak v4, v4, v7, v8 523 b .Lxtsdecloop 524.Lxtsdecout: 525 FRAME_POP 526 ret 527AES_ENDPROC(aes_xts_decrypt) 528 529 /* 530 * aes_mac_update(u8 const in[], u32 const rk[], int rounds, 531 * int blocks, u8 dg[], int enc_before, int enc_after) 532 */ 533AES_ENTRY(aes_mac_update) 534 ld1 {v0.16b}, [x4] /* get dg */ 535 enc_prepare w2, x1, x7 536 cbnz w5, .Lmacenc 537 538.Lmacloop: 539 cbz w3, .Lmacout 540 ld1 {v1.16b}, [x0], #16 /* get next pt block */ 541 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ 542 543 subs w3, w3, #1 544 csinv x5, x6, xzr, eq 545 cbz w5, .Lmacout 546 547.Lmacenc: 548 encrypt_block v0, w2, x1, x7, w8 549 b .Lmacloop 550 551.Lmacout: 552 st1 {v0.16b}, [x4] /* return dg */ 553 ret 554AES_ENDPROC(aes_mac_update) 555