1/* 2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES 3 * 4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* included by aes-ce.S and aes-neon.S */ 12 13 .text 14 .align 4 15 16/* 17 * There are several ways to instantiate this code: 18 * - no interleave, all inline 19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) 20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) 21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) 22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) 23 * 24 * Macros imported by this code: 25 * - enc_prepare - setup NEON registers for encryption 26 * - dec_prepare - setup NEON registers for decryption 27 * - enc_switch_key - change to new key after having prepared for encryption 28 * - encrypt_block - encrypt a single block 29 * - decrypt block - decrypt a single block 30 * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) 31 * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) 32 * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) 33 * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) 34 */ 35 36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) 37#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp 38#define FRAME_POP ldp x29, x30, [sp],#16 39 40#if INTERLEAVE == 2 41 42aes_encrypt_block2x: 43 encrypt_block2x v0, v1, w3, x2, x6, w7 44 ret 45ENDPROC(aes_encrypt_block2x) 46 47aes_decrypt_block2x: 48 decrypt_block2x v0, v1, w3, x2, x6, w7 49 ret 50ENDPROC(aes_decrypt_block2x) 51 52#elif INTERLEAVE == 4 53 54aes_encrypt_block4x: 55 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 56 ret 57ENDPROC(aes_encrypt_block4x) 58 59aes_decrypt_block4x: 60 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 61 ret 62ENDPROC(aes_decrypt_block4x) 63 64#else 65#error INTERLEAVE should equal 2 or 4 66#endif 67 68 .macro do_encrypt_block2x 69 bl aes_encrypt_block2x 70 .endm 71 72 .macro do_decrypt_block2x 73 bl aes_decrypt_block2x 74 .endm 75 76 .macro do_encrypt_block4x 77 bl aes_encrypt_block4x 78 .endm 79 80 .macro do_decrypt_block4x 81 bl aes_decrypt_block4x 82 .endm 83 84#else 85#define FRAME_PUSH 86#define FRAME_POP 87 88 .macro do_encrypt_block2x 89 encrypt_block2x v0, v1, w3, x2, x6, w7 90 .endm 91 92 .macro do_decrypt_block2x 93 decrypt_block2x v0, v1, w3, x2, x6, w7 94 .endm 95 96 .macro do_encrypt_block4x 97 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 98 .endm 99 100 .macro do_decrypt_block4x 101 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 102 .endm 103 104#endif 105 106 /* 107 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 108 * int blocks, int first) 109 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 110 * int blocks, int first) 111 */ 112 113AES_ENTRY(aes_ecb_encrypt) 114 FRAME_PUSH 115 cbz w5, .LecbencloopNx 116 117 enc_prepare w3, x2, x5 118 119.LecbencloopNx: 120#if INTERLEAVE >= 2 121 subs w4, w4, #INTERLEAVE 122 bmi .Lecbenc1x 123#if INTERLEAVE == 2 124 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 125 do_encrypt_block2x 126 st1 {v0.16b-v1.16b}, [x0], #32 127#else 128 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 129 do_encrypt_block4x 130 st1 {v0.16b-v3.16b}, [x0], #64 131#endif 132 b .LecbencloopNx 133.Lecbenc1x: 134 adds w4, w4, #INTERLEAVE 135 beq .Lecbencout 136#endif 137.Lecbencloop: 138 ld1 {v0.16b}, [x1], #16 /* get next pt block */ 139 encrypt_block v0, w3, x2, x5, w6 140 st1 {v0.16b}, [x0], #16 141 subs w4, w4, #1 142 bne .Lecbencloop 143.Lecbencout: 144 FRAME_POP 145 ret 146AES_ENDPROC(aes_ecb_encrypt) 147 148 149AES_ENTRY(aes_ecb_decrypt) 150 FRAME_PUSH 151 cbz w5, .LecbdecloopNx 152 153 dec_prepare w3, x2, x5 154 155.LecbdecloopNx: 156#if INTERLEAVE >= 2 157 subs w4, w4, #INTERLEAVE 158 bmi .Lecbdec1x 159#if INTERLEAVE == 2 160 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 161 do_decrypt_block2x 162 st1 {v0.16b-v1.16b}, [x0], #32 163#else 164 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 165 do_decrypt_block4x 166 st1 {v0.16b-v3.16b}, [x0], #64 167#endif 168 b .LecbdecloopNx 169.Lecbdec1x: 170 adds w4, w4, #INTERLEAVE 171 beq .Lecbdecout 172#endif 173.Lecbdecloop: 174 ld1 {v0.16b}, [x1], #16 /* get next ct block */ 175 decrypt_block v0, w3, x2, x5, w6 176 st1 {v0.16b}, [x0], #16 177 subs w4, w4, #1 178 bne .Lecbdecloop 179.Lecbdecout: 180 FRAME_POP 181 ret 182AES_ENDPROC(aes_ecb_decrypt) 183 184 185 /* 186 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 187 * int blocks, u8 iv[], int first) 188 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 189 * int blocks, u8 iv[], int first) 190 */ 191 192AES_ENTRY(aes_cbc_encrypt) 193 cbz w6, .Lcbcencloop 194 195 ld1 {v0.16b}, [x5] /* get iv */ 196 enc_prepare w3, x2, x5 197 198.Lcbcencloop: 199 ld1 {v1.16b}, [x1], #16 /* get next pt block */ 200 eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ 201 encrypt_block v0, w3, x2, x5, w6 202 st1 {v0.16b}, [x0], #16 203 subs w4, w4, #1 204 bne .Lcbcencloop 205 ret 206AES_ENDPROC(aes_cbc_encrypt) 207 208 209AES_ENTRY(aes_cbc_decrypt) 210 FRAME_PUSH 211 cbz w6, .LcbcdecloopNx 212 213 ld1 {v7.16b}, [x5] /* get iv */ 214 dec_prepare w3, x2, x5 215 216.LcbcdecloopNx: 217#if INTERLEAVE >= 2 218 subs w4, w4, #INTERLEAVE 219 bmi .Lcbcdec1x 220#if INTERLEAVE == 2 221 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 222 mov v2.16b, v0.16b 223 mov v3.16b, v1.16b 224 do_decrypt_block2x 225 eor v0.16b, v0.16b, v7.16b 226 eor v1.16b, v1.16b, v2.16b 227 mov v7.16b, v3.16b 228 st1 {v0.16b-v1.16b}, [x0], #32 229#else 230 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 231 mov v4.16b, v0.16b 232 mov v5.16b, v1.16b 233 mov v6.16b, v2.16b 234 do_decrypt_block4x 235 sub x1, x1, #16 236 eor v0.16b, v0.16b, v7.16b 237 eor v1.16b, v1.16b, v4.16b 238 ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ 239 eor v2.16b, v2.16b, v5.16b 240 eor v3.16b, v3.16b, v6.16b 241 st1 {v0.16b-v3.16b}, [x0], #64 242#endif 243 b .LcbcdecloopNx 244.Lcbcdec1x: 245 adds w4, w4, #INTERLEAVE 246 beq .Lcbcdecout 247#endif 248.Lcbcdecloop: 249 ld1 {v1.16b}, [x1], #16 /* get next ct block */ 250 mov v0.16b, v1.16b /* ...and copy to v0 */ 251 decrypt_block v0, w3, x2, x5, w6 252 eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ 253 mov v7.16b, v1.16b /* ct is next iv */ 254 st1 {v0.16b}, [x0], #16 255 subs w4, w4, #1 256 bne .Lcbcdecloop 257.Lcbcdecout: 258 FRAME_POP 259 ret 260AES_ENDPROC(aes_cbc_decrypt) 261 262 263 /* 264 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 265 * int blocks, u8 ctr[], int first) 266 */ 267 268AES_ENTRY(aes_ctr_encrypt) 269 FRAME_PUSH 270 cbnz w6, .Lctrfirst /* 1st time around? */ 271 umov x5, v4.d[1] /* keep swabbed ctr in reg */ 272 rev x5, x5 273#if INTERLEAVE >= 2 274 cmn w5, w4 /* 32 bit overflow? */ 275 bcs .Lctrinc 276 add x5, x5, #1 /* increment BE ctr */ 277 b .LctrincNx 278#else 279 b .Lctrinc 280#endif 281.Lctrfirst: 282 enc_prepare w3, x2, x6 283 ld1 {v4.16b}, [x5] 284 umov x5, v4.d[1] /* keep swabbed ctr in reg */ 285 rev x5, x5 286#if INTERLEAVE >= 2 287 cmn w5, w4 /* 32 bit overflow? */ 288 bcs .Lctrloop 289.LctrloopNx: 290 subs w4, w4, #INTERLEAVE 291 bmi .Lctr1x 292#if INTERLEAVE == 2 293 mov v0.8b, v4.8b 294 mov v1.8b, v4.8b 295 rev x7, x5 296 add x5, x5, #1 297 ins v0.d[1], x7 298 rev x7, x5 299 add x5, x5, #1 300 ins v1.d[1], x7 301 ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ 302 do_encrypt_block2x 303 eor v0.16b, v0.16b, v2.16b 304 eor v1.16b, v1.16b, v3.16b 305 st1 {v0.16b-v1.16b}, [x0], #32 306#else 307 ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ 308 dup v7.4s, w5 309 mov v0.16b, v4.16b 310 add v7.4s, v7.4s, v8.4s 311 mov v1.16b, v4.16b 312 rev32 v8.16b, v7.16b 313 mov v2.16b, v4.16b 314 mov v3.16b, v4.16b 315 mov v1.s[3], v8.s[0] 316 mov v2.s[3], v8.s[1] 317 mov v3.s[3], v8.s[2] 318 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ 319 do_encrypt_block4x 320 eor v0.16b, v5.16b, v0.16b 321 ld1 {v5.16b}, [x1], #16 /* get 1 input block */ 322 eor v1.16b, v6.16b, v1.16b 323 eor v2.16b, v7.16b, v2.16b 324 eor v3.16b, v5.16b, v3.16b 325 st1 {v0.16b-v3.16b}, [x0], #64 326 add x5, x5, #INTERLEAVE 327#endif 328 cbz w4, .LctroutNx 329.LctrincNx: 330 rev x7, x5 331 ins v4.d[1], x7 332 b .LctrloopNx 333.LctroutNx: 334 sub x5, x5, #1 335 rev x7, x5 336 ins v4.d[1], x7 337 b .Lctrout 338.Lctr1x: 339 adds w4, w4, #INTERLEAVE 340 beq .Lctrout 341#endif 342.Lctrloop: 343 mov v0.16b, v4.16b 344 encrypt_block v0, w3, x2, x6, w7 345 subs w4, w4, #1 346 bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ 347 ld1 {v3.16b}, [x1], #16 348 eor v3.16b, v0.16b, v3.16b 349 st1 {v3.16b}, [x0], #16 350 beq .Lctrout 351.Lctrinc: 352 adds x5, x5, #1 /* increment BE ctr */ 353 rev x7, x5 354 ins v4.d[1], x7 355 bcc .Lctrloop /* no overflow? */ 356 umov x7, v4.d[0] /* load upper word of ctr */ 357 rev x7, x7 /* ... to handle the carry */ 358 add x7, x7, #1 359 rev x7, x7 360 ins v4.d[0], x7 361 b .Lctrloop 362.Lctrhalfblock: 363 ld1 {v3.8b}, [x1] 364 eor v3.8b, v0.8b, v3.8b 365 st1 {v3.8b}, [x0] 366.Lctrout: 367 FRAME_POP 368 ret 369AES_ENDPROC(aes_ctr_encrypt) 370 .ltorg 371 372 373 /* 374 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 375 * int blocks, u8 const rk2[], u8 iv[], int first) 376 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 377 * int blocks, u8 const rk2[], u8 iv[], int first) 378 */ 379 380 .macro next_tweak, out, in, const, tmp 381 sshr \tmp\().2d, \in\().2d, #63 382 and \tmp\().16b, \tmp\().16b, \const\().16b 383 add \out\().2d, \in\().2d, \in\().2d 384 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 385 eor \out\().16b, \out\().16b, \tmp\().16b 386 .endm 387 388.Lxts_mul_x: 389 .word 1, 0, 0x87, 0 390 391AES_ENTRY(aes_xts_encrypt) 392 FRAME_PUSH 393 cbz w7, .LxtsencloopNx 394 395 ld1 {v4.16b}, [x6] 396 enc_prepare w3, x5, x6 397 encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 398 enc_switch_key w3, x2, x6 399 ldr q7, .Lxts_mul_x 400 b .LxtsencNx 401 402.LxtsencloopNx: 403 ldr q7, .Lxts_mul_x 404 next_tweak v4, v4, v7, v8 405.LxtsencNx: 406#if INTERLEAVE >= 2 407 subs w4, w4, #INTERLEAVE 408 bmi .Lxtsenc1x 409#if INTERLEAVE == 2 410 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 411 next_tweak v5, v4, v7, v8 412 eor v0.16b, v0.16b, v4.16b 413 eor v1.16b, v1.16b, v5.16b 414 do_encrypt_block2x 415 eor v0.16b, v0.16b, v4.16b 416 eor v1.16b, v1.16b, v5.16b 417 st1 {v0.16b-v1.16b}, [x0], #32 418 cbz w4, .LxtsencoutNx 419 next_tweak v4, v5, v7, v8 420 b .LxtsencNx 421.LxtsencoutNx: 422 mov v4.16b, v5.16b 423 b .Lxtsencout 424#else 425 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 426 next_tweak v5, v4, v7, v8 427 eor v0.16b, v0.16b, v4.16b 428 next_tweak v6, v5, v7, v8 429 eor v1.16b, v1.16b, v5.16b 430 eor v2.16b, v2.16b, v6.16b 431 next_tweak v7, v6, v7, v8 432 eor v3.16b, v3.16b, v7.16b 433 do_encrypt_block4x 434 eor v3.16b, v3.16b, v7.16b 435 eor v0.16b, v0.16b, v4.16b 436 eor v1.16b, v1.16b, v5.16b 437 eor v2.16b, v2.16b, v6.16b 438 st1 {v0.16b-v3.16b}, [x0], #64 439 mov v4.16b, v7.16b 440 cbz w4, .Lxtsencout 441 b .LxtsencloopNx 442#endif 443.Lxtsenc1x: 444 adds w4, w4, #INTERLEAVE 445 beq .Lxtsencout 446#endif 447.Lxtsencloop: 448 ld1 {v1.16b}, [x1], #16 449 eor v0.16b, v1.16b, v4.16b 450 encrypt_block v0, w3, x2, x6, w7 451 eor v0.16b, v0.16b, v4.16b 452 st1 {v0.16b}, [x0], #16 453 subs w4, w4, #1 454 beq .Lxtsencout 455 next_tweak v4, v4, v7, v8 456 b .Lxtsencloop 457.Lxtsencout: 458 FRAME_POP 459 ret 460AES_ENDPROC(aes_xts_encrypt) 461 462 463AES_ENTRY(aes_xts_decrypt) 464 FRAME_PUSH 465 cbz w7, .LxtsdecloopNx 466 467 ld1 {v4.16b}, [x6] 468 enc_prepare w3, x5, x6 469 encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 470 dec_prepare w3, x2, x6 471 ldr q7, .Lxts_mul_x 472 b .LxtsdecNx 473 474.LxtsdecloopNx: 475 ldr q7, .Lxts_mul_x 476 next_tweak v4, v4, v7, v8 477.LxtsdecNx: 478#if INTERLEAVE >= 2 479 subs w4, w4, #INTERLEAVE 480 bmi .Lxtsdec1x 481#if INTERLEAVE == 2 482 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 483 next_tweak v5, v4, v7, v8 484 eor v0.16b, v0.16b, v4.16b 485 eor v1.16b, v1.16b, v5.16b 486 do_decrypt_block2x 487 eor v0.16b, v0.16b, v4.16b 488 eor v1.16b, v1.16b, v5.16b 489 st1 {v0.16b-v1.16b}, [x0], #32 490 cbz w4, .LxtsdecoutNx 491 next_tweak v4, v5, v7, v8 492 b .LxtsdecNx 493.LxtsdecoutNx: 494 mov v4.16b, v5.16b 495 b .Lxtsdecout 496#else 497 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 498 next_tweak v5, v4, v7, v8 499 eor v0.16b, v0.16b, v4.16b 500 next_tweak v6, v5, v7, v8 501 eor v1.16b, v1.16b, v5.16b 502 eor v2.16b, v2.16b, v6.16b 503 next_tweak v7, v6, v7, v8 504 eor v3.16b, v3.16b, v7.16b 505 do_decrypt_block4x 506 eor v3.16b, v3.16b, v7.16b 507 eor v0.16b, v0.16b, v4.16b 508 eor v1.16b, v1.16b, v5.16b 509 eor v2.16b, v2.16b, v6.16b 510 st1 {v0.16b-v3.16b}, [x0], #64 511 mov v4.16b, v7.16b 512 cbz w4, .Lxtsdecout 513 b .LxtsdecloopNx 514#endif 515.Lxtsdec1x: 516 adds w4, w4, #INTERLEAVE 517 beq .Lxtsdecout 518#endif 519.Lxtsdecloop: 520 ld1 {v1.16b}, [x1], #16 521 eor v0.16b, v1.16b, v4.16b 522 decrypt_block v0, w3, x2, x6, w7 523 eor v0.16b, v0.16b, v4.16b 524 st1 {v0.16b}, [x0], #16 525 subs w4, w4, #1 526 beq .Lxtsdecout 527 next_tweak v4, v4, v7, v8 528 b .Lxtsdecloop 529.Lxtsdecout: 530 FRAME_POP 531 ret 532AES_ENDPROC(aes_xts_decrypt) 533