1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES 4 * 5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8/* included by aes-ce.S and aes-neon.S */ 9 10 .text 11 .align 4 12 13#ifndef MAX_STRIDE 14#define MAX_STRIDE 4 15#endif 16 17#if MAX_STRIDE == 4 18#define ST4(x...) x 19#define ST5(x...) 20#else 21#define ST4(x...) 22#define ST5(x...) x 23#endif 24 25SYM_FUNC_START_LOCAL(aes_encrypt_block4x) 26 encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 27 ret 28SYM_FUNC_END(aes_encrypt_block4x) 29 30SYM_FUNC_START_LOCAL(aes_decrypt_block4x) 31 decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 32 ret 33SYM_FUNC_END(aes_decrypt_block4x) 34 35#if MAX_STRIDE == 5 36SYM_FUNC_START_LOCAL(aes_encrypt_block5x) 37 encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 38 ret 39SYM_FUNC_END(aes_encrypt_block5x) 40 41SYM_FUNC_START_LOCAL(aes_decrypt_block5x) 42 decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 43 ret 44SYM_FUNC_END(aes_decrypt_block5x) 45#endif 46 47 /* 48 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 49 * int blocks) 50 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 51 * int blocks) 52 */ 53 54AES_FUNC_START(aes_ecb_encrypt) 55 stp x29, x30, [sp, #-16]! 56 mov x29, sp 57 58 enc_prepare w3, x2, x5 59 60.LecbencloopNx: 61 subs w4, w4, #MAX_STRIDE 62 bmi .Lecbenc1x 63 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 64ST4( bl aes_encrypt_block4x ) 65ST5( ld1 {v4.16b}, [x1], #16 ) 66ST5( bl aes_encrypt_block5x ) 67 st1 {v0.16b-v3.16b}, [x0], #64 68ST5( st1 {v4.16b}, [x0], #16 ) 69 b .LecbencloopNx 70.Lecbenc1x: 71 adds w4, w4, #MAX_STRIDE 72 beq .Lecbencout 73.Lecbencloop: 74 ld1 {v0.16b}, [x1], #16 /* get next pt block */ 75 encrypt_block v0, w3, x2, x5, w6 76 st1 {v0.16b}, [x0], #16 77 subs w4, w4, #1 78 bne .Lecbencloop 79.Lecbencout: 80 ldp x29, x30, [sp], #16 81 ret 82AES_FUNC_END(aes_ecb_encrypt) 83 84 85AES_FUNC_START(aes_ecb_decrypt) 86 stp x29, x30, [sp, #-16]! 87 mov x29, sp 88 89 dec_prepare w3, x2, x5 90 91.LecbdecloopNx: 92 subs w4, w4, #MAX_STRIDE 93 bmi .Lecbdec1x 94 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 95ST4( bl aes_decrypt_block4x ) 96ST5( ld1 {v4.16b}, [x1], #16 ) 97ST5( bl aes_decrypt_block5x ) 98 st1 {v0.16b-v3.16b}, [x0], #64 99ST5( st1 {v4.16b}, [x0], #16 ) 100 b .LecbdecloopNx 101.Lecbdec1x: 102 adds w4, w4, #MAX_STRIDE 103 beq .Lecbdecout 104.Lecbdecloop: 105 ld1 {v0.16b}, [x1], #16 /* get next ct block */ 106 decrypt_block v0, w3, x2, x5, w6 107 st1 {v0.16b}, [x0], #16 108 subs w4, w4, #1 109 bne .Lecbdecloop 110.Lecbdecout: 111 ldp x29, x30, [sp], #16 112 ret 113AES_FUNC_END(aes_ecb_decrypt) 114 115 116 /* 117 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 118 * int blocks, u8 iv[]) 119 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 120 * int blocks, u8 iv[]) 121 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], 122 * int rounds, int blocks, u8 iv[], 123 * u32 const rk2[]); 124 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], 125 * int rounds, int blocks, u8 iv[], 126 * u32 const rk2[]); 127 */ 128 129AES_FUNC_START(aes_essiv_cbc_encrypt) 130 ld1 {v4.16b}, [x5] /* get iv */ 131 132 mov w8, #14 /* AES-256: 14 rounds */ 133 enc_prepare w8, x6, x7 134 encrypt_block v4, w8, x6, x7, w9 135 enc_switch_key w3, x2, x6 136 b .Lcbcencloop4x 137 138AES_FUNC_START(aes_cbc_encrypt) 139 ld1 {v4.16b}, [x5] /* get iv */ 140 enc_prepare w3, x2, x6 141 142.Lcbcencloop4x: 143 subs w4, w4, #4 144 bmi .Lcbcenc1x 145 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 146 eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ 147 encrypt_block v0, w3, x2, x6, w7 148 eor v1.16b, v1.16b, v0.16b 149 encrypt_block v1, w3, x2, x6, w7 150 eor v2.16b, v2.16b, v1.16b 151 encrypt_block v2, w3, x2, x6, w7 152 eor v3.16b, v3.16b, v2.16b 153 encrypt_block v3, w3, x2, x6, w7 154 st1 {v0.16b-v3.16b}, [x0], #64 155 mov v4.16b, v3.16b 156 b .Lcbcencloop4x 157.Lcbcenc1x: 158 adds w4, w4, #4 159 beq .Lcbcencout 160.Lcbcencloop: 161 ld1 {v0.16b}, [x1], #16 /* get next pt block */ 162 eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ 163 encrypt_block v4, w3, x2, x6, w7 164 st1 {v4.16b}, [x0], #16 165 subs w4, w4, #1 166 bne .Lcbcencloop 167.Lcbcencout: 168 st1 {v4.16b}, [x5] /* return iv */ 169 ret 170AES_FUNC_END(aes_cbc_encrypt) 171AES_FUNC_END(aes_essiv_cbc_encrypt) 172 173AES_FUNC_START(aes_essiv_cbc_decrypt) 174 stp x29, x30, [sp, #-16]! 175 mov x29, sp 176 177 ld1 {cbciv.16b}, [x5] /* get iv */ 178 179 mov w8, #14 /* AES-256: 14 rounds */ 180 enc_prepare w8, x6, x7 181 encrypt_block cbciv, w8, x6, x7, w9 182 b .Lessivcbcdecstart 183 184AES_FUNC_START(aes_cbc_decrypt) 185 stp x29, x30, [sp, #-16]! 186 mov x29, sp 187 188 ld1 {cbciv.16b}, [x5] /* get iv */ 189.Lessivcbcdecstart: 190 dec_prepare w3, x2, x6 191 192.LcbcdecloopNx: 193 subs w4, w4, #MAX_STRIDE 194 bmi .Lcbcdec1x 195 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 196#if MAX_STRIDE == 5 197 ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ 198 mov v5.16b, v0.16b 199 mov v6.16b, v1.16b 200 mov v7.16b, v2.16b 201 bl aes_decrypt_block5x 202 sub x1, x1, #32 203 eor v0.16b, v0.16b, cbciv.16b 204 eor v1.16b, v1.16b, v5.16b 205 ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ 206 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ 207 eor v2.16b, v2.16b, v6.16b 208 eor v3.16b, v3.16b, v7.16b 209 eor v4.16b, v4.16b, v5.16b 210#else 211 mov v4.16b, v0.16b 212 mov v5.16b, v1.16b 213 mov v6.16b, v2.16b 214 bl aes_decrypt_block4x 215 sub x1, x1, #16 216 eor v0.16b, v0.16b, cbciv.16b 217 eor v1.16b, v1.16b, v4.16b 218 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ 219 eor v2.16b, v2.16b, v5.16b 220 eor v3.16b, v3.16b, v6.16b 221#endif 222 st1 {v0.16b-v3.16b}, [x0], #64 223ST5( st1 {v4.16b}, [x0], #16 ) 224 b .LcbcdecloopNx 225.Lcbcdec1x: 226 adds w4, w4, #MAX_STRIDE 227 beq .Lcbcdecout 228.Lcbcdecloop: 229 ld1 {v1.16b}, [x1], #16 /* get next ct block */ 230 mov v0.16b, v1.16b /* ...and copy to v0 */ 231 decrypt_block v0, w3, x2, x6, w7 232 eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ 233 mov cbciv.16b, v1.16b /* ct is next iv */ 234 st1 {v0.16b}, [x0], #16 235 subs w4, w4, #1 236 bne .Lcbcdecloop 237.Lcbcdecout: 238 st1 {cbciv.16b}, [x5] /* return iv */ 239 ldp x29, x30, [sp], #16 240 ret 241AES_FUNC_END(aes_cbc_decrypt) 242AES_FUNC_END(aes_essiv_cbc_decrypt) 243 244 245 /* 246 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], 247 * int rounds, int bytes, u8 const iv[]) 248 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], 249 * int rounds, int bytes, u8 const iv[]) 250 */ 251 252AES_FUNC_START(aes_cbc_cts_encrypt) 253 adr_l x8, .Lcts_permute_table 254 sub x4, x4, #16 255 add x9, x8, #32 256 add x8, x8, x4 257 sub x9, x9, x4 258 ld1 {v3.16b}, [x8] 259 ld1 {v4.16b}, [x9] 260 261 ld1 {v0.16b}, [x1], x4 /* overlapping loads */ 262 ld1 {v1.16b}, [x1] 263 264 ld1 {v5.16b}, [x5] /* get iv */ 265 enc_prepare w3, x2, x6 266 267 eor v0.16b, v0.16b, v5.16b /* xor with iv */ 268 tbl v1.16b, {v1.16b}, v4.16b 269 encrypt_block v0, w3, x2, x6, w7 270 271 eor v1.16b, v1.16b, v0.16b 272 tbl v0.16b, {v0.16b}, v3.16b 273 encrypt_block v1, w3, x2, x6, w7 274 275 add x4, x0, x4 276 st1 {v0.16b}, [x4] /* overlapping stores */ 277 st1 {v1.16b}, [x0] 278 ret 279AES_FUNC_END(aes_cbc_cts_encrypt) 280 281AES_FUNC_START(aes_cbc_cts_decrypt) 282 adr_l x8, .Lcts_permute_table 283 sub x4, x4, #16 284 add x9, x8, #32 285 add x8, x8, x4 286 sub x9, x9, x4 287 ld1 {v3.16b}, [x8] 288 ld1 {v4.16b}, [x9] 289 290 ld1 {v0.16b}, [x1], x4 /* overlapping loads */ 291 ld1 {v1.16b}, [x1] 292 293 ld1 {v5.16b}, [x5] /* get iv */ 294 dec_prepare w3, x2, x6 295 296 decrypt_block v0, w3, x2, x6, w7 297 tbl v2.16b, {v0.16b}, v3.16b 298 eor v2.16b, v2.16b, v1.16b 299 300 tbx v0.16b, {v1.16b}, v4.16b 301 decrypt_block v0, w3, x2, x6, w7 302 eor v0.16b, v0.16b, v5.16b /* xor with iv */ 303 304 add x4, x0, x4 305 st1 {v2.16b}, [x4] /* overlapping stores */ 306 st1 {v0.16b}, [x0] 307 ret 308AES_FUNC_END(aes_cbc_cts_decrypt) 309 310 .section ".rodata", "a" 311 .align 6 312.Lcts_permute_table: 313 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 314 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 315 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 316 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 317 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 318 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 319 .previous 320 321 /* 322 * This macro generates the code for CTR and XCTR mode. 323 */ 324.macro ctr_encrypt xctr 325 // Arguments 326 OUT .req x0 327 IN .req x1 328 KEY .req x2 329 ROUNDS_W .req w3 330 BYTES_W .req w4 331 IV .req x5 332 BYTE_CTR_W .req w6 // XCTR only 333 // Intermediate values 334 CTR_W .req w11 // XCTR only 335 CTR .req x11 // XCTR only 336 IV_PART .req x12 337 BLOCKS .req x13 338 BLOCKS_W .req w13 339 340 stp x29, x30, [sp, #-16]! 341 mov x29, sp 342 343 enc_prepare ROUNDS_W, KEY, IV_PART 344 ld1 {vctr.16b}, [IV] 345 346 /* 347 * Keep 64 bits of the IV in a register. For CTR mode this lets us 348 * easily increment the IV. For XCTR mode this lets us efficiently XOR 349 * the 64-bit counter with the IV. 350 */ 351 .if \xctr 352 umov IV_PART, vctr.d[0] 353 lsr CTR_W, BYTE_CTR_W, #4 354 .else 355 umov IV_PART, vctr.d[1] 356 rev IV_PART, IV_PART 357 .endif 358 359.LctrloopNx\xctr: 360 add BLOCKS_W, BYTES_W, #15 361 sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 362 lsr BLOCKS_W, BLOCKS_W, #4 363 mov w8, #MAX_STRIDE 364 cmp BLOCKS_W, w8 365 csel BLOCKS_W, BLOCKS_W, w8, lt 366 367 /* 368 * Set up the counter values in v0-v{MAX_STRIDE-1}. 369 * 370 * If we are encrypting less than MAX_STRIDE blocks, the tail block 371 * handling code expects the last keystream block to be in 372 * v{MAX_STRIDE-1}. For example: if encrypting two blocks with 373 * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. 374 */ 375 .if \xctr 376 add CTR, CTR, BLOCKS 377 .else 378 adds IV_PART, IV_PART, BLOCKS 379 .endif 380 mov v0.16b, vctr.16b 381 mov v1.16b, vctr.16b 382 mov v2.16b, vctr.16b 383 mov v3.16b, vctr.16b 384ST5( mov v4.16b, vctr.16b ) 385 .if \xctr 386 sub x6, CTR, #MAX_STRIDE - 1 387 sub x7, CTR, #MAX_STRIDE - 2 388 sub x8, CTR, #MAX_STRIDE - 3 389 sub x9, CTR, #MAX_STRIDE - 4 390ST5( sub x10, CTR, #MAX_STRIDE - 5 ) 391 eor x6, x6, IV_PART 392 eor x7, x7, IV_PART 393 eor x8, x8, IV_PART 394 eor x9, x9, IV_PART 395ST5( eor x10, x10, IV_PART ) 396 mov v0.d[0], x6 397 mov v1.d[0], x7 398 mov v2.d[0], x8 399 mov v3.d[0], x9 400ST5( mov v4.d[0], x10 ) 401 .else 402 bcs 0f 403 .subsection 1 404 /* 405 * This subsection handles carries. 406 * 407 * Conditional branching here is allowed with respect to time 408 * invariance since the branches are dependent on the IV instead 409 * of the plaintext or key. This code is rarely executed in 410 * practice anyway. 411 */ 412 413 /* Apply carry to outgoing counter. */ 4140: umov x8, vctr.d[0] 415 rev x8, x8 416 add x8, x8, #1 417 rev x8, x8 418 ins vctr.d[0], x8 419 420 /* 421 * Apply carry to counter blocks if needed. 422 * 423 * Since the carry flag was set, we know 0 <= IV_PART < 424 * MAX_STRIDE. Using the value of IV_PART we can determine how 425 * many counter blocks need to be updated. 426 */ 427 cbz IV_PART, 2f 428 adr x16, 1f 429 sub x16, x16, IV_PART, lsl #3 430 br x16 431 bti c 432 mov v0.d[0], vctr.d[0] 433 bti c 434 mov v1.d[0], vctr.d[0] 435 bti c 436 mov v2.d[0], vctr.d[0] 437 bti c 438 mov v3.d[0], vctr.d[0] 439ST5( bti c ) 440ST5( mov v4.d[0], vctr.d[0] ) 4411: b 2f 442 .previous 443 4442: rev x7, IV_PART 445 ins vctr.d[1], x7 446 sub x7, IV_PART, #MAX_STRIDE - 1 447 sub x8, IV_PART, #MAX_STRIDE - 2 448 sub x9, IV_PART, #MAX_STRIDE - 3 449 rev x7, x7 450 rev x8, x8 451 mov v1.d[1], x7 452 rev x9, x9 453ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) 454 mov v2.d[1], x8 455ST5( rev x10, x10 ) 456 mov v3.d[1], x9 457ST5( mov v4.d[1], x10 ) 458 .endif 459 460 /* 461 * If there are at least MAX_STRIDE blocks left, XOR the data with 462 * keystream and store. Otherwise jump to tail handling. 463 */ 464 tbnz BYTES_W, #31, .Lctrtail\xctr 465 ld1 {v5.16b-v7.16b}, [IN], #48 466ST4( bl aes_encrypt_block4x ) 467ST5( bl aes_encrypt_block5x ) 468 eor v0.16b, v5.16b, v0.16b 469ST4( ld1 {v5.16b}, [IN], #16 ) 470 eor v1.16b, v6.16b, v1.16b 471ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) 472 eor v2.16b, v7.16b, v2.16b 473 eor v3.16b, v5.16b, v3.16b 474ST5( eor v4.16b, v6.16b, v4.16b ) 475 st1 {v0.16b-v3.16b}, [OUT], #64 476ST5( st1 {v4.16b}, [OUT], #16 ) 477 cbz BYTES_W, .Lctrout\xctr 478 b .LctrloopNx\xctr 479 480.Lctrout\xctr: 481 .if !\xctr 482 st1 {vctr.16b}, [IV] /* return next CTR value */ 483 .endif 484 ldp x29, x30, [sp], #16 485 ret 486 487.Lctrtail\xctr: 488 /* 489 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext 490 * 491 * This code expects the last keystream block to be in v{MAX_STRIDE-1}. 492 * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and 493 * v4 should have the next two counter blocks. 494 * 495 * This allows us to store the ciphertext by writing to overlapping 496 * regions of memory. Any invalid ciphertext blocks get overwritten by 497 * correctly computed blocks. This approach greatly simplifies the 498 * logic for storing the ciphertext. 499 */ 500 mov x16, #16 501 ands w7, BYTES_W, #0xf 502 csel x13, x7, x16, ne 503 504ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) 505ST5( csel x14, x16, xzr, gt ) 506 cmp BYTES_W, #48 - (MAX_STRIDE << 4) 507 csel x15, x16, xzr, gt 508 cmp BYTES_W, #32 - (MAX_STRIDE << 4) 509 csel x16, x16, xzr, gt 510 cmp BYTES_W, #16 - (MAX_STRIDE << 4) 511 512 adr_l x9, .Lcts_permute_table 513 add x9, x9, x13 514 ble .Lctrtail1x\xctr 515 516ST5( ld1 {v5.16b}, [IN], x14 ) 517 ld1 {v6.16b}, [IN], x15 518 ld1 {v7.16b}, [IN], x16 519 520ST4( bl aes_encrypt_block4x ) 521ST5( bl aes_encrypt_block5x ) 522 523 ld1 {v8.16b}, [IN], x13 524 ld1 {v9.16b}, [IN] 525 ld1 {v10.16b}, [x9] 526 527ST4( eor v6.16b, v6.16b, v0.16b ) 528ST4( eor v7.16b, v7.16b, v1.16b ) 529ST4( tbl v3.16b, {v3.16b}, v10.16b ) 530ST4( eor v8.16b, v8.16b, v2.16b ) 531ST4( eor v9.16b, v9.16b, v3.16b ) 532 533ST5( eor v5.16b, v5.16b, v0.16b ) 534ST5( eor v6.16b, v6.16b, v1.16b ) 535ST5( tbl v4.16b, {v4.16b}, v10.16b ) 536ST5( eor v7.16b, v7.16b, v2.16b ) 537ST5( eor v8.16b, v8.16b, v3.16b ) 538ST5( eor v9.16b, v9.16b, v4.16b ) 539 540ST5( st1 {v5.16b}, [OUT], x14 ) 541 st1 {v6.16b}, [OUT], x15 542 st1 {v7.16b}, [OUT], x16 543 add x13, x13, OUT 544 st1 {v9.16b}, [x13] // overlapping stores 545 st1 {v8.16b}, [OUT] 546 b .Lctrout\xctr 547 548.Lctrtail1x\xctr: 549 /* 550 * Handle <= 16 bytes of plaintext 551 * 552 * This code always reads and writes 16 bytes. To avoid out of bounds 553 * accesses, XCTR and CTR modes must use a temporary buffer when 554 * encrypting/decrypting less than 16 bytes. 555 * 556 * This code is unusual in that it loads the input and stores the output 557 * relative to the end of the buffers rather than relative to the start. 558 * This causes unusual behaviour when encrypting/decrypting less than 16 559 * bytes; the end of the data is expected to be at the end of the 560 * temporary buffer rather than the start of the data being at the start 561 * of the temporary buffer. 562 */ 563 sub x8, x7, #16 564 csel x7, x7, x8, eq 565 add IN, IN, x7 566 add OUT, OUT, x7 567 ld1 {v5.16b}, [IN] 568 ld1 {v6.16b}, [OUT] 569ST5( mov v3.16b, v4.16b ) 570 encrypt_block v3, ROUNDS_W, KEY, x8, w7 571 ld1 {v10.16b-v11.16b}, [x9] 572 tbl v3.16b, {v3.16b}, v10.16b 573 sshr v11.16b, v11.16b, #7 574 eor v5.16b, v5.16b, v3.16b 575 bif v5.16b, v6.16b, v11.16b 576 st1 {v5.16b}, [OUT] 577 b .Lctrout\xctr 578 579 // Arguments 580 .unreq OUT 581 .unreq IN 582 .unreq KEY 583 .unreq ROUNDS_W 584 .unreq BYTES_W 585 .unreq IV 586 .unreq BYTE_CTR_W // XCTR only 587 // Intermediate values 588 .unreq CTR_W // XCTR only 589 .unreq CTR // XCTR only 590 .unreq IV_PART 591 .unreq BLOCKS 592 .unreq BLOCKS_W 593.endm 594 595 /* 596 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 597 * int bytes, u8 ctr[]) 598 * 599 * The input and output buffers must always be at least 16 bytes even if 600 * encrypting/decrypting less than 16 bytes. Otherwise out of bounds 601 * accesses will occur. The data to be encrypted/decrypted is expected 602 * to be at the end of this 16-byte temporary buffer rather than the 603 * start. 604 */ 605 606AES_FUNC_START(aes_ctr_encrypt) 607 ctr_encrypt 0 608AES_FUNC_END(aes_ctr_encrypt) 609 610 /* 611 * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 612 * int bytes, u8 const iv[], int byte_ctr) 613 * 614 * The input and output buffers must always be at least 16 bytes even if 615 * encrypting/decrypting less than 16 bytes. Otherwise out of bounds 616 * accesses will occur. The data to be encrypted/decrypted is expected 617 * to be at the end of this 16-byte temporary buffer rather than the 618 * start. 619 */ 620 621AES_FUNC_START(aes_xctr_encrypt) 622 ctr_encrypt 1 623AES_FUNC_END(aes_xctr_encrypt) 624 625 626 /* 627 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 628 * int bytes, u8 const rk2[], u8 iv[], int first) 629 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 630 * int bytes, u8 const rk2[], u8 iv[], int first) 631 */ 632 633 .macro next_tweak, out, in, tmp 634 sshr \tmp\().2d, \in\().2d, #63 635 and \tmp\().16b, \tmp\().16b, xtsmask.16b 636 add \out\().2d, \in\().2d, \in\().2d 637 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 638 eor \out\().16b, \out\().16b, \tmp\().16b 639 .endm 640 641 .macro xts_load_mask, tmp 642 movi xtsmask.2s, #0x1 643 movi \tmp\().2s, #0x87 644 uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s 645 .endm 646 647AES_FUNC_START(aes_xts_encrypt) 648 stp x29, x30, [sp, #-16]! 649 mov x29, sp 650 651 ld1 {v4.16b}, [x6] 652 xts_load_mask v8 653 cbz w7, .Lxtsencnotfirst 654 655 enc_prepare w3, x5, x8 656 xts_cts_skip_tw w7, .LxtsencNx 657 encrypt_block v4, w3, x5, x8, w7 /* first tweak */ 658 enc_switch_key w3, x2, x8 659 b .LxtsencNx 660 661.Lxtsencnotfirst: 662 enc_prepare w3, x2, x8 663.LxtsencloopNx: 664 next_tweak v4, v4, v8 665.LxtsencNx: 666 subs w4, w4, #64 667 bmi .Lxtsenc1x 668 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 669 next_tweak v5, v4, v8 670 eor v0.16b, v0.16b, v4.16b 671 next_tweak v6, v5, v8 672 eor v1.16b, v1.16b, v5.16b 673 eor v2.16b, v2.16b, v6.16b 674 next_tweak v7, v6, v8 675 eor v3.16b, v3.16b, v7.16b 676 bl aes_encrypt_block4x 677 eor v3.16b, v3.16b, v7.16b 678 eor v0.16b, v0.16b, v4.16b 679 eor v1.16b, v1.16b, v5.16b 680 eor v2.16b, v2.16b, v6.16b 681 st1 {v0.16b-v3.16b}, [x0], #64 682 mov v4.16b, v7.16b 683 cbz w4, .Lxtsencret 684 xts_reload_mask v8 685 b .LxtsencloopNx 686.Lxtsenc1x: 687 adds w4, w4, #64 688 beq .Lxtsencout 689 subs w4, w4, #16 690 bmi .LxtsencctsNx 691.Lxtsencloop: 692 ld1 {v0.16b}, [x1], #16 693.Lxtsencctsout: 694 eor v0.16b, v0.16b, v4.16b 695 encrypt_block v0, w3, x2, x8, w7 696 eor v0.16b, v0.16b, v4.16b 697 cbz w4, .Lxtsencout 698 subs w4, w4, #16 699 next_tweak v4, v4, v8 700 bmi .Lxtsenccts 701 st1 {v0.16b}, [x0], #16 702 b .Lxtsencloop 703.Lxtsencout: 704 st1 {v0.16b}, [x0] 705.Lxtsencret: 706 st1 {v4.16b}, [x6] 707 ldp x29, x30, [sp], #16 708 ret 709 710.LxtsencctsNx: 711 mov v0.16b, v3.16b 712 sub x0, x0, #16 713.Lxtsenccts: 714 adr_l x8, .Lcts_permute_table 715 716 add x1, x1, w4, sxtw /* rewind input pointer */ 717 add w4, w4, #16 /* # bytes in final block */ 718 add x9, x8, #32 719 add x8, x8, x4 720 sub x9, x9, x4 721 add x4, x0, x4 /* output address of final block */ 722 723 ld1 {v1.16b}, [x1] /* load final block */ 724 ld1 {v2.16b}, [x8] 725 ld1 {v3.16b}, [x9] 726 727 tbl v2.16b, {v0.16b}, v2.16b 728 tbx v0.16b, {v1.16b}, v3.16b 729 st1 {v2.16b}, [x4] /* overlapping stores */ 730 mov w4, wzr 731 b .Lxtsencctsout 732AES_FUNC_END(aes_xts_encrypt) 733 734AES_FUNC_START(aes_xts_decrypt) 735 stp x29, x30, [sp, #-16]! 736 mov x29, sp 737 738 /* subtract 16 bytes if we are doing CTS */ 739 sub w8, w4, #0x10 740 tst w4, #0xf 741 csel w4, w4, w8, eq 742 743 ld1 {v4.16b}, [x6] 744 xts_load_mask v8 745 xts_cts_skip_tw w7, .Lxtsdecskiptw 746 cbz w7, .Lxtsdecnotfirst 747 748 enc_prepare w3, x5, x8 749 encrypt_block v4, w3, x5, x8, w7 /* first tweak */ 750.Lxtsdecskiptw: 751 dec_prepare w3, x2, x8 752 b .LxtsdecNx 753 754.Lxtsdecnotfirst: 755 dec_prepare w3, x2, x8 756.LxtsdecloopNx: 757 next_tweak v4, v4, v8 758.LxtsdecNx: 759 subs w4, w4, #64 760 bmi .Lxtsdec1x 761 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 762 next_tweak v5, v4, v8 763 eor v0.16b, v0.16b, v4.16b 764 next_tweak v6, v5, v8 765 eor v1.16b, v1.16b, v5.16b 766 eor v2.16b, v2.16b, v6.16b 767 next_tweak v7, v6, v8 768 eor v3.16b, v3.16b, v7.16b 769 bl aes_decrypt_block4x 770 eor v3.16b, v3.16b, v7.16b 771 eor v0.16b, v0.16b, v4.16b 772 eor v1.16b, v1.16b, v5.16b 773 eor v2.16b, v2.16b, v6.16b 774 st1 {v0.16b-v3.16b}, [x0], #64 775 mov v4.16b, v7.16b 776 cbz w4, .Lxtsdecout 777 xts_reload_mask v8 778 b .LxtsdecloopNx 779.Lxtsdec1x: 780 adds w4, w4, #64 781 beq .Lxtsdecout 782 subs w4, w4, #16 783.Lxtsdecloop: 784 ld1 {v0.16b}, [x1], #16 785 bmi .Lxtsdeccts 786.Lxtsdecctsout: 787 eor v0.16b, v0.16b, v4.16b 788 decrypt_block v0, w3, x2, x8, w7 789 eor v0.16b, v0.16b, v4.16b 790 st1 {v0.16b}, [x0], #16 791 cbz w4, .Lxtsdecout 792 subs w4, w4, #16 793 next_tweak v4, v4, v8 794 b .Lxtsdecloop 795.Lxtsdecout: 796 st1 {v4.16b}, [x6] 797 ldp x29, x30, [sp], #16 798 ret 799 800.Lxtsdeccts: 801 adr_l x8, .Lcts_permute_table 802 803 add x1, x1, w4, sxtw /* rewind input pointer */ 804 add w4, w4, #16 /* # bytes in final block */ 805 add x9, x8, #32 806 add x8, x8, x4 807 sub x9, x9, x4 808 add x4, x0, x4 /* output address of final block */ 809 810 next_tweak v5, v4, v8 811 812 ld1 {v1.16b}, [x1] /* load final block */ 813 ld1 {v2.16b}, [x8] 814 ld1 {v3.16b}, [x9] 815 816 eor v0.16b, v0.16b, v5.16b 817 decrypt_block v0, w3, x2, x8, w7 818 eor v0.16b, v0.16b, v5.16b 819 820 tbl v2.16b, {v0.16b}, v2.16b 821 tbx v0.16b, {v1.16b}, v3.16b 822 823 st1 {v2.16b}, [x4] /* overlapping stores */ 824 mov w4, wzr 825 b .Lxtsdecctsout 826AES_FUNC_END(aes_xts_decrypt) 827 828 /* 829 * aes_mac_update(u8 const in[], u32 const rk[], int rounds, 830 * int blocks, u8 dg[], int enc_before, int enc_after) 831 */ 832AES_FUNC_START(aes_mac_update) 833 ld1 {v0.16b}, [x4] /* get dg */ 834 enc_prepare w2, x1, x7 835 cbz w5, .Lmacloop4x 836 837 encrypt_block v0, w2, x1, x7, w8 838 839.Lmacloop4x: 840 subs w3, w3, #4 841 bmi .Lmac1x 842 ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ 843 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ 844 encrypt_block v0, w2, x1, x7, w8 845 eor v0.16b, v0.16b, v2.16b 846 encrypt_block v0, w2, x1, x7, w8 847 eor v0.16b, v0.16b, v3.16b 848 encrypt_block v0, w2, x1, x7, w8 849 eor v0.16b, v0.16b, v4.16b 850 cmp w3, wzr 851 csinv x5, x6, xzr, eq 852 cbz w5, .Lmacout 853 encrypt_block v0, w2, x1, x7, w8 854 st1 {v0.16b}, [x4] /* return dg */ 855 cond_yield .Lmacout, x7, x8 856 b .Lmacloop4x 857.Lmac1x: 858 add w3, w3, #4 859.Lmacloop: 860 cbz w3, .Lmacout 861 ld1 {v1.16b}, [x0], #16 /* get next pt block */ 862 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ 863 864 subs w3, w3, #1 865 csinv x5, x6, xzr, eq 866 cbz w5, .Lmacout 867 868.Lmacenc: 869 encrypt_block v0, w2, x1, x7, w8 870 b .Lmacloop 871 872.Lmacout: 873 st1 {v0.16b}, [x4] /* return dg */ 874 mov w0, w3 875 ret 876AES_FUNC_END(aes_mac_update) 877