1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions 4 * 5 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 .text 12 .fpu crypto-neon-fp-armv8 13 .align 3 14 15 .macro enc_round, state, key 16 aese.8 \state, \key 17 aesmc.8 \state, \state 18 .endm 19 20 .macro dec_round, state, key 21 aesd.8 \state, \key 22 aesimc.8 \state, \state 23 .endm 24 25 .macro enc_dround, key1, key2 26 enc_round q0, \key1 27 enc_round q0, \key2 28 .endm 29 30 .macro dec_dround, key1, key2 31 dec_round q0, \key1 32 dec_round q0, \key2 33 .endm 34 35 .macro enc_fround, key1, key2, key3 36 enc_round q0, \key1 37 aese.8 q0, \key2 38 veor q0, q0, \key3 39 .endm 40 41 .macro dec_fround, key1, key2, key3 42 dec_round q0, \key1 43 aesd.8 q0, \key2 44 veor q0, q0, \key3 45 .endm 46 47 .macro enc_dround_3x, key1, key2 48 enc_round q0, \key1 49 enc_round q1, \key1 50 enc_round q2, \key1 51 enc_round q0, \key2 52 enc_round q1, \key2 53 enc_round q2, \key2 54 .endm 55 56 .macro dec_dround_3x, key1, key2 57 dec_round q0, \key1 58 dec_round q1, \key1 59 dec_round q2, \key1 60 dec_round q0, \key2 61 dec_round q1, \key2 62 dec_round q2, \key2 63 .endm 64 65 .macro enc_fround_3x, key1, key2, key3 66 enc_round q0, \key1 67 enc_round q1, \key1 68 enc_round q2, \key1 69 aese.8 q0, \key2 70 aese.8 q1, \key2 71 aese.8 q2, \key2 72 veor q0, q0, \key3 73 veor q1, q1, \key3 74 veor q2, q2, \key3 75 .endm 76 77 .macro dec_fround_3x, key1, key2, key3 78 dec_round q0, \key1 79 dec_round q1, \key1 80 dec_round q2, \key1 81 aesd.8 q0, \key2 82 aesd.8 q1, \key2 83 aesd.8 q2, \key2 84 veor q0, q0, \key3 85 veor q1, q1, \key3 86 veor q2, q2, \key3 87 .endm 88 89 .macro do_block, dround, fround 90 cmp r3, #12 @ which key size? 91 vld1.8 {q10-q11}, [ip]! 92 \dround q8, q9 93 vld1.8 {q12-q13}, [ip]! 94 \dround q10, q11 95 vld1.8 {q10-q11}, [ip]! 96 \dround q12, q13 97 vld1.8 {q12-q13}, [ip]! 98 \dround q10, q11 99 blo 0f @ AES-128: 10 rounds 100 vld1.8 {q10-q11}, [ip]! 101 \dround q12, q13 102 beq 1f @ AES-192: 12 rounds 103 vld1.8 {q12-q13}, [ip] 104 \dround q10, q11 1050: \fround q12, q13, q14 106 bx lr 107 1081: \fround q10, q11, q14 109 bx lr 110 .endm 111 112 /* 113 * Internal, non-AAPCS compliant functions that implement the core AES 114 * transforms. These should preserve all registers except q0 - q2 and ip 115 * Arguments: 116 * q0 : first in/output block 117 * q1 : second in/output block (_3x version only) 118 * q2 : third in/output block (_3x version only) 119 * q8 : first round key 120 * q9 : secound round key 121 * q14 : final round key 122 * r2 : address of round key array 123 * r3 : number of rounds 124 */ 125 .align 6 126aes_encrypt: 127 add ip, r2, #32 @ 3rd round key 128.Laes_encrypt_tweak: 129 do_block enc_dround, enc_fround 130ENDPROC(aes_encrypt) 131 132 .align 6 133aes_decrypt: 134 add ip, r2, #32 @ 3rd round key 135 do_block dec_dround, dec_fround 136ENDPROC(aes_decrypt) 137 138 .align 6 139aes_encrypt_3x: 140 add ip, r2, #32 @ 3rd round key 141 do_block enc_dround_3x, enc_fround_3x 142ENDPROC(aes_encrypt_3x) 143 144 .align 6 145aes_decrypt_3x: 146 add ip, r2, #32 @ 3rd round key 147 do_block dec_dround_3x, dec_fround_3x 148ENDPROC(aes_decrypt_3x) 149 150 .macro prepare_key, rk, rounds 151 add ip, \rk, \rounds, lsl #4 152 vld1.8 {q8-q9}, [\rk] @ load first 2 round keys 153 vld1.8 {q14}, [ip] @ load last round key 154 .endm 155 156 /* 157 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 158 * int blocks) 159 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 160 * int blocks) 161 */ 162ENTRY(ce_aes_ecb_encrypt) 163 push {r4, lr} 164 ldr r4, [sp, #8] 165 prepare_key r2, r3 166.Lecbencloop3x: 167 subs r4, r4, #3 168 bmi .Lecbenc1x 169 vld1.8 {q0-q1}, [r1]! 170 vld1.8 {q2}, [r1]! 171 bl aes_encrypt_3x 172 vst1.8 {q0-q1}, [r0]! 173 vst1.8 {q2}, [r0]! 174 b .Lecbencloop3x 175.Lecbenc1x: 176 adds r4, r4, #3 177 beq .Lecbencout 178.Lecbencloop: 179 vld1.8 {q0}, [r1]! 180 bl aes_encrypt 181 vst1.8 {q0}, [r0]! 182 subs r4, r4, #1 183 bne .Lecbencloop 184.Lecbencout: 185 pop {r4, pc} 186ENDPROC(ce_aes_ecb_encrypt) 187 188ENTRY(ce_aes_ecb_decrypt) 189 push {r4, lr} 190 ldr r4, [sp, #8] 191 prepare_key r2, r3 192.Lecbdecloop3x: 193 subs r4, r4, #3 194 bmi .Lecbdec1x 195 vld1.8 {q0-q1}, [r1]! 196 vld1.8 {q2}, [r1]! 197 bl aes_decrypt_3x 198 vst1.8 {q0-q1}, [r0]! 199 vst1.8 {q2}, [r0]! 200 b .Lecbdecloop3x 201.Lecbdec1x: 202 adds r4, r4, #3 203 beq .Lecbdecout 204.Lecbdecloop: 205 vld1.8 {q0}, [r1]! 206 bl aes_decrypt 207 vst1.8 {q0}, [r0]! 208 subs r4, r4, #1 209 bne .Lecbdecloop 210.Lecbdecout: 211 pop {r4, pc} 212ENDPROC(ce_aes_ecb_decrypt) 213 214 /* 215 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 216 * int blocks, u8 iv[]) 217 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 218 * int blocks, u8 iv[]) 219 */ 220ENTRY(ce_aes_cbc_encrypt) 221 push {r4-r6, lr} 222 ldrd r4, r5, [sp, #16] 223 vld1.8 {q0}, [r5] 224 prepare_key r2, r3 225.Lcbcencloop: 226 vld1.8 {q1}, [r1]! @ get next pt block 227 veor q0, q0, q1 @ ..and xor with iv 228 bl aes_encrypt 229 vst1.8 {q0}, [r0]! 230 subs r4, r4, #1 231 bne .Lcbcencloop 232 vst1.8 {q0}, [r5] 233 pop {r4-r6, pc} 234ENDPROC(ce_aes_cbc_encrypt) 235 236ENTRY(ce_aes_cbc_decrypt) 237 push {r4-r6, lr} 238 ldrd r4, r5, [sp, #16] 239 vld1.8 {q6}, [r5] @ keep iv in q6 240 prepare_key r2, r3 241.Lcbcdecloop3x: 242 subs r4, r4, #3 243 bmi .Lcbcdec1x 244 vld1.8 {q0-q1}, [r1]! 245 vld1.8 {q2}, [r1]! 246 vmov q3, q0 247 vmov q4, q1 248 vmov q5, q2 249 bl aes_decrypt_3x 250 veor q0, q0, q6 251 veor q1, q1, q3 252 veor q2, q2, q4 253 vmov q6, q5 254 vst1.8 {q0-q1}, [r0]! 255 vst1.8 {q2}, [r0]! 256 b .Lcbcdecloop3x 257.Lcbcdec1x: 258 adds r4, r4, #3 259 beq .Lcbcdecout 260 vmov q15, q14 @ preserve last round key 261.Lcbcdecloop: 262 vld1.8 {q0}, [r1]! @ get next ct block 263 veor q14, q15, q6 @ combine prev ct with last key 264 vmov q6, q0 265 bl aes_decrypt 266 vst1.8 {q0}, [r0]! 267 subs r4, r4, #1 268 bne .Lcbcdecloop 269.Lcbcdecout: 270 vst1.8 {q6}, [r5] @ keep iv in q6 271 pop {r4-r6, pc} 272ENDPROC(ce_aes_cbc_decrypt) 273 274 /* 275 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 276 * int blocks, u8 ctr[]) 277 */ 278ENTRY(ce_aes_ctr_encrypt) 279 push {r4-r6, lr} 280 ldrd r4, r5, [sp, #16] 281 vld1.8 {q6}, [r5] @ load ctr 282 prepare_key r2, r3 283 vmov r6, s27 @ keep swabbed ctr in r6 284 rev r6, r6 285 cmn r6, r4 @ 32 bit overflow? 286 bcs .Lctrloop 287.Lctrloop3x: 288 subs r4, r4, #3 289 bmi .Lctr1x 290 add r6, r6, #1 291 vmov q0, q6 292 vmov q1, q6 293 rev ip, r6 294 add r6, r6, #1 295 vmov q2, q6 296 vmov s7, ip 297 rev ip, r6 298 add r6, r6, #1 299 vmov s11, ip 300 vld1.8 {q3-q4}, [r1]! 301 vld1.8 {q5}, [r1]! 302 bl aes_encrypt_3x 303 veor q0, q0, q3 304 veor q1, q1, q4 305 veor q2, q2, q5 306 rev ip, r6 307 vst1.8 {q0-q1}, [r0]! 308 vst1.8 {q2}, [r0]! 309 vmov s27, ip 310 b .Lctrloop3x 311.Lctr1x: 312 adds r4, r4, #3 313 beq .Lctrout 314.Lctrloop: 315 vmov q0, q6 316 bl aes_encrypt 317 318 adds r6, r6, #1 @ increment BE ctr 319 rev ip, r6 320 vmov s27, ip 321 bcs .Lctrcarry 322 323.Lctrcarrydone: 324 subs r4, r4, #1 325 bmi .Lctrtailblock @ blocks < 0 means tail block 326 vld1.8 {q3}, [r1]! 327 veor q3, q0, q3 328 vst1.8 {q3}, [r0]! 329 bne .Lctrloop 330 331.Lctrout: 332 vst1.8 {q6}, [r5] @ return next CTR value 333 pop {r4-r6, pc} 334 335.Lctrtailblock: 336 vst1.8 {q0}, [r0, :64] @ return the key stream 337 b .Lctrout 338 339.Lctrcarry: 340 .irp sreg, s26, s25, s24 341 vmov ip, \sreg @ load next word of ctr 342 rev ip, ip @ ... to handle the carry 343 adds ip, ip, #1 344 rev ip, ip 345 vmov \sreg, ip 346 bcc .Lctrcarrydone 347 .endr 348 b .Lctrcarrydone 349ENDPROC(ce_aes_ctr_encrypt) 350 351 /* 352 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 353 * int blocks, u8 iv[], u8 const rk2[], int first) 354 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 355 * int blocks, u8 iv[], u8 const rk2[], int first) 356 */ 357 358 .macro next_tweak, out, in, const, tmp 359 vshr.s64 \tmp, \in, #63 360 vand \tmp, \tmp, \const 361 vadd.u64 \out, \in, \in 362 vext.8 \tmp, \tmp, \tmp, #8 363 veor \out, \out, \tmp 364 .endm 365 366 .align 3 367.Lxts_mul_x: 368 .quad 1, 0x87 369 370ce_aes_xts_init: 371 vldr d14, .Lxts_mul_x 372 vldr d15, .Lxts_mul_x + 8 373 374 ldrd r4, r5, [sp, #16] @ load args 375 ldr r6, [sp, #28] 376 vld1.8 {q0}, [r5] @ load iv 377 teq r6, #1 @ start of a block? 378 bxne lr 379 380 @ Encrypt the IV in q0 with the second AES key. This should only 381 @ be done at the start of a block. 382 ldr r6, [sp, #24] @ load AES key 2 383 prepare_key r6, r3 384 add ip, r6, #32 @ 3rd round key of key 2 385 b .Laes_encrypt_tweak @ tail call 386ENDPROC(ce_aes_xts_init) 387 388ENTRY(ce_aes_xts_encrypt) 389 push {r4-r6, lr} 390 391 bl ce_aes_xts_init @ run shared prologue 392 prepare_key r2, r3 393 vmov q3, q0 394 395 teq r6, #0 @ start of a block? 396 bne .Lxtsenc3x 397 398.Lxtsencloop3x: 399 next_tweak q3, q3, q7, q6 400.Lxtsenc3x: 401 subs r4, r4, #3 402 bmi .Lxtsenc1x 403 vld1.8 {q0-q1}, [r1]! @ get 3 pt blocks 404 vld1.8 {q2}, [r1]! 405 next_tweak q4, q3, q7, q6 406 veor q0, q0, q3 407 next_tweak q5, q4, q7, q6 408 veor q1, q1, q4 409 veor q2, q2, q5 410 bl aes_encrypt_3x 411 veor q0, q0, q3 412 veor q1, q1, q4 413 veor q2, q2, q5 414 vst1.8 {q0-q1}, [r0]! @ write 3 ct blocks 415 vst1.8 {q2}, [r0]! 416 vmov q3, q5 417 teq r4, #0 418 beq .Lxtsencout 419 b .Lxtsencloop3x 420.Lxtsenc1x: 421 adds r4, r4, #3 422 beq .Lxtsencout 423.Lxtsencloop: 424 vld1.8 {q0}, [r1]! 425 veor q0, q0, q3 426 bl aes_encrypt 427 veor q0, q0, q3 428 vst1.8 {q0}, [r0]! 429 subs r4, r4, #1 430 beq .Lxtsencout 431 next_tweak q3, q3, q7, q6 432 b .Lxtsencloop 433.Lxtsencout: 434 vst1.8 {q3}, [r5] 435 pop {r4-r6, pc} 436ENDPROC(ce_aes_xts_encrypt) 437 438 439ENTRY(ce_aes_xts_decrypt) 440 push {r4-r6, lr} 441 442 bl ce_aes_xts_init @ run shared prologue 443 prepare_key r2, r3 444 vmov q3, q0 445 446 teq r6, #0 @ start of a block? 447 bne .Lxtsdec3x 448 449.Lxtsdecloop3x: 450 next_tweak q3, q3, q7, q6 451.Lxtsdec3x: 452 subs r4, r4, #3 453 bmi .Lxtsdec1x 454 vld1.8 {q0-q1}, [r1]! @ get 3 ct blocks 455 vld1.8 {q2}, [r1]! 456 next_tweak q4, q3, q7, q6 457 veor q0, q0, q3 458 next_tweak q5, q4, q7, q6 459 veor q1, q1, q4 460 veor q2, q2, q5 461 bl aes_decrypt_3x 462 veor q0, q0, q3 463 veor q1, q1, q4 464 veor q2, q2, q5 465 vst1.8 {q0-q1}, [r0]! @ write 3 pt blocks 466 vst1.8 {q2}, [r0]! 467 vmov q3, q5 468 teq r4, #0 469 beq .Lxtsdecout 470 b .Lxtsdecloop3x 471.Lxtsdec1x: 472 adds r4, r4, #3 473 beq .Lxtsdecout 474.Lxtsdecloop: 475 vld1.8 {q0}, [r1]! 476 veor q0, q0, q3 477 add ip, r2, #32 @ 3rd round key 478 bl aes_decrypt 479 veor q0, q0, q3 480 vst1.8 {q0}, [r0]! 481 subs r4, r4, #1 482 beq .Lxtsdecout 483 next_tweak q3, q3, q7, q6 484 b .Lxtsdecloop 485.Lxtsdecout: 486 vst1.8 {q3}, [r5] 487 pop {r4-r6, pc} 488ENDPROC(ce_aes_xts_decrypt) 489 490 /* 491 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the 492 * AES sbox substitution on each byte in 493 * 'input' 494 */ 495ENTRY(ce_aes_sub) 496 vdup.32 q1, r0 497 veor q0, q0, q0 498 aese.8 q0, q1 499 vmov r0, s0 500 bx lr 501ENDPROC(ce_aes_sub) 502 503 /* 504 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns 505 * operation on round key *src 506 */ 507ENTRY(ce_aes_invert) 508 vld1.8 {q0}, [r1] 509 aesimc.8 q0, q0 510 vst1.8 {q0}, [r0] 511 bx lr 512ENDPROC(ce_aes_invert) 513