1/* 2 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions 3 * 4 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 .text 15 .fpu crypto-neon-fp-armv8 16 .align 3 17 18 .macro enc_round, state, key 19 aese.8 \state, \key 20 aesmc.8 \state, \state 21 .endm 22 23 .macro dec_round, state, key 24 aesd.8 \state, \key 25 aesimc.8 \state, \state 26 .endm 27 28 .macro enc_dround, key1, key2 29 enc_round q0, \key1 30 enc_round q0, \key2 31 .endm 32 33 .macro dec_dround, key1, key2 34 dec_round q0, \key1 35 dec_round q0, \key2 36 .endm 37 38 .macro enc_fround, key1, key2, key3 39 enc_round q0, \key1 40 aese.8 q0, \key2 41 veor q0, q0, \key3 42 .endm 43 44 .macro dec_fround, key1, key2, key3 45 dec_round q0, \key1 46 aesd.8 q0, \key2 47 veor q0, q0, \key3 48 .endm 49 50 .macro enc_dround_3x, key1, key2 51 enc_round q0, \key1 52 enc_round q1, \key1 53 enc_round q2, \key1 54 enc_round q0, \key2 55 enc_round q1, \key2 56 enc_round q2, \key2 57 .endm 58 59 .macro dec_dround_3x, key1, key2 60 dec_round q0, \key1 61 dec_round q1, \key1 62 dec_round q2, \key1 63 dec_round q0, \key2 64 dec_round q1, \key2 65 dec_round q2, \key2 66 .endm 67 68 .macro enc_fround_3x, key1, key2, key3 69 enc_round q0, \key1 70 enc_round q1, \key1 71 enc_round q2, \key1 72 aese.8 q0, \key2 73 aese.8 q1, \key2 74 aese.8 q2, \key2 75 veor q0, q0, \key3 76 veor q1, q1, \key3 77 veor q2, q2, \key3 78 .endm 79 80 .macro dec_fround_3x, key1, key2, key3 81 dec_round q0, \key1 82 dec_round q1, \key1 83 dec_round q2, \key1 84 aesd.8 q0, \key2 85 aesd.8 q1, \key2 86 aesd.8 q2, \key2 87 veor q0, q0, \key3 88 veor q1, q1, \key3 89 veor q2, q2, \key3 90 .endm 91 92 .macro do_block, dround, fround 93 cmp r3, #12 @ which key size? 94 vld1.8 {q10-q11}, [ip]! 95 \dround q8, q9 96 vld1.8 {q12-q13}, [ip]! 97 \dround q10, q11 98 vld1.8 {q10-q11}, [ip]! 99 \dround q12, q13 100 vld1.8 {q12-q13}, [ip]! 101 \dround q10, q11 102 blo 0f @ AES-128: 10 rounds 103 vld1.8 {q10-q11}, [ip]! 104 beq 1f @ AES-192: 12 rounds 105 \dround q12, q13 106 vld1.8 {q12-q13}, [ip] 107 \dround q10, q11 1080: \fround q12, q13, q14 109 bx lr 110 1111: \dround q12, q13 112 \fround q10, q11, q14 113 bx lr 114 .endm 115 116 /* 117 * Internal, non-AAPCS compliant functions that implement the core AES 118 * transforms. These should preserve all registers except q0 - q2 and ip 119 * Arguments: 120 * q0 : first in/output block 121 * q1 : second in/output block (_3x version only) 122 * q2 : third in/output block (_3x version only) 123 * q8 : first round key 124 * q9 : secound round key 125 * ip : address of 3rd round key 126 * q14 : final round key 127 * r3 : number of rounds 128 */ 129 .align 6 130aes_encrypt: 131 add ip, r2, #32 @ 3rd round key 132.Laes_encrypt_tweak: 133 do_block enc_dround, enc_fround 134ENDPROC(aes_encrypt) 135 136 .align 6 137aes_decrypt: 138 add ip, r2, #32 @ 3rd round key 139 do_block dec_dround, dec_fround 140ENDPROC(aes_decrypt) 141 142 .align 6 143aes_encrypt_3x: 144 add ip, r2, #32 @ 3rd round key 145 do_block enc_dround_3x, enc_fround_3x 146ENDPROC(aes_encrypt_3x) 147 148 .align 6 149aes_decrypt_3x: 150 add ip, r2, #32 @ 3rd round key 151 do_block dec_dround_3x, dec_fround_3x 152ENDPROC(aes_decrypt_3x) 153 154 .macro prepare_key, rk, rounds 155 add ip, \rk, \rounds, lsl #4 156 vld1.8 {q8-q9}, [\rk] @ load first 2 round keys 157 vld1.8 {q14}, [ip] @ load last round key 158 .endm 159 160 /* 161 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 162 * int blocks) 163 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 164 * int blocks) 165 */ 166ENTRY(ce_aes_ecb_encrypt) 167 push {r4, lr} 168 ldr r4, [sp, #8] 169 prepare_key r2, r3 170.Lecbencloop3x: 171 subs r4, r4, #3 172 bmi .Lecbenc1x 173 vld1.8 {q0-q1}, [r1, :64]! 174 vld1.8 {q2}, [r1, :64]! 175 bl aes_encrypt_3x 176 vst1.8 {q0-q1}, [r0, :64]! 177 vst1.8 {q2}, [r0, :64]! 178 b .Lecbencloop3x 179.Lecbenc1x: 180 adds r4, r4, #3 181 beq .Lecbencout 182.Lecbencloop: 183 vld1.8 {q0}, [r1, :64]! 184 bl aes_encrypt 185 vst1.8 {q0}, [r0, :64]! 186 subs r4, r4, #1 187 bne .Lecbencloop 188.Lecbencout: 189 pop {r4, pc} 190ENDPROC(ce_aes_ecb_encrypt) 191 192ENTRY(ce_aes_ecb_decrypt) 193 push {r4, lr} 194 ldr r4, [sp, #8] 195 prepare_key r2, r3 196.Lecbdecloop3x: 197 subs r4, r4, #3 198 bmi .Lecbdec1x 199 vld1.8 {q0-q1}, [r1, :64]! 200 vld1.8 {q2}, [r1, :64]! 201 bl aes_decrypt_3x 202 vst1.8 {q0-q1}, [r0, :64]! 203 vst1.8 {q2}, [r0, :64]! 204 b .Lecbdecloop3x 205.Lecbdec1x: 206 adds r4, r4, #3 207 beq .Lecbdecout 208.Lecbdecloop: 209 vld1.8 {q0}, [r1, :64]! 210 bl aes_decrypt 211 vst1.8 {q0}, [r0, :64]! 212 subs r4, r4, #1 213 bne .Lecbdecloop 214.Lecbdecout: 215 pop {r4, pc} 216ENDPROC(ce_aes_ecb_decrypt) 217 218 /* 219 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 220 * int blocks, u8 iv[]) 221 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 222 * int blocks, u8 iv[]) 223 */ 224ENTRY(ce_aes_cbc_encrypt) 225 push {r4-r6, lr} 226 ldrd r4, r5, [sp, #16] 227 vld1.8 {q0}, [r5] 228 prepare_key r2, r3 229.Lcbcencloop: 230 vld1.8 {q1}, [r1, :64]! @ get next pt block 231 veor q0, q0, q1 @ ..and xor with iv 232 bl aes_encrypt 233 vst1.8 {q0}, [r0, :64]! 234 subs r4, r4, #1 235 bne .Lcbcencloop 236 vst1.8 {q0}, [r5] 237 pop {r4-r6, pc} 238ENDPROC(ce_aes_cbc_encrypt) 239 240ENTRY(ce_aes_cbc_decrypt) 241 push {r4-r6, lr} 242 ldrd r4, r5, [sp, #16] 243 vld1.8 {q6}, [r5] @ keep iv in q6 244 prepare_key r2, r3 245.Lcbcdecloop3x: 246 subs r4, r4, #3 247 bmi .Lcbcdec1x 248 vld1.8 {q0-q1}, [r1, :64]! 249 vld1.8 {q2}, [r1, :64]! 250 vmov q3, q0 251 vmov q4, q1 252 vmov q5, q2 253 bl aes_decrypt_3x 254 veor q0, q0, q6 255 veor q1, q1, q3 256 veor q2, q2, q4 257 vmov q6, q5 258 vst1.8 {q0-q1}, [r0, :64]! 259 vst1.8 {q2}, [r0, :64]! 260 b .Lcbcdecloop3x 261.Lcbcdec1x: 262 adds r4, r4, #3 263 beq .Lcbcdecout 264 vmov q15, q14 @ preserve last round key 265.Lcbcdecloop: 266 vld1.8 {q0}, [r1, :64]! @ get next ct block 267 veor q14, q15, q6 @ combine prev ct with last key 268 vmov q6, q0 269 bl aes_decrypt 270 vst1.8 {q0}, [r0, :64]! 271 subs r4, r4, #1 272 bne .Lcbcdecloop 273.Lcbcdecout: 274 vst1.8 {q6}, [r5] @ keep iv in q6 275 pop {r4-r6, pc} 276ENDPROC(ce_aes_cbc_decrypt) 277 278 /* 279 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 280 * int blocks, u8 ctr[]) 281 */ 282ENTRY(ce_aes_ctr_encrypt) 283 push {r4-r6, lr} 284 ldrd r4, r5, [sp, #16] 285 vld1.8 {q6}, [r5] @ load ctr 286 prepare_key r2, r3 287 vmov r6, s27 @ keep swabbed ctr in r6 288 rev r6, r6 289 cmn r6, r4 @ 32 bit overflow? 290 bcs .Lctrloop 291.Lctrloop3x: 292 subs r4, r4, #3 293 bmi .Lctr1x 294 add r6, r6, #1 295 vmov q0, q6 296 vmov q1, q6 297 rev ip, r6 298 add r6, r6, #1 299 vmov q2, q6 300 vmov s7, ip 301 rev ip, r6 302 add r6, r6, #1 303 vmov s11, ip 304 vld1.8 {q3-q4}, [r1, :64]! 305 vld1.8 {q5}, [r1, :64]! 306 bl aes_encrypt_3x 307 veor q0, q0, q3 308 veor q1, q1, q4 309 veor q2, q2, q5 310 rev ip, r6 311 vst1.8 {q0-q1}, [r0, :64]! 312 vst1.8 {q2}, [r0, :64]! 313 vmov s27, ip 314 b .Lctrloop3x 315.Lctr1x: 316 adds r4, r4, #3 317 beq .Lctrout 318.Lctrloop: 319 vmov q0, q6 320 bl aes_encrypt 321 subs r4, r4, #1 322 bmi .Lctrhalfblock @ blocks < 0 means 1/2 block 323 vld1.8 {q3}, [r1, :64]! 324 veor q3, q0, q3 325 vst1.8 {q3}, [r0, :64]! 326 327 adds r6, r6, #1 @ increment BE ctr 328 rev ip, r6 329 vmov s27, ip 330 bcs .Lctrcarry 331 teq r4, #0 332 bne .Lctrloop 333.Lctrout: 334 vst1.8 {q6}, [r5] 335 pop {r4-r6, pc} 336 337.Lctrhalfblock: 338 vld1.8 {d1}, [r1, :64] 339 veor d0, d0, d1 340 vst1.8 {d0}, [r0, :64] 341 pop {r4-r6, pc} 342 343.Lctrcarry: 344 .irp sreg, s26, s25, s24 345 vmov ip, \sreg @ load next word of ctr 346 rev ip, ip @ ... to handle the carry 347 adds ip, ip, #1 348 rev ip, ip 349 vmov \sreg, ip 350 bcc 0f 351 .endr 3520: teq r4, #0 353 beq .Lctrout 354 b .Lctrloop 355ENDPROC(ce_aes_ctr_encrypt) 356 357 /* 358 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 359 * int blocks, u8 iv[], u8 const rk2[], int first) 360 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 361 * int blocks, u8 iv[], u8 const rk2[], int first) 362 */ 363 364 .macro next_tweak, out, in, const, tmp 365 vshr.s64 \tmp, \in, #63 366 vand \tmp, \tmp, \const 367 vadd.u64 \out, \in, \in 368 vext.8 \tmp, \tmp, \tmp, #8 369 veor \out, \out, \tmp 370 .endm 371 372 .align 3 373.Lxts_mul_x: 374 .quad 1, 0x87 375 376ce_aes_xts_init: 377 vldr d14, .Lxts_mul_x 378 vldr d15, .Lxts_mul_x + 8 379 380 ldrd r4, r5, [sp, #16] @ load args 381 ldr r6, [sp, #28] 382 vld1.8 {q0}, [r5] @ load iv 383 teq r6, #1 @ start of a block? 384 bxne lr 385 386 @ Encrypt the IV in q0 with the second AES key. This should only 387 @ be done at the start of a block. 388 ldr r6, [sp, #24] @ load AES key 2 389 prepare_key r6, r3 390 add ip, r6, #32 @ 3rd round key of key 2 391 b .Laes_encrypt_tweak @ tail call 392ENDPROC(ce_aes_xts_init) 393 394ENTRY(ce_aes_xts_encrypt) 395 push {r4-r6, lr} 396 397 bl ce_aes_xts_init @ run shared prologue 398 prepare_key r2, r3 399 vmov q3, q0 400 401 teq r6, #0 @ start of a block? 402 bne .Lxtsenc3x 403 404.Lxtsencloop3x: 405 next_tweak q3, q3, q7, q6 406.Lxtsenc3x: 407 subs r4, r4, #3 408 bmi .Lxtsenc1x 409 vld1.8 {q0-q1}, [r1, :64]! @ get 3 pt blocks 410 vld1.8 {q2}, [r1, :64]! 411 next_tweak q4, q3, q7, q6 412 veor q0, q0, q3 413 next_tweak q5, q4, q7, q6 414 veor q1, q1, q4 415 veor q2, q2, q5 416 bl aes_encrypt_3x 417 veor q0, q0, q3 418 veor q1, q1, q4 419 veor q2, q2, q5 420 vst1.8 {q0-q1}, [r0, :64]! @ write 3 ct blocks 421 vst1.8 {q2}, [r0, :64]! 422 vmov q3, q5 423 teq r4, #0 424 beq .Lxtsencout 425 b .Lxtsencloop3x 426.Lxtsenc1x: 427 adds r4, r4, #3 428 beq .Lxtsencout 429.Lxtsencloop: 430 vld1.8 {q0}, [r1, :64]! 431 veor q0, q0, q3 432 bl aes_encrypt 433 veor q0, q0, q3 434 vst1.8 {q0}, [r0, :64]! 435 subs r4, r4, #1 436 beq .Lxtsencout 437 next_tweak q3, q3, q7, q6 438 b .Lxtsencloop 439.Lxtsencout: 440 vst1.8 {q3}, [r5] 441 pop {r4-r6, pc} 442ENDPROC(ce_aes_xts_encrypt) 443 444 445ENTRY(ce_aes_xts_decrypt) 446 push {r4-r6, lr} 447 448 bl ce_aes_xts_init @ run shared prologue 449 prepare_key r2, r3 450 vmov q3, q0 451 452 teq r6, #0 @ start of a block? 453 bne .Lxtsdec3x 454 455.Lxtsdecloop3x: 456 next_tweak q3, q3, q7, q6 457.Lxtsdec3x: 458 subs r4, r4, #3 459 bmi .Lxtsdec1x 460 vld1.8 {q0-q1}, [r1, :64]! @ get 3 ct blocks 461 vld1.8 {q2}, [r1, :64]! 462 next_tweak q4, q3, q7, q6 463 veor q0, q0, q3 464 next_tweak q5, q4, q7, q6 465 veor q1, q1, q4 466 veor q2, q2, q5 467 bl aes_decrypt_3x 468 veor q0, q0, q3 469 veor q1, q1, q4 470 veor q2, q2, q5 471 vst1.8 {q0-q1}, [r0, :64]! @ write 3 pt blocks 472 vst1.8 {q2}, [r0, :64]! 473 vmov q3, q5 474 teq r4, #0 475 beq .Lxtsdecout 476 b .Lxtsdecloop3x 477.Lxtsdec1x: 478 adds r4, r4, #3 479 beq .Lxtsdecout 480.Lxtsdecloop: 481 vld1.8 {q0}, [r1, :64]! 482 veor q0, q0, q3 483 add ip, r2, #32 @ 3rd round key 484 bl aes_decrypt 485 veor q0, q0, q3 486 vst1.8 {q0}, [r0, :64]! 487 subs r4, r4, #1 488 beq .Lxtsdecout 489 next_tweak q3, q3, q7, q6 490 b .Lxtsdecloop 491.Lxtsdecout: 492 vst1.8 {q3}, [r5] 493 pop {r4-r6, pc} 494ENDPROC(ce_aes_xts_decrypt) 495 496 /* 497 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the 498 * AES sbox substitution on each byte in 499 * 'input' 500 */ 501ENTRY(ce_aes_sub) 502 vdup.32 q1, r0 503 veor q0, q0, q0 504 aese.8 q0, q1 505 vmov r0, s0 506 bx lr 507ENDPROC(ce_aes_sub) 508 509 /* 510 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns 511 * operation on round key *src 512 */ 513ENTRY(ce_aes_invert) 514 vld1.8 {q0}, [r1] 515 aesimc.8 q0, q0 516 vst1.8 {q0}, [r0] 517 bx lr 518ENDPROC(ce_aes_invert) 519