1/* 2 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions 3 * 4 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 .text 15 .fpu crypto-neon-fp-armv8 16 .align 3 17 18 .macro enc_round, state, key 19 aese.8 \state, \key 20 aesmc.8 \state, \state 21 .endm 22 23 .macro dec_round, state, key 24 aesd.8 \state, \key 25 aesimc.8 \state, \state 26 .endm 27 28 .macro enc_dround, key1, key2 29 enc_round q0, \key1 30 enc_round q0, \key2 31 .endm 32 33 .macro dec_dround, key1, key2 34 dec_round q0, \key1 35 dec_round q0, \key2 36 .endm 37 38 .macro enc_fround, key1, key2, key3 39 enc_round q0, \key1 40 aese.8 q0, \key2 41 veor q0, q0, \key3 42 .endm 43 44 .macro dec_fround, key1, key2, key3 45 dec_round q0, \key1 46 aesd.8 q0, \key2 47 veor q0, q0, \key3 48 .endm 49 50 .macro enc_dround_3x, key1, key2 51 enc_round q0, \key1 52 enc_round q1, \key1 53 enc_round q2, \key1 54 enc_round q0, \key2 55 enc_round q1, \key2 56 enc_round q2, \key2 57 .endm 58 59 .macro dec_dround_3x, key1, key2 60 dec_round q0, \key1 61 dec_round q1, \key1 62 dec_round q2, \key1 63 dec_round q0, \key2 64 dec_round q1, \key2 65 dec_round q2, \key2 66 .endm 67 68 .macro enc_fround_3x, key1, key2, key3 69 enc_round q0, \key1 70 enc_round q1, \key1 71 enc_round q2, \key1 72 aese.8 q0, \key2 73 aese.8 q1, \key2 74 aese.8 q2, \key2 75 veor q0, q0, \key3 76 veor q1, q1, \key3 77 veor q2, q2, \key3 78 .endm 79 80 .macro dec_fround_3x, key1, key2, key3 81 dec_round q0, \key1 82 dec_round q1, \key1 83 dec_round q2, \key1 84 aesd.8 q0, \key2 85 aesd.8 q1, \key2 86 aesd.8 q2, \key2 87 veor q0, q0, \key3 88 veor q1, q1, \key3 89 veor q2, q2, \key3 90 .endm 91 92 .macro do_block, dround, fround 93 cmp r3, #12 @ which key size? 94 vld1.8 {q10-q11}, [ip]! 95 \dround q8, q9 96 vld1.8 {q12-q13}, [ip]! 97 \dround q10, q11 98 vld1.8 {q10-q11}, [ip]! 99 \dround q12, q13 100 vld1.8 {q12-q13}, [ip]! 101 \dround q10, q11 102 blo 0f @ AES-128: 10 rounds 103 vld1.8 {q10-q11}, [ip]! 104 \dround q12, q13 105 beq 1f @ AES-192: 12 rounds 106 vld1.8 {q12-q13}, [ip] 107 \dround q10, q11 1080: \fround q12, q13, q14 109 bx lr 110 1111: \fround q10, q11, q14 112 bx lr 113 .endm 114 115 /* 116 * Internal, non-AAPCS compliant functions that implement the core AES 117 * transforms. These should preserve all registers except q0 - q2 and ip 118 * Arguments: 119 * q0 : first in/output block 120 * q1 : second in/output block (_3x version only) 121 * q2 : third in/output block (_3x version only) 122 * q8 : first round key 123 * q9 : secound round key 124 * q14 : final round key 125 * r2 : address of round key array 126 * r3 : number of rounds 127 */ 128 .align 6 129aes_encrypt: 130 add ip, r2, #32 @ 3rd round key 131.Laes_encrypt_tweak: 132 do_block enc_dround, enc_fround 133ENDPROC(aes_encrypt) 134 135 .align 6 136aes_decrypt: 137 add ip, r2, #32 @ 3rd round key 138 do_block dec_dround, dec_fround 139ENDPROC(aes_decrypt) 140 141 .align 6 142aes_encrypt_3x: 143 add ip, r2, #32 @ 3rd round key 144 do_block enc_dround_3x, enc_fround_3x 145ENDPROC(aes_encrypt_3x) 146 147 .align 6 148aes_decrypt_3x: 149 add ip, r2, #32 @ 3rd round key 150 do_block dec_dround_3x, dec_fround_3x 151ENDPROC(aes_decrypt_3x) 152 153 .macro prepare_key, rk, rounds 154 add ip, \rk, \rounds, lsl #4 155 vld1.8 {q8-q9}, [\rk] @ load first 2 round keys 156 vld1.8 {q14}, [ip] @ load last round key 157 .endm 158 159 /* 160 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 161 * int blocks) 162 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 163 * int blocks) 164 */ 165ENTRY(ce_aes_ecb_encrypt) 166 push {r4, lr} 167 ldr r4, [sp, #8] 168 prepare_key r2, r3 169.Lecbencloop3x: 170 subs r4, r4, #3 171 bmi .Lecbenc1x 172 vld1.8 {q0-q1}, [r1]! 173 vld1.8 {q2}, [r1]! 174 bl aes_encrypt_3x 175 vst1.8 {q0-q1}, [r0]! 176 vst1.8 {q2}, [r0]! 177 b .Lecbencloop3x 178.Lecbenc1x: 179 adds r4, r4, #3 180 beq .Lecbencout 181.Lecbencloop: 182 vld1.8 {q0}, [r1]! 183 bl aes_encrypt 184 vst1.8 {q0}, [r0]! 185 subs r4, r4, #1 186 bne .Lecbencloop 187.Lecbencout: 188 pop {r4, pc} 189ENDPROC(ce_aes_ecb_encrypt) 190 191ENTRY(ce_aes_ecb_decrypt) 192 push {r4, lr} 193 ldr r4, [sp, #8] 194 prepare_key r2, r3 195.Lecbdecloop3x: 196 subs r4, r4, #3 197 bmi .Lecbdec1x 198 vld1.8 {q0-q1}, [r1]! 199 vld1.8 {q2}, [r1]! 200 bl aes_decrypt_3x 201 vst1.8 {q0-q1}, [r0]! 202 vst1.8 {q2}, [r0]! 203 b .Lecbdecloop3x 204.Lecbdec1x: 205 adds r4, r4, #3 206 beq .Lecbdecout 207.Lecbdecloop: 208 vld1.8 {q0}, [r1]! 209 bl aes_decrypt 210 vst1.8 {q0}, [r0]! 211 subs r4, r4, #1 212 bne .Lecbdecloop 213.Lecbdecout: 214 pop {r4, pc} 215ENDPROC(ce_aes_ecb_decrypt) 216 217 /* 218 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 219 * int blocks, u8 iv[]) 220 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 221 * int blocks, u8 iv[]) 222 */ 223ENTRY(ce_aes_cbc_encrypt) 224 push {r4-r6, lr} 225 ldrd r4, r5, [sp, #16] 226 vld1.8 {q0}, [r5] 227 prepare_key r2, r3 228.Lcbcencloop: 229 vld1.8 {q1}, [r1]! @ get next pt block 230 veor q0, q0, q1 @ ..and xor with iv 231 bl aes_encrypt 232 vst1.8 {q0}, [r0]! 233 subs r4, r4, #1 234 bne .Lcbcencloop 235 vst1.8 {q0}, [r5] 236 pop {r4-r6, pc} 237ENDPROC(ce_aes_cbc_encrypt) 238 239ENTRY(ce_aes_cbc_decrypt) 240 push {r4-r6, lr} 241 ldrd r4, r5, [sp, #16] 242 vld1.8 {q6}, [r5] @ keep iv in q6 243 prepare_key r2, r3 244.Lcbcdecloop3x: 245 subs r4, r4, #3 246 bmi .Lcbcdec1x 247 vld1.8 {q0-q1}, [r1]! 248 vld1.8 {q2}, [r1]! 249 vmov q3, q0 250 vmov q4, q1 251 vmov q5, q2 252 bl aes_decrypt_3x 253 veor q0, q0, q6 254 veor q1, q1, q3 255 veor q2, q2, q4 256 vmov q6, q5 257 vst1.8 {q0-q1}, [r0]! 258 vst1.8 {q2}, [r0]! 259 b .Lcbcdecloop3x 260.Lcbcdec1x: 261 adds r4, r4, #3 262 beq .Lcbcdecout 263 vmov q15, q14 @ preserve last round key 264.Lcbcdecloop: 265 vld1.8 {q0}, [r1]! @ get next ct block 266 veor q14, q15, q6 @ combine prev ct with last key 267 vmov q6, q0 268 bl aes_decrypt 269 vst1.8 {q0}, [r0]! 270 subs r4, r4, #1 271 bne .Lcbcdecloop 272.Lcbcdecout: 273 vst1.8 {q6}, [r5] @ keep iv in q6 274 pop {r4-r6, pc} 275ENDPROC(ce_aes_cbc_decrypt) 276 277 /* 278 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 279 * int blocks, u8 ctr[]) 280 */ 281ENTRY(ce_aes_ctr_encrypt) 282 push {r4-r6, lr} 283 ldrd r4, r5, [sp, #16] 284 vld1.8 {q6}, [r5] @ load ctr 285 prepare_key r2, r3 286 vmov r6, s27 @ keep swabbed ctr in r6 287 rev r6, r6 288 cmn r6, r4 @ 32 bit overflow? 289 bcs .Lctrloop 290.Lctrloop3x: 291 subs r4, r4, #3 292 bmi .Lctr1x 293 add r6, r6, #1 294 vmov q0, q6 295 vmov q1, q6 296 rev ip, r6 297 add r6, r6, #1 298 vmov q2, q6 299 vmov s7, ip 300 rev ip, r6 301 add r6, r6, #1 302 vmov s11, ip 303 vld1.8 {q3-q4}, [r1]! 304 vld1.8 {q5}, [r1]! 305 bl aes_encrypt_3x 306 veor q0, q0, q3 307 veor q1, q1, q4 308 veor q2, q2, q5 309 rev ip, r6 310 vst1.8 {q0-q1}, [r0]! 311 vst1.8 {q2}, [r0]! 312 vmov s27, ip 313 b .Lctrloop3x 314.Lctr1x: 315 adds r4, r4, #3 316 beq .Lctrout 317.Lctrloop: 318 vmov q0, q6 319 bl aes_encrypt 320 subs r4, r4, #1 321 bmi .Lctrtailblock @ blocks < 0 means tail block 322 vld1.8 {q3}, [r1]! 323 veor q3, q0, q3 324 vst1.8 {q3}, [r0]! 325 326 adds r6, r6, #1 @ increment BE ctr 327 rev ip, r6 328 vmov s27, ip 329 bcs .Lctrcarry 330 teq r4, #0 331 bne .Lctrloop 332.Lctrout: 333 vst1.8 {q6}, [r5] 334 pop {r4-r6, pc} 335 336.Lctrtailblock: 337 vst1.8 {q0}, [r0, :64] @ return just the key stream 338 pop {r4-r6, pc} 339 340.Lctrcarry: 341 .irp sreg, s26, s25, s24 342 vmov ip, \sreg @ load next word of ctr 343 rev ip, ip @ ... to handle the carry 344 adds ip, ip, #1 345 rev ip, ip 346 vmov \sreg, ip 347 bcc 0f 348 .endr 3490: teq r4, #0 350 beq .Lctrout 351 b .Lctrloop 352ENDPROC(ce_aes_ctr_encrypt) 353 354 /* 355 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 356 * int blocks, u8 iv[], u8 const rk2[], int first) 357 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 358 * int blocks, u8 iv[], u8 const rk2[], int first) 359 */ 360 361 .macro next_tweak, out, in, const, tmp 362 vshr.s64 \tmp, \in, #63 363 vand \tmp, \tmp, \const 364 vadd.u64 \out, \in, \in 365 vext.8 \tmp, \tmp, \tmp, #8 366 veor \out, \out, \tmp 367 .endm 368 369 .align 3 370.Lxts_mul_x: 371 .quad 1, 0x87 372 373ce_aes_xts_init: 374 vldr d14, .Lxts_mul_x 375 vldr d15, .Lxts_mul_x + 8 376 377 ldrd r4, r5, [sp, #16] @ load args 378 ldr r6, [sp, #28] 379 vld1.8 {q0}, [r5] @ load iv 380 teq r6, #1 @ start of a block? 381 bxne lr 382 383 @ Encrypt the IV in q0 with the second AES key. This should only 384 @ be done at the start of a block. 385 ldr r6, [sp, #24] @ load AES key 2 386 prepare_key r6, r3 387 add ip, r6, #32 @ 3rd round key of key 2 388 b .Laes_encrypt_tweak @ tail call 389ENDPROC(ce_aes_xts_init) 390 391ENTRY(ce_aes_xts_encrypt) 392 push {r4-r6, lr} 393 394 bl ce_aes_xts_init @ run shared prologue 395 prepare_key r2, r3 396 vmov q3, q0 397 398 teq r6, #0 @ start of a block? 399 bne .Lxtsenc3x 400 401.Lxtsencloop3x: 402 next_tweak q3, q3, q7, q6 403.Lxtsenc3x: 404 subs r4, r4, #3 405 bmi .Lxtsenc1x 406 vld1.8 {q0-q1}, [r1]! @ get 3 pt blocks 407 vld1.8 {q2}, [r1]! 408 next_tweak q4, q3, q7, q6 409 veor q0, q0, q3 410 next_tweak q5, q4, q7, q6 411 veor q1, q1, q4 412 veor q2, q2, q5 413 bl aes_encrypt_3x 414 veor q0, q0, q3 415 veor q1, q1, q4 416 veor q2, q2, q5 417 vst1.8 {q0-q1}, [r0]! @ write 3 ct blocks 418 vst1.8 {q2}, [r0]! 419 vmov q3, q5 420 teq r4, #0 421 beq .Lxtsencout 422 b .Lxtsencloop3x 423.Lxtsenc1x: 424 adds r4, r4, #3 425 beq .Lxtsencout 426.Lxtsencloop: 427 vld1.8 {q0}, [r1]! 428 veor q0, q0, q3 429 bl aes_encrypt 430 veor q0, q0, q3 431 vst1.8 {q0}, [r0]! 432 subs r4, r4, #1 433 beq .Lxtsencout 434 next_tweak q3, q3, q7, q6 435 b .Lxtsencloop 436.Lxtsencout: 437 vst1.8 {q3}, [r5] 438 pop {r4-r6, pc} 439ENDPROC(ce_aes_xts_encrypt) 440 441 442ENTRY(ce_aes_xts_decrypt) 443 push {r4-r6, lr} 444 445 bl ce_aes_xts_init @ run shared prologue 446 prepare_key r2, r3 447 vmov q3, q0 448 449 teq r6, #0 @ start of a block? 450 bne .Lxtsdec3x 451 452.Lxtsdecloop3x: 453 next_tweak q3, q3, q7, q6 454.Lxtsdec3x: 455 subs r4, r4, #3 456 bmi .Lxtsdec1x 457 vld1.8 {q0-q1}, [r1]! @ get 3 ct blocks 458 vld1.8 {q2}, [r1]! 459 next_tweak q4, q3, q7, q6 460 veor q0, q0, q3 461 next_tweak q5, q4, q7, q6 462 veor q1, q1, q4 463 veor q2, q2, q5 464 bl aes_decrypt_3x 465 veor q0, q0, q3 466 veor q1, q1, q4 467 veor q2, q2, q5 468 vst1.8 {q0-q1}, [r0]! @ write 3 pt blocks 469 vst1.8 {q2}, [r0]! 470 vmov q3, q5 471 teq r4, #0 472 beq .Lxtsdecout 473 b .Lxtsdecloop3x 474.Lxtsdec1x: 475 adds r4, r4, #3 476 beq .Lxtsdecout 477.Lxtsdecloop: 478 vld1.8 {q0}, [r1]! 479 veor q0, q0, q3 480 add ip, r2, #32 @ 3rd round key 481 bl aes_decrypt 482 veor q0, q0, q3 483 vst1.8 {q0}, [r0]! 484 subs r4, r4, #1 485 beq .Lxtsdecout 486 next_tweak q3, q3, q7, q6 487 b .Lxtsdecloop 488.Lxtsdecout: 489 vst1.8 {q3}, [r5] 490 pop {r4-r6, pc} 491ENDPROC(ce_aes_xts_decrypt) 492 493 /* 494 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the 495 * AES sbox substitution on each byte in 496 * 'input' 497 */ 498ENTRY(ce_aes_sub) 499 vdup.32 q1, r0 500 veor q0, q0, q0 501 aese.8 q0, q1 502 vmov r0, s0 503 bx lr 504ENDPROC(ce_aes_sub) 505 506 /* 507 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns 508 * operation on round key *src 509 */ 510ENTRY(ce_aes_invert) 511 vld1.8 {q0}, [r1] 512 aesimc.8 q0, q0 513 vst1.8 {q0}, [r0] 514 bx lr 515ENDPROC(ce_aes_invert) 516