1/* 2 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions 3 * 4 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 .text 15 .fpu crypto-neon-fp-armv8 16 .align 3 17 18 .macro enc_round, state, key 19 aese.8 \state, \key 20 aesmc.8 \state, \state 21 .endm 22 23 .macro dec_round, state, key 24 aesd.8 \state, \key 25 aesimc.8 \state, \state 26 .endm 27 28 .macro enc_dround, key1, key2 29 enc_round q0, \key1 30 enc_round q0, \key2 31 .endm 32 33 .macro dec_dround, key1, key2 34 dec_round q0, \key1 35 dec_round q0, \key2 36 .endm 37 38 .macro enc_fround, key1, key2, key3 39 enc_round q0, \key1 40 aese.8 q0, \key2 41 veor q0, q0, \key3 42 .endm 43 44 .macro dec_fround, key1, key2, key3 45 dec_round q0, \key1 46 aesd.8 q0, \key2 47 veor q0, q0, \key3 48 .endm 49 50 .macro enc_dround_3x, key1, key2 51 enc_round q0, \key1 52 enc_round q1, \key1 53 enc_round q2, \key1 54 enc_round q0, \key2 55 enc_round q1, \key2 56 enc_round q2, \key2 57 .endm 58 59 .macro dec_dround_3x, key1, key2 60 dec_round q0, \key1 61 dec_round q1, \key1 62 dec_round q2, \key1 63 dec_round q0, \key2 64 dec_round q1, \key2 65 dec_round q2, \key2 66 .endm 67 68 .macro enc_fround_3x, key1, key2, key3 69 enc_round q0, \key1 70 enc_round q1, \key1 71 enc_round q2, \key1 72 aese.8 q0, \key2 73 aese.8 q1, \key2 74 aese.8 q2, \key2 75 veor q0, q0, \key3 76 veor q1, q1, \key3 77 veor q2, q2, \key3 78 .endm 79 80 .macro dec_fround_3x, key1, key2, key3 81 dec_round q0, \key1 82 dec_round q1, \key1 83 dec_round q2, \key1 84 aesd.8 q0, \key2 85 aesd.8 q1, \key2 86 aesd.8 q2, \key2 87 veor q0, q0, \key3 88 veor q1, q1, \key3 89 veor q2, q2, \key3 90 .endm 91 92 .macro do_block, dround, fround 93 cmp r3, #12 @ which key size? 94 vld1.8 {q10-q11}, [ip]! 95 \dround q8, q9 96 vld1.8 {q12-q13}, [ip]! 97 \dround q10, q11 98 vld1.8 {q10-q11}, [ip]! 99 \dround q12, q13 100 vld1.8 {q12-q13}, [ip]! 101 \dround q10, q11 102 blo 0f @ AES-128: 10 rounds 103 vld1.8 {q10-q11}, [ip]! 104 \dround q12, q13 105 beq 1f @ AES-192: 12 rounds 106 vld1.8 {q12-q13}, [ip] 107 \dround q10, q11 1080: \fround q12, q13, q14 109 bx lr 110 1111: \fround q10, q11, q14 112 bx lr 113 .endm 114 115 /* 116 * Internal, non-AAPCS compliant functions that implement the core AES 117 * transforms. These should preserve all registers except q0 - q2 and ip 118 * Arguments: 119 * q0 : first in/output block 120 * q1 : second in/output block (_3x version only) 121 * q2 : third in/output block (_3x version only) 122 * q8 : first round key 123 * q9 : secound round key 124 * q14 : final round key 125 * r2 : address of round key array 126 * r3 : number of rounds 127 */ 128 .align 6 129aes_encrypt: 130 add ip, r2, #32 @ 3rd round key 131.Laes_encrypt_tweak: 132 do_block enc_dround, enc_fround 133ENDPROC(aes_encrypt) 134 135 .align 6 136aes_decrypt: 137 add ip, r2, #32 @ 3rd round key 138 do_block dec_dround, dec_fround 139ENDPROC(aes_decrypt) 140 141 .align 6 142aes_encrypt_3x: 143 add ip, r2, #32 @ 3rd round key 144 do_block enc_dround_3x, enc_fround_3x 145ENDPROC(aes_encrypt_3x) 146 147 .align 6 148aes_decrypt_3x: 149 add ip, r2, #32 @ 3rd round key 150 do_block dec_dround_3x, dec_fround_3x 151ENDPROC(aes_decrypt_3x) 152 153 .macro prepare_key, rk, rounds 154 add ip, \rk, \rounds, lsl #4 155 vld1.8 {q8-q9}, [\rk] @ load first 2 round keys 156 vld1.8 {q14}, [ip] @ load last round key 157 .endm 158 159 /* 160 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 161 * int blocks) 162 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 163 * int blocks) 164 */ 165ENTRY(ce_aes_ecb_encrypt) 166 push {r4, lr} 167 ldr r4, [sp, #8] 168 prepare_key r2, r3 169.Lecbencloop3x: 170 subs r4, r4, #3 171 bmi .Lecbenc1x 172 vld1.8 {q0-q1}, [r1, :64]! 173 vld1.8 {q2}, [r1, :64]! 174 bl aes_encrypt_3x 175 vst1.8 {q0-q1}, [r0, :64]! 176 vst1.8 {q2}, [r0, :64]! 177 b .Lecbencloop3x 178.Lecbenc1x: 179 adds r4, r4, #3 180 beq .Lecbencout 181.Lecbencloop: 182 vld1.8 {q0}, [r1, :64]! 183 bl aes_encrypt 184 vst1.8 {q0}, [r0, :64]! 185 subs r4, r4, #1 186 bne .Lecbencloop 187.Lecbencout: 188 pop {r4, pc} 189ENDPROC(ce_aes_ecb_encrypt) 190 191ENTRY(ce_aes_ecb_decrypt) 192 push {r4, lr} 193 ldr r4, [sp, #8] 194 prepare_key r2, r3 195.Lecbdecloop3x: 196 subs r4, r4, #3 197 bmi .Lecbdec1x 198 vld1.8 {q0-q1}, [r1, :64]! 199 vld1.8 {q2}, [r1, :64]! 200 bl aes_decrypt_3x 201 vst1.8 {q0-q1}, [r0, :64]! 202 vst1.8 {q2}, [r0, :64]! 203 b .Lecbdecloop3x 204.Lecbdec1x: 205 adds r4, r4, #3 206 beq .Lecbdecout 207.Lecbdecloop: 208 vld1.8 {q0}, [r1, :64]! 209 bl aes_decrypt 210 vst1.8 {q0}, [r0, :64]! 211 subs r4, r4, #1 212 bne .Lecbdecloop 213.Lecbdecout: 214 pop {r4, pc} 215ENDPROC(ce_aes_ecb_decrypt) 216 217 /* 218 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 219 * int blocks, u8 iv[]) 220 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 221 * int blocks, u8 iv[]) 222 */ 223ENTRY(ce_aes_cbc_encrypt) 224 push {r4-r6, lr} 225 ldrd r4, r5, [sp, #16] 226 vld1.8 {q0}, [r5] 227 prepare_key r2, r3 228.Lcbcencloop: 229 vld1.8 {q1}, [r1, :64]! @ get next pt block 230 veor q0, q0, q1 @ ..and xor with iv 231 bl aes_encrypt 232 vst1.8 {q0}, [r0, :64]! 233 subs r4, r4, #1 234 bne .Lcbcencloop 235 vst1.8 {q0}, [r5] 236 pop {r4-r6, pc} 237ENDPROC(ce_aes_cbc_encrypt) 238 239ENTRY(ce_aes_cbc_decrypt) 240 push {r4-r6, lr} 241 ldrd r4, r5, [sp, #16] 242 vld1.8 {q6}, [r5] @ keep iv in q6 243 prepare_key r2, r3 244.Lcbcdecloop3x: 245 subs r4, r4, #3 246 bmi .Lcbcdec1x 247 vld1.8 {q0-q1}, [r1, :64]! 248 vld1.8 {q2}, [r1, :64]! 249 vmov q3, q0 250 vmov q4, q1 251 vmov q5, q2 252 bl aes_decrypt_3x 253 veor q0, q0, q6 254 veor q1, q1, q3 255 veor q2, q2, q4 256 vmov q6, q5 257 vst1.8 {q0-q1}, [r0, :64]! 258 vst1.8 {q2}, [r0, :64]! 259 b .Lcbcdecloop3x 260.Lcbcdec1x: 261 adds r4, r4, #3 262 beq .Lcbcdecout 263 vmov q15, q14 @ preserve last round key 264.Lcbcdecloop: 265 vld1.8 {q0}, [r1, :64]! @ get next ct block 266 veor q14, q15, q6 @ combine prev ct with last key 267 vmov q6, q0 268 bl aes_decrypt 269 vst1.8 {q0}, [r0, :64]! 270 subs r4, r4, #1 271 bne .Lcbcdecloop 272.Lcbcdecout: 273 vst1.8 {q6}, [r5] @ keep iv in q6 274 pop {r4-r6, pc} 275ENDPROC(ce_aes_cbc_decrypt) 276 277 /* 278 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 279 * int blocks, u8 ctr[]) 280 */ 281ENTRY(ce_aes_ctr_encrypt) 282 push {r4-r6, lr} 283 ldrd r4, r5, [sp, #16] 284 vld1.8 {q6}, [r5] @ load ctr 285 prepare_key r2, r3 286 vmov r6, s27 @ keep swabbed ctr in r6 287 rev r6, r6 288 cmn r6, r4 @ 32 bit overflow? 289 bcs .Lctrloop 290.Lctrloop3x: 291 subs r4, r4, #3 292 bmi .Lctr1x 293 add r6, r6, #1 294 vmov q0, q6 295 vmov q1, q6 296 rev ip, r6 297 add r6, r6, #1 298 vmov q2, q6 299 vmov s7, ip 300 rev ip, r6 301 add r6, r6, #1 302 vmov s11, ip 303 vld1.8 {q3-q4}, [r1, :64]! 304 vld1.8 {q5}, [r1, :64]! 305 bl aes_encrypt_3x 306 veor q0, q0, q3 307 veor q1, q1, q4 308 veor q2, q2, q5 309 rev ip, r6 310 vst1.8 {q0-q1}, [r0, :64]! 311 vst1.8 {q2}, [r0, :64]! 312 vmov s27, ip 313 b .Lctrloop3x 314.Lctr1x: 315 adds r4, r4, #3 316 beq .Lctrout 317.Lctrloop: 318 vmov q0, q6 319 bl aes_encrypt 320 subs r4, r4, #1 321 bmi .Lctrhalfblock @ blocks < 0 means 1/2 block 322 vld1.8 {q3}, [r1, :64]! 323 veor q3, q0, q3 324 vst1.8 {q3}, [r0, :64]! 325 326 adds r6, r6, #1 @ increment BE ctr 327 rev ip, r6 328 vmov s27, ip 329 bcs .Lctrcarry 330 teq r4, #0 331 bne .Lctrloop 332.Lctrout: 333 vst1.8 {q6}, [r5] 334 pop {r4-r6, pc} 335 336.Lctrhalfblock: 337 vld1.8 {d1}, [r1, :64] 338 veor d0, d0, d1 339 vst1.8 {d0}, [r0, :64] 340 pop {r4-r6, pc} 341 342.Lctrcarry: 343 .irp sreg, s26, s25, s24 344 vmov ip, \sreg @ load next word of ctr 345 rev ip, ip @ ... to handle the carry 346 adds ip, ip, #1 347 rev ip, ip 348 vmov \sreg, ip 349 bcc 0f 350 .endr 3510: teq r4, #0 352 beq .Lctrout 353 b .Lctrloop 354ENDPROC(ce_aes_ctr_encrypt) 355 356 /* 357 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 358 * int blocks, u8 iv[], u8 const rk2[], int first) 359 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 360 * int blocks, u8 iv[], u8 const rk2[], int first) 361 */ 362 363 .macro next_tweak, out, in, const, tmp 364 vshr.s64 \tmp, \in, #63 365 vand \tmp, \tmp, \const 366 vadd.u64 \out, \in, \in 367 vext.8 \tmp, \tmp, \tmp, #8 368 veor \out, \out, \tmp 369 .endm 370 371 .align 3 372.Lxts_mul_x: 373 .quad 1, 0x87 374 375ce_aes_xts_init: 376 vldr d14, .Lxts_mul_x 377 vldr d15, .Lxts_mul_x + 8 378 379 ldrd r4, r5, [sp, #16] @ load args 380 ldr r6, [sp, #28] 381 vld1.8 {q0}, [r5] @ load iv 382 teq r6, #1 @ start of a block? 383 bxne lr 384 385 @ Encrypt the IV in q0 with the second AES key. This should only 386 @ be done at the start of a block. 387 ldr r6, [sp, #24] @ load AES key 2 388 prepare_key r6, r3 389 add ip, r6, #32 @ 3rd round key of key 2 390 b .Laes_encrypt_tweak @ tail call 391ENDPROC(ce_aes_xts_init) 392 393ENTRY(ce_aes_xts_encrypt) 394 push {r4-r6, lr} 395 396 bl ce_aes_xts_init @ run shared prologue 397 prepare_key r2, r3 398 vmov q3, q0 399 400 teq r6, #0 @ start of a block? 401 bne .Lxtsenc3x 402 403.Lxtsencloop3x: 404 next_tweak q3, q3, q7, q6 405.Lxtsenc3x: 406 subs r4, r4, #3 407 bmi .Lxtsenc1x 408 vld1.8 {q0-q1}, [r1, :64]! @ get 3 pt blocks 409 vld1.8 {q2}, [r1, :64]! 410 next_tweak q4, q3, q7, q6 411 veor q0, q0, q3 412 next_tweak q5, q4, q7, q6 413 veor q1, q1, q4 414 veor q2, q2, q5 415 bl aes_encrypt_3x 416 veor q0, q0, q3 417 veor q1, q1, q4 418 veor q2, q2, q5 419 vst1.8 {q0-q1}, [r0, :64]! @ write 3 ct blocks 420 vst1.8 {q2}, [r0, :64]! 421 vmov q3, q5 422 teq r4, #0 423 beq .Lxtsencout 424 b .Lxtsencloop3x 425.Lxtsenc1x: 426 adds r4, r4, #3 427 beq .Lxtsencout 428.Lxtsencloop: 429 vld1.8 {q0}, [r1, :64]! 430 veor q0, q0, q3 431 bl aes_encrypt 432 veor q0, q0, q3 433 vst1.8 {q0}, [r0, :64]! 434 subs r4, r4, #1 435 beq .Lxtsencout 436 next_tweak q3, q3, q7, q6 437 b .Lxtsencloop 438.Lxtsencout: 439 vst1.8 {q3}, [r5] 440 pop {r4-r6, pc} 441ENDPROC(ce_aes_xts_encrypt) 442 443 444ENTRY(ce_aes_xts_decrypt) 445 push {r4-r6, lr} 446 447 bl ce_aes_xts_init @ run shared prologue 448 prepare_key r2, r3 449 vmov q3, q0 450 451 teq r6, #0 @ start of a block? 452 bne .Lxtsdec3x 453 454.Lxtsdecloop3x: 455 next_tweak q3, q3, q7, q6 456.Lxtsdec3x: 457 subs r4, r4, #3 458 bmi .Lxtsdec1x 459 vld1.8 {q0-q1}, [r1, :64]! @ get 3 ct blocks 460 vld1.8 {q2}, [r1, :64]! 461 next_tweak q4, q3, q7, q6 462 veor q0, q0, q3 463 next_tweak q5, q4, q7, q6 464 veor q1, q1, q4 465 veor q2, q2, q5 466 bl aes_decrypt_3x 467 veor q0, q0, q3 468 veor q1, q1, q4 469 veor q2, q2, q5 470 vst1.8 {q0-q1}, [r0, :64]! @ write 3 pt blocks 471 vst1.8 {q2}, [r0, :64]! 472 vmov q3, q5 473 teq r4, #0 474 beq .Lxtsdecout 475 b .Lxtsdecloop3x 476.Lxtsdec1x: 477 adds r4, r4, #3 478 beq .Lxtsdecout 479.Lxtsdecloop: 480 vld1.8 {q0}, [r1, :64]! 481 veor q0, q0, q3 482 add ip, r2, #32 @ 3rd round key 483 bl aes_decrypt 484 veor q0, q0, q3 485 vst1.8 {q0}, [r0, :64]! 486 subs r4, r4, #1 487 beq .Lxtsdecout 488 next_tweak q3, q3, q7, q6 489 b .Lxtsdecloop 490.Lxtsdecout: 491 vst1.8 {q3}, [r5] 492 pop {r4-r6, pc} 493ENDPROC(ce_aes_xts_decrypt) 494 495 /* 496 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the 497 * AES sbox substitution on each byte in 498 * 'input' 499 */ 500ENTRY(ce_aes_sub) 501 vdup.32 q1, r0 502 veor q0, q0, q0 503 aese.8 q0, q1 504 vmov r0, s0 505 bx lr 506ENDPROC(ce_aes_sub) 507 508 /* 509 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns 510 * operation on round key *src 511 */ 512ENTRY(ce_aes_invert) 513 vld1.8 {q0}, [r1] 514 aesimc.8 q0, q0 515 vst1.8 {q0}, [r0] 516 bx lr 517ENDPROC(ce_aes_invert) 518