1/* 2 * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions 3 * 4 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 .text 15 .arch armv8-a+crypto 16 17 /* 18 * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, 19 * u32 *macp, u8 const rk[], u32 rounds); 20 */ 21ENTRY(ce_aes_ccm_auth_data) 22 frame_push 7 23 24 mov x19, x0 25 mov x20, x1 26 mov x21, x2 27 mov x22, x3 28 mov x23, x4 29 mov x24, x5 30 31 ldr w25, [x22] /* leftover from prev round? */ 32 ld1 {v0.16b}, [x0] /* load mac */ 33 cbz w25, 1f 34 sub w25, w25, #16 35 eor v1.16b, v1.16b, v1.16b 360: ldrb w7, [x20], #1 /* get 1 byte of input */ 37 subs w21, w21, #1 38 add w25, w25, #1 39 ins v1.b[0], w7 40 ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ 41 beq 8f /* out of input? */ 42 cbnz w25, 0b 43 eor v0.16b, v0.16b, v1.16b 441: ld1 {v3.4s}, [x23] /* load first round key */ 45 prfm pldl1strm, [x20] 46 cmp w24, #12 /* which key size? */ 47 add x6, x23, #16 48 sub w7, w24, #2 /* modified # of rounds */ 49 bmi 2f 50 bne 5f 51 mov v5.16b, v3.16b 52 b 4f 532: mov v4.16b, v3.16b 54 ld1 {v5.4s}, [x6], #16 /* load 2nd round key */ 553: aese v0.16b, v4.16b 56 aesmc v0.16b, v0.16b 574: ld1 {v3.4s}, [x6], #16 /* load next round key */ 58 aese v0.16b, v5.16b 59 aesmc v0.16b, v0.16b 605: ld1 {v4.4s}, [x6], #16 /* load next round key */ 61 subs w7, w7, #3 62 aese v0.16b, v3.16b 63 aesmc v0.16b, v0.16b 64 ld1 {v5.4s}, [x6], #16 /* load next round key */ 65 bpl 3b 66 aese v0.16b, v4.16b 67 subs w21, w21, #16 /* last data? */ 68 eor v0.16b, v0.16b, v5.16b /* final round */ 69 bmi 6f 70 ld1 {v1.16b}, [x20], #16 /* load next input block */ 71 eor v0.16b, v0.16b, v1.16b /* xor with mac */ 72 beq 6f 73 74 if_will_cond_yield_neon 75 st1 {v0.16b}, [x19] /* store mac */ 76 do_cond_yield_neon 77 ld1 {v0.16b}, [x19] /* reload mac */ 78 endif_yield_neon 79 80 b 1b 816: st1 {v0.16b}, [x19] /* store mac */ 82 beq 10f 83 adds w21, w21, #16 84 beq 10f 85 mov w25, w21 867: ldrb w7, [x20], #1 87 umov w6, v0.b[0] 88 eor w6, w6, w7 89 strb w6, [x19], #1 90 subs w21, w21, #1 91 beq 10f 92 ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ 93 b 7b 948: mov w7, w25 95 add w25, w25, #16 969: ext v1.16b, v1.16b, v1.16b, #1 97 adds w7, w7, #1 98 bne 9b 99 eor v0.16b, v0.16b, v1.16b 100 st1 {v0.16b}, [x19] 10110: str w25, [x22] 102 103 frame_pop 104 ret 105ENDPROC(ce_aes_ccm_auth_data) 106 107 /* 108 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], 109 * u32 rounds); 110 */ 111ENTRY(ce_aes_ccm_final) 112 ld1 {v3.4s}, [x2], #16 /* load first round key */ 113 ld1 {v0.16b}, [x0] /* load mac */ 114 cmp w3, #12 /* which key size? */ 115 sub w3, w3, #2 /* modified # of rounds */ 116 ld1 {v1.16b}, [x1] /* load 1st ctriv */ 117 bmi 0f 118 bne 3f 119 mov v5.16b, v3.16b 120 b 2f 1210: mov v4.16b, v3.16b 1221: ld1 {v5.4s}, [x2], #16 /* load next round key */ 123 aese v0.16b, v4.16b 124 aesmc v0.16b, v0.16b 125 aese v1.16b, v4.16b 126 aesmc v1.16b, v1.16b 1272: ld1 {v3.4s}, [x2], #16 /* load next round key */ 128 aese v0.16b, v5.16b 129 aesmc v0.16b, v0.16b 130 aese v1.16b, v5.16b 131 aesmc v1.16b, v1.16b 1323: ld1 {v4.4s}, [x2], #16 /* load next round key */ 133 subs w3, w3, #3 134 aese v0.16b, v3.16b 135 aesmc v0.16b, v0.16b 136 aese v1.16b, v3.16b 137 aesmc v1.16b, v1.16b 138 bpl 1b 139 aese v0.16b, v4.16b 140 aese v1.16b, v4.16b 141 /* final round key cancels out */ 142 eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ 143 st1 {v0.16b}, [x0] /* store result */ 144 ret 145ENDPROC(ce_aes_ccm_final) 146 147 .macro aes_ccm_do_crypt,enc 148 frame_push 8 149 150 mov x19, x0 151 mov x20, x1 152 mov x21, x2 153 mov x22, x3 154 mov x23, x4 155 mov x24, x5 156 mov x25, x6 157 158 ldr x26, [x25, #8] /* load lower ctr */ 159 ld1 {v0.16b}, [x24] /* load mac */ 160CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ 1610: /* outer loop */ 162 ld1 {v1.8b}, [x25] /* load upper ctr */ 163 prfm pldl1strm, [x20] 164 add x26, x26, #1 165 rev x9, x26 166 cmp w23, #12 /* which key size? */ 167 sub w7, w23, #2 /* get modified # of rounds */ 168 ins v1.d[1], x9 /* no carry in lower ctr */ 169 ld1 {v3.4s}, [x22] /* load first round key */ 170 add x10, x22, #16 171 bmi 1f 172 bne 4f 173 mov v5.16b, v3.16b 174 b 3f 1751: mov v4.16b, v3.16b 176 ld1 {v5.4s}, [x10], #16 /* load 2nd round key */ 1772: /* inner loop: 3 rounds, 2x interleaved */ 178 aese v0.16b, v4.16b 179 aesmc v0.16b, v0.16b 180 aese v1.16b, v4.16b 181 aesmc v1.16b, v1.16b 1823: ld1 {v3.4s}, [x10], #16 /* load next round key */ 183 aese v0.16b, v5.16b 184 aesmc v0.16b, v0.16b 185 aese v1.16b, v5.16b 186 aesmc v1.16b, v1.16b 1874: ld1 {v4.4s}, [x10], #16 /* load next round key */ 188 subs w7, w7, #3 189 aese v0.16b, v3.16b 190 aesmc v0.16b, v0.16b 191 aese v1.16b, v3.16b 192 aesmc v1.16b, v1.16b 193 ld1 {v5.4s}, [x10], #16 /* load next round key */ 194 bpl 2b 195 aese v0.16b, v4.16b 196 aese v1.16b, v4.16b 197 subs w21, w21, #16 198 bmi 7f /* partial block? */ 199 ld1 {v2.16b}, [x20], #16 /* load next input block */ 200 .if \enc == 1 201 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ 202 eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ 203 .else 204 eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ 205 eor v1.16b, v2.16b, v5.16b /* final round enc */ 206 .endif 207 eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ 208 st1 {v1.16b}, [x19], #16 /* write output block */ 209 beq 5f 210 211 if_will_cond_yield_neon 212 st1 {v0.16b}, [x24] /* store mac */ 213 do_cond_yield_neon 214 ld1 {v0.16b}, [x24] /* reload mac */ 215 endif_yield_neon 216 217 b 0b 2185: 219CPU_LE( rev x26, x26 ) 220 st1 {v0.16b}, [x24] /* store mac */ 221 str x26, [x25, #8] /* store lsb end of ctr (BE) */ 222 2236: frame_pop 224 ret 225 2267: eor v0.16b, v0.16b, v5.16b /* final round mac */ 227 eor v1.16b, v1.16b, v5.16b /* final round enc */ 228 st1 {v0.16b}, [x24] /* store mac */ 229 add w21, w21, #16 /* process partial tail block */ 2308: ldrb w9, [x20], #1 /* get 1 byte of input */ 231 umov w6, v1.b[0] /* get top crypted ctr byte */ 232 umov w7, v0.b[0] /* get top mac byte */ 233 .if \enc == 1 234 eor w7, w7, w9 235 eor w9, w9, w6 236 .else 237 eor w9, w9, w6 238 eor w7, w7, w9 239 .endif 240 strb w9, [x19], #1 /* store out byte */ 241 strb w7, [x24], #1 /* store mac byte */ 242 subs w21, w21, #1 243 beq 6b 244 ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ 245 ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ 246 b 8b 247 .endm 248 249 /* 250 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, 251 * u8 const rk[], u32 rounds, u8 mac[], 252 * u8 ctr[]); 253 * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, 254 * u8 const rk[], u32 rounds, u8 mac[], 255 * u8 ctr[]); 256 */ 257ENTRY(ce_aes_ccm_encrypt) 258 aes_ccm_do_crypt 1 259ENDPROC(ce_aes_ccm_encrypt) 260 261ENTRY(ce_aes_ccm_decrypt) 262 aes_ccm_do_crypt 0 263ENDPROC(ce_aes_ccm_decrypt) 264