1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Bit sliced AES using NEON instructions 4 * 5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8/* 9 * The algorithm implemented here is described in detail by the paper 10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and 11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) 12 * 13 * This implementation is based primarily on the OpenSSL implementation 14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> 15 */ 16 17#include <linux/linkage.h> 18#include <asm/assembler.h> 19 20 .text 21 22 rounds .req x11 23 bskey .req x12 24 25 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 26 eor \b2, \b2, \b1 27 eor \b5, \b5, \b6 28 eor \b3, \b3, \b0 29 eor \b6, \b6, \b2 30 eor \b5, \b5, \b0 31 eor \b6, \b6, \b3 32 eor \b3, \b3, \b7 33 eor \b7, \b7, \b5 34 eor \b3, \b3, \b4 35 eor \b4, \b4, \b5 36 eor \b2, \b2, \b7 37 eor \b3, \b3, \b1 38 eor \b1, \b1, \b5 39 .endm 40 41 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 42 eor \b0, \b0, \b6 43 eor \b1, \b1, \b4 44 eor \b4, \b4, \b6 45 eor \b2, \b2, \b0 46 eor \b6, \b6, \b1 47 eor \b1, \b1, \b5 48 eor \b5, \b5, \b3 49 eor \b3, \b3, \b7 50 eor \b7, \b7, \b5 51 eor \b2, \b2, \b5 52 eor \b4, \b4, \b7 53 .endm 54 55 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 56 eor \b1, \b1, \b7 57 eor \b4, \b4, \b7 58 eor \b7, \b7, \b5 59 eor \b1, \b1, \b3 60 eor \b2, \b2, \b5 61 eor \b3, \b3, \b7 62 eor \b6, \b6, \b1 63 eor \b2, \b2, \b0 64 eor \b5, \b5, \b3 65 eor \b4, \b4, \b6 66 eor \b0, \b0, \b6 67 eor \b1, \b1, \b4 68 .endm 69 70 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 71 eor \b1, \b1, \b5 72 eor \b2, \b2, \b7 73 eor \b3, \b3, \b1 74 eor \b4, \b4, \b5 75 eor \b7, \b7, \b5 76 eor \b3, \b3, \b4 77 eor \b5, \b5, \b0 78 eor \b3, \b3, \b7 79 eor \b6, \b6, \b2 80 eor \b2, \b2, \b1 81 eor \b6, \b6, \b3 82 eor \b3, \b3, \b0 83 eor \b5, \b5, \b6 84 .endm 85 86 .macro mul_gf4, x0, x1, y0, y1, t0, t1 87 eor \t0, \y0, \y1 88 and \t0, \t0, \x0 89 eor \x0, \x0, \x1 90 and \t1, \x1, \y0 91 and \x0, \x0, \y1 92 eor \x1, \t1, \t0 93 eor \x0, \x0, \t1 94 .endm 95 96 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 97 eor \t0, \y0, \y1 98 eor \t1, \y2, \y3 99 and \t0, \t0, \x0 100 and \t1, \t1, \x2 101 eor \x0, \x0, \x1 102 eor \x2, \x2, \x3 103 and \x1, \x1, \y0 104 and \x3, \x3, \y2 105 and \x0, \x0, \y1 106 and \x2, \x2, \y3 107 eor \x1, \x1, \x0 108 eor \x2, \x2, \x3 109 eor \x0, \x0, \t0 110 eor \x3, \x3, \t1 111 .endm 112 113 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ 114 y0, y1, y2, y3, t0, t1, t2, t3 115 eor \t0, \x0, \x2 116 eor \t1, \x1, \x3 117 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 118 eor \y0, \y0, \y2 119 eor \y1, \y1, \y3 120 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 121 eor \x0, \x0, \t0 122 eor \x2, \x2, \t0 123 eor \x1, \x1, \t1 124 eor \x3, \x3, \t1 125 eor \t0, \x4, \x6 126 eor \t1, \x5, \x7 127 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 128 eor \y0, \y0, \y2 129 eor \y1, \y1, \y3 130 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 131 eor \x4, \x4, \t0 132 eor \x6, \x6, \t0 133 eor \x5, \x5, \t1 134 eor \x7, \x7, \t1 135 .endm 136 137 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ 138 t0, t1, t2, t3, s0, s1, s2, s3 139 eor \t3, \x4, \x6 140 eor \t0, \x5, \x7 141 eor \t1, \x1, \x3 142 eor \s1, \x7, \x6 143 eor \s0, \x0, \x2 144 eor \s3, \t3, \t0 145 orr \t2, \t0, \t1 146 and \s2, \t3, \s0 147 orr \t3, \t3, \s0 148 eor \s0, \s0, \t1 149 and \t0, \t0, \t1 150 eor \t1, \x3, \x2 151 and \s3, \s3, \s0 152 and \s1, \s1, \t1 153 eor \t1, \x4, \x5 154 eor \s0, \x1, \x0 155 eor \t3, \t3, \s1 156 eor \t2, \t2, \s1 157 and \s1, \t1, \s0 158 orr \t1, \t1, \s0 159 eor \t3, \t3, \s3 160 eor \t0, \t0, \s1 161 eor \t2, \t2, \s2 162 eor \t1, \t1, \s3 163 eor \t0, \t0, \s2 164 and \s0, \x7, \x3 165 eor \t1, \t1, \s2 166 and \s1, \x6, \x2 167 and \s2, \x5, \x1 168 orr \s3, \x4, \x0 169 eor \t3, \t3, \s0 170 eor \t1, \t1, \s2 171 eor \s0, \t0, \s3 172 eor \t2, \t2, \s1 173 and \s2, \t3, \t1 174 eor \s1, \t2, \s2 175 eor \s3, \s0, \s2 176 bsl \s1, \t1, \s0 177 not \t0, \s0 178 bsl \s0, \s1, \s3 179 bsl \t0, \s1, \s3 180 bsl \s3, \t3, \t2 181 eor \t3, \t3, \t2 182 and \s2, \s0, \s3 183 eor \t1, \t1, \t0 184 eor \s2, \s2, \t3 185 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 186 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 187 .endm 188 189 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 190 t0, t1, t2, t3, s0, s1, s2, s3 191 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 192 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 193 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ 194 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 195 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 196 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 197 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 198 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b 199 .endm 200 201 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 202 t0, t1, t2, t3, s0, s1, s2, s3 203 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 204 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 205 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ 206 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 207 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 208 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 209 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 210 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b 211 .endm 212 213 .macro enc_next_rk 214 ldp q16, q17, [bskey], #128 215 ldp q18, q19, [bskey, #-96] 216 ldp q20, q21, [bskey, #-64] 217 ldp q22, q23, [bskey, #-32] 218 .endm 219 220 .macro dec_next_rk 221 ldp q16, q17, [bskey, #-128]! 222 ldp q18, q19, [bskey, #32] 223 ldp q20, q21, [bskey, #64] 224 ldp q22, q23, [bskey, #96] 225 .endm 226 227 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 228 eor \x0\().16b, \x0\().16b, v16.16b 229 eor \x1\().16b, \x1\().16b, v17.16b 230 eor \x2\().16b, \x2\().16b, v18.16b 231 eor \x3\().16b, \x3\().16b, v19.16b 232 eor \x4\().16b, \x4\().16b, v20.16b 233 eor \x5\().16b, \x5\().16b, v21.16b 234 eor \x6\().16b, \x6\().16b, v22.16b 235 eor \x7\().16b, \x7\().16b, v23.16b 236 .endm 237 238 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask 239 tbl \x0\().16b, {\x0\().16b}, \mask\().16b 240 tbl \x1\().16b, {\x1\().16b}, \mask\().16b 241 tbl \x2\().16b, {\x2\().16b}, \mask\().16b 242 tbl \x3\().16b, {\x3\().16b}, \mask\().16b 243 tbl \x4\().16b, {\x4\().16b}, \mask\().16b 244 tbl \x5\().16b, {\x5\().16b}, \mask\().16b 245 tbl \x6\().16b, {\x6\().16b}, \mask\().16b 246 tbl \x7\().16b, {\x7\().16b}, \mask\().16b 247 .endm 248 249 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 250 t0, t1, t2, t3, t4, t5, t6, t7, inv 251 ext \t0\().16b, \x0\().16b, \x0\().16b, #12 252 ext \t1\().16b, \x1\().16b, \x1\().16b, #12 253 eor \x0\().16b, \x0\().16b, \t0\().16b 254 ext \t2\().16b, \x2\().16b, \x2\().16b, #12 255 eor \x1\().16b, \x1\().16b, \t1\().16b 256 ext \t3\().16b, \x3\().16b, \x3\().16b, #12 257 eor \x2\().16b, \x2\().16b, \t2\().16b 258 ext \t4\().16b, \x4\().16b, \x4\().16b, #12 259 eor \x3\().16b, \x3\().16b, \t3\().16b 260 ext \t5\().16b, \x5\().16b, \x5\().16b, #12 261 eor \x4\().16b, \x4\().16b, \t4\().16b 262 ext \t6\().16b, \x6\().16b, \x6\().16b, #12 263 eor \x5\().16b, \x5\().16b, \t5\().16b 264 ext \t7\().16b, \x7\().16b, \x7\().16b, #12 265 eor \x6\().16b, \x6\().16b, \t6\().16b 266 eor \t1\().16b, \t1\().16b, \x0\().16b 267 eor \x7\().16b, \x7\().16b, \t7\().16b 268 ext \x0\().16b, \x0\().16b, \x0\().16b, #8 269 eor \t2\().16b, \t2\().16b, \x1\().16b 270 eor \t0\().16b, \t0\().16b, \x7\().16b 271 eor \t1\().16b, \t1\().16b, \x7\().16b 272 ext \x1\().16b, \x1\().16b, \x1\().16b, #8 273 eor \t5\().16b, \t5\().16b, \x4\().16b 274 eor \x0\().16b, \x0\().16b, \t0\().16b 275 eor \t6\().16b, \t6\().16b, \x5\().16b 276 eor \x1\().16b, \x1\().16b, \t1\().16b 277 ext \t0\().16b, \x4\().16b, \x4\().16b, #8 278 eor \t4\().16b, \t4\().16b, \x3\().16b 279 ext \t1\().16b, \x5\().16b, \x5\().16b, #8 280 eor \t7\().16b, \t7\().16b, \x6\().16b 281 ext \x4\().16b, \x3\().16b, \x3\().16b, #8 282 eor \t3\().16b, \t3\().16b, \x2\().16b 283 ext \x5\().16b, \x7\().16b, \x7\().16b, #8 284 eor \t4\().16b, \t4\().16b, \x7\().16b 285 ext \x3\().16b, \x6\().16b, \x6\().16b, #8 286 eor \t3\().16b, \t3\().16b, \x7\().16b 287 ext \x6\().16b, \x2\().16b, \x2\().16b, #8 288 eor \x7\().16b, \t1\().16b, \t5\().16b 289 .ifb \inv 290 eor \x2\().16b, \t0\().16b, \t4\().16b 291 eor \x4\().16b, \x4\().16b, \t3\().16b 292 eor \x5\().16b, \x5\().16b, \t7\().16b 293 eor \x3\().16b, \x3\().16b, \t6\().16b 294 eor \x6\().16b, \x6\().16b, \t2\().16b 295 .else 296 eor \t3\().16b, \t3\().16b, \x4\().16b 297 eor \x5\().16b, \x5\().16b, \t7\().16b 298 eor \x2\().16b, \x3\().16b, \t6\().16b 299 eor \x3\().16b, \t0\().16b, \t4\().16b 300 eor \x4\().16b, \x6\().16b, \t2\().16b 301 mov \x6\().16b, \t3\().16b 302 .endif 303 .endm 304 305 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 306 t0, t1, t2, t3, t4, t5, t6, t7 307 ext \t0\().16b, \x0\().16b, \x0\().16b, #8 308 ext \t6\().16b, \x6\().16b, \x6\().16b, #8 309 ext \t7\().16b, \x7\().16b, \x7\().16b, #8 310 eor \t0\().16b, \t0\().16b, \x0\().16b 311 ext \t1\().16b, \x1\().16b, \x1\().16b, #8 312 eor \t6\().16b, \t6\().16b, \x6\().16b 313 ext \t2\().16b, \x2\().16b, \x2\().16b, #8 314 eor \t7\().16b, \t7\().16b, \x7\().16b 315 ext \t3\().16b, \x3\().16b, \x3\().16b, #8 316 eor \t1\().16b, \t1\().16b, \x1\().16b 317 ext \t4\().16b, \x4\().16b, \x4\().16b, #8 318 eor \t2\().16b, \t2\().16b, \x2\().16b 319 ext \t5\().16b, \x5\().16b, \x5\().16b, #8 320 eor \t3\().16b, \t3\().16b, \x3\().16b 321 eor \t4\().16b, \t4\().16b, \x4\().16b 322 eor \t5\().16b, \t5\().16b, \x5\().16b 323 eor \x0\().16b, \x0\().16b, \t6\().16b 324 eor \x1\().16b, \x1\().16b, \t6\().16b 325 eor \x2\().16b, \x2\().16b, \t0\().16b 326 eor \x4\().16b, \x4\().16b, \t2\().16b 327 eor \x3\().16b, \x3\().16b, \t1\().16b 328 eor \x1\().16b, \x1\().16b, \t7\().16b 329 eor \x2\().16b, \x2\().16b, \t7\().16b 330 eor \x4\().16b, \x4\().16b, \t6\().16b 331 eor \x5\().16b, \x5\().16b, \t3\().16b 332 eor \x3\().16b, \x3\().16b, \t6\().16b 333 eor \x6\().16b, \x6\().16b, \t4\().16b 334 eor \x4\().16b, \x4\().16b, \t7\().16b 335 eor \x5\().16b, \x5\().16b, \t7\().16b 336 eor \x7\().16b, \x7\().16b, \t5\().16b 337 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 338 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 339 .endm 340 341 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 342 ushr \t0\().2d, \b0\().2d, #\n 343 ushr \t1\().2d, \b1\().2d, #\n 344 eor \t0\().16b, \t0\().16b, \a0\().16b 345 eor \t1\().16b, \t1\().16b, \a1\().16b 346 and \t0\().16b, \t0\().16b, \mask\().16b 347 and \t1\().16b, \t1\().16b, \mask\().16b 348 eor \a0\().16b, \a0\().16b, \t0\().16b 349 shl \t0\().2d, \t0\().2d, #\n 350 eor \a1\().16b, \a1\().16b, \t1\().16b 351 shl \t1\().2d, \t1\().2d, #\n 352 eor \b0\().16b, \b0\().16b, \t0\().16b 353 eor \b1\().16b, \b1\().16b, \t1\().16b 354 .endm 355 356 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 357 movi \t0\().16b, #0x55 358 movi \t1\().16b, #0x33 359 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 360 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 361 movi \t0\().16b, #0x0f 362 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 363 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 364 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 365 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 366 .endm 367 368 369 .align 6 370M0: .octa 0x0004080c0105090d02060a0e03070b0f 371 372M0SR: .octa 0x0004080c05090d010a0e02060f03070b 373SR: .octa 0x0f0e0d0c0a09080b0504070600030201 374SRM0: .octa 0x01060b0c0207080d0304090e00050a0f 375 376M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 377ISR: .octa 0x0f0e0d0c080b0a090504070602010003 378ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f 379 380 /* 381 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) 382 */ 383SYM_FUNC_START(aesbs_convert_key) 384 ld1 {v7.4s}, [x1], #16 // load round 0 key 385 ld1 {v17.4s}, [x1], #16 // load round 1 key 386 387 movi v8.16b, #0x01 // bit masks 388 movi v9.16b, #0x02 389 movi v10.16b, #0x04 390 movi v11.16b, #0x08 391 movi v12.16b, #0x10 392 movi v13.16b, #0x20 393 movi v14.16b, #0x40 394 movi v15.16b, #0x80 395 ldr q16, M0 396 397 sub x2, x2, #1 398 str q7, [x0], #16 // save round 0 key 399 400.Lkey_loop: 401 tbl v7.16b ,{v17.16b}, v16.16b 402 ld1 {v17.4s}, [x1], #16 // load next round key 403 404 cmtst v0.16b, v7.16b, v8.16b 405 cmtst v1.16b, v7.16b, v9.16b 406 cmtst v2.16b, v7.16b, v10.16b 407 cmtst v3.16b, v7.16b, v11.16b 408 cmtst v4.16b, v7.16b, v12.16b 409 cmtst v5.16b, v7.16b, v13.16b 410 cmtst v6.16b, v7.16b, v14.16b 411 cmtst v7.16b, v7.16b, v15.16b 412 not v0.16b, v0.16b 413 not v1.16b, v1.16b 414 not v5.16b, v5.16b 415 not v6.16b, v6.16b 416 417 subs x2, x2, #1 418 stp q0, q1, [x0], #128 419 stp q2, q3, [x0, #-96] 420 stp q4, q5, [x0, #-64] 421 stp q6, q7, [x0, #-32] 422 b.ne .Lkey_loop 423 424 movi v7.16b, #0x63 // compose .L63 425 eor v17.16b, v17.16b, v7.16b 426 str q17, [x0] 427 ret 428SYM_FUNC_END(aesbs_convert_key) 429 430 .align 4 431SYM_FUNC_START_LOCAL(aesbs_encrypt8) 432 ldr q9, [bskey], #16 // round 0 key 433 ldr q8, M0SR 434 ldr q24, SR 435 436 eor v10.16b, v0.16b, v9.16b // xor with round0 key 437 eor v11.16b, v1.16b, v9.16b 438 tbl v0.16b, {v10.16b}, v8.16b 439 eor v12.16b, v2.16b, v9.16b 440 tbl v1.16b, {v11.16b}, v8.16b 441 eor v13.16b, v3.16b, v9.16b 442 tbl v2.16b, {v12.16b}, v8.16b 443 eor v14.16b, v4.16b, v9.16b 444 tbl v3.16b, {v13.16b}, v8.16b 445 eor v15.16b, v5.16b, v9.16b 446 tbl v4.16b, {v14.16b}, v8.16b 447 eor v10.16b, v6.16b, v9.16b 448 tbl v5.16b, {v15.16b}, v8.16b 449 eor v11.16b, v7.16b, v9.16b 450 tbl v6.16b, {v10.16b}, v8.16b 451 tbl v7.16b, {v11.16b}, v8.16b 452 453 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 454 455 sub rounds, rounds, #1 456 b .Lenc_sbox 457 458.Lenc_loop: 459 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 460.Lenc_sbox: 461 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 462 v13, v14, v15 463 subs rounds, rounds, #1 464 b.cc .Lenc_done 465 466 enc_next_rk 467 468 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ 469 v13, v14, v15 470 471 add_round_key v0, v1, v2, v3, v4, v5, v6, v7 472 473 b.ne .Lenc_loop 474 ldr q24, SRM0 475 b .Lenc_loop 476 477.Lenc_done: 478 ldr q12, [bskey] // last round key 479 480 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 481 482 eor v0.16b, v0.16b, v12.16b 483 eor v1.16b, v1.16b, v12.16b 484 eor v4.16b, v4.16b, v12.16b 485 eor v6.16b, v6.16b, v12.16b 486 eor v3.16b, v3.16b, v12.16b 487 eor v7.16b, v7.16b, v12.16b 488 eor v2.16b, v2.16b, v12.16b 489 eor v5.16b, v5.16b, v12.16b 490 ret 491SYM_FUNC_END(aesbs_encrypt8) 492 493 .align 4 494SYM_FUNC_START_LOCAL(aesbs_decrypt8) 495 lsl x9, rounds, #7 496 add bskey, bskey, x9 497 498 ldr q9, [bskey, #-112]! // round 0 key 499 ldr q8, M0ISR 500 ldr q24, ISR 501 502 eor v10.16b, v0.16b, v9.16b // xor with round0 key 503 eor v11.16b, v1.16b, v9.16b 504 tbl v0.16b, {v10.16b}, v8.16b 505 eor v12.16b, v2.16b, v9.16b 506 tbl v1.16b, {v11.16b}, v8.16b 507 eor v13.16b, v3.16b, v9.16b 508 tbl v2.16b, {v12.16b}, v8.16b 509 eor v14.16b, v4.16b, v9.16b 510 tbl v3.16b, {v13.16b}, v8.16b 511 eor v15.16b, v5.16b, v9.16b 512 tbl v4.16b, {v14.16b}, v8.16b 513 eor v10.16b, v6.16b, v9.16b 514 tbl v5.16b, {v15.16b}, v8.16b 515 eor v11.16b, v7.16b, v9.16b 516 tbl v6.16b, {v10.16b}, v8.16b 517 tbl v7.16b, {v11.16b}, v8.16b 518 519 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 520 521 sub rounds, rounds, #1 522 b .Ldec_sbox 523 524.Ldec_loop: 525 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 526.Ldec_sbox: 527 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 528 v13, v14, v15 529 subs rounds, rounds, #1 530 b.cc .Ldec_done 531 532 dec_next_rk 533 534 add_round_key v0, v1, v6, v4, v2, v7, v3, v5 535 536 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ 537 v13, v14, v15 538 539 b.ne .Ldec_loop 540 ldr q24, ISRM0 541 b .Ldec_loop 542.Ldec_done: 543 ldr q12, [bskey, #-16] // last round key 544 545 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 546 547 eor v0.16b, v0.16b, v12.16b 548 eor v1.16b, v1.16b, v12.16b 549 eor v6.16b, v6.16b, v12.16b 550 eor v4.16b, v4.16b, v12.16b 551 eor v2.16b, v2.16b, v12.16b 552 eor v7.16b, v7.16b, v12.16b 553 eor v3.16b, v3.16b, v12.16b 554 eor v5.16b, v5.16b, v12.16b 555 ret 556SYM_FUNC_END(aesbs_decrypt8) 557 558 /* 559 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 560 * int blocks) 561 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 562 * int blocks) 563 */ 564 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 565 frame_push 5 566 567 mov x19, x0 568 mov x20, x1 569 mov x21, x2 570 mov x22, x3 571 mov x23, x4 572 57399: mov x5, #1 574 lsl x5, x5, x23 575 subs w23, w23, #8 576 csel x23, x23, xzr, pl 577 csel x5, x5, xzr, mi 578 579 ld1 {v0.16b}, [x20], #16 580 tbnz x5, #1, 0f 581 ld1 {v1.16b}, [x20], #16 582 tbnz x5, #2, 0f 583 ld1 {v2.16b}, [x20], #16 584 tbnz x5, #3, 0f 585 ld1 {v3.16b}, [x20], #16 586 tbnz x5, #4, 0f 587 ld1 {v4.16b}, [x20], #16 588 tbnz x5, #5, 0f 589 ld1 {v5.16b}, [x20], #16 590 tbnz x5, #6, 0f 591 ld1 {v6.16b}, [x20], #16 592 tbnz x5, #7, 0f 593 ld1 {v7.16b}, [x20], #16 594 5950: mov bskey, x21 596 mov rounds, x22 597 bl \do8 598 599 st1 {\o0\().16b}, [x19], #16 600 tbnz x5, #1, 1f 601 st1 {\o1\().16b}, [x19], #16 602 tbnz x5, #2, 1f 603 st1 {\o2\().16b}, [x19], #16 604 tbnz x5, #3, 1f 605 st1 {\o3\().16b}, [x19], #16 606 tbnz x5, #4, 1f 607 st1 {\o4\().16b}, [x19], #16 608 tbnz x5, #5, 1f 609 st1 {\o5\().16b}, [x19], #16 610 tbnz x5, #6, 1f 611 st1 {\o6\().16b}, [x19], #16 612 tbnz x5, #7, 1f 613 st1 {\o7\().16b}, [x19], #16 614 615 cbz x23, 1f 616 b 99b 617 6181: frame_pop 619 ret 620 .endm 621 622 .align 4 623SYM_FUNC_START(aesbs_ecb_encrypt) 624 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 625SYM_FUNC_END(aesbs_ecb_encrypt) 626 627 .align 4 628SYM_FUNC_START(aesbs_ecb_decrypt) 629 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 630SYM_FUNC_END(aesbs_ecb_decrypt) 631 632 /* 633 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 634 * int blocks, u8 iv[]) 635 */ 636 .align 4 637SYM_FUNC_START(aesbs_cbc_decrypt) 638 frame_push 6 639 640 mov x19, x0 641 mov x20, x1 642 mov x21, x2 643 mov x22, x3 644 mov x23, x4 645 mov x24, x5 646 64799: mov x6, #1 648 lsl x6, x6, x23 649 subs w23, w23, #8 650 csel x23, x23, xzr, pl 651 csel x6, x6, xzr, mi 652 653 ld1 {v0.16b}, [x20], #16 654 mov v25.16b, v0.16b 655 tbnz x6, #1, 0f 656 ld1 {v1.16b}, [x20], #16 657 mov v26.16b, v1.16b 658 tbnz x6, #2, 0f 659 ld1 {v2.16b}, [x20], #16 660 mov v27.16b, v2.16b 661 tbnz x6, #3, 0f 662 ld1 {v3.16b}, [x20], #16 663 mov v28.16b, v3.16b 664 tbnz x6, #4, 0f 665 ld1 {v4.16b}, [x20], #16 666 mov v29.16b, v4.16b 667 tbnz x6, #5, 0f 668 ld1 {v5.16b}, [x20], #16 669 mov v30.16b, v5.16b 670 tbnz x6, #6, 0f 671 ld1 {v6.16b}, [x20], #16 672 mov v31.16b, v6.16b 673 tbnz x6, #7, 0f 674 ld1 {v7.16b}, [x20] 675 6760: mov bskey, x21 677 mov rounds, x22 678 bl aesbs_decrypt8 679 680 ld1 {v24.16b}, [x24] // load IV 681 682 eor v1.16b, v1.16b, v25.16b 683 eor v6.16b, v6.16b, v26.16b 684 eor v4.16b, v4.16b, v27.16b 685 eor v2.16b, v2.16b, v28.16b 686 eor v7.16b, v7.16b, v29.16b 687 eor v0.16b, v0.16b, v24.16b 688 eor v3.16b, v3.16b, v30.16b 689 eor v5.16b, v5.16b, v31.16b 690 691 st1 {v0.16b}, [x19], #16 692 mov v24.16b, v25.16b 693 tbnz x6, #1, 1f 694 st1 {v1.16b}, [x19], #16 695 mov v24.16b, v26.16b 696 tbnz x6, #2, 1f 697 st1 {v6.16b}, [x19], #16 698 mov v24.16b, v27.16b 699 tbnz x6, #3, 1f 700 st1 {v4.16b}, [x19], #16 701 mov v24.16b, v28.16b 702 tbnz x6, #4, 1f 703 st1 {v2.16b}, [x19], #16 704 mov v24.16b, v29.16b 705 tbnz x6, #5, 1f 706 st1 {v7.16b}, [x19], #16 707 mov v24.16b, v30.16b 708 tbnz x6, #6, 1f 709 st1 {v3.16b}, [x19], #16 710 mov v24.16b, v31.16b 711 tbnz x6, #7, 1f 712 ld1 {v24.16b}, [x20], #16 713 st1 {v5.16b}, [x19], #16 7141: st1 {v24.16b}, [x24] // store IV 715 716 cbz x23, 2f 717 b 99b 718 7192: frame_pop 720 ret 721SYM_FUNC_END(aesbs_cbc_decrypt) 722 723 .macro next_tweak, out, in, const, tmp 724 sshr \tmp\().2d, \in\().2d, #63 725 and \tmp\().16b, \tmp\().16b, \const\().16b 726 add \out\().2d, \in\().2d, \in\().2d 727 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 728 eor \out\().16b, \out\().16b, \tmp\().16b 729 .endm 730 731 /* 732 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 733 * int blocks, u8 iv[]) 734 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 735 * int blocks, u8 iv[]) 736 */ 737SYM_FUNC_START_LOCAL(__xts_crypt8) 738 mov x6, #1 739 lsl x6, x6, x23 740 subs w23, w23, #8 741 csel x23, x23, xzr, pl 742 csel x6, x6, xzr, mi 743 744 ld1 {v0.16b}, [x20], #16 745 next_tweak v26, v25, v30, v31 746 eor v0.16b, v0.16b, v25.16b 747 tbnz x6, #1, 0f 748 749 ld1 {v1.16b}, [x20], #16 750 next_tweak v27, v26, v30, v31 751 eor v1.16b, v1.16b, v26.16b 752 tbnz x6, #2, 0f 753 754 ld1 {v2.16b}, [x20], #16 755 next_tweak v28, v27, v30, v31 756 eor v2.16b, v2.16b, v27.16b 757 tbnz x6, #3, 0f 758 759 ld1 {v3.16b}, [x20], #16 760 next_tweak v29, v28, v30, v31 761 eor v3.16b, v3.16b, v28.16b 762 tbnz x6, #4, 0f 763 764 ld1 {v4.16b}, [x20], #16 765 str q29, [sp, #.Lframe_local_offset] 766 eor v4.16b, v4.16b, v29.16b 767 next_tweak v29, v29, v30, v31 768 tbnz x6, #5, 0f 769 770 ld1 {v5.16b}, [x20], #16 771 str q29, [sp, #.Lframe_local_offset + 16] 772 eor v5.16b, v5.16b, v29.16b 773 next_tweak v29, v29, v30, v31 774 tbnz x6, #6, 0f 775 776 ld1 {v6.16b}, [x20], #16 777 str q29, [sp, #.Lframe_local_offset + 32] 778 eor v6.16b, v6.16b, v29.16b 779 next_tweak v29, v29, v30, v31 780 tbnz x6, #7, 0f 781 782 ld1 {v7.16b}, [x20], #16 783 str q29, [sp, #.Lframe_local_offset + 48] 784 eor v7.16b, v7.16b, v29.16b 785 next_tweak v29, v29, v30, v31 786 7870: mov bskey, x21 788 mov rounds, x22 789 br x16 790SYM_FUNC_END(__xts_crypt8) 791 792 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 793 frame_push 6, 64 794 795 mov x19, x0 796 mov x20, x1 797 mov x21, x2 798 mov x22, x3 799 mov x23, x4 800 mov x24, x5 801 802 movi v30.2s, #0x1 803 movi v25.2s, #0x87 804 uzp1 v30.4s, v30.4s, v25.4s 805 ld1 {v25.16b}, [x24] 806 80799: adr x16, \do8 808 bl __xts_crypt8 809 810 ldp q16, q17, [sp, #.Lframe_local_offset] 811 ldp q18, q19, [sp, #.Lframe_local_offset + 32] 812 813 eor \o0\().16b, \o0\().16b, v25.16b 814 eor \o1\().16b, \o1\().16b, v26.16b 815 eor \o2\().16b, \o2\().16b, v27.16b 816 eor \o3\().16b, \o3\().16b, v28.16b 817 818 st1 {\o0\().16b}, [x19], #16 819 mov v25.16b, v26.16b 820 tbnz x6, #1, 1f 821 st1 {\o1\().16b}, [x19], #16 822 mov v25.16b, v27.16b 823 tbnz x6, #2, 1f 824 st1 {\o2\().16b}, [x19], #16 825 mov v25.16b, v28.16b 826 tbnz x6, #3, 1f 827 st1 {\o3\().16b}, [x19], #16 828 mov v25.16b, v29.16b 829 tbnz x6, #4, 1f 830 831 eor \o4\().16b, \o4\().16b, v16.16b 832 eor \o5\().16b, \o5\().16b, v17.16b 833 eor \o6\().16b, \o6\().16b, v18.16b 834 eor \o7\().16b, \o7\().16b, v19.16b 835 836 st1 {\o4\().16b}, [x19], #16 837 tbnz x6, #5, 1f 838 st1 {\o5\().16b}, [x19], #16 839 tbnz x6, #6, 1f 840 st1 {\o6\().16b}, [x19], #16 841 tbnz x6, #7, 1f 842 st1 {\o7\().16b}, [x19], #16 843 844 cbz x23, 1f 845 st1 {v25.16b}, [x24] 846 847 b 99b 848 8491: st1 {v25.16b}, [x24] 850 frame_pop 851 ret 852 .endm 853 854SYM_FUNC_START(aesbs_xts_encrypt) 855 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 856SYM_FUNC_END(aesbs_xts_encrypt) 857 858SYM_FUNC_START(aesbs_xts_decrypt) 859 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 860SYM_FUNC_END(aesbs_xts_decrypt) 861 862 .macro next_ctr, v 863 mov \v\().d[1], x8 864 adds x8, x8, #1 865 mov \v\().d[0], x7 866 adc x7, x7, xzr 867 rev64 \v\().16b, \v\().16b 868 .endm 869 870 /* 871 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 872 * int rounds, int blocks, u8 iv[], u8 final[]) 873 */ 874SYM_FUNC_START(aesbs_ctr_encrypt) 875 frame_push 8 876 877 mov x19, x0 878 mov x20, x1 879 mov x21, x2 880 mov x22, x3 881 mov x23, x4 882 mov x24, x5 883 mov x25, x6 884 885 cmp x25, #0 886 cset x26, ne 887 add x23, x23, x26 // do one extra block if final 888 889 ldp x7, x8, [x24] 890 ld1 {v0.16b}, [x24] 891CPU_LE( rev x7, x7 ) 892CPU_LE( rev x8, x8 ) 893 adds x8, x8, #1 894 adc x7, x7, xzr 895 89699: mov x9, #1 897 lsl x9, x9, x23 898 subs w23, w23, #8 899 csel x23, x23, xzr, pl 900 csel x9, x9, xzr, le 901 902 tbnz x9, #1, 0f 903 next_ctr v1 904 tbnz x9, #2, 0f 905 next_ctr v2 906 tbnz x9, #3, 0f 907 next_ctr v3 908 tbnz x9, #4, 0f 909 next_ctr v4 910 tbnz x9, #5, 0f 911 next_ctr v5 912 tbnz x9, #6, 0f 913 next_ctr v6 914 tbnz x9, #7, 0f 915 next_ctr v7 916 9170: mov bskey, x21 918 mov rounds, x22 919 bl aesbs_encrypt8 920 921 lsr x9, x9, x26 // disregard the extra block 922 tbnz x9, #0, 0f 923 924 ld1 {v8.16b}, [x20], #16 925 eor v0.16b, v0.16b, v8.16b 926 st1 {v0.16b}, [x19], #16 927 tbnz x9, #1, 1f 928 929 ld1 {v9.16b}, [x20], #16 930 eor v1.16b, v1.16b, v9.16b 931 st1 {v1.16b}, [x19], #16 932 tbnz x9, #2, 2f 933 934 ld1 {v10.16b}, [x20], #16 935 eor v4.16b, v4.16b, v10.16b 936 st1 {v4.16b}, [x19], #16 937 tbnz x9, #3, 3f 938 939 ld1 {v11.16b}, [x20], #16 940 eor v6.16b, v6.16b, v11.16b 941 st1 {v6.16b}, [x19], #16 942 tbnz x9, #4, 4f 943 944 ld1 {v12.16b}, [x20], #16 945 eor v3.16b, v3.16b, v12.16b 946 st1 {v3.16b}, [x19], #16 947 tbnz x9, #5, 5f 948 949 ld1 {v13.16b}, [x20], #16 950 eor v7.16b, v7.16b, v13.16b 951 st1 {v7.16b}, [x19], #16 952 tbnz x9, #6, 6f 953 954 ld1 {v14.16b}, [x20], #16 955 eor v2.16b, v2.16b, v14.16b 956 st1 {v2.16b}, [x19], #16 957 tbnz x9, #7, 7f 958 959 ld1 {v15.16b}, [x20], #16 960 eor v5.16b, v5.16b, v15.16b 961 st1 {v5.16b}, [x19], #16 962 9638: next_ctr v0 964 st1 {v0.16b}, [x24] 965 cbz x23, .Lctr_done 966 967 b 99b 968 969.Lctr_done: 970 frame_pop 971 ret 972 973 /* 974 * If we are handling the tail of the input (x6 != NULL), return the 975 * final keystream block back to the caller. 976 */ 9770: cbz x25, 8b 978 st1 {v0.16b}, [x25] 979 b 8b 9801: cbz x25, 8b 981 st1 {v1.16b}, [x25] 982 b 8b 9832: cbz x25, 8b 984 st1 {v4.16b}, [x25] 985 b 8b 9863: cbz x25, 8b 987 st1 {v6.16b}, [x25] 988 b 8b 9894: cbz x25, 8b 990 st1 {v3.16b}, [x25] 991 b 8b 9925: cbz x25, 8b 993 st1 {v7.16b}, [x25] 994 b 8b 9956: cbz x25, 8b 996 st1 {v2.16b}, [x25] 997 b 8b 9987: cbz x25, 8b 999 st1 {v5.16b}, [x25] 1000 b 8b 1001SYM_FUNC_END(aesbs_ctr_encrypt) 1002