1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Bit sliced AES using NEON instructions 4 * 5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8/* 9 * The algorithm implemented here is described in detail by the paper 10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and 11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) 12 * 13 * This implementation is based primarily on the OpenSSL implementation 14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> 15 */ 16 17#include <linux/linkage.h> 18#include <asm/assembler.h> 19 20 .text 21 22 rounds .req x11 23 bskey .req x12 24 25 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 26 eor \b2, \b2, \b1 27 eor \b5, \b5, \b6 28 eor \b3, \b3, \b0 29 eor \b6, \b6, \b2 30 eor \b5, \b5, \b0 31 eor \b6, \b6, \b3 32 eor \b3, \b3, \b7 33 eor \b7, \b7, \b5 34 eor \b3, \b3, \b4 35 eor \b4, \b4, \b5 36 eor \b2, \b2, \b7 37 eor \b3, \b3, \b1 38 eor \b1, \b1, \b5 39 .endm 40 41 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 42 eor \b0, \b0, \b6 43 eor \b1, \b1, \b4 44 eor \b4, \b4, \b6 45 eor \b2, \b2, \b0 46 eor \b6, \b6, \b1 47 eor \b1, \b1, \b5 48 eor \b5, \b5, \b3 49 eor \b3, \b3, \b7 50 eor \b7, \b7, \b5 51 eor \b2, \b2, \b5 52 eor \b4, \b4, \b7 53 .endm 54 55 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 56 eor \b1, \b1, \b7 57 eor \b4, \b4, \b7 58 eor \b7, \b7, \b5 59 eor \b1, \b1, \b3 60 eor \b2, \b2, \b5 61 eor \b3, \b3, \b7 62 eor \b6, \b6, \b1 63 eor \b2, \b2, \b0 64 eor \b5, \b5, \b3 65 eor \b4, \b4, \b6 66 eor \b0, \b0, \b6 67 eor \b1, \b1, \b4 68 .endm 69 70 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 71 eor \b1, \b1, \b5 72 eor \b2, \b2, \b7 73 eor \b3, \b3, \b1 74 eor \b4, \b4, \b5 75 eor \b7, \b7, \b5 76 eor \b3, \b3, \b4 77 eor \b5, \b5, \b0 78 eor \b3, \b3, \b7 79 eor \b6, \b6, \b2 80 eor \b2, \b2, \b1 81 eor \b6, \b6, \b3 82 eor \b3, \b3, \b0 83 eor \b5, \b5, \b6 84 .endm 85 86 .macro mul_gf4, x0, x1, y0, y1, t0, t1 87 eor \t0, \y0, \y1 88 and \t0, \t0, \x0 89 eor \x0, \x0, \x1 90 and \t1, \x1, \y0 91 and \x0, \x0, \y1 92 eor \x1, \t1, \t0 93 eor \x0, \x0, \t1 94 .endm 95 96 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 97 eor \t0, \y0, \y1 98 eor \t1, \y2, \y3 99 and \t0, \t0, \x0 100 and \t1, \t1, \x2 101 eor \x0, \x0, \x1 102 eor \x2, \x2, \x3 103 and \x1, \x1, \y0 104 and \x3, \x3, \y2 105 and \x0, \x0, \y1 106 and \x2, \x2, \y3 107 eor \x1, \x1, \x0 108 eor \x2, \x2, \x3 109 eor \x0, \x0, \t0 110 eor \x3, \x3, \t1 111 .endm 112 113 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ 114 y0, y1, y2, y3, t0, t1, t2, t3 115 eor \t0, \x0, \x2 116 eor \t1, \x1, \x3 117 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 118 eor \y0, \y0, \y2 119 eor \y1, \y1, \y3 120 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 121 eor \x0, \x0, \t0 122 eor \x2, \x2, \t0 123 eor \x1, \x1, \t1 124 eor \x3, \x3, \t1 125 eor \t0, \x4, \x6 126 eor \t1, \x5, \x7 127 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 128 eor \y0, \y0, \y2 129 eor \y1, \y1, \y3 130 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 131 eor \x4, \x4, \t0 132 eor \x6, \x6, \t0 133 eor \x5, \x5, \t1 134 eor \x7, \x7, \t1 135 .endm 136 137 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ 138 t0, t1, t2, t3, s0, s1, s2, s3 139 eor \t3, \x4, \x6 140 eor \t0, \x5, \x7 141 eor \t1, \x1, \x3 142 eor \s1, \x7, \x6 143 eor \s0, \x0, \x2 144 eor \s3, \t3, \t0 145 orr \t2, \t0, \t1 146 and \s2, \t3, \s0 147 orr \t3, \t3, \s0 148 eor \s0, \s0, \t1 149 and \t0, \t0, \t1 150 eor \t1, \x3, \x2 151 and \s3, \s3, \s0 152 and \s1, \s1, \t1 153 eor \t1, \x4, \x5 154 eor \s0, \x1, \x0 155 eor \t3, \t3, \s1 156 eor \t2, \t2, \s1 157 and \s1, \t1, \s0 158 orr \t1, \t1, \s0 159 eor \t3, \t3, \s3 160 eor \t0, \t0, \s1 161 eor \t2, \t2, \s2 162 eor \t1, \t1, \s3 163 eor \t0, \t0, \s2 164 and \s0, \x7, \x3 165 eor \t1, \t1, \s2 166 and \s1, \x6, \x2 167 and \s2, \x5, \x1 168 orr \s3, \x4, \x0 169 eor \t3, \t3, \s0 170 eor \t1, \t1, \s2 171 eor \s0, \t0, \s3 172 eor \t2, \t2, \s1 173 and \s2, \t3, \t1 174 eor \s1, \t2, \s2 175 eor \s3, \s0, \s2 176 bsl \s1, \t1, \s0 177 not \t0, \s0 178 bsl \s0, \s1, \s3 179 bsl \t0, \s1, \s3 180 bsl \s3, \t3, \t2 181 eor \t3, \t3, \t2 182 and \s2, \s0, \s3 183 eor \t1, \t1, \t0 184 eor \s2, \s2, \t3 185 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 186 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 187 .endm 188 189 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 190 t0, t1, t2, t3, s0, s1, s2, s3 191 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 192 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 193 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ 194 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 195 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 196 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 197 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 198 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b 199 .endm 200 201 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 202 t0, t1, t2, t3, s0, s1, s2, s3 203 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 204 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 205 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ 206 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 207 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 208 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 209 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 210 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b 211 .endm 212 213 .macro enc_next_rk 214 ldp q16, q17, [bskey], #128 215 ldp q18, q19, [bskey, #-96] 216 ldp q20, q21, [bskey, #-64] 217 ldp q22, q23, [bskey, #-32] 218 .endm 219 220 .macro dec_next_rk 221 ldp q16, q17, [bskey, #-128]! 222 ldp q18, q19, [bskey, #32] 223 ldp q20, q21, [bskey, #64] 224 ldp q22, q23, [bskey, #96] 225 .endm 226 227 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 228 eor \x0\().16b, \x0\().16b, v16.16b 229 eor \x1\().16b, \x1\().16b, v17.16b 230 eor \x2\().16b, \x2\().16b, v18.16b 231 eor \x3\().16b, \x3\().16b, v19.16b 232 eor \x4\().16b, \x4\().16b, v20.16b 233 eor \x5\().16b, \x5\().16b, v21.16b 234 eor \x6\().16b, \x6\().16b, v22.16b 235 eor \x7\().16b, \x7\().16b, v23.16b 236 .endm 237 238 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask 239 tbl \x0\().16b, {\x0\().16b}, \mask\().16b 240 tbl \x1\().16b, {\x1\().16b}, \mask\().16b 241 tbl \x2\().16b, {\x2\().16b}, \mask\().16b 242 tbl \x3\().16b, {\x3\().16b}, \mask\().16b 243 tbl \x4\().16b, {\x4\().16b}, \mask\().16b 244 tbl \x5\().16b, {\x5\().16b}, \mask\().16b 245 tbl \x6\().16b, {\x6\().16b}, \mask\().16b 246 tbl \x7\().16b, {\x7\().16b}, \mask\().16b 247 .endm 248 249 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 250 t0, t1, t2, t3, t4, t5, t6, t7, inv 251 ext \t0\().16b, \x0\().16b, \x0\().16b, #12 252 ext \t1\().16b, \x1\().16b, \x1\().16b, #12 253 eor \x0\().16b, \x0\().16b, \t0\().16b 254 ext \t2\().16b, \x2\().16b, \x2\().16b, #12 255 eor \x1\().16b, \x1\().16b, \t1\().16b 256 ext \t3\().16b, \x3\().16b, \x3\().16b, #12 257 eor \x2\().16b, \x2\().16b, \t2\().16b 258 ext \t4\().16b, \x4\().16b, \x4\().16b, #12 259 eor \x3\().16b, \x3\().16b, \t3\().16b 260 ext \t5\().16b, \x5\().16b, \x5\().16b, #12 261 eor \x4\().16b, \x4\().16b, \t4\().16b 262 ext \t6\().16b, \x6\().16b, \x6\().16b, #12 263 eor \x5\().16b, \x5\().16b, \t5\().16b 264 ext \t7\().16b, \x7\().16b, \x7\().16b, #12 265 eor \x6\().16b, \x6\().16b, \t6\().16b 266 eor \t1\().16b, \t1\().16b, \x0\().16b 267 eor \x7\().16b, \x7\().16b, \t7\().16b 268 ext \x0\().16b, \x0\().16b, \x0\().16b, #8 269 eor \t2\().16b, \t2\().16b, \x1\().16b 270 eor \t0\().16b, \t0\().16b, \x7\().16b 271 eor \t1\().16b, \t1\().16b, \x7\().16b 272 ext \x1\().16b, \x1\().16b, \x1\().16b, #8 273 eor \t5\().16b, \t5\().16b, \x4\().16b 274 eor \x0\().16b, \x0\().16b, \t0\().16b 275 eor \t6\().16b, \t6\().16b, \x5\().16b 276 eor \x1\().16b, \x1\().16b, \t1\().16b 277 ext \t0\().16b, \x4\().16b, \x4\().16b, #8 278 eor \t4\().16b, \t4\().16b, \x3\().16b 279 ext \t1\().16b, \x5\().16b, \x5\().16b, #8 280 eor \t7\().16b, \t7\().16b, \x6\().16b 281 ext \x4\().16b, \x3\().16b, \x3\().16b, #8 282 eor \t3\().16b, \t3\().16b, \x2\().16b 283 ext \x5\().16b, \x7\().16b, \x7\().16b, #8 284 eor \t4\().16b, \t4\().16b, \x7\().16b 285 ext \x3\().16b, \x6\().16b, \x6\().16b, #8 286 eor \t3\().16b, \t3\().16b, \x7\().16b 287 ext \x6\().16b, \x2\().16b, \x2\().16b, #8 288 eor \x7\().16b, \t1\().16b, \t5\().16b 289 .ifb \inv 290 eor \x2\().16b, \t0\().16b, \t4\().16b 291 eor \x4\().16b, \x4\().16b, \t3\().16b 292 eor \x5\().16b, \x5\().16b, \t7\().16b 293 eor \x3\().16b, \x3\().16b, \t6\().16b 294 eor \x6\().16b, \x6\().16b, \t2\().16b 295 .else 296 eor \t3\().16b, \t3\().16b, \x4\().16b 297 eor \x5\().16b, \x5\().16b, \t7\().16b 298 eor \x2\().16b, \x3\().16b, \t6\().16b 299 eor \x3\().16b, \t0\().16b, \t4\().16b 300 eor \x4\().16b, \x6\().16b, \t2\().16b 301 mov \x6\().16b, \t3\().16b 302 .endif 303 .endm 304 305 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 306 t0, t1, t2, t3, t4, t5, t6, t7 307 ext \t0\().16b, \x0\().16b, \x0\().16b, #8 308 ext \t6\().16b, \x6\().16b, \x6\().16b, #8 309 ext \t7\().16b, \x7\().16b, \x7\().16b, #8 310 eor \t0\().16b, \t0\().16b, \x0\().16b 311 ext \t1\().16b, \x1\().16b, \x1\().16b, #8 312 eor \t6\().16b, \t6\().16b, \x6\().16b 313 ext \t2\().16b, \x2\().16b, \x2\().16b, #8 314 eor \t7\().16b, \t7\().16b, \x7\().16b 315 ext \t3\().16b, \x3\().16b, \x3\().16b, #8 316 eor \t1\().16b, \t1\().16b, \x1\().16b 317 ext \t4\().16b, \x4\().16b, \x4\().16b, #8 318 eor \t2\().16b, \t2\().16b, \x2\().16b 319 ext \t5\().16b, \x5\().16b, \x5\().16b, #8 320 eor \t3\().16b, \t3\().16b, \x3\().16b 321 eor \t4\().16b, \t4\().16b, \x4\().16b 322 eor \t5\().16b, \t5\().16b, \x5\().16b 323 eor \x0\().16b, \x0\().16b, \t6\().16b 324 eor \x1\().16b, \x1\().16b, \t6\().16b 325 eor \x2\().16b, \x2\().16b, \t0\().16b 326 eor \x4\().16b, \x4\().16b, \t2\().16b 327 eor \x3\().16b, \x3\().16b, \t1\().16b 328 eor \x1\().16b, \x1\().16b, \t7\().16b 329 eor \x2\().16b, \x2\().16b, \t7\().16b 330 eor \x4\().16b, \x4\().16b, \t6\().16b 331 eor \x5\().16b, \x5\().16b, \t3\().16b 332 eor \x3\().16b, \x3\().16b, \t6\().16b 333 eor \x6\().16b, \x6\().16b, \t4\().16b 334 eor \x4\().16b, \x4\().16b, \t7\().16b 335 eor \x5\().16b, \x5\().16b, \t7\().16b 336 eor \x7\().16b, \x7\().16b, \t5\().16b 337 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 338 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 339 .endm 340 341 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 342 ushr \t0\().2d, \b0\().2d, #\n 343 ushr \t1\().2d, \b1\().2d, #\n 344 eor \t0\().16b, \t0\().16b, \a0\().16b 345 eor \t1\().16b, \t1\().16b, \a1\().16b 346 and \t0\().16b, \t0\().16b, \mask\().16b 347 and \t1\().16b, \t1\().16b, \mask\().16b 348 eor \a0\().16b, \a0\().16b, \t0\().16b 349 shl \t0\().2d, \t0\().2d, #\n 350 eor \a1\().16b, \a1\().16b, \t1\().16b 351 shl \t1\().2d, \t1\().2d, #\n 352 eor \b0\().16b, \b0\().16b, \t0\().16b 353 eor \b1\().16b, \b1\().16b, \t1\().16b 354 .endm 355 356 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 357 movi \t0\().16b, #0x55 358 movi \t1\().16b, #0x33 359 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 360 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 361 movi \t0\().16b, #0x0f 362 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 363 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 364 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 365 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 366 .endm 367 368 369 .align 6 370M0: .octa 0x0004080c0105090d02060a0e03070b0f 371 372M0SR: .octa 0x0004080c05090d010a0e02060f03070b 373SR: .octa 0x0f0e0d0c0a09080b0504070600030201 374SRM0: .octa 0x01060b0c0207080d0304090e00050a0f 375 376M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 377ISR: .octa 0x0f0e0d0c080b0a090504070602010003 378ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f 379 380 /* 381 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) 382 */ 383ENTRY(aesbs_convert_key) 384 ld1 {v7.4s}, [x1], #16 // load round 0 key 385 ld1 {v17.4s}, [x1], #16 // load round 1 key 386 387 movi v8.16b, #0x01 // bit masks 388 movi v9.16b, #0x02 389 movi v10.16b, #0x04 390 movi v11.16b, #0x08 391 movi v12.16b, #0x10 392 movi v13.16b, #0x20 393 movi v14.16b, #0x40 394 movi v15.16b, #0x80 395 ldr q16, M0 396 397 sub x2, x2, #1 398 str q7, [x0], #16 // save round 0 key 399 400.Lkey_loop: 401 tbl v7.16b ,{v17.16b}, v16.16b 402 ld1 {v17.4s}, [x1], #16 // load next round key 403 404 cmtst v0.16b, v7.16b, v8.16b 405 cmtst v1.16b, v7.16b, v9.16b 406 cmtst v2.16b, v7.16b, v10.16b 407 cmtst v3.16b, v7.16b, v11.16b 408 cmtst v4.16b, v7.16b, v12.16b 409 cmtst v5.16b, v7.16b, v13.16b 410 cmtst v6.16b, v7.16b, v14.16b 411 cmtst v7.16b, v7.16b, v15.16b 412 not v0.16b, v0.16b 413 not v1.16b, v1.16b 414 not v5.16b, v5.16b 415 not v6.16b, v6.16b 416 417 subs x2, x2, #1 418 stp q0, q1, [x0], #128 419 stp q2, q3, [x0, #-96] 420 stp q4, q5, [x0, #-64] 421 stp q6, q7, [x0, #-32] 422 b.ne .Lkey_loop 423 424 movi v7.16b, #0x63 // compose .L63 425 eor v17.16b, v17.16b, v7.16b 426 str q17, [x0] 427 ret 428ENDPROC(aesbs_convert_key) 429 430 .align 4 431aesbs_encrypt8: 432 ldr q9, [bskey], #16 // round 0 key 433 ldr q8, M0SR 434 ldr q24, SR 435 436 eor v10.16b, v0.16b, v9.16b // xor with round0 key 437 eor v11.16b, v1.16b, v9.16b 438 tbl v0.16b, {v10.16b}, v8.16b 439 eor v12.16b, v2.16b, v9.16b 440 tbl v1.16b, {v11.16b}, v8.16b 441 eor v13.16b, v3.16b, v9.16b 442 tbl v2.16b, {v12.16b}, v8.16b 443 eor v14.16b, v4.16b, v9.16b 444 tbl v3.16b, {v13.16b}, v8.16b 445 eor v15.16b, v5.16b, v9.16b 446 tbl v4.16b, {v14.16b}, v8.16b 447 eor v10.16b, v6.16b, v9.16b 448 tbl v5.16b, {v15.16b}, v8.16b 449 eor v11.16b, v7.16b, v9.16b 450 tbl v6.16b, {v10.16b}, v8.16b 451 tbl v7.16b, {v11.16b}, v8.16b 452 453 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 454 455 sub rounds, rounds, #1 456 b .Lenc_sbox 457 458.Lenc_loop: 459 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 460.Lenc_sbox: 461 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 462 v13, v14, v15 463 subs rounds, rounds, #1 464 b.cc .Lenc_done 465 466 enc_next_rk 467 468 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ 469 v13, v14, v15 470 471 add_round_key v0, v1, v2, v3, v4, v5, v6, v7 472 473 b.ne .Lenc_loop 474 ldr q24, SRM0 475 b .Lenc_loop 476 477.Lenc_done: 478 ldr q12, [bskey] // last round key 479 480 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 481 482 eor v0.16b, v0.16b, v12.16b 483 eor v1.16b, v1.16b, v12.16b 484 eor v4.16b, v4.16b, v12.16b 485 eor v6.16b, v6.16b, v12.16b 486 eor v3.16b, v3.16b, v12.16b 487 eor v7.16b, v7.16b, v12.16b 488 eor v2.16b, v2.16b, v12.16b 489 eor v5.16b, v5.16b, v12.16b 490 ret 491ENDPROC(aesbs_encrypt8) 492 493 .align 4 494aesbs_decrypt8: 495 lsl x9, rounds, #7 496 add bskey, bskey, x9 497 498 ldr q9, [bskey, #-112]! // round 0 key 499 ldr q8, M0ISR 500 ldr q24, ISR 501 502 eor v10.16b, v0.16b, v9.16b // xor with round0 key 503 eor v11.16b, v1.16b, v9.16b 504 tbl v0.16b, {v10.16b}, v8.16b 505 eor v12.16b, v2.16b, v9.16b 506 tbl v1.16b, {v11.16b}, v8.16b 507 eor v13.16b, v3.16b, v9.16b 508 tbl v2.16b, {v12.16b}, v8.16b 509 eor v14.16b, v4.16b, v9.16b 510 tbl v3.16b, {v13.16b}, v8.16b 511 eor v15.16b, v5.16b, v9.16b 512 tbl v4.16b, {v14.16b}, v8.16b 513 eor v10.16b, v6.16b, v9.16b 514 tbl v5.16b, {v15.16b}, v8.16b 515 eor v11.16b, v7.16b, v9.16b 516 tbl v6.16b, {v10.16b}, v8.16b 517 tbl v7.16b, {v11.16b}, v8.16b 518 519 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 520 521 sub rounds, rounds, #1 522 b .Ldec_sbox 523 524.Ldec_loop: 525 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 526.Ldec_sbox: 527 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 528 v13, v14, v15 529 subs rounds, rounds, #1 530 b.cc .Ldec_done 531 532 dec_next_rk 533 534 add_round_key v0, v1, v6, v4, v2, v7, v3, v5 535 536 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ 537 v13, v14, v15 538 539 b.ne .Ldec_loop 540 ldr q24, ISRM0 541 b .Ldec_loop 542.Ldec_done: 543 ldr q12, [bskey, #-16] // last round key 544 545 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 546 547 eor v0.16b, v0.16b, v12.16b 548 eor v1.16b, v1.16b, v12.16b 549 eor v6.16b, v6.16b, v12.16b 550 eor v4.16b, v4.16b, v12.16b 551 eor v2.16b, v2.16b, v12.16b 552 eor v7.16b, v7.16b, v12.16b 553 eor v3.16b, v3.16b, v12.16b 554 eor v5.16b, v5.16b, v12.16b 555 ret 556ENDPROC(aesbs_decrypt8) 557 558 /* 559 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 560 * int blocks) 561 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 562 * int blocks) 563 */ 564 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 565 frame_push 5 566 567 mov x19, x0 568 mov x20, x1 569 mov x21, x2 570 mov x22, x3 571 mov x23, x4 572 57399: mov x5, #1 574 lsl x5, x5, x23 575 subs w23, w23, #8 576 csel x23, x23, xzr, pl 577 csel x5, x5, xzr, mi 578 579 ld1 {v0.16b}, [x20], #16 580 tbnz x5, #1, 0f 581 ld1 {v1.16b}, [x20], #16 582 tbnz x5, #2, 0f 583 ld1 {v2.16b}, [x20], #16 584 tbnz x5, #3, 0f 585 ld1 {v3.16b}, [x20], #16 586 tbnz x5, #4, 0f 587 ld1 {v4.16b}, [x20], #16 588 tbnz x5, #5, 0f 589 ld1 {v5.16b}, [x20], #16 590 tbnz x5, #6, 0f 591 ld1 {v6.16b}, [x20], #16 592 tbnz x5, #7, 0f 593 ld1 {v7.16b}, [x20], #16 594 5950: mov bskey, x21 596 mov rounds, x22 597 bl \do8 598 599 st1 {\o0\().16b}, [x19], #16 600 tbnz x5, #1, 1f 601 st1 {\o1\().16b}, [x19], #16 602 tbnz x5, #2, 1f 603 st1 {\o2\().16b}, [x19], #16 604 tbnz x5, #3, 1f 605 st1 {\o3\().16b}, [x19], #16 606 tbnz x5, #4, 1f 607 st1 {\o4\().16b}, [x19], #16 608 tbnz x5, #5, 1f 609 st1 {\o5\().16b}, [x19], #16 610 tbnz x5, #6, 1f 611 st1 {\o6\().16b}, [x19], #16 612 tbnz x5, #7, 1f 613 st1 {\o7\().16b}, [x19], #16 614 615 cbz x23, 1f 616 cond_yield_neon 617 b 99b 618 6191: frame_pop 620 ret 621 .endm 622 623 .align 4 624ENTRY(aesbs_ecb_encrypt) 625 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 626ENDPROC(aesbs_ecb_encrypt) 627 628 .align 4 629ENTRY(aesbs_ecb_decrypt) 630 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 631ENDPROC(aesbs_ecb_decrypt) 632 633 /* 634 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 635 * int blocks, u8 iv[]) 636 */ 637 .align 4 638ENTRY(aesbs_cbc_decrypt) 639 frame_push 6 640 641 mov x19, x0 642 mov x20, x1 643 mov x21, x2 644 mov x22, x3 645 mov x23, x4 646 mov x24, x5 647 64899: mov x6, #1 649 lsl x6, x6, x23 650 subs w23, w23, #8 651 csel x23, x23, xzr, pl 652 csel x6, x6, xzr, mi 653 654 ld1 {v0.16b}, [x20], #16 655 mov v25.16b, v0.16b 656 tbnz x6, #1, 0f 657 ld1 {v1.16b}, [x20], #16 658 mov v26.16b, v1.16b 659 tbnz x6, #2, 0f 660 ld1 {v2.16b}, [x20], #16 661 mov v27.16b, v2.16b 662 tbnz x6, #3, 0f 663 ld1 {v3.16b}, [x20], #16 664 mov v28.16b, v3.16b 665 tbnz x6, #4, 0f 666 ld1 {v4.16b}, [x20], #16 667 mov v29.16b, v4.16b 668 tbnz x6, #5, 0f 669 ld1 {v5.16b}, [x20], #16 670 mov v30.16b, v5.16b 671 tbnz x6, #6, 0f 672 ld1 {v6.16b}, [x20], #16 673 mov v31.16b, v6.16b 674 tbnz x6, #7, 0f 675 ld1 {v7.16b}, [x20] 676 6770: mov bskey, x21 678 mov rounds, x22 679 bl aesbs_decrypt8 680 681 ld1 {v24.16b}, [x24] // load IV 682 683 eor v1.16b, v1.16b, v25.16b 684 eor v6.16b, v6.16b, v26.16b 685 eor v4.16b, v4.16b, v27.16b 686 eor v2.16b, v2.16b, v28.16b 687 eor v7.16b, v7.16b, v29.16b 688 eor v0.16b, v0.16b, v24.16b 689 eor v3.16b, v3.16b, v30.16b 690 eor v5.16b, v5.16b, v31.16b 691 692 st1 {v0.16b}, [x19], #16 693 mov v24.16b, v25.16b 694 tbnz x6, #1, 1f 695 st1 {v1.16b}, [x19], #16 696 mov v24.16b, v26.16b 697 tbnz x6, #2, 1f 698 st1 {v6.16b}, [x19], #16 699 mov v24.16b, v27.16b 700 tbnz x6, #3, 1f 701 st1 {v4.16b}, [x19], #16 702 mov v24.16b, v28.16b 703 tbnz x6, #4, 1f 704 st1 {v2.16b}, [x19], #16 705 mov v24.16b, v29.16b 706 tbnz x6, #5, 1f 707 st1 {v7.16b}, [x19], #16 708 mov v24.16b, v30.16b 709 tbnz x6, #6, 1f 710 st1 {v3.16b}, [x19], #16 711 mov v24.16b, v31.16b 712 tbnz x6, #7, 1f 713 ld1 {v24.16b}, [x20], #16 714 st1 {v5.16b}, [x19], #16 7151: st1 {v24.16b}, [x24] // store IV 716 717 cbz x23, 2f 718 cond_yield_neon 719 b 99b 720 7212: frame_pop 722 ret 723ENDPROC(aesbs_cbc_decrypt) 724 725 .macro next_tweak, out, in, const, tmp 726 sshr \tmp\().2d, \in\().2d, #63 727 and \tmp\().16b, \tmp\().16b, \const\().16b 728 add \out\().2d, \in\().2d, \in\().2d 729 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 730 eor \out\().16b, \out\().16b, \tmp\().16b 731 .endm 732 733 .align 4 734.Lxts_mul_x: 735CPU_LE( .quad 1, 0x87 ) 736CPU_BE( .quad 0x87, 1 ) 737 738 /* 739 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 740 * int blocks, u8 iv[]) 741 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 742 * int blocks, u8 iv[]) 743 */ 744__xts_crypt8: 745 mov x6, #1 746 lsl x6, x6, x23 747 subs w23, w23, #8 748 csel x23, x23, xzr, pl 749 csel x6, x6, xzr, mi 750 751 ld1 {v0.16b}, [x20], #16 752 next_tweak v26, v25, v30, v31 753 eor v0.16b, v0.16b, v25.16b 754 tbnz x6, #1, 0f 755 756 ld1 {v1.16b}, [x20], #16 757 next_tweak v27, v26, v30, v31 758 eor v1.16b, v1.16b, v26.16b 759 tbnz x6, #2, 0f 760 761 ld1 {v2.16b}, [x20], #16 762 next_tweak v28, v27, v30, v31 763 eor v2.16b, v2.16b, v27.16b 764 tbnz x6, #3, 0f 765 766 ld1 {v3.16b}, [x20], #16 767 next_tweak v29, v28, v30, v31 768 eor v3.16b, v3.16b, v28.16b 769 tbnz x6, #4, 0f 770 771 ld1 {v4.16b}, [x20], #16 772 str q29, [sp, #.Lframe_local_offset] 773 eor v4.16b, v4.16b, v29.16b 774 next_tweak v29, v29, v30, v31 775 tbnz x6, #5, 0f 776 777 ld1 {v5.16b}, [x20], #16 778 str q29, [sp, #.Lframe_local_offset + 16] 779 eor v5.16b, v5.16b, v29.16b 780 next_tweak v29, v29, v30, v31 781 tbnz x6, #6, 0f 782 783 ld1 {v6.16b}, [x20], #16 784 str q29, [sp, #.Lframe_local_offset + 32] 785 eor v6.16b, v6.16b, v29.16b 786 next_tweak v29, v29, v30, v31 787 tbnz x6, #7, 0f 788 789 ld1 {v7.16b}, [x20], #16 790 str q29, [sp, #.Lframe_local_offset + 48] 791 eor v7.16b, v7.16b, v29.16b 792 next_tweak v29, v29, v30, v31 793 7940: mov bskey, x21 795 mov rounds, x22 796 br x7 797ENDPROC(__xts_crypt8) 798 799 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 800 frame_push 6, 64 801 802 mov x19, x0 803 mov x20, x1 804 mov x21, x2 805 mov x22, x3 806 mov x23, x4 807 mov x24, x5 808 8090: ldr q30, .Lxts_mul_x 810 ld1 {v25.16b}, [x24] 811 81299: adr x7, \do8 813 bl __xts_crypt8 814 815 ldp q16, q17, [sp, #.Lframe_local_offset] 816 ldp q18, q19, [sp, #.Lframe_local_offset + 32] 817 818 eor \o0\().16b, \o0\().16b, v25.16b 819 eor \o1\().16b, \o1\().16b, v26.16b 820 eor \o2\().16b, \o2\().16b, v27.16b 821 eor \o3\().16b, \o3\().16b, v28.16b 822 823 st1 {\o0\().16b}, [x19], #16 824 mov v25.16b, v26.16b 825 tbnz x6, #1, 1f 826 st1 {\o1\().16b}, [x19], #16 827 mov v25.16b, v27.16b 828 tbnz x6, #2, 1f 829 st1 {\o2\().16b}, [x19], #16 830 mov v25.16b, v28.16b 831 tbnz x6, #3, 1f 832 st1 {\o3\().16b}, [x19], #16 833 mov v25.16b, v29.16b 834 tbnz x6, #4, 1f 835 836 eor \o4\().16b, \o4\().16b, v16.16b 837 eor \o5\().16b, \o5\().16b, v17.16b 838 eor \o6\().16b, \o6\().16b, v18.16b 839 eor \o7\().16b, \o7\().16b, v19.16b 840 841 st1 {\o4\().16b}, [x19], #16 842 tbnz x6, #5, 1f 843 st1 {\o5\().16b}, [x19], #16 844 tbnz x6, #6, 1f 845 st1 {\o6\().16b}, [x19], #16 846 tbnz x6, #7, 1f 847 st1 {\o7\().16b}, [x19], #16 848 849 cbz x23, 1f 850 st1 {v25.16b}, [x24] 851 852 cond_yield_neon 0b 853 b 99b 854 8551: st1 {v25.16b}, [x24] 856 frame_pop 857 ret 858 .endm 859 860ENTRY(aesbs_xts_encrypt) 861 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 862ENDPROC(aesbs_xts_encrypt) 863 864ENTRY(aesbs_xts_decrypt) 865 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 866ENDPROC(aesbs_xts_decrypt) 867 868 .macro next_ctr, v 869 mov \v\().d[1], x8 870 adds x8, x8, #1 871 mov \v\().d[0], x7 872 adc x7, x7, xzr 873 rev64 \v\().16b, \v\().16b 874 .endm 875 876 /* 877 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 878 * int rounds, int blocks, u8 iv[], u8 final[]) 879 */ 880ENTRY(aesbs_ctr_encrypt) 881 frame_push 8 882 883 mov x19, x0 884 mov x20, x1 885 mov x21, x2 886 mov x22, x3 887 mov x23, x4 888 mov x24, x5 889 mov x25, x6 890 891 cmp x25, #0 892 cset x26, ne 893 add x23, x23, x26 // do one extra block if final 894 89598: ldp x7, x8, [x24] 896 ld1 {v0.16b}, [x24] 897CPU_LE( rev x7, x7 ) 898CPU_LE( rev x8, x8 ) 899 adds x8, x8, #1 900 adc x7, x7, xzr 901 90299: mov x9, #1 903 lsl x9, x9, x23 904 subs w23, w23, #8 905 csel x23, x23, xzr, pl 906 csel x9, x9, xzr, le 907 908 tbnz x9, #1, 0f 909 next_ctr v1 910 tbnz x9, #2, 0f 911 next_ctr v2 912 tbnz x9, #3, 0f 913 next_ctr v3 914 tbnz x9, #4, 0f 915 next_ctr v4 916 tbnz x9, #5, 0f 917 next_ctr v5 918 tbnz x9, #6, 0f 919 next_ctr v6 920 tbnz x9, #7, 0f 921 next_ctr v7 922 9230: mov bskey, x21 924 mov rounds, x22 925 bl aesbs_encrypt8 926 927 lsr x9, x9, x26 // disregard the extra block 928 tbnz x9, #0, 0f 929 930 ld1 {v8.16b}, [x20], #16 931 eor v0.16b, v0.16b, v8.16b 932 st1 {v0.16b}, [x19], #16 933 tbnz x9, #1, 1f 934 935 ld1 {v9.16b}, [x20], #16 936 eor v1.16b, v1.16b, v9.16b 937 st1 {v1.16b}, [x19], #16 938 tbnz x9, #2, 2f 939 940 ld1 {v10.16b}, [x20], #16 941 eor v4.16b, v4.16b, v10.16b 942 st1 {v4.16b}, [x19], #16 943 tbnz x9, #3, 3f 944 945 ld1 {v11.16b}, [x20], #16 946 eor v6.16b, v6.16b, v11.16b 947 st1 {v6.16b}, [x19], #16 948 tbnz x9, #4, 4f 949 950 ld1 {v12.16b}, [x20], #16 951 eor v3.16b, v3.16b, v12.16b 952 st1 {v3.16b}, [x19], #16 953 tbnz x9, #5, 5f 954 955 ld1 {v13.16b}, [x20], #16 956 eor v7.16b, v7.16b, v13.16b 957 st1 {v7.16b}, [x19], #16 958 tbnz x9, #6, 6f 959 960 ld1 {v14.16b}, [x20], #16 961 eor v2.16b, v2.16b, v14.16b 962 st1 {v2.16b}, [x19], #16 963 tbnz x9, #7, 7f 964 965 ld1 {v15.16b}, [x20], #16 966 eor v5.16b, v5.16b, v15.16b 967 st1 {v5.16b}, [x19], #16 968 9698: next_ctr v0 970 st1 {v0.16b}, [x24] 971 cbz x23, .Lctr_done 972 973 cond_yield_neon 98b 974 b 99b 975 976.Lctr_done: 977 frame_pop 978 ret 979 980 /* 981 * If we are handling the tail of the input (x6 != NULL), return the 982 * final keystream block back to the caller. 983 */ 9840: cbz x25, 8b 985 st1 {v0.16b}, [x25] 986 b 8b 9871: cbz x25, 8b 988 st1 {v1.16b}, [x25] 989 b 8b 9902: cbz x25, 8b 991 st1 {v4.16b}, [x25] 992 b 8b 9933: cbz x25, 8b 994 st1 {v6.16b}, [x25] 995 b 8b 9964: cbz x25, 8b 997 st1 {v3.16b}, [x25] 998 b 8b 9995: cbz x25, 8b 1000 st1 {v7.16b}, [x25] 1001 b 8b 10026: cbz x25, 8b 1003 st1 {v2.16b}, [x25] 1004 b 8b 10057: cbz x25, 8b 1006 st1 {v5.16b}, [x25] 1007 b 8b 1008ENDPROC(aesbs_ctr_encrypt) 1009