1/* 2 * Bit sliced AES using NEON instructions 3 * 4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* 12 * The algorithm implemented here is described in detail by the paper 13 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and 14 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) 15 * 16 * This implementation is based primarily on the OpenSSL implementation 17 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> 18 */ 19 20#include <linux/linkage.h> 21#include <asm/assembler.h> 22 23 .text 24 25 rounds .req x11 26 bskey .req x12 27 28 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 29 eor \b2, \b2, \b1 30 eor \b5, \b5, \b6 31 eor \b3, \b3, \b0 32 eor \b6, \b6, \b2 33 eor \b5, \b5, \b0 34 eor \b6, \b6, \b3 35 eor \b3, \b3, \b7 36 eor \b7, \b7, \b5 37 eor \b3, \b3, \b4 38 eor \b4, \b4, \b5 39 eor \b2, \b2, \b7 40 eor \b3, \b3, \b1 41 eor \b1, \b1, \b5 42 .endm 43 44 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 45 eor \b0, \b0, \b6 46 eor \b1, \b1, \b4 47 eor \b4, \b4, \b6 48 eor \b2, \b2, \b0 49 eor \b6, \b6, \b1 50 eor \b1, \b1, \b5 51 eor \b5, \b5, \b3 52 eor \b3, \b3, \b7 53 eor \b7, \b7, \b5 54 eor \b2, \b2, \b5 55 eor \b4, \b4, \b7 56 .endm 57 58 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 59 eor \b1, \b1, \b7 60 eor \b4, \b4, \b7 61 eor \b7, \b7, \b5 62 eor \b1, \b1, \b3 63 eor \b2, \b2, \b5 64 eor \b3, \b3, \b7 65 eor \b6, \b6, \b1 66 eor \b2, \b2, \b0 67 eor \b5, \b5, \b3 68 eor \b4, \b4, \b6 69 eor \b0, \b0, \b6 70 eor \b1, \b1, \b4 71 .endm 72 73 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 74 eor \b1, \b1, \b5 75 eor \b2, \b2, \b7 76 eor \b3, \b3, \b1 77 eor \b4, \b4, \b5 78 eor \b7, \b7, \b5 79 eor \b3, \b3, \b4 80 eor \b5, \b5, \b0 81 eor \b3, \b3, \b7 82 eor \b6, \b6, \b2 83 eor \b2, \b2, \b1 84 eor \b6, \b6, \b3 85 eor \b3, \b3, \b0 86 eor \b5, \b5, \b6 87 .endm 88 89 .macro mul_gf4, x0, x1, y0, y1, t0, t1 90 eor \t0, \y0, \y1 91 and \t0, \t0, \x0 92 eor \x0, \x0, \x1 93 and \t1, \x1, \y0 94 and \x0, \x0, \y1 95 eor \x1, \t1, \t0 96 eor \x0, \x0, \t1 97 .endm 98 99 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 100 eor \t0, \y0, \y1 101 eor \t1, \y2, \y3 102 and \t0, \t0, \x0 103 and \t1, \t1, \x2 104 eor \x0, \x0, \x1 105 eor \x2, \x2, \x3 106 and \x1, \x1, \y0 107 and \x3, \x3, \y2 108 and \x0, \x0, \y1 109 and \x2, \x2, \y3 110 eor \x1, \x1, \x0 111 eor \x2, \x2, \x3 112 eor \x0, \x0, \t0 113 eor \x3, \x3, \t1 114 .endm 115 116 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ 117 y0, y1, y2, y3, t0, t1, t2, t3 118 eor \t0, \x0, \x2 119 eor \t1, \x1, \x3 120 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 121 eor \y0, \y0, \y2 122 eor \y1, \y1, \y3 123 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 124 eor \x0, \x0, \t0 125 eor \x2, \x2, \t0 126 eor \x1, \x1, \t1 127 eor \x3, \x3, \t1 128 eor \t0, \x4, \x6 129 eor \t1, \x5, \x7 130 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 131 eor \y0, \y0, \y2 132 eor \y1, \y1, \y3 133 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 134 eor \x4, \x4, \t0 135 eor \x6, \x6, \t0 136 eor \x5, \x5, \t1 137 eor \x7, \x7, \t1 138 .endm 139 140 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ 141 t0, t1, t2, t3, s0, s1, s2, s3 142 eor \t3, \x4, \x6 143 eor \t0, \x5, \x7 144 eor \t1, \x1, \x3 145 eor \s1, \x7, \x6 146 eor \s0, \x0, \x2 147 eor \s3, \t3, \t0 148 orr \t2, \t0, \t1 149 and \s2, \t3, \s0 150 orr \t3, \t3, \s0 151 eor \s0, \s0, \t1 152 and \t0, \t0, \t1 153 eor \t1, \x3, \x2 154 and \s3, \s3, \s0 155 and \s1, \s1, \t1 156 eor \t1, \x4, \x5 157 eor \s0, \x1, \x0 158 eor \t3, \t3, \s1 159 eor \t2, \t2, \s1 160 and \s1, \t1, \s0 161 orr \t1, \t1, \s0 162 eor \t3, \t3, \s3 163 eor \t0, \t0, \s1 164 eor \t2, \t2, \s2 165 eor \t1, \t1, \s3 166 eor \t0, \t0, \s2 167 and \s0, \x7, \x3 168 eor \t1, \t1, \s2 169 and \s1, \x6, \x2 170 and \s2, \x5, \x1 171 orr \s3, \x4, \x0 172 eor \t3, \t3, \s0 173 eor \t1, \t1, \s2 174 eor \s0, \t0, \s3 175 eor \t2, \t2, \s1 176 and \s2, \t3, \t1 177 eor \s1, \t2, \s2 178 eor \s3, \s0, \s2 179 bsl \s1, \t1, \s0 180 not \t0, \s0 181 bsl \s0, \s1, \s3 182 bsl \t0, \s1, \s3 183 bsl \s3, \t3, \t2 184 eor \t3, \t3, \t2 185 and \s2, \s0, \s3 186 eor \t1, \t1, \t0 187 eor \s2, \s2, \t3 188 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 189 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 190 .endm 191 192 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 193 t0, t1, t2, t3, s0, s1, s2, s3 194 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 195 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 196 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ 197 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 198 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 199 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 200 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 201 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b 202 .endm 203 204 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 205 t0, t1, t2, t3, s0, s1, s2, s3 206 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 207 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 208 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ 209 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 210 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 211 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 212 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 213 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b 214 .endm 215 216 .macro enc_next_rk 217 ldp q16, q17, [bskey], #128 218 ldp q18, q19, [bskey, #-96] 219 ldp q20, q21, [bskey, #-64] 220 ldp q22, q23, [bskey, #-32] 221 .endm 222 223 .macro dec_next_rk 224 ldp q16, q17, [bskey, #-128]! 225 ldp q18, q19, [bskey, #32] 226 ldp q20, q21, [bskey, #64] 227 ldp q22, q23, [bskey, #96] 228 .endm 229 230 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 231 eor \x0\().16b, \x0\().16b, v16.16b 232 eor \x1\().16b, \x1\().16b, v17.16b 233 eor \x2\().16b, \x2\().16b, v18.16b 234 eor \x3\().16b, \x3\().16b, v19.16b 235 eor \x4\().16b, \x4\().16b, v20.16b 236 eor \x5\().16b, \x5\().16b, v21.16b 237 eor \x6\().16b, \x6\().16b, v22.16b 238 eor \x7\().16b, \x7\().16b, v23.16b 239 .endm 240 241 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask 242 tbl \x0\().16b, {\x0\().16b}, \mask\().16b 243 tbl \x1\().16b, {\x1\().16b}, \mask\().16b 244 tbl \x2\().16b, {\x2\().16b}, \mask\().16b 245 tbl \x3\().16b, {\x3\().16b}, \mask\().16b 246 tbl \x4\().16b, {\x4\().16b}, \mask\().16b 247 tbl \x5\().16b, {\x5\().16b}, \mask\().16b 248 tbl \x6\().16b, {\x6\().16b}, \mask\().16b 249 tbl \x7\().16b, {\x7\().16b}, \mask\().16b 250 .endm 251 252 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 253 t0, t1, t2, t3, t4, t5, t6, t7, inv 254 ext \t0\().16b, \x0\().16b, \x0\().16b, #12 255 ext \t1\().16b, \x1\().16b, \x1\().16b, #12 256 eor \x0\().16b, \x0\().16b, \t0\().16b 257 ext \t2\().16b, \x2\().16b, \x2\().16b, #12 258 eor \x1\().16b, \x1\().16b, \t1\().16b 259 ext \t3\().16b, \x3\().16b, \x3\().16b, #12 260 eor \x2\().16b, \x2\().16b, \t2\().16b 261 ext \t4\().16b, \x4\().16b, \x4\().16b, #12 262 eor \x3\().16b, \x3\().16b, \t3\().16b 263 ext \t5\().16b, \x5\().16b, \x5\().16b, #12 264 eor \x4\().16b, \x4\().16b, \t4\().16b 265 ext \t6\().16b, \x6\().16b, \x6\().16b, #12 266 eor \x5\().16b, \x5\().16b, \t5\().16b 267 ext \t7\().16b, \x7\().16b, \x7\().16b, #12 268 eor \x6\().16b, \x6\().16b, \t6\().16b 269 eor \t1\().16b, \t1\().16b, \x0\().16b 270 eor \x7\().16b, \x7\().16b, \t7\().16b 271 ext \x0\().16b, \x0\().16b, \x0\().16b, #8 272 eor \t2\().16b, \t2\().16b, \x1\().16b 273 eor \t0\().16b, \t0\().16b, \x7\().16b 274 eor \t1\().16b, \t1\().16b, \x7\().16b 275 ext \x1\().16b, \x1\().16b, \x1\().16b, #8 276 eor \t5\().16b, \t5\().16b, \x4\().16b 277 eor \x0\().16b, \x0\().16b, \t0\().16b 278 eor \t6\().16b, \t6\().16b, \x5\().16b 279 eor \x1\().16b, \x1\().16b, \t1\().16b 280 ext \t0\().16b, \x4\().16b, \x4\().16b, #8 281 eor \t4\().16b, \t4\().16b, \x3\().16b 282 ext \t1\().16b, \x5\().16b, \x5\().16b, #8 283 eor \t7\().16b, \t7\().16b, \x6\().16b 284 ext \x4\().16b, \x3\().16b, \x3\().16b, #8 285 eor \t3\().16b, \t3\().16b, \x2\().16b 286 ext \x5\().16b, \x7\().16b, \x7\().16b, #8 287 eor \t4\().16b, \t4\().16b, \x7\().16b 288 ext \x3\().16b, \x6\().16b, \x6\().16b, #8 289 eor \t3\().16b, \t3\().16b, \x7\().16b 290 ext \x6\().16b, \x2\().16b, \x2\().16b, #8 291 eor \x7\().16b, \t1\().16b, \t5\().16b 292 .ifb \inv 293 eor \x2\().16b, \t0\().16b, \t4\().16b 294 eor \x4\().16b, \x4\().16b, \t3\().16b 295 eor \x5\().16b, \x5\().16b, \t7\().16b 296 eor \x3\().16b, \x3\().16b, \t6\().16b 297 eor \x6\().16b, \x6\().16b, \t2\().16b 298 .else 299 eor \t3\().16b, \t3\().16b, \x4\().16b 300 eor \x5\().16b, \x5\().16b, \t7\().16b 301 eor \x2\().16b, \x3\().16b, \t6\().16b 302 eor \x3\().16b, \t0\().16b, \t4\().16b 303 eor \x4\().16b, \x6\().16b, \t2\().16b 304 mov \x6\().16b, \t3\().16b 305 .endif 306 .endm 307 308 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 309 t0, t1, t2, t3, t4, t5, t6, t7 310 ext \t0\().16b, \x0\().16b, \x0\().16b, #8 311 ext \t6\().16b, \x6\().16b, \x6\().16b, #8 312 ext \t7\().16b, \x7\().16b, \x7\().16b, #8 313 eor \t0\().16b, \t0\().16b, \x0\().16b 314 ext \t1\().16b, \x1\().16b, \x1\().16b, #8 315 eor \t6\().16b, \t6\().16b, \x6\().16b 316 ext \t2\().16b, \x2\().16b, \x2\().16b, #8 317 eor \t7\().16b, \t7\().16b, \x7\().16b 318 ext \t3\().16b, \x3\().16b, \x3\().16b, #8 319 eor \t1\().16b, \t1\().16b, \x1\().16b 320 ext \t4\().16b, \x4\().16b, \x4\().16b, #8 321 eor \t2\().16b, \t2\().16b, \x2\().16b 322 ext \t5\().16b, \x5\().16b, \x5\().16b, #8 323 eor \t3\().16b, \t3\().16b, \x3\().16b 324 eor \t4\().16b, \t4\().16b, \x4\().16b 325 eor \t5\().16b, \t5\().16b, \x5\().16b 326 eor \x0\().16b, \x0\().16b, \t6\().16b 327 eor \x1\().16b, \x1\().16b, \t6\().16b 328 eor \x2\().16b, \x2\().16b, \t0\().16b 329 eor \x4\().16b, \x4\().16b, \t2\().16b 330 eor \x3\().16b, \x3\().16b, \t1\().16b 331 eor \x1\().16b, \x1\().16b, \t7\().16b 332 eor \x2\().16b, \x2\().16b, \t7\().16b 333 eor \x4\().16b, \x4\().16b, \t6\().16b 334 eor \x5\().16b, \x5\().16b, \t3\().16b 335 eor \x3\().16b, \x3\().16b, \t6\().16b 336 eor \x6\().16b, \x6\().16b, \t4\().16b 337 eor \x4\().16b, \x4\().16b, \t7\().16b 338 eor \x5\().16b, \x5\().16b, \t7\().16b 339 eor \x7\().16b, \x7\().16b, \t5\().16b 340 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 341 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 342 .endm 343 344 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 345 ushr \t0\().2d, \b0\().2d, #\n 346 ushr \t1\().2d, \b1\().2d, #\n 347 eor \t0\().16b, \t0\().16b, \a0\().16b 348 eor \t1\().16b, \t1\().16b, \a1\().16b 349 and \t0\().16b, \t0\().16b, \mask\().16b 350 and \t1\().16b, \t1\().16b, \mask\().16b 351 eor \a0\().16b, \a0\().16b, \t0\().16b 352 shl \t0\().2d, \t0\().2d, #\n 353 eor \a1\().16b, \a1\().16b, \t1\().16b 354 shl \t1\().2d, \t1\().2d, #\n 355 eor \b0\().16b, \b0\().16b, \t0\().16b 356 eor \b1\().16b, \b1\().16b, \t1\().16b 357 .endm 358 359 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 360 movi \t0\().16b, #0x55 361 movi \t1\().16b, #0x33 362 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 363 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 364 movi \t0\().16b, #0x0f 365 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 366 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 367 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 368 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 369 .endm 370 371 372 .align 6 373M0: .octa 0x0004080c0105090d02060a0e03070b0f 374 375M0SR: .octa 0x0004080c05090d010a0e02060f03070b 376SR: .octa 0x0f0e0d0c0a09080b0504070600030201 377SRM0: .octa 0x01060b0c0207080d0304090e00050a0f 378 379M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 380ISR: .octa 0x0f0e0d0c080b0a090504070602010003 381ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f 382 383 /* 384 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) 385 */ 386ENTRY(aesbs_convert_key) 387 ld1 {v7.4s}, [x1], #16 // load round 0 key 388 ld1 {v17.4s}, [x1], #16 // load round 1 key 389 390 movi v8.16b, #0x01 // bit masks 391 movi v9.16b, #0x02 392 movi v10.16b, #0x04 393 movi v11.16b, #0x08 394 movi v12.16b, #0x10 395 movi v13.16b, #0x20 396 movi v14.16b, #0x40 397 movi v15.16b, #0x80 398 ldr q16, M0 399 400 sub x2, x2, #1 401 str q7, [x0], #16 // save round 0 key 402 403.Lkey_loop: 404 tbl v7.16b ,{v17.16b}, v16.16b 405 ld1 {v17.4s}, [x1], #16 // load next round key 406 407 cmtst v0.16b, v7.16b, v8.16b 408 cmtst v1.16b, v7.16b, v9.16b 409 cmtst v2.16b, v7.16b, v10.16b 410 cmtst v3.16b, v7.16b, v11.16b 411 cmtst v4.16b, v7.16b, v12.16b 412 cmtst v5.16b, v7.16b, v13.16b 413 cmtst v6.16b, v7.16b, v14.16b 414 cmtst v7.16b, v7.16b, v15.16b 415 not v0.16b, v0.16b 416 not v1.16b, v1.16b 417 not v5.16b, v5.16b 418 not v6.16b, v6.16b 419 420 subs x2, x2, #1 421 stp q0, q1, [x0], #128 422 stp q2, q3, [x0, #-96] 423 stp q4, q5, [x0, #-64] 424 stp q6, q7, [x0, #-32] 425 b.ne .Lkey_loop 426 427 movi v7.16b, #0x63 // compose .L63 428 eor v17.16b, v17.16b, v7.16b 429 str q17, [x0] 430 ret 431ENDPROC(aesbs_convert_key) 432 433 .align 4 434aesbs_encrypt8: 435 ldr q9, [bskey], #16 // round 0 key 436 ldr q8, M0SR 437 ldr q24, SR 438 439 eor v10.16b, v0.16b, v9.16b // xor with round0 key 440 eor v11.16b, v1.16b, v9.16b 441 tbl v0.16b, {v10.16b}, v8.16b 442 eor v12.16b, v2.16b, v9.16b 443 tbl v1.16b, {v11.16b}, v8.16b 444 eor v13.16b, v3.16b, v9.16b 445 tbl v2.16b, {v12.16b}, v8.16b 446 eor v14.16b, v4.16b, v9.16b 447 tbl v3.16b, {v13.16b}, v8.16b 448 eor v15.16b, v5.16b, v9.16b 449 tbl v4.16b, {v14.16b}, v8.16b 450 eor v10.16b, v6.16b, v9.16b 451 tbl v5.16b, {v15.16b}, v8.16b 452 eor v11.16b, v7.16b, v9.16b 453 tbl v6.16b, {v10.16b}, v8.16b 454 tbl v7.16b, {v11.16b}, v8.16b 455 456 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 457 458 sub rounds, rounds, #1 459 b .Lenc_sbox 460 461.Lenc_loop: 462 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 463.Lenc_sbox: 464 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 465 v13, v14, v15 466 subs rounds, rounds, #1 467 b.cc .Lenc_done 468 469 enc_next_rk 470 471 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ 472 v13, v14, v15 473 474 add_round_key v0, v1, v2, v3, v4, v5, v6, v7 475 476 b.ne .Lenc_loop 477 ldr q24, SRM0 478 b .Lenc_loop 479 480.Lenc_done: 481 ldr q12, [bskey] // last round key 482 483 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 484 485 eor v0.16b, v0.16b, v12.16b 486 eor v1.16b, v1.16b, v12.16b 487 eor v4.16b, v4.16b, v12.16b 488 eor v6.16b, v6.16b, v12.16b 489 eor v3.16b, v3.16b, v12.16b 490 eor v7.16b, v7.16b, v12.16b 491 eor v2.16b, v2.16b, v12.16b 492 eor v5.16b, v5.16b, v12.16b 493 ret 494ENDPROC(aesbs_encrypt8) 495 496 .align 4 497aesbs_decrypt8: 498 lsl x9, rounds, #7 499 add bskey, bskey, x9 500 501 ldr q9, [bskey, #-112]! // round 0 key 502 ldr q8, M0ISR 503 ldr q24, ISR 504 505 eor v10.16b, v0.16b, v9.16b // xor with round0 key 506 eor v11.16b, v1.16b, v9.16b 507 tbl v0.16b, {v10.16b}, v8.16b 508 eor v12.16b, v2.16b, v9.16b 509 tbl v1.16b, {v11.16b}, v8.16b 510 eor v13.16b, v3.16b, v9.16b 511 tbl v2.16b, {v12.16b}, v8.16b 512 eor v14.16b, v4.16b, v9.16b 513 tbl v3.16b, {v13.16b}, v8.16b 514 eor v15.16b, v5.16b, v9.16b 515 tbl v4.16b, {v14.16b}, v8.16b 516 eor v10.16b, v6.16b, v9.16b 517 tbl v5.16b, {v15.16b}, v8.16b 518 eor v11.16b, v7.16b, v9.16b 519 tbl v6.16b, {v10.16b}, v8.16b 520 tbl v7.16b, {v11.16b}, v8.16b 521 522 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 523 524 sub rounds, rounds, #1 525 b .Ldec_sbox 526 527.Ldec_loop: 528 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 529.Ldec_sbox: 530 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 531 v13, v14, v15 532 subs rounds, rounds, #1 533 b.cc .Ldec_done 534 535 dec_next_rk 536 537 add_round_key v0, v1, v6, v4, v2, v7, v3, v5 538 539 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ 540 v13, v14, v15 541 542 b.ne .Ldec_loop 543 ldr q24, ISRM0 544 b .Ldec_loop 545.Ldec_done: 546 ldr q12, [bskey, #-16] // last round key 547 548 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 549 550 eor v0.16b, v0.16b, v12.16b 551 eor v1.16b, v1.16b, v12.16b 552 eor v6.16b, v6.16b, v12.16b 553 eor v4.16b, v4.16b, v12.16b 554 eor v2.16b, v2.16b, v12.16b 555 eor v7.16b, v7.16b, v12.16b 556 eor v3.16b, v3.16b, v12.16b 557 eor v5.16b, v5.16b, v12.16b 558 ret 559ENDPROC(aesbs_decrypt8) 560 561 /* 562 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 563 * int blocks) 564 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 565 * int blocks) 566 */ 567 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 568 frame_push 5 569 570 mov x19, x0 571 mov x20, x1 572 mov x21, x2 573 mov x22, x3 574 mov x23, x4 575 57699: mov x5, #1 577 lsl x5, x5, x23 578 subs w23, w23, #8 579 csel x23, x23, xzr, pl 580 csel x5, x5, xzr, mi 581 582 ld1 {v0.16b}, [x20], #16 583 tbnz x5, #1, 0f 584 ld1 {v1.16b}, [x20], #16 585 tbnz x5, #2, 0f 586 ld1 {v2.16b}, [x20], #16 587 tbnz x5, #3, 0f 588 ld1 {v3.16b}, [x20], #16 589 tbnz x5, #4, 0f 590 ld1 {v4.16b}, [x20], #16 591 tbnz x5, #5, 0f 592 ld1 {v5.16b}, [x20], #16 593 tbnz x5, #6, 0f 594 ld1 {v6.16b}, [x20], #16 595 tbnz x5, #7, 0f 596 ld1 {v7.16b}, [x20], #16 597 5980: mov bskey, x21 599 mov rounds, x22 600 bl \do8 601 602 st1 {\o0\().16b}, [x19], #16 603 tbnz x5, #1, 1f 604 st1 {\o1\().16b}, [x19], #16 605 tbnz x5, #2, 1f 606 st1 {\o2\().16b}, [x19], #16 607 tbnz x5, #3, 1f 608 st1 {\o3\().16b}, [x19], #16 609 tbnz x5, #4, 1f 610 st1 {\o4\().16b}, [x19], #16 611 tbnz x5, #5, 1f 612 st1 {\o5\().16b}, [x19], #16 613 tbnz x5, #6, 1f 614 st1 {\o6\().16b}, [x19], #16 615 tbnz x5, #7, 1f 616 st1 {\o7\().16b}, [x19], #16 617 618 cbz x23, 1f 619 cond_yield_neon 620 b 99b 621 6221: frame_pop 623 ret 624 .endm 625 626 .align 4 627ENTRY(aesbs_ecb_encrypt) 628 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 629ENDPROC(aesbs_ecb_encrypt) 630 631 .align 4 632ENTRY(aesbs_ecb_decrypt) 633 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 634ENDPROC(aesbs_ecb_decrypt) 635 636 /* 637 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 638 * int blocks, u8 iv[]) 639 */ 640 .align 4 641ENTRY(aesbs_cbc_decrypt) 642 frame_push 6 643 644 mov x19, x0 645 mov x20, x1 646 mov x21, x2 647 mov x22, x3 648 mov x23, x4 649 mov x24, x5 650 65199: mov x6, #1 652 lsl x6, x6, x23 653 subs w23, w23, #8 654 csel x23, x23, xzr, pl 655 csel x6, x6, xzr, mi 656 657 ld1 {v0.16b}, [x20], #16 658 mov v25.16b, v0.16b 659 tbnz x6, #1, 0f 660 ld1 {v1.16b}, [x20], #16 661 mov v26.16b, v1.16b 662 tbnz x6, #2, 0f 663 ld1 {v2.16b}, [x20], #16 664 mov v27.16b, v2.16b 665 tbnz x6, #3, 0f 666 ld1 {v3.16b}, [x20], #16 667 mov v28.16b, v3.16b 668 tbnz x6, #4, 0f 669 ld1 {v4.16b}, [x20], #16 670 mov v29.16b, v4.16b 671 tbnz x6, #5, 0f 672 ld1 {v5.16b}, [x20], #16 673 mov v30.16b, v5.16b 674 tbnz x6, #6, 0f 675 ld1 {v6.16b}, [x20], #16 676 mov v31.16b, v6.16b 677 tbnz x6, #7, 0f 678 ld1 {v7.16b}, [x20] 679 6800: mov bskey, x21 681 mov rounds, x22 682 bl aesbs_decrypt8 683 684 ld1 {v24.16b}, [x24] // load IV 685 686 eor v1.16b, v1.16b, v25.16b 687 eor v6.16b, v6.16b, v26.16b 688 eor v4.16b, v4.16b, v27.16b 689 eor v2.16b, v2.16b, v28.16b 690 eor v7.16b, v7.16b, v29.16b 691 eor v0.16b, v0.16b, v24.16b 692 eor v3.16b, v3.16b, v30.16b 693 eor v5.16b, v5.16b, v31.16b 694 695 st1 {v0.16b}, [x19], #16 696 mov v24.16b, v25.16b 697 tbnz x6, #1, 1f 698 st1 {v1.16b}, [x19], #16 699 mov v24.16b, v26.16b 700 tbnz x6, #2, 1f 701 st1 {v6.16b}, [x19], #16 702 mov v24.16b, v27.16b 703 tbnz x6, #3, 1f 704 st1 {v4.16b}, [x19], #16 705 mov v24.16b, v28.16b 706 tbnz x6, #4, 1f 707 st1 {v2.16b}, [x19], #16 708 mov v24.16b, v29.16b 709 tbnz x6, #5, 1f 710 st1 {v7.16b}, [x19], #16 711 mov v24.16b, v30.16b 712 tbnz x6, #6, 1f 713 st1 {v3.16b}, [x19], #16 714 mov v24.16b, v31.16b 715 tbnz x6, #7, 1f 716 ld1 {v24.16b}, [x20], #16 717 st1 {v5.16b}, [x19], #16 7181: st1 {v24.16b}, [x24] // store IV 719 720 cbz x23, 2f 721 cond_yield_neon 722 b 99b 723 7242: frame_pop 725 ret 726ENDPROC(aesbs_cbc_decrypt) 727 728 .macro next_tweak, out, in, const, tmp 729 sshr \tmp\().2d, \in\().2d, #63 730 and \tmp\().16b, \tmp\().16b, \const\().16b 731 add \out\().2d, \in\().2d, \in\().2d 732 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 733 eor \out\().16b, \out\().16b, \tmp\().16b 734 .endm 735 736 .align 4 737.Lxts_mul_x: 738CPU_LE( .quad 1, 0x87 ) 739CPU_BE( .quad 0x87, 1 ) 740 741 /* 742 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 743 * int blocks, u8 iv[]) 744 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 745 * int blocks, u8 iv[]) 746 */ 747__xts_crypt8: 748 mov x6, #1 749 lsl x6, x6, x23 750 subs w23, w23, #8 751 csel x23, x23, xzr, pl 752 csel x6, x6, xzr, mi 753 754 ld1 {v0.16b}, [x20], #16 755 next_tweak v26, v25, v30, v31 756 eor v0.16b, v0.16b, v25.16b 757 tbnz x6, #1, 0f 758 759 ld1 {v1.16b}, [x20], #16 760 next_tweak v27, v26, v30, v31 761 eor v1.16b, v1.16b, v26.16b 762 tbnz x6, #2, 0f 763 764 ld1 {v2.16b}, [x20], #16 765 next_tweak v28, v27, v30, v31 766 eor v2.16b, v2.16b, v27.16b 767 tbnz x6, #3, 0f 768 769 ld1 {v3.16b}, [x20], #16 770 next_tweak v29, v28, v30, v31 771 eor v3.16b, v3.16b, v28.16b 772 tbnz x6, #4, 0f 773 774 ld1 {v4.16b}, [x20], #16 775 str q29, [sp, #.Lframe_local_offset] 776 eor v4.16b, v4.16b, v29.16b 777 next_tweak v29, v29, v30, v31 778 tbnz x6, #5, 0f 779 780 ld1 {v5.16b}, [x20], #16 781 str q29, [sp, #.Lframe_local_offset + 16] 782 eor v5.16b, v5.16b, v29.16b 783 next_tweak v29, v29, v30, v31 784 tbnz x6, #6, 0f 785 786 ld1 {v6.16b}, [x20], #16 787 str q29, [sp, #.Lframe_local_offset + 32] 788 eor v6.16b, v6.16b, v29.16b 789 next_tweak v29, v29, v30, v31 790 tbnz x6, #7, 0f 791 792 ld1 {v7.16b}, [x20], #16 793 str q29, [sp, #.Lframe_local_offset + 48] 794 eor v7.16b, v7.16b, v29.16b 795 next_tweak v29, v29, v30, v31 796 7970: mov bskey, x21 798 mov rounds, x22 799 br x7 800ENDPROC(__xts_crypt8) 801 802 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 803 frame_push 6, 64 804 805 mov x19, x0 806 mov x20, x1 807 mov x21, x2 808 mov x22, x3 809 mov x23, x4 810 mov x24, x5 811 8120: ldr q30, .Lxts_mul_x 813 ld1 {v25.16b}, [x24] 814 81599: adr x7, \do8 816 bl __xts_crypt8 817 818 ldp q16, q17, [sp, #.Lframe_local_offset] 819 ldp q18, q19, [sp, #.Lframe_local_offset + 32] 820 821 eor \o0\().16b, \o0\().16b, v25.16b 822 eor \o1\().16b, \o1\().16b, v26.16b 823 eor \o2\().16b, \o2\().16b, v27.16b 824 eor \o3\().16b, \o3\().16b, v28.16b 825 826 st1 {\o0\().16b}, [x19], #16 827 mov v25.16b, v26.16b 828 tbnz x6, #1, 1f 829 st1 {\o1\().16b}, [x19], #16 830 mov v25.16b, v27.16b 831 tbnz x6, #2, 1f 832 st1 {\o2\().16b}, [x19], #16 833 mov v25.16b, v28.16b 834 tbnz x6, #3, 1f 835 st1 {\o3\().16b}, [x19], #16 836 mov v25.16b, v29.16b 837 tbnz x6, #4, 1f 838 839 eor \o4\().16b, \o4\().16b, v16.16b 840 eor \o5\().16b, \o5\().16b, v17.16b 841 eor \o6\().16b, \o6\().16b, v18.16b 842 eor \o7\().16b, \o7\().16b, v19.16b 843 844 st1 {\o4\().16b}, [x19], #16 845 tbnz x6, #5, 1f 846 st1 {\o5\().16b}, [x19], #16 847 tbnz x6, #6, 1f 848 st1 {\o6\().16b}, [x19], #16 849 tbnz x6, #7, 1f 850 st1 {\o7\().16b}, [x19], #16 851 852 cbz x23, 1f 853 st1 {v25.16b}, [x24] 854 855 cond_yield_neon 0b 856 b 99b 857 8581: st1 {v25.16b}, [x24] 859 frame_pop 860 ret 861 .endm 862 863ENTRY(aesbs_xts_encrypt) 864 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 865ENDPROC(aesbs_xts_encrypt) 866 867ENTRY(aesbs_xts_decrypt) 868 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 869ENDPROC(aesbs_xts_decrypt) 870 871 .macro next_ctr, v 872 mov \v\().d[1], x8 873 adds x8, x8, #1 874 mov \v\().d[0], x7 875 adc x7, x7, xzr 876 rev64 \v\().16b, \v\().16b 877 .endm 878 879 /* 880 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 881 * int rounds, int blocks, u8 iv[], u8 final[]) 882 */ 883ENTRY(aesbs_ctr_encrypt) 884 frame_push 8 885 886 mov x19, x0 887 mov x20, x1 888 mov x21, x2 889 mov x22, x3 890 mov x23, x4 891 mov x24, x5 892 mov x25, x6 893 894 cmp x25, #0 895 cset x26, ne 896 add x23, x23, x26 // do one extra block if final 897 89898: ldp x7, x8, [x24] 899 ld1 {v0.16b}, [x24] 900CPU_LE( rev x7, x7 ) 901CPU_LE( rev x8, x8 ) 902 adds x8, x8, #1 903 adc x7, x7, xzr 904 90599: mov x9, #1 906 lsl x9, x9, x23 907 subs w23, w23, #8 908 csel x23, x23, xzr, pl 909 csel x9, x9, xzr, le 910 911 tbnz x9, #1, 0f 912 next_ctr v1 913 tbnz x9, #2, 0f 914 next_ctr v2 915 tbnz x9, #3, 0f 916 next_ctr v3 917 tbnz x9, #4, 0f 918 next_ctr v4 919 tbnz x9, #5, 0f 920 next_ctr v5 921 tbnz x9, #6, 0f 922 next_ctr v6 923 tbnz x9, #7, 0f 924 next_ctr v7 925 9260: mov bskey, x21 927 mov rounds, x22 928 bl aesbs_encrypt8 929 930 lsr x9, x9, x26 // disregard the extra block 931 tbnz x9, #0, 0f 932 933 ld1 {v8.16b}, [x20], #16 934 eor v0.16b, v0.16b, v8.16b 935 st1 {v0.16b}, [x19], #16 936 tbnz x9, #1, 1f 937 938 ld1 {v9.16b}, [x20], #16 939 eor v1.16b, v1.16b, v9.16b 940 st1 {v1.16b}, [x19], #16 941 tbnz x9, #2, 2f 942 943 ld1 {v10.16b}, [x20], #16 944 eor v4.16b, v4.16b, v10.16b 945 st1 {v4.16b}, [x19], #16 946 tbnz x9, #3, 3f 947 948 ld1 {v11.16b}, [x20], #16 949 eor v6.16b, v6.16b, v11.16b 950 st1 {v6.16b}, [x19], #16 951 tbnz x9, #4, 4f 952 953 ld1 {v12.16b}, [x20], #16 954 eor v3.16b, v3.16b, v12.16b 955 st1 {v3.16b}, [x19], #16 956 tbnz x9, #5, 5f 957 958 ld1 {v13.16b}, [x20], #16 959 eor v7.16b, v7.16b, v13.16b 960 st1 {v7.16b}, [x19], #16 961 tbnz x9, #6, 6f 962 963 ld1 {v14.16b}, [x20], #16 964 eor v2.16b, v2.16b, v14.16b 965 st1 {v2.16b}, [x19], #16 966 tbnz x9, #7, 7f 967 968 ld1 {v15.16b}, [x20], #16 969 eor v5.16b, v5.16b, v15.16b 970 st1 {v5.16b}, [x19], #16 971 9728: next_ctr v0 973 st1 {v0.16b}, [x24] 974 cbz x23, .Lctr_done 975 976 cond_yield_neon 98b 977 b 99b 978 979.Lctr_done: 980 frame_pop 981 ret 982 983 /* 984 * If we are handling the tail of the input (x6 != NULL), return the 985 * final keystream block back to the caller. 986 */ 9870: cbz x25, 8b 988 st1 {v0.16b}, [x25] 989 b 8b 9901: cbz x25, 8b 991 st1 {v1.16b}, [x25] 992 b 8b 9932: cbz x25, 8b 994 st1 {v4.16b}, [x25] 995 b 8b 9963: cbz x25, 8b 997 st1 {v6.16b}, [x25] 998 b 8b 9994: cbz x25, 8b 1000 st1 {v3.16b}, [x25] 1001 b 8b 10025: cbz x25, 8b 1003 st1 {v7.16b}, [x25] 1004 b 8b 10056: cbz x25, 8b 1006 st1 {v2.16b}, [x25] 1007 b 8b 10087: cbz x25, 8b 1009 st1 {v5.16b}, [x25] 1010 b 8b 1011ENDPROC(aesbs_ctr_encrypt) 1012