1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * ARIA Cipher 16-way parallel algorithm (AVX) 4 * 5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> 6 * 7 */ 8 9#include <linux/linkage.h> 10#include <linux/cfi_types.h> 11#include <asm/frame.h> 12 13/* struct aria_ctx: */ 14#define enc_key 0 15#define dec_key 272 16#define rounds 544 17 18/* register macros */ 19#define CTX %rdi 20 21 22#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ 23 ( (((a0) & 1) << 0) | \ 24 (((a1) & 1) << 1) | \ 25 (((a2) & 1) << 2) | \ 26 (((a3) & 1) << 3) | \ 27 (((a4) & 1) << 4) | \ 28 (((a5) & 1) << 5) | \ 29 (((a6) & 1) << 6) | \ 30 (((a7) & 1) << 7) ) 31 32#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ 33 ( ((l7) << (0 * 8)) | \ 34 ((l6) << (1 * 8)) | \ 35 ((l5) << (2 * 8)) | \ 36 ((l4) << (3 * 8)) | \ 37 ((l3) << (4 * 8)) | \ 38 ((l2) << (5 * 8)) | \ 39 ((l1) << (6 * 8)) | \ 40 ((l0) << (7 * 8)) ) 41 42#define inc_le128(x, minus_one, tmp) \ 43 vpcmpeqq minus_one, x, tmp; \ 44 vpsubq minus_one, x, x; \ 45 vpslldq $8, tmp, tmp; \ 46 vpsubq tmp, x, x; 47 48#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 49 vpand x, mask4bit, tmp0; \ 50 vpandn x, mask4bit, x; \ 51 vpsrld $4, x, x; \ 52 \ 53 vpshufb tmp0, lo_t, tmp0; \ 54 vpshufb x, hi_t, x; \ 55 vpxor tmp0, x, x; 56 57#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 58 vpunpckhdq x1, x0, t2; \ 59 vpunpckldq x1, x0, x0; \ 60 \ 61 vpunpckldq x3, x2, t1; \ 62 vpunpckhdq x3, x2, x2; \ 63 \ 64 vpunpckhqdq t1, x0, x1; \ 65 vpunpcklqdq t1, x0, x0; \ 66 \ 67 vpunpckhqdq x2, t2, x3; \ 68 vpunpcklqdq x2, t2, x2; 69 70#define byteslice_16x16b(a0, b0, c0, d0, \ 71 a1, b1, c1, d1, \ 72 a2, b2, c2, d2, \ 73 a3, b3, c3, d3, \ 74 st0, st1) \ 75 vmovdqu d2, st0; \ 76 vmovdqu d3, st1; \ 77 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 78 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 79 vmovdqu st0, d2; \ 80 vmovdqu st1, d3; \ 81 \ 82 vmovdqu a0, st0; \ 83 vmovdqu a1, st1; \ 84 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 85 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 86 \ 87 vmovdqu .Lshufb_16x16b, a0; \ 88 vmovdqu st1, a1; \ 89 vpshufb a0, a2, a2; \ 90 vpshufb a0, a3, a3; \ 91 vpshufb a0, b0, b0; \ 92 vpshufb a0, b1, b1; \ 93 vpshufb a0, b2, b2; \ 94 vpshufb a0, b3, b3; \ 95 vpshufb a0, a1, a1; \ 96 vpshufb a0, c0, c0; \ 97 vpshufb a0, c1, c1; \ 98 vpshufb a0, c2, c2; \ 99 vpshufb a0, c3, c3; \ 100 vpshufb a0, d0, d0; \ 101 vpshufb a0, d1, d1; \ 102 vpshufb a0, d2, d2; \ 103 vpshufb a0, d3, d3; \ 104 vmovdqu d3, st1; \ 105 vmovdqu st0, d3; \ 106 vpshufb a0, d3, a0; \ 107 vmovdqu d2, st0; \ 108 \ 109 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 110 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 111 vmovdqu st0, d2; \ 112 vmovdqu st1, d3; \ 113 \ 114 vmovdqu b0, st0; \ 115 vmovdqu b1, st1; \ 116 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 117 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 118 vmovdqu st0, b0; \ 119 vmovdqu st1, b1; \ 120 /* does not adjust output bytes inside vectors */ 121 122#define debyteslice_16x16b(a0, b0, c0, d0, \ 123 a1, b1, c1, d1, \ 124 a2, b2, c2, d2, \ 125 a3, b3, c3, d3, \ 126 st0, st1) \ 127 vmovdqu d2, st0; \ 128 vmovdqu d3, st1; \ 129 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 130 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 131 vmovdqu st0, d2; \ 132 vmovdqu st1, d3; \ 133 \ 134 vmovdqu a0, st0; \ 135 vmovdqu a1, st1; \ 136 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 137 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 138 \ 139 vmovdqu .Lshufb_16x16b, a0; \ 140 vmovdqu st1, a1; \ 141 vpshufb a0, a2, a2; \ 142 vpshufb a0, a3, a3; \ 143 vpshufb a0, b0, b0; \ 144 vpshufb a0, b1, b1; \ 145 vpshufb a0, b2, b2; \ 146 vpshufb a0, b3, b3; \ 147 vpshufb a0, a1, a1; \ 148 vpshufb a0, c0, c0; \ 149 vpshufb a0, c1, c1; \ 150 vpshufb a0, c2, c2; \ 151 vpshufb a0, c3, c3; \ 152 vpshufb a0, d0, d0; \ 153 vpshufb a0, d1, d1; \ 154 vpshufb a0, d2, d2; \ 155 vpshufb a0, d3, d3; \ 156 vmovdqu d3, st1; \ 157 vmovdqu st0, d3; \ 158 vpshufb a0, d3, a0; \ 159 vmovdqu d2, st0; \ 160 \ 161 transpose_4x4(c0, d0, a0, b0, d2, d3); \ 162 transpose_4x4(c1, d1, a1, b1, d2, d3); \ 163 vmovdqu st0, d2; \ 164 vmovdqu st1, d3; \ 165 \ 166 vmovdqu b0, st0; \ 167 vmovdqu b1, st1; \ 168 transpose_4x4(c2, d2, a2, b2, b0, b1); \ 169 transpose_4x4(c3, d3, a3, b3, b0, b1); \ 170 vmovdqu st0, b0; \ 171 vmovdqu st1, b1; \ 172 /* does not adjust output bytes inside vectors */ 173 174/* load blocks to registers and apply pre-whitening */ 175#define inpack16_pre(x0, x1, x2, x3, \ 176 x4, x5, x6, x7, \ 177 y0, y1, y2, y3, \ 178 y4, y5, y6, y7, \ 179 rio) \ 180 vmovdqu (0 * 16)(rio), x0; \ 181 vmovdqu (1 * 16)(rio), x1; \ 182 vmovdqu (2 * 16)(rio), x2; \ 183 vmovdqu (3 * 16)(rio), x3; \ 184 vmovdqu (4 * 16)(rio), x4; \ 185 vmovdqu (5 * 16)(rio), x5; \ 186 vmovdqu (6 * 16)(rio), x6; \ 187 vmovdqu (7 * 16)(rio), x7; \ 188 vmovdqu (8 * 16)(rio), y0; \ 189 vmovdqu (9 * 16)(rio), y1; \ 190 vmovdqu (10 * 16)(rio), y2; \ 191 vmovdqu (11 * 16)(rio), y3; \ 192 vmovdqu (12 * 16)(rio), y4; \ 193 vmovdqu (13 * 16)(rio), y5; \ 194 vmovdqu (14 * 16)(rio), y6; \ 195 vmovdqu (15 * 16)(rio), y7; 196 197/* byteslice pre-whitened blocks and store to temporary memory */ 198#define inpack16_post(x0, x1, x2, x3, \ 199 x4, x5, x6, x7, \ 200 y0, y1, y2, y3, \ 201 y4, y5, y6, y7, \ 202 mem_ab, mem_cd) \ 203 byteslice_16x16b(x0, x1, x2, x3, \ 204 x4, x5, x6, x7, \ 205 y0, y1, y2, y3, \ 206 y4, y5, y6, y7, \ 207 (mem_ab), (mem_cd)); \ 208 \ 209 vmovdqu x0, 0 * 16(mem_ab); \ 210 vmovdqu x1, 1 * 16(mem_ab); \ 211 vmovdqu x2, 2 * 16(mem_ab); \ 212 vmovdqu x3, 3 * 16(mem_ab); \ 213 vmovdqu x4, 4 * 16(mem_ab); \ 214 vmovdqu x5, 5 * 16(mem_ab); \ 215 vmovdqu x6, 6 * 16(mem_ab); \ 216 vmovdqu x7, 7 * 16(mem_ab); \ 217 vmovdqu y0, 0 * 16(mem_cd); \ 218 vmovdqu y1, 1 * 16(mem_cd); \ 219 vmovdqu y2, 2 * 16(mem_cd); \ 220 vmovdqu y3, 3 * 16(mem_cd); \ 221 vmovdqu y4, 4 * 16(mem_cd); \ 222 vmovdqu y5, 5 * 16(mem_cd); \ 223 vmovdqu y6, 6 * 16(mem_cd); \ 224 vmovdqu y7, 7 * 16(mem_cd); 225 226#define write_output(x0, x1, x2, x3, \ 227 x4, x5, x6, x7, \ 228 y0, y1, y2, y3, \ 229 y4, y5, y6, y7, \ 230 mem) \ 231 vmovdqu x0, 0 * 16(mem); \ 232 vmovdqu x1, 1 * 16(mem); \ 233 vmovdqu x2, 2 * 16(mem); \ 234 vmovdqu x3, 3 * 16(mem); \ 235 vmovdqu x4, 4 * 16(mem); \ 236 vmovdqu x5, 5 * 16(mem); \ 237 vmovdqu x6, 6 * 16(mem); \ 238 vmovdqu x7, 7 * 16(mem); \ 239 vmovdqu y0, 8 * 16(mem); \ 240 vmovdqu y1, 9 * 16(mem); \ 241 vmovdqu y2, 10 * 16(mem); \ 242 vmovdqu y3, 11 * 16(mem); \ 243 vmovdqu y4, 12 * 16(mem); \ 244 vmovdqu y5, 13 * 16(mem); \ 245 vmovdqu y6, 14 * 16(mem); \ 246 vmovdqu y7, 15 * 16(mem); \ 247 248#define aria_store_state_8way(x0, x1, x2, x3, \ 249 x4, x5, x6, x7, \ 250 mem_tmp, idx) \ 251 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \ 252 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \ 253 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \ 254 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \ 255 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \ 256 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \ 257 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \ 258 vmovdqu x7, ((idx + 7) * 16)(mem_tmp); 259 260#define aria_load_state_8way(x0, x1, x2, x3, \ 261 x4, x5, x6, x7, \ 262 mem_tmp, idx) \ 263 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \ 264 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \ 265 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \ 266 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \ 267 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \ 268 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \ 269 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \ 270 vmovdqu ((idx + 7) * 16)(mem_tmp), x7; 271 272#define aria_ark_8way(x0, x1, x2, x3, \ 273 x4, x5, x6, x7, \ 274 t0, rk, idx, round) \ 275 /* AddRoundKey */ \ 276 vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ 277 vpxor t0, x0, x0; \ 278 vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ 279 vpxor t0, x1, x1; \ 280 vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ 281 vpxor t0, x2, x2; \ 282 vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ 283 vpxor t0, x3, x3; \ 284 vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ 285 vpxor t0, x4, x4; \ 286 vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ 287 vpxor t0, x5, x5; \ 288 vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ 289 vpxor t0, x6, x6; \ 290 vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ 291 vpxor t0, x7, x7; 292 293#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ 294 x4, x5, x6, x7, \ 295 t0, t1, t2, t3, \ 296 t4, t5, t6, t7) \ 297 vpbroadcastq .Ltf_s2_bitmatrix, t0; \ 298 vpbroadcastq .Ltf_inv_bitmatrix, t1; \ 299 vpbroadcastq .Ltf_id_bitmatrix, t2; \ 300 vpbroadcastq .Ltf_aff_bitmatrix, t3; \ 301 vpbroadcastq .Ltf_x2_bitmatrix, t4; \ 302 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ 303 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ 304 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ 305 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ 306 vgf2p8affineinvqb $0, t2, x2, x2; \ 307 vgf2p8affineinvqb $0, t2, x6, x6; \ 308 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ 309 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ 310 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ 311 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ 312 vgf2p8affineinvqb $0, t2, x3, x3; \ 313 vgf2p8affineinvqb $0, t2, x7, x7 314 315#define aria_sbox_8way(x0, x1, x2, x3, \ 316 x4, x5, x6, x7, \ 317 t0, t1, t2, t3, \ 318 t4, t5, t6, t7) \ 319 vpxor t7, t7, t7; \ 320 vmovdqa .Linv_shift_row, t0; \ 321 vmovdqa .Lshift_row, t1; \ 322 vpbroadcastd .L0f0f0f0f, t6; \ 323 vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ 324 vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ 325 vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ 326 vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \ 327 \ 328 vaesenclast t7, x0, x0; \ 329 vaesenclast t7, x4, x4; \ 330 vaesenclast t7, x1, x1; \ 331 vaesenclast t7, x5, x5; \ 332 vaesdeclast t7, x2, x2; \ 333 vaesdeclast t7, x6, x6; \ 334 \ 335 /* AES inverse shift rows */ \ 336 vpshufb t0, x0, x0; \ 337 vpshufb t0, x4, x4; \ 338 vpshufb t0, x1, x1; \ 339 vpshufb t0, x5, x5; \ 340 vpshufb t1, x3, x3; \ 341 vpshufb t1, x7, x7; \ 342 vpshufb t1, x2, x2; \ 343 vpshufb t1, x6, x6; \ 344 \ 345 /* affine transformation for S2 */ \ 346 filter_8bit(x1, t2, t3, t6, t0); \ 347 /* affine transformation for S2 */ \ 348 filter_8bit(x5, t2, t3, t6, t0); \ 349 \ 350 /* affine transformation for X2 */ \ 351 filter_8bit(x3, t4, t5, t6, t0); \ 352 /* affine transformation for X2 */ \ 353 filter_8bit(x7, t4, t5, t6, t0); \ 354 vaesdeclast t7, x3, x3; \ 355 vaesdeclast t7, x7, x7; 356 357#define aria_diff_m(x0, x1, x2, x3, \ 358 t0, t1, t2, t3) \ 359 /* T = rotr32(X, 8); */ \ 360 /* X ^= T */ \ 361 vpxor x0, x3, t0; \ 362 vpxor x1, x0, t1; \ 363 vpxor x2, x1, t2; \ 364 vpxor x3, x2, t3; \ 365 /* X = T ^ rotr(X, 16); */ \ 366 vpxor t2, x0, x0; \ 367 vpxor x1, t3, t3; \ 368 vpxor t0, x2, x2; \ 369 vpxor t1, x3, x1; \ 370 vmovdqu t3, x3; 371 372#define aria_diff_word(x0, x1, x2, x3, \ 373 x4, x5, x6, x7, \ 374 y0, y1, y2, y3, \ 375 y4, y5, y6, y7) \ 376 /* t1 ^= t2; */ \ 377 vpxor y0, x4, x4; \ 378 vpxor y1, x5, x5; \ 379 vpxor y2, x6, x6; \ 380 vpxor y3, x7, x7; \ 381 \ 382 /* t2 ^= t3; */ \ 383 vpxor y4, y0, y0; \ 384 vpxor y5, y1, y1; \ 385 vpxor y6, y2, y2; \ 386 vpxor y7, y3, y3; \ 387 \ 388 /* t0 ^= t1; */ \ 389 vpxor x4, x0, x0; \ 390 vpxor x5, x1, x1; \ 391 vpxor x6, x2, x2; \ 392 vpxor x7, x3, x3; \ 393 \ 394 /* t3 ^= t1; */ \ 395 vpxor x4, y4, y4; \ 396 vpxor x5, y5, y5; \ 397 vpxor x6, y6, y6; \ 398 vpxor x7, y7, y7; \ 399 \ 400 /* t2 ^= t0; */ \ 401 vpxor x0, y0, y0; \ 402 vpxor x1, y1, y1; \ 403 vpxor x2, y2, y2; \ 404 vpxor x3, y3, y3; \ 405 \ 406 /* t1 ^= t2; */ \ 407 vpxor y0, x4, x4; \ 408 vpxor y1, x5, x5; \ 409 vpxor y2, x6, x6; \ 410 vpxor y3, x7, x7; 411 412#define aria_fe(x0, x1, x2, x3, \ 413 x4, x5, x6, x7, \ 414 y0, y1, y2, y3, \ 415 y4, y5, y6, y7, \ 416 mem_tmp, rk, round) \ 417 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 418 y0, rk, 8, round); \ 419 \ 420 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 421 y0, y1, y2, y3, y4, y5, y6, y7); \ 422 \ 423 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 424 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 425 aria_store_state_8way(x0, x1, x2, x3, \ 426 x4, x5, x6, x7, \ 427 mem_tmp, 8); \ 428 \ 429 aria_load_state_8way(x0, x1, x2, x3, \ 430 x4, x5, x6, x7, \ 431 mem_tmp, 0); \ 432 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 433 y0, rk, 0, round); \ 434 \ 435 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 436 y0, y1, y2, y3, y4, y5, y6, y7); \ 437 \ 438 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 439 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 440 aria_store_state_8way(x0, x1, x2, x3, \ 441 x4, x5, x6, x7, \ 442 mem_tmp, 0); \ 443 aria_load_state_8way(y0, y1, y2, y3, \ 444 y4, y5, y6, y7, \ 445 mem_tmp, 8); \ 446 aria_diff_word(x0, x1, x2, x3, \ 447 x4, x5, x6, x7, \ 448 y0, y1, y2, y3, \ 449 y4, y5, y6, y7); \ 450 /* aria_diff_byte() \ 451 * T3 = ABCD -> BADC \ 452 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 453 * T0 = ABCD -> CDAB \ 454 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 455 * T1 = ABCD -> DCBA \ 456 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 457 */ \ 458 aria_diff_word(x2, x3, x0, x1, \ 459 x7, x6, x5, x4, \ 460 y0, y1, y2, y3, \ 461 y5, y4, y7, y6); \ 462 aria_store_state_8way(x3, x2, x1, x0, \ 463 x6, x7, x4, x5, \ 464 mem_tmp, 0); 465 466#define aria_fo(x0, x1, x2, x3, \ 467 x4, x5, x6, x7, \ 468 y0, y1, y2, y3, \ 469 y4, y5, y6, y7, \ 470 mem_tmp, rk, round) \ 471 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 472 y0, rk, 8, round); \ 473 \ 474 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 475 y0, y1, y2, y3, y4, y5, y6, y7); \ 476 \ 477 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 478 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 479 aria_store_state_8way(x0, x1, x2, x3, \ 480 x4, x5, x6, x7, \ 481 mem_tmp, 8); \ 482 \ 483 aria_load_state_8way(x0, x1, x2, x3, \ 484 x4, x5, x6, x7, \ 485 mem_tmp, 0); \ 486 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 487 y0, rk, 0, round); \ 488 \ 489 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 490 y0, y1, y2, y3, y4, y5, y6, y7); \ 491 \ 492 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 493 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 494 aria_store_state_8way(x0, x1, x2, x3, \ 495 x4, x5, x6, x7, \ 496 mem_tmp, 0); \ 497 aria_load_state_8way(y0, y1, y2, y3, \ 498 y4, y5, y6, y7, \ 499 mem_tmp, 8); \ 500 aria_diff_word(x0, x1, x2, x3, \ 501 x4, x5, x6, x7, \ 502 y0, y1, y2, y3, \ 503 y4, y5, y6, y7); \ 504 /* aria_diff_byte() \ 505 * T1 = ABCD -> BADC \ 506 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 507 * T2 = ABCD -> CDAB \ 508 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 509 * T3 = ABCD -> DCBA \ 510 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 511 */ \ 512 aria_diff_word(x0, x1, x2, x3, \ 513 x5, x4, x7, x6, \ 514 y2, y3, y0, y1, \ 515 y7, y6, y5, y4); \ 516 aria_store_state_8way(x3, x2, x1, x0, \ 517 x6, x7, x4, x5, \ 518 mem_tmp, 0); 519 520#define aria_ff(x0, x1, x2, x3, \ 521 x4, x5, x6, x7, \ 522 y0, y1, y2, y3, \ 523 y4, y5, y6, y7, \ 524 mem_tmp, rk, round, last_round) \ 525 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 526 y0, rk, 8, round); \ 527 \ 528 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 529 y0, y1, y2, y3, y4, y5, y6, y7); \ 530 \ 531 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 532 y0, rk, 8, last_round); \ 533 \ 534 aria_store_state_8way(x0, x1, x2, x3, \ 535 x4, x5, x6, x7, \ 536 mem_tmp, 8); \ 537 \ 538 aria_load_state_8way(x0, x1, x2, x3, \ 539 x4, x5, x6, x7, \ 540 mem_tmp, 0); \ 541 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 542 y0, rk, 0, round); \ 543 \ 544 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 545 y0, y1, y2, y3, y4, y5, y6, y7); \ 546 \ 547 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 548 y0, rk, 0, last_round); \ 549 \ 550 aria_load_state_8way(y0, y1, y2, y3, \ 551 y4, y5, y6, y7, \ 552 mem_tmp, 8); 553 554#define aria_fe_gfni(x0, x1, x2, x3, \ 555 x4, x5, x6, x7, \ 556 y0, y1, y2, y3, \ 557 y4, y5, y6, y7, \ 558 mem_tmp, rk, round) \ 559 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 560 y0, rk, 8, round); \ 561 \ 562 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 563 x6, x7, x4, x5, \ 564 y0, y1, y2, y3, \ 565 y4, y5, y6, y7); \ 566 \ 567 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 568 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 569 aria_store_state_8way(x0, x1, x2, x3, \ 570 x4, x5, x6, x7, \ 571 mem_tmp, 8); \ 572 \ 573 aria_load_state_8way(x0, x1, x2, x3, \ 574 x4, x5, x6, x7, \ 575 mem_tmp, 0); \ 576 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 577 y0, rk, 0, round); \ 578 \ 579 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 580 x6, x7, x4, x5, \ 581 y0, y1, y2, y3, \ 582 y4, y5, y6, y7); \ 583 \ 584 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 585 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 586 aria_store_state_8way(x0, x1, x2, x3, \ 587 x4, x5, x6, x7, \ 588 mem_tmp, 0); \ 589 aria_load_state_8way(y0, y1, y2, y3, \ 590 y4, y5, y6, y7, \ 591 mem_tmp, 8); \ 592 aria_diff_word(x0, x1, x2, x3, \ 593 x4, x5, x6, x7, \ 594 y0, y1, y2, y3, \ 595 y4, y5, y6, y7); \ 596 /* aria_diff_byte() \ 597 * T3 = ABCD -> BADC \ 598 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 599 * T0 = ABCD -> CDAB \ 600 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 601 * T1 = ABCD -> DCBA \ 602 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 603 */ \ 604 aria_diff_word(x2, x3, x0, x1, \ 605 x7, x6, x5, x4, \ 606 y0, y1, y2, y3, \ 607 y5, y4, y7, y6); \ 608 aria_store_state_8way(x3, x2, x1, x0, \ 609 x6, x7, x4, x5, \ 610 mem_tmp, 0); 611 612#define aria_fo_gfni(x0, x1, x2, x3, \ 613 x4, x5, x6, x7, \ 614 y0, y1, y2, y3, \ 615 y4, y5, y6, y7, \ 616 mem_tmp, rk, round) \ 617 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 618 y0, rk, 8, round); \ 619 \ 620 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 621 x4, x5, x6, x7, \ 622 y0, y1, y2, y3, \ 623 y4, y5, y6, y7); \ 624 \ 625 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 626 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 627 aria_store_state_8way(x0, x1, x2, x3, \ 628 x4, x5, x6, x7, \ 629 mem_tmp, 8); \ 630 \ 631 aria_load_state_8way(x0, x1, x2, x3, \ 632 x4, x5, x6, x7, \ 633 mem_tmp, 0); \ 634 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 635 y0, rk, 0, round); \ 636 \ 637 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 638 x4, x5, x6, x7, \ 639 y0, y1, y2, y3, \ 640 y4, y5, y6, y7); \ 641 \ 642 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 643 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 644 aria_store_state_8way(x0, x1, x2, x3, \ 645 x4, x5, x6, x7, \ 646 mem_tmp, 0); \ 647 aria_load_state_8way(y0, y1, y2, y3, \ 648 y4, y5, y6, y7, \ 649 mem_tmp, 8); \ 650 aria_diff_word(x0, x1, x2, x3, \ 651 x4, x5, x6, x7, \ 652 y0, y1, y2, y3, \ 653 y4, y5, y6, y7); \ 654 /* aria_diff_byte() \ 655 * T1 = ABCD -> BADC \ 656 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 657 * T2 = ABCD -> CDAB \ 658 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 659 * T3 = ABCD -> DCBA \ 660 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 661 */ \ 662 aria_diff_word(x0, x1, x2, x3, \ 663 x5, x4, x7, x6, \ 664 y2, y3, y0, y1, \ 665 y7, y6, y5, y4); \ 666 aria_store_state_8way(x3, x2, x1, x0, \ 667 x6, x7, x4, x5, \ 668 mem_tmp, 0); 669 670#define aria_ff_gfni(x0, x1, x2, x3, \ 671 x4, x5, x6, x7, \ 672 y0, y1, y2, y3, \ 673 y4, y5, y6, y7, \ 674 mem_tmp, rk, round, last_round) \ 675 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 676 y0, rk, 8, round); \ 677 \ 678 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 679 x6, x7, x4, x5, \ 680 y0, y1, y2, y3, \ 681 y4, y5, y6, y7); \ 682 \ 683 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 684 y0, rk, 8, last_round); \ 685 \ 686 aria_store_state_8way(x0, x1, x2, x3, \ 687 x4, x5, x6, x7, \ 688 mem_tmp, 8); \ 689 \ 690 aria_load_state_8way(x0, x1, x2, x3, \ 691 x4, x5, x6, x7, \ 692 mem_tmp, 0); \ 693 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 694 y0, rk, 0, round); \ 695 \ 696 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 697 x6, x7, x4, x5, \ 698 y0, y1, y2, y3, \ 699 y4, y5, y6, y7); \ 700 \ 701 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 702 y0, rk, 0, last_round); \ 703 \ 704 aria_load_state_8way(y0, y1, y2, y3, \ 705 y4, y5, y6, y7, \ 706 mem_tmp, 8); 707 708/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 709.section .rodata.cst16, "aM", @progbits, 16 710.align 16 711 712#define SHUFB_BYTES(idx) \ 713 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 714 715.Lshufb_16x16b: 716 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); 717/* For isolating SubBytes from AESENCLAST, inverse shift row */ 718.Linv_shift_row: 719 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 720 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 721.Lshift_row: 722 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 723 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b 724/* For CTR-mode IV byteswap */ 725.Lbswap128_mask: 726 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 727 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 728 729/* AES inverse affine and S2 combined: 730 * 1 1 0 0 0 0 0 1 x0 0 731 * 0 1 0 0 1 0 0 0 x1 0 732 * 1 1 0 0 1 1 1 1 x2 0 733 * 0 1 1 0 1 0 0 1 x3 1 734 * 0 1 0 0 1 1 0 0 * x4 + 0 735 * 0 1 0 1 1 0 0 0 x5 0 736 * 0 0 0 0 0 1 0 1 x6 0 737 * 1 1 1 0 0 1 1 1 x7 1 738 */ 739.Ltf_lo__inv_aff__and__s2: 740 .octa 0x92172DA81A9FA520B2370D883ABF8500 741.Ltf_hi__inv_aff__and__s2: 742 .octa 0x2B15FFC1AF917B45E6D8320C625CB688 743 744/* X2 and AES forward affine combined: 745 * 1 0 1 1 0 0 0 1 x0 0 746 * 0 1 1 1 1 0 1 1 x1 0 747 * 0 0 0 1 1 0 1 0 x2 1 748 * 0 1 0 0 0 1 0 0 x3 0 749 * 0 0 1 1 1 0 1 1 * x4 + 0 750 * 0 1 0 0 1 0 0 0 x5 0 751 * 1 1 0 1 0 0 1 1 x6 0 752 * 0 1 0 0 1 0 1 0 x7 0 753 */ 754.Ltf_lo__x2__and__fwd_aff: 755 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 756.Ltf_hi__x2__and__fwd_aff: 757 .octa 0x3F893781E95FE1576CDA64D2BA0CB204 758 759.section .rodata.cst8, "aM", @progbits, 8 760.align 8 761/* AES affine: */ 762#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) 763.Ltf_aff_bitmatrix: 764 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 765 BV8(1, 1, 0, 0, 0, 1, 1, 1), 766 BV8(1, 1, 1, 0, 0, 0, 1, 1), 767 BV8(1, 1, 1, 1, 0, 0, 0, 1), 768 BV8(1, 1, 1, 1, 1, 0, 0, 0), 769 BV8(0, 1, 1, 1, 1, 1, 0, 0), 770 BV8(0, 0, 1, 1, 1, 1, 1, 0), 771 BV8(0, 0, 0, 1, 1, 1, 1, 1)) 772 773/* AES inverse affine: */ 774#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) 775.Ltf_inv_bitmatrix: 776 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 777 BV8(1, 0, 0, 1, 0, 0, 1, 0), 778 BV8(0, 1, 0, 0, 1, 0, 0, 1), 779 BV8(1, 0, 1, 0, 0, 1, 0, 0), 780 BV8(0, 1, 0, 1, 0, 0, 1, 0), 781 BV8(0, 0, 1, 0, 1, 0, 0, 1), 782 BV8(1, 0, 0, 1, 0, 1, 0, 0), 783 BV8(0, 1, 0, 0, 1, 0, 1, 0)) 784 785/* S2: */ 786#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) 787.Ltf_s2_bitmatrix: 788 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 789 BV8(0, 0, 1, 1, 1, 1, 1, 1), 790 BV8(1, 1, 1, 0, 1, 1, 0, 1), 791 BV8(1, 1, 0, 0, 0, 0, 1, 1), 792 BV8(0, 1, 0, 0, 0, 0, 1, 1), 793 BV8(1, 1, 0, 0, 1, 1, 1, 0), 794 BV8(0, 1, 1, 0, 0, 0, 1, 1), 795 BV8(1, 1, 1, 1, 0, 1, 1, 0)) 796 797/* X2: */ 798#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) 799.Ltf_x2_bitmatrix: 800 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 801 BV8(0, 0, 1, 0, 0, 1, 1, 0), 802 BV8(0, 0, 0, 0, 1, 0, 1, 0), 803 BV8(1, 1, 1, 0, 0, 0, 1, 1), 804 BV8(1, 1, 1, 0, 1, 1, 0, 0), 805 BV8(0, 1, 1, 0, 1, 0, 1, 1), 806 BV8(1, 0, 1, 1, 1, 1, 0, 1), 807 BV8(1, 0, 0, 1, 0, 0, 1, 1)) 808 809/* Identity matrix: */ 810.Ltf_id_bitmatrix: 811 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 812 BV8(0, 1, 0, 0, 0, 0, 0, 0), 813 BV8(0, 0, 1, 0, 0, 0, 0, 0), 814 BV8(0, 0, 0, 1, 0, 0, 0, 0), 815 BV8(0, 0, 0, 0, 1, 0, 0, 0), 816 BV8(0, 0, 0, 0, 0, 1, 0, 0), 817 BV8(0, 0, 0, 0, 0, 0, 1, 0), 818 BV8(0, 0, 0, 0, 0, 0, 0, 1)) 819 820/* 4-bit mask */ 821.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 822.align 4 823.L0f0f0f0f: 824 .long 0x0f0f0f0f 825 826.text 827 828SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) 829 /* input: 830 * %r9: rk 831 * %rsi: dst 832 * %rdx: src 833 * %xmm0..%xmm15: 16 byte-sliced blocks 834 */ 835 836 FRAME_BEGIN 837 838 movq %rsi, %rax; 839 leaq 8 * 16(%rax), %r8; 840 841 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 842 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 843 %xmm15, %rax, %r8); 844 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 845 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 846 %rax, %r9, 0); 847 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 848 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 849 %xmm15, %rax, %r9, 1); 850 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 851 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 852 %rax, %r9, 2); 853 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 854 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 855 %xmm15, %rax, %r9, 3); 856 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 857 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 858 %rax, %r9, 4); 859 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 860 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 861 %xmm15, %rax, %r9, 5); 862 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 863 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 864 %rax, %r9, 6); 865 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 866 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 867 %xmm15, %rax, %r9, 7); 868 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 869 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 870 %rax, %r9, 8); 871 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 872 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 873 %xmm15, %rax, %r9, 9); 874 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 875 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 876 %rax, %r9, 10); 877 cmpl $12, rounds(CTX); 878 jne .Laria_192; 879 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 880 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 881 %xmm15, %rax, %r9, 11, 12); 882 jmp .Laria_end; 883.Laria_192: 884 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 885 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 886 %xmm15, %rax, %r9, 11); 887 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 888 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 889 %rax, %r9, 12); 890 cmpl $14, rounds(CTX); 891 jne .Laria_256; 892 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 893 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 894 %xmm15, %rax, %r9, 13, 14); 895 jmp .Laria_end; 896.Laria_256: 897 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 898 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 899 %xmm15, %rax, %r9, 13); 900 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 901 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 902 %rax, %r9, 14); 903 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 904 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 905 %xmm15, %rax, %r9, 15, 16); 906.Laria_end: 907 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 908 %xmm9, %xmm13, %xmm0, %xmm5, 909 %xmm10, %xmm14, %xmm3, %xmm6, 910 %xmm11, %xmm15, %xmm2, %xmm7, 911 (%rax), (%r8)); 912 913 FRAME_END 914 RET; 915SYM_FUNC_END(__aria_aesni_avx_crypt_16way) 916 917SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way) 918 /* input: 919 * %rdi: ctx, CTX 920 * %rsi: dst 921 * %rdx: src 922 */ 923 924 FRAME_BEGIN 925 926 leaq enc_key(CTX), %r9; 927 928 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 929 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 930 %xmm15, %rdx); 931 932 call __aria_aesni_avx_crypt_16way; 933 934 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 935 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 936 %xmm15, %rax); 937 938 FRAME_END 939 RET; 940SYM_FUNC_END(aria_aesni_avx_encrypt_16way) 941 942SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way) 943 /* input: 944 * %rdi: ctx, CTX 945 * %rsi: dst 946 * %rdx: src 947 */ 948 949 FRAME_BEGIN 950 951 leaq dec_key(CTX), %r9; 952 953 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 954 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 955 %xmm15, %rdx); 956 957 call __aria_aesni_avx_crypt_16way; 958 959 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 960 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 961 %xmm15, %rax); 962 963 FRAME_END 964 RET; 965SYM_FUNC_END(aria_aesni_avx_decrypt_16way) 966 967SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way) 968 /* input: 969 * %rdi: ctx 970 * %rsi: dst 971 * %rdx: src 972 * %rcx: keystream 973 * %r8: iv (big endian, 128bit) 974 */ 975 976 FRAME_BEGIN 977 /* load IV and byteswap */ 978 vmovdqu (%r8), %xmm8; 979 980 vmovdqa .Lbswap128_mask (%rip), %xmm1; 981 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */ 982 983 vpcmpeqd %xmm0, %xmm0, %xmm0; 984 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */ 985 986 /* construct IVs */ 987 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 988 vpshufb %xmm1, %xmm3, %xmm9; 989 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 990 vpshufb %xmm1, %xmm3, %xmm10; 991 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 992 vpshufb %xmm1, %xmm3, %xmm11; 993 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 994 vpshufb %xmm1, %xmm3, %xmm12; 995 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 996 vpshufb %xmm1, %xmm3, %xmm13; 997 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 998 vpshufb %xmm1, %xmm3, %xmm14; 999 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1000 vpshufb %xmm1, %xmm3, %xmm15; 1001 vmovdqu %xmm8, (0 * 16)(%rcx); 1002 vmovdqu %xmm9, (1 * 16)(%rcx); 1003 vmovdqu %xmm10, (2 * 16)(%rcx); 1004 vmovdqu %xmm11, (3 * 16)(%rcx); 1005 vmovdqu %xmm12, (4 * 16)(%rcx); 1006 vmovdqu %xmm13, (5 * 16)(%rcx); 1007 vmovdqu %xmm14, (6 * 16)(%rcx); 1008 vmovdqu %xmm15, (7 * 16)(%rcx); 1009 1010 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1011 vpshufb %xmm1, %xmm3, %xmm8; 1012 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1013 vpshufb %xmm1, %xmm3, %xmm9; 1014 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1015 vpshufb %xmm1, %xmm3, %xmm10; 1016 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1017 vpshufb %xmm1, %xmm3, %xmm11; 1018 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1019 vpshufb %xmm1, %xmm3, %xmm12; 1020 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1021 vpshufb %xmm1, %xmm3, %xmm13; 1022 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1023 vpshufb %xmm1, %xmm3, %xmm14; 1024 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1025 vpshufb %xmm1, %xmm3, %xmm15; 1026 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1027 vpshufb %xmm1, %xmm3, %xmm4; 1028 vmovdqu %xmm4, (%r8); 1029 1030 vmovdqu (0 * 16)(%rcx), %xmm0; 1031 vmovdqu (1 * 16)(%rcx), %xmm1; 1032 vmovdqu (2 * 16)(%rcx), %xmm2; 1033 vmovdqu (3 * 16)(%rcx), %xmm3; 1034 vmovdqu (4 * 16)(%rcx), %xmm4; 1035 vmovdqu (5 * 16)(%rcx), %xmm5; 1036 vmovdqu (6 * 16)(%rcx), %xmm6; 1037 vmovdqu (7 * 16)(%rcx), %xmm7; 1038 1039 FRAME_END 1040 RET; 1041SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way) 1042 1043SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way) 1044 /* input: 1045 * %rdi: ctx 1046 * %rsi: dst 1047 * %rdx: src 1048 * %rcx: keystream 1049 * %r8: iv (big endian, 128bit) 1050 */ 1051 FRAME_BEGIN 1052 1053 call __aria_aesni_avx_ctr_gen_keystream_16way; 1054 1055 leaq (%rsi), %r10; 1056 leaq (%rdx), %r11; 1057 leaq (%rcx), %rsi; 1058 leaq (%rcx), %rdx; 1059 leaq enc_key(CTX), %r9; 1060 1061 call __aria_aesni_avx_crypt_16way; 1062 1063 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1064 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1065 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1066 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1067 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1068 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1069 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1070 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1071 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1072 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1073 vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1074 vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1075 vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1076 vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1077 vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1078 vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1079 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1080 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1081 %xmm15, %r10); 1082 1083 FRAME_END 1084 RET; 1085SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way) 1086 1087SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) 1088 /* input: 1089 * %r9: rk 1090 * %rsi: dst 1091 * %rdx: src 1092 * %xmm0..%xmm15: 16 byte-sliced blocks 1093 */ 1094 1095 FRAME_BEGIN 1096 1097 movq %rsi, %rax; 1098 leaq 8 * 16(%rax), %r8; 1099 1100 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, 1101 %xmm4, %xmm5, %xmm6, %xmm7, 1102 %xmm8, %xmm9, %xmm10, %xmm11, 1103 %xmm12, %xmm13, %xmm14, 1104 %xmm15, %rax, %r8); 1105 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11, 1106 %xmm12, %xmm13, %xmm14, %xmm15, 1107 %xmm0, %xmm1, %xmm2, %xmm3, 1108 %xmm4, %xmm5, %xmm6, %xmm7, 1109 %rax, %r9, 0); 1110 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1111 %xmm4, %xmm5, %xmm6, %xmm7, 1112 %xmm8, %xmm9, %xmm10, %xmm11, 1113 %xmm12, %xmm13, %xmm14, 1114 %xmm15, %rax, %r9, 1); 1115 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1116 %xmm12, %xmm13, %xmm14, %xmm15, 1117 %xmm0, %xmm1, %xmm2, %xmm3, 1118 %xmm4, %xmm5, %xmm6, %xmm7, 1119 %rax, %r9, 2); 1120 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1121 %xmm4, %xmm5, %xmm6, %xmm7, 1122 %xmm8, %xmm9, %xmm10, %xmm11, 1123 %xmm12, %xmm13, %xmm14, 1124 %xmm15, %rax, %r9, 3); 1125 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1126 %xmm12, %xmm13, %xmm14, %xmm15, 1127 %xmm0, %xmm1, %xmm2, %xmm3, 1128 %xmm4, %xmm5, %xmm6, %xmm7, 1129 %rax, %r9, 4); 1130 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1131 %xmm4, %xmm5, %xmm6, %xmm7, 1132 %xmm8, %xmm9, %xmm10, %xmm11, 1133 %xmm12, %xmm13, %xmm14, 1134 %xmm15, %rax, %r9, 5); 1135 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1136 %xmm12, %xmm13, %xmm14, %xmm15, 1137 %xmm0, %xmm1, %xmm2, %xmm3, 1138 %xmm4, %xmm5, %xmm6, %xmm7, 1139 %rax, %r9, 6); 1140 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1141 %xmm4, %xmm5, %xmm6, %xmm7, 1142 %xmm8, %xmm9, %xmm10, %xmm11, 1143 %xmm12, %xmm13, %xmm14, 1144 %xmm15, %rax, %r9, 7); 1145 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1146 %xmm12, %xmm13, %xmm14, %xmm15, 1147 %xmm0, %xmm1, %xmm2, %xmm3, 1148 %xmm4, %xmm5, %xmm6, %xmm7, 1149 %rax, %r9, 8); 1150 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1151 %xmm4, %xmm5, %xmm6, %xmm7, 1152 %xmm8, %xmm9, %xmm10, %xmm11, 1153 %xmm12, %xmm13, %xmm14, 1154 %xmm15, %rax, %r9, 9); 1155 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1156 %xmm12, %xmm13, %xmm14, %xmm15, 1157 %xmm0, %xmm1, %xmm2, %xmm3, 1158 %xmm4, %xmm5, %xmm6, %xmm7, 1159 %rax, %r9, 10); 1160 cmpl $12, rounds(CTX); 1161 jne .Laria_gfni_192; 1162 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1163 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1164 %xmm15, %rax, %r9, 11, 12); 1165 jmp .Laria_gfni_end; 1166.Laria_gfni_192: 1167 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1168 %xmm4, %xmm5, %xmm6, %xmm7, 1169 %xmm8, %xmm9, %xmm10, %xmm11, 1170 %xmm12, %xmm13, %xmm14, 1171 %xmm15, %rax, %r9, 11); 1172 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1173 %xmm12, %xmm13, %xmm14, %xmm15, 1174 %xmm0, %xmm1, %xmm2, %xmm3, 1175 %xmm4, %xmm5, %xmm6, %xmm7, 1176 %rax, %r9, 12); 1177 cmpl $14, rounds(CTX); 1178 jne .Laria_gfni_256; 1179 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1180 %xmm4, %xmm5, %xmm6, %xmm7, 1181 %xmm8, %xmm9, %xmm10, %xmm11, 1182 %xmm12, %xmm13, %xmm14, 1183 %xmm15, %rax, %r9, 13, 14); 1184 jmp .Laria_gfni_end; 1185.Laria_gfni_256: 1186 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1187 %xmm4, %xmm5, %xmm6, %xmm7, 1188 %xmm8, %xmm9, %xmm10, %xmm11, 1189 %xmm12, %xmm13, %xmm14, 1190 %xmm15, %rax, %r9, 13); 1191 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1192 %xmm12, %xmm13, %xmm14, %xmm15, 1193 %xmm0, %xmm1, %xmm2, %xmm3, 1194 %xmm4, %xmm5, %xmm6, %xmm7, 1195 %rax, %r9, 14); 1196 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1197 %xmm4, %xmm5, %xmm6, %xmm7, 1198 %xmm8, %xmm9, %xmm10, %xmm11, 1199 %xmm12, %xmm13, %xmm14, 1200 %xmm15, %rax, %r9, 15, 16); 1201.Laria_gfni_end: 1202 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 1203 %xmm9, %xmm13, %xmm0, %xmm5, 1204 %xmm10, %xmm14, %xmm3, %xmm6, 1205 %xmm11, %xmm15, %xmm2, %xmm7, 1206 (%rax), (%r8)); 1207 1208 FRAME_END 1209 RET; 1210SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way) 1211 1212SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) 1213 /* input: 1214 * %rdi: ctx, CTX 1215 * %rsi: dst 1216 * %rdx: src 1217 */ 1218 1219 FRAME_BEGIN 1220 1221 leaq enc_key(CTX), %r9; 1222 1223 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1224 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1225 %xmm15, %rdx); 1226 1227 call __aria_aesni_avx_gfni_crypt_16way; 1228 1229 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1230 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1231 %xmm15, %rax); 1232 1233 FRAME_END 1234 RET; 1235SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way) 1236 1237SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) 1238 /* input: 1239 * %rdi: ctx, CTX 1240 * %rsi: dst 1241 * %rdx: src 1242 */ 1243 1244 FRAME_BEGIN 1245 1246 leaq dec_key(CTX), %r9; 1247 1248 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1249 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1250 %xmm15, %rdx); 1251 1252 call __aria_aesni_avx_gfni_crypt_16way; 1253 1254 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1255 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1256 %xmm15, %rax); 1257 1258 FRAME_END 1259 RET; 1260SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way) 1261 1262SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) 1263 /* input: 1264 * %rdi: ctx 1265 * %rsi: dst 1266 * %rdx: src 1267 * %rcx: keystream 1268 * %r8: iv (big endian, 128bit) 1269 */ 1270 FRAME_BEGIN 1271 1272 call __aria_aesni_avx_ctr_gen_keystream_16way 1273 1274 leaq (%rsi), %r10; 1275 leaq (%rdx), %r11; 1276 leaq (%rcx), %rsi; 1277 leaq (%rcx), %rdx; 1278 leaq enc_key(CTX), %r9; 1279 1280 call __aria_aesni_avx_gfni_crypt_16way; 1281 1282 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1283 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1284 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1285 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1286 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1287 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1288 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1289 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1290 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1291 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1292 vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1293 vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1294 vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1295 vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1296 vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1297 vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1298 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1299 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1300 %xmm15, %r10); 1301 1302 FRAME_END 1303 RET; 1304SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way) 1305