1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * ARIA Cipher 16-way parallel algorithm (AVX) 4 * 5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> 6 * 7 */ 8 9#include <linux/linkage.h> 10#include <asm/frame.h> 11 12/* struct aria_ctx: */ 13#define enc_key 0 14#define dec_key 272 15#define rounds 544 16 17/* register macros */ 18#define CTX %rdi 19 20 21#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ 22 ( (((a0) & 1) << 0) | \ 23 (((a1) & 1) << 1) | \ 24 (((a2) & 1) << 2) | \ 25 (((a3) & 1) << 3) | \ 26 (((a4) & 1) << 4) | \ 27 (((a5) & 1) << 5) | \ 28 (((a6) & 1) << 6) | \ 29 (((a7) & 1) << 7) ) 30 31#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ 32 ( ((l7) << (0 * 8)) | \ 33 ((l6) << (1 * 8)) | \ 34 ((l5) << (2 * 8)) | \ 35 ((l4) << (3 * 8)) | \ 36 ((l3) << (4 * 8)) | \ 37 ((l2) << (5 * 8)) | \ 38 ((l1) << (6 * 8)) | \ 39 ((l0) << (7 * 8)) ) 40 41#define inc_le128(x, minus_one, tmp) \ 42 vpcmpeqq minus_one, x, tmp; \ 43 vpsubq minus_one, x, x; \ 44 vpslldq $8, tmp, tmp; \ 45 vpsubq tmp, x, x; 46 47#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 48 vpand x, mask4bit, tmp0; \ 49 vpandn x, mask4bit, x; \ 50 vpsrld $4, x, x; \ 51 \ 52 vpshufb tmp0, lo_t, tmp0; \ 53 vpshufb x, hi_t, x; \ 54 vpxor tmp0, x, x; 55 56#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 57 vpunpckhdq x1, x0, t2; \ 58 vpunpckldq x1, x0, x0; \ 59 \ 60 vpunpckldq x3, x2, t1; \ 61 vpunpckhdq x3, x2, x2; \ 62 \ 63 vpunpckhqdq t1, x0, x1; \ 64 vpunpcklqdq t1, x0, x0; \ 65 \ 66 vpunpckhqdq x2, t2, x3; \ 67 vpunpcklqdq x2, t2, x2; 68 69#define byteslice_16x16b(a0, b0, c0, d0, \ 70 a1, b1, c1, d1, \ 71 a2, b2, c2, d2, \ 72 a3, b3, c3, d3, \ 73 st0, st1) \ 74 vmovdqu d2, st0; \ 75 vmovdqu d3, st1; \ 76 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 77 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 78 vmovdqu st0, d2; \ 79 vmovdqu st1, d3; \ 80 \ 81 vmovdqu a0, st0; \ 82 vmovdqu a1, st1; \ 83 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 84 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 85 \ 86 vmovdqu .Lshufb_16x16b, a0; \ 87 vmovdqu st1, a1; \ 88 vpshufb a0, a2, a2; \ 89 vpshufb a0, a3, a3; \ 90 vpshufb a0, b0, b0; \ 91 vpshufb a0, b1, b1; \ 92 vpshufb a0, b2, b2; \ 93 vpshufb a0, b3, b3; \ 94 vpshufb a0, a1, a1; \ 95 vpshufb a0, c0, c0; \ 96 vpshufb a0, c1, c1; \ 97 vpshufb a0, c2, c2; \ 98 vpshufb a0, c3, c3; \ 99 vpshufb a0, d0, d0; \ 100 vpshufb a0, d1, d1; \ 101 vpshufb a0, d2, d2; \ 102 vpshufb a0, d3, d3; \ 103 vmovdqu d3, st1; \ 104 vmovdqu st0, d3; \ 105 vpshufb a0, d3, a0; \ 106 vmovdqu d2, st0; \ 107 \ 108 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 109 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 110 vmovdqu st0, d2; \ 111 vmovdqu st1, d3; \ 112 \ 113 vmovdqu b0, st0; \ 114 vmovdqu b1, st1; \ 115 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 116 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 117 vmovdqu st0, b0; \ 118 vmovdqu st1, b1; \ 119 /* does not adjust output bytes inside vectors */ 120 121#define debyteslice_16x16b(a0, b0, c0, d0, \ 122 a1, b1, c1, d1, \ 123 a2, b2, c2, d2, \ 124 a3, b3, c3, d3, \ 125 st0, st1) \ 126 vmovdqu d2, st0; \ 127 vmovdqu d3, st1; \ 128 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 129 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 130 vmovdqu st0, d2; \ 131 vmovdqu st1, d3; \ 132 \ 133 vmovdqu a0, st0; \ 134 vmovdqu a1, st1; \ 135 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 136 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 137 \ 138 vmovdqu .Lshufb_16x16b, a0; \ 139 vmovdqu st1, a1; \ 140 vpshufb a0, a2, a2; \ 141 vpshufb a0, a3, a3; \ 142 vpshufb a0, b0, b0; \ 143 vpshufb a0, b1, b1; \ 144 vpshufb a0, b2, b2; \ 145 vpshufb a0, b3, b3; \ 146 vpshufb a0, a1, a1; \ 147 vpshufb a0, c0, c0; \ 148 vpshufb a0, c1, c1; \ 149 vpshufb a0, c2, c2; \ 150 vpshufb a0, c3, c3; \ 151 vpshufb a0, d0, d0; \ 152 vpshufb a0, d1, d1; \ 153 vpshufb a0, d2, d2; \ 154 vpshufb a0, d3, d3; \ 155 vmovdqu d3, st1; \ 156 vmovdqu st0, d3; \ 157 vpshufb a0, d3, a0; \ 158 vmovdqu d2, st0; \ 159 \ 160 transpose_4x4(c0, d0, a0, b0, d2, d3); \ 161 transpose_4x4(c1, d1, a1, b1, d2, d3); \ 162 vmovdqu st0, d2; \ 163 vmovdqu st1, d3; \ 164 \ 165 vmovdqu b0, st0; \ 166 vmovdqu b1, st1; \ 167 transpose_4x4(c2, d2, a2, b2, b0, b1); \ 168 transpose_4x4(c3, d3, a3, b3, b0, b1); \ 169 vmovdqu st0, b0; \ 170 vmovdqu st1, b1; \ 171 /* does not adjust output bytes inside vectors */ 172 173/* load blocks to registers and apply pre-whitening */ 174#define inpack16_pre(x0, x1, x2, x3, \ 175 x4, x5, x6, x7, \ 176 y0, y1, y2, y3, \ 177 y4, y5, y6, y7, \ 178 rio) \ 179 vmovdqu (0 * 16)(rio), x0; \ 180 vmovdqu (1 * 16)(rio), x1; \ 181 vmovdqu (2 * 16)(rio), x2; \ 182 vmovdqu (3 * 16)(rio), x3; \ 183 vmovdqu (4 * 16)(rio), x4; \ 184 vmovdqu (5 * 16)(rio), x5; \ 185 vmovdqu (6 * 16)(rio), x6; \ 186 vmovdqu (7 * 16)(rio), x7; \ 187 vmovdqu (8 * 16)(rio), y0; \ 188 vmovdqu (9 * 16)(rio), y1; \ 189 vmovdqu (10 * 16)(rio), y2; \ 190 vmovdqu (11 * 16)(rio), y3; \ 191 vmovdqu (12 * 16)(rio), y4; \ 192 vmovdqu (13 * 16)(rio), y5; \ 193 vmovdqu (14 * 16)(rio), y6; \ 194 vmovdqu (15 * 16)(rio), y7; 195 196/* byteslice pre-whitened blocks and store to temporary memory */ 197#define inpack16_post(x0, x1, x2, x3, \ 198 x4, x5, x6, x7, \ 199 y0, y1, y2, y3, \ 200 y4, y5, y6, y7, \ 201 mem_ab, mem_cd) \ 202 byteslice_16x16b(x0, x1, x2, x3, \ 203 x4, x5, x6, x7, \ 204 y0, y1, y2, y3, \ 205 y4, y5, y6, y7, \ 206 (mem_ab), (mem_cd)); \ 207 \ 208 vmovdqu x0, 0 * 16(mem_ab); \ 209 vmovdqu x1, 1 * 16(mem_ab); \ 210 vmovdqu x2, 2 * 16(mem_ab); \ 211 vmovdqu x3, 3 * 16(mem_ab); \ 212 vmovdqu x4, 4 * 16(mem_ab); \ 213 vmovdqu x5, 5 * 16(mem_ab); \ 214 vmovdqu x6, 6 * 16(mem_ab); \ 215 vmovdqu x7, 7 * 16(mem_ab); \ 216 vmovdqu y0, 0 * 16(mem_cd); \ 217 vmovdqu y1, 1 * 16(mem_cd); \ 218 vmovdqu y2, 2 * 16(mem_cd); \ 219 vmovdqu y3, 3 * 16(mem_cd); \ 220 vmovdqu y4, 4 * 16(mem_cd); \ 221 vmovdqu y5, 5 * 16(mem_cd); \ 222 vmovdqu y6, 6 * 16(mem_cd); \ 223 vmovdqu y7, 7 * 16(mem_cd); 224 225#define write_output(x0, x1, x2, x3, \ 226 x4, x5, x6, x7, \ 227 y0, y1, y2, y3, \ 228 y4, y5, y6, y7, \ 229 mem) \ 230 vmovdqu x0, 0 * 16(mem); \ 231 vmovdqu x1, 1 * 16(mem); \ 232 vmovdqu x2, 2 * 16(mem); \ 233 vmovdqu x3, 3 * 16(mem); \ 234 vmovdqu x4, 4 * 16(mem); \ 235 vmovdqu x5, 5 * 16(mem); \ 236 vmovdqu x6, 6 * 16(mem); \ 237 vmovdqu x7, 7 * 16(mem); \ 238 vmovdqu y0, 8 * 16(mem); \ 239 vmovdqu y1, 9 * 16(mem); \ 240 vmovdqu y2, 10 * 16(mem); \ 241 vmovdqu y3, 11 * 16(mem); \ 242 vmovdqu y4, 12 * 16(mem); \ 243 vmovdqu y5, 13 * 16(mem); \ 244 vmovdqu y6, 14 * 16(mem); \ 245 vmovdqu y7, 15 * 16(mem); \ 246 247#define aria_store_state_8way(x0, x1, x2, x3, \ 248 x4, x5, x6, x7, \ 249 mem_tmp, idx) \ 250 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \ 251 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \ 252 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \ 253 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \ 254 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \ 255 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \ 256 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \ 257 vmovdqu x7, ((idx + 7) * 16)(mem_tmp); 258 259#define aria_load_state_8way(x0, x1, x2, x3, \ 260 x4, x5, x6, x7, \ 261 mem_tmp, idx) \ 262 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \ 263 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \ 264 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \ 265 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \ 266 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \ 267 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \ 268 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \ 269 vmovdqu ((idx + 7) * 16)(mem_tmp), x7; 270 271#define aria_ark_8way(x0, x1, x2, x3, \ 272 x4, x5, x6, x7, \ 273 t0, rk, idx, round) \ 274 /* AddRoundKey */ \ 275 vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ 276 vpxor t0, x0, x0; \ 277 vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ 278 vpxor t0, x1, x1; \ 279 vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ 280 vpxor t0, x2, x2; \ 281 vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ 282 vpxor t0, x3, x3; \ 283 vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ 284 vpxor t0, x4, x4; \ 285 vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ 286 vpxor t0, x5, x5; \ 287 vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ 288 vpxor t0, x6, x6; \ 289 vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ 290 vpxor t0, x7, x7; 291 292#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ 293 x4, x5, x6, x7, \ 294 t0, t1, t2, t3, \ 295 t4, t5, t6, t7) \ 296 vpbroadcastq .Ltf_s2_bitmatrix, t0; \ 297 vpbroadcastq .Ltf_inv_bitmatrix, t1; \ 298 vpbroadcastq .Ltf_id_bitmatrix, t2; \ 299 vpbroadcastq .Ltf_aff_bitmatrix, t3; \ 300 vpbroadcastq .Ltf_x2_bitmatrix, t4; \ 301 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ 302 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ 303 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ 304 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ 305 vgf2p8affineinvqb $0, t2, x2, x2; \ 306 vgf2p8affineinvqb $0, t2, x6, x6; \ 307 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ 308 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ 309 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ 310 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ 311 vgf2p8affineinvqb $0, t2, x3, x3; \ 312 vgf2p8affineinvqb $0, t2, x7, x7 313 314#define aria_sbox_8way(x0, x1, x2, x3, \ 315 x4, x5, x6, x7, \ 316 t0, t1, t2, t3, \ 317 t4, t5, t6, t7) \ 318 vpxor t7, t7, t7; \ 319 vmovdqa .Linv_shift_row, t0; \ 320 vmovdqa .Lshift_row, t1; \ 321 vpbroadcastd .L0f0f0f0f, t6; \ 322 vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ 323 vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ 324 vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ 325 vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \ 326 \ 327 vaesenclast t7, x0, x0; \ 328 vaesenclast t7, x4, x4; \ 329 vaesenclast t7, x1, x1; \ 330 vaesenclast t7, x5, x5; \ 331 vaesdeclast t7, x2, x2; \ 332 vaesdeclast t7, x6, x6; \ 333 \ 334 /* AES inverse shift rows */ \ 335 vpshufb t0, x0, x0; \ 336 vpshufb t0, x4, x4; \ 337 vpshufb t0, x1, x1; \ 338 vpshufb t0, x5, x5; \ 339 vpshufb t1, x3, x3; \ 340 vpshufb t1, x7, x7; \ 341 vpshufb t1, x2, x2; \ 342 vpshufb t1, x6, x6; \ 343 \ 344 /* affine transformation for S2 */ \ 345 filter_8bit(x1, t2, t3, t6, t0); \ 346 /* affine transformation for S2 */ \ 347 filter_8bit(x5, t2, t3, t6, t0); \ 348 \ 349 /* affine transformation for X2 */ \ 350 filter_8bit(x3, t4, t5, t6, t0); \ 351 /* affine transformation for X2 */ \ 352 filter_8bit(x7, t4, t5, t6, t0); \ 353 vaesdeclast t7, x3, x3; \ 354 vaesdeclast t7, x7, x7; 355 356#define aria_diff_m(x0, x1, x2, x3, \ 357 t0, t1, t2, t3) \ 358 /* T = rotr32(X, 8); */ \ 359 /* X ^= T */ \ 360 vpxor x0, x3, t0; \ 361 vpxor x1, x0, t1; \ 362 vpxor x2, x1, t2; \ 363 vpxor x3, x2, t3; \ 364 /* X = T ^ rotr(X, 16); */ \ 365 vpxor t2, x0, x0; \ 366 vpxor x1, t3, t3; \ 367 vpxor t0, x2, x2; \ 368 vpxor t1, x3, x1; \ 369 vmovdqu t3, x3; 370 371#define aria_diff_word(x0, x1, x2, x3, \ 372 x4, x5, x6, x7, \ 373 y0, y1, y2, y3, \ 374 y4, y5, y6, y7) \ 375 /* t1 ^= t2; */ \ 376 vpxor y0, x4, x4; \ 377 vpxor y1, x5, x5; \ 378 vpxor y2, x6, x6; \ 379 vpxor y3, x7, x7; \ 380 \ 381 /* t2 ^= t3; */ \ 382 vpxor y4, y0, y0; \ 383 vpxor y5, y1, y1; \ 384 vpxor y6, y2, y2; \ 385 vpxor y7, y3, y3; \ 386 \ 387 /* t0 ^= t1; */ \ 388 vpxor x4, x0, x0; \ 389 vpxor x5, x1, x1; \ 390 vpxor x6, x2, x2; \ 391 vpxor x7, x3, x3; \ 392 \ 393 /* t3 ^= t1; */ \ 394 vpxor x4, y4, y4; \ 395 vpxor x5, y5, y5; \ 396 vpxor x6, y6, y6; \ 397 vpxor x7, y7, y7; \ 398 \ 399 /* t2 ^= t0; */ \ 400 vpxor x0, y0, y0; \ 401 vpxor x1, y1, y1; \ 402 vpxor x2, y2, y2; \ 403 vpxor x3, y3, y3; \ 404 \ 405 /* t1 ^= t2; */ \ 406 vpxor y0, x4, x4; \ 407 vpxor y1, x5, x5; \ 408 vpxor y2, x6, x6; \ 409 vpxor y3, x7, x7; 410 411#define aria_fe(x0, x1, x2, x3, \ 412 x4, x5, x6, x7, \ 413 y0, y1, y2, y3, \ 414 y4, y5, y6, y7, \ 415 mem_tmp, rk, round) \ 416 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 417 y0, rk, 8, round); \ 418 \ 419 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 420 y0, y1, y2, y3, y4, y5, y6, y7); \ 421 \ 422 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 423 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 424 aria_store_state_8way(x0, x1, x2, x3, \ 425 x4, x5, x6, x7, \ 426 mem_tmp, 8); \ 427 \ 428 aria_load_state_8way(x0, x1, x2, x3, \ 429 x4, x5, x6, x7, \ 430 mem_tmp, 0); \ 431 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 432 y0, rk, 0, round); \ 433 \ 434 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 435 y0, y1, y2, y3, y4, y5, y6, y7); \ 436 \ 437 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 438 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 439 aria_store_state_8way(x0, x1, x2, x3, \ 440 x4, x5, x6, x7, \ 441 mem_tmp, 0); \ 442 aria_load_state_8way(y0, y1, y2, y3, \ 443 y4, y5, y6, y7, \ 444 mem_tmp, 8); \ 445 aria_diff_word(x0, x1, x2, x3, \ 446 x4, x5, x6, x7, \ 447 y0, y1, y2, y3, \ 448 y4, y5, y6, y7); \ 449 /* aria_diff_byte() \ 450 * T3 = ABCD -> BADC \ 451 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 452 * T0 = ABCD -> CDAB \ 453 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 454 * T1 = ABCD -> DCBA \ 455 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 456 */ \ 457 aria_diff_word(x2, x3, x0, x1, \ 458 x7, x6, x5, x4, \ 459 y0, y1, y2, y3, \ 460 y5, y4, y7, y6); \ 461 aria_store_state_8way(x3, x2, x1, x0, \ 462 x6, x7, x4, x5, \ 463 mem_tmp, 0); 464 465#define aria_fo(x0, x1, x2, x3, \ 466 x4, x5, x6, x7, \ 467 y0, y1, y2, y3, \ 468 y4, y5, y6, y7, \ 469 mem_tmp, rk, round) \ 470 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 471 y0, rk, 8, round); \ 472 \ 473 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 474 y0, y1, y2, y3, y4, y5, y6, y7); \ 475 \ 476 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 477 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 478 aria_store_state_8way(x0, x1, x2, x3, \ 479 x4, x5, x6, x7, \ 480 mem_tmp, 8); \ 481 \ 482 aria_load_state_8way(x0, x1, x2, x3, \ 483 x4, x5, x6, x7, \ 484 mem_tmp, 0); \ 485 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 486 y0, rk, 0, round); \ 487 \ 488 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 489 y0, y1, y2, y3, y4, y5, y6, y7); \ 490 \ 491 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 492 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 493 aria_store_state_8way(x0, x1, x2, x3, \ 494 x4, x5, x6, x7, \ 495 mem_tmp, 0); \ 496 aria_load_state_8way(y0, y1, y2, y3, \ 497 y4, y5, y6, y7, \ 498 mem_tmp, 8); \ 499 aria_diff_word(x0, x1, x2, x3, \ 500 x4, x5, x6, x7, \ 501 y0, y1, y2, y3, \ 502 y4, y5, y6, y7); \ 503 /* aria_diff_byte() \ 504 * T1 = ABCD -> BADC \ 505 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 506 * T2 = ABCD -> CDAB \ 507 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 508 * T3 = ABCD -> DCBA \ 509 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 510 */ \ 511 aria_diff_word(x0, x1, x2, x3, \ 512 x5, x4, x7, x6, \ 513 y2, y3, y0, y1, \ 514 y7, y6, y5, y4); \ 515 aria_store_state_8way(x3, x2, x1, x0, \ 516 x6, x7, x4, x5, \ 517 mem_tmp, 0); 518 519#define aria_ff(x0, x1, x2, x3, \ 520 x4, x5, x6, x7, \ 521 y0, y1, y2, y3, \ 522 y4, y5, y6, y7, \ 523 mem_tmp, rk, round, last_round) \ 524 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 525 y0, rk, 8, round); \ 526 \ 527 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 528 y0, y1, y2, y3, y4, y5, y6, y7); \ 529 \ 530 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 531 y0, rk, 8, last_round); \ 532 \ 533 aria_store_state_8way(x0, x1, x2, x3, \ 534 x4, x5, x6, x7, \ 535 mem_tmp, 8); \ 536 \ 537 aria_load_state_8way(x0, x1, x2, x3, \ 538 x4, x5, x6, x7, \ 539 mem_tmp, 0); \ 540 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 541 y0, rk, 0, round); \ 542 \ 543 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 544 y0, y1, y2, y3, y4, y5, y6, y7); \ 545 \ 546 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 547 y0, rk, 0, last_round); \ 548 \ 549 aria_load_state_8way(y0, y1, y2, y3, \ 550 y4, y5, y6, y7, \ 551 mem_tmp, 8); 552 553#define aria_fe_gfni(x0, x1, x2, x3, \ 554 x4, x5, x6, x7, \ 555 y0, y1, y2, y3, \ 556 y4, y5, y6, y7, \ 557 mem_tmp, rk, round) \ 558 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 559 y0, rk, 8, round); \ 560 \ 561 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 562 x6, x7, x4, x5, \ 563 y0, y1, y2, y3, \ 564 y4, y5, y6, y7); \ 565 \ 566 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 567 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 568 aria_store_state_8way(x0, x1, x2, x3, \ 569 x4, x5, x6, x7, \ 570 mem_tmp, 8); \ 571 \ 572 aria_load_state_8way(x0, x1, x2, x3, \ 573 x4, x5, x6, x7, \ 574 mem_tmp, 0); \ 575 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 576 y0, rk, 0, round); \ 577 \ 578 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 579 x6, x7, x4, x5, \ 580 y0, y1, y2, y3, \ 581 y4, y5, y6, y7); \ 582 \ 583 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 584 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 585 aria_store_state_8way(x0, x1, x2, x3, \ 586 x4, x5, x6, x7, \ 587 mem_tmp, 0); \ 588 aria_load_state_8way(y0, y1, y2, y3, \ 589 y4, y5, y6, y7, \ 590 mem_tmp, 8); \ 591 aria_diff_word(x0, x1, x2, x3, \ 592 x4, x5, x6, x7, \ 593 y0, y1, y2, y3, \ 594 y4, y5, y6, y7); \ 595 /* aria_diff_byte() \ 596 * T3 = ABCD -> BADC \ 597 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 598 * T0 = ABCD -> CDAB \ 599 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 600 * T1 = ABCD -> DCBA \ 601 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 602 */ \ 603 aria_diff_word(x2, x3, x0, x1, \ 604 x7, x6, x5, x4, \ 605 y0, y1, y2, y3, \ 606 y5, y4, y7, y6); \ 607 aria_store_state_8way(x3, x2, x1, x0, \ 608 x6, x7, x4, x5, \ 609 mem_tmp, 0); 610 611#define aria_fo_gfni(x0, x1, x2, x3, \ 612 x4, x5, x6, x7, \ 613 y0, y1, y2, y3, \ 614 y4, y5, y6, y7, \ 615 mem_tmp, rk, round) \ 616 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 617 y0, rk, 8, round); \ 618 \ 619 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 620 x4, x5, x6, x7, \ 621 y0, y1, y2, y3, \ 622 y4, y5, y6, y7); \ 623 \ 624 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 625 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 626 aria_store_state_8way(x0, x1, x2, x3, \ 627 x4, x5, x6, x7, \ 628 mem_tmp, 8); \ 629 \ 630 aria_load_state_8way(x0, x1, x2, x3, \ 631 x4, x5, x6, x7, \ 632 mem_tmp, 0); \ 633 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 634 y0, rk, 0, round); \ 635 \ 636 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 637 x4, x5, x6, x7, \ 638 y0, y1, y2, y3, \ 639 y4, y5, y6, y7); \ 640 \ 641 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 642 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 643 aria_store_state_8way(x0, x1, x2, x3, \ 644 x4, x5, x6, x7, \ 645 mem_tmp, 0); \ 646 aria_load_state_8way(y0, y1, y2, y3, \ 647 y4, y5, y6, y7, \ 648 mem_tmp, 8); \ 649 aria_diff_word(x0, x1, x2, x3, \ 650 x4, x5, x6, x7, \ 651 y0, y1, y2, y3, \ 652 y4, y5, y6, y7); \ 653 /* aria_diff_byte() \ 654 * T1 = ABCD -> BADC \ 655 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 656 * T2 = ABCD -> CDAB \ 657 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 658 * T3 = ABCD -> DCBA \ 659 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 660 */ \ 661 aria_diff_word(x0, x1, x2, x3, \ 662 x5, x4, x7, x6, \ 663 y2, y3, y0, y1, \ 664 y7, y6, y5, y4); \ 665 aria_store_state_8way(x3, x2, x1, x0, \ 666 x6, x7, x4, x5, \ 667 mem_tmp, 0); 668 669#define aria_ff_gfni(x0, x1, x2, x3, \ 670 x4, x5, x6, x7, \ 671 y0, y1, y2, y3, \ 672 y4, y5, y6, y7, \ 673 mem_tmp, rk, round, last_round) \ 674 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 675 y0, rk, 8, round); \ 676 \ 677 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 678 x6, x7, x4, x5, \ 679 y0, y1, y2, y3, \ 680 y4, y5, y6, y7); \ 681 \ 682 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 683 y0, rk, 8, last_round); \ 684 \ 685 aria_store_state_8way(x0, x1, x2, x3, \ 686 x4, x5, x6, x7, \ 687 mem_tmp, 8); \ 688 \ 689 aria_load_state_8way(x0, x1, x2, x3, \ 690 x4, x5, x6, x7, \ 691 mem_tmp, 0); \ 692 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 693 y0, rk, 0, round); \ 694 \ 695 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 696 x6, x7, x4, x5, \ 697 y0, y1, y2, y3, \ 698 y4, y5, y6, y7); \ 699 \ 700 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 701 y0, rk, 0, last_round); \ 702 \ 703 aria_load_state_8way(y0, y1, y2, y3, \ 704 y4, y5, y6, y7, \ 705 mem_tmp, 8); 706 707/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 708.section .rodata.cst16, "aM", @progbits, 16 709.align 16 710 711#define SHUFB_BYTES(idx) \ 712 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 713 714.Lshufb_16x16b: 715 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); 716/* For isolating SubBytes from AESENCLAST, inverse shift row */ 717.Linv_shift_row: 718 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 719 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 720.Lshift_row: 721 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 722 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b 723/* For CTR-mode IV byteswap */ 724.Lbswap128_mask: 725 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 726 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 727 728/* AES inverse affine and S2 combined: 729 * 1 1 0 0 0 0 0 1 x0 0 730 * 0 1 0 0 1 0 0 0 x1 0 731 * 1 1 0 0 1 1 1 1 x2 0 732 * 0 1 1 0 1 0 0 1 x3 1 733 * 0 1 0 0 1 1 0 0 * x4 + 0 734 * 0 1 0 1 1 0 0 0 x5 0 735 * 0 0 0 0 0 1 0 1 x6 0 736 * 1 1 1 0 0 1 1 1 x7 1 737 */ 738.Ltf_lo__inv_aff__and__s2: 739 .octa 0x92172DA81A9FA520B2370D883ABF8500 740.Ltf_hi__inv_aff__and__s2: 741 .octa 0x2B15FFC1AF917B45E6D8320C625CB688 742 743/* X2 and AES forward affine combined: 744 * 1 0 1 1 0 0 0 1 x0 0 745 * 0 1 1 1 1 0 1 1 x1 0 746 * 0 0 0 1 1 0 1 0 x2 1 747 * 0 1 0 0 0 1 0 0 x3 0 748 * 0 0 1 1 1 0 1 1 * x4 + 0 749 * 0 1 0 0 1 0 0 0 x5 0 750 * 1 1 0 1 0 0 1 1 x6 0 751 * 0 1 0 0 1 0 1 0 x7 0 752 */ 753.Ltf_lo__x2__and__fwd_aff: 754 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 755.Ltf_hi__x2__and__fwd_aff: 756 .octa 0x3F893781E95FE1576CDA64D2BA0CB204 757 758.section .rodata.cst8, "aM", @progbits, 8 759.align 8 760/* AES affine: */ 761#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) 762.Ltf_aff_bitmatrix: 763 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 764 BV8(1, 1, 0, 0, 0, 1, 1, 1), 765 BV8(1, 1, 1, 0, 0, 0, 1, 1), 766 BV8(1, 1, 1, 1, 0, 0, 0, 1), 767 BV8(1, 1, 1, 1, 1, 0, 0, 0), 768 BV8(0, 1, 1, 1, 1, 1, 0, 0), 769 BV8(0, 0, 1, 1, 1, 1, 1, 0), 770 BV8(0, 0, 0, 1, 1, 1, 1, 1)) 771 772/* AES inverse affine: */ 773#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) 774.Ltf_inv_bitmatrix: 775 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 776 BV8(1, 0, 0, 1, 0, 0, 1, 0), 777 BV8(0, 1, 0, 0, 1, 0, 0, 1), 778 BV8(1, 0, 1, 0, 0, 1, 0, 0), 779 BV8(0, 1, 0, 1, 0, 0, 1, 0), 780 BV8(0, 0, 1, 0, 1, 0, 0, 1), 781 BV8(1, 0, 0, 1, 0, 1, 0, 0), 782 BV8(0, 1, 0, 0, 1, 0, 1, 0)) 783 784/* S2: */ 785#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) 786.Ltf_s2_bitmatrix: 787 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 788 BV8(0, 0, 1, 1, 1, 1, 1, 1), 789 BV8(1, 1, 1, 0, 1, 1, 0, 1), 790 BV8(1, 1, 0, 0, 0, 0, 1, 1), 791 BV8(0, 1, 0, 0, 0, 0, 1, 1), 792 BV8(1, 1, 0, 0, 1, 1, 1, 0), 793 BV8(0, 1, 1, 0, 0, 0, 1, 1), 794 BV8(1, 1, 1, 1, 0, 1, 1, 0)) 795 796/* X2: */ 797#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) 798.Ltf_x2_bitmatrix: 799 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 800 BV8(0, 0, 1, 0, 0, 1, 1, 0), 801 BV8(0, 0, 0, 0, 1, 0, 1, 0), 802 BV8(1, 1, 1, 0, 0, 0, 1, 1), 803 BV8(1, 1, 1, 0, 1, 1, 0, 0), 804 BV8(0, 1, 1, 0, 1, 0, 1, 1), 805 BV8(1, 0, 1, 1, 1, 1, 0, 1), 806 BV8(1, 0, 0, 1, 0, 0, 1, 1)) 807 808/* Identity matrix: */ 809.Ltf_id_bitmatrix: 810 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 811 BV8(0, 1, 0, 0, 0, 0, 0, 0), 812 BV8(0, 0, 1, 0, 0, 0, 0, 0), 813 BV8(0, 0, 0, 1, 0, 0, 0, 0), 814 BV8(0, 0, 0, 0, 1, 0, 0, 0), 815 BV8(0, 0, 0, 0, 0, 1, 0, 0), 816 BV8(0, 0, 0, 0, 0, 0, 1, 0), 817 BV8(0, 0, 0, 0, 0, 0, 0, 1)) 818 819/* 4-bit mask */ 820.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 821.align 4 822.L0f0f0f0f: 823 .long 0x0f0f0f0f 824 825.text 826 827SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) 828 /* input: 829 * %r9: rk 830 * %rsi: dst 831 * %rdx: src 832 * %xmm0..%xmm15: 16 byte-sliced blocks 833 */ 834 835 FRAME_BEGIN 836 837 movq %rsi, %rax; 838 leaq 8 * 16(%rax), %r8; 839 840 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 841 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 842 %xmm15, %rax, %r8); 843 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 844 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 845 %rax, %r9, 0); 846 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 847 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 848 %xmm15, %rax, %r9, 1); 849 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 850 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 851 %rax, %r9, 2); 852 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 853 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 854 %xmm15, %rax, %r9, 3); 855 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 856 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 857 %rax, %r9, 4); 858 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 859 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 860 %xmm15, %rax, %r9, 5); 861 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 862 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 863 %rax, %r9, 6); 864 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 865 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 866 %xmm15, %rax, %r9, 7); 867 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 868 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 869 %rax, %r9, 8); 870 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 871 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 872 %xmm15, %rax, %r9, 9); 873 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 874 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 875 %rax, %r9, 10); 876 cmpl $12, rounds(CTX); 877 jne .Laria_192; 878 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 879 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 880 %xmm15, %rax, %r9, 11, 12); 881 jmp .Laria_end; 882.Laria_192: 883 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 884 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 885 %xmm15, %rax, %r9, 11); 886 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 887 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 888 %rax, %r9, 12); 889 cmpl $14, rounds(CTX); 890 jne .Laria_256; 891 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 892 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 893 %xmm15, %rax, %r9, 13, 14); 894 jmp .Laria_end; 895.Laria_256: 896 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 897 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 898 %xmm15, %rax, %r9, 13); 899 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 900 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 901 %rax, %r9, 14); 902 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 903 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 904 %xmm15, %rax, %r9, 15, 16); 905.Laria_end: 906 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 907 %xmm9, %xmm13, %xmm0, %xmm5, 908 %xmm10, %xmm14, %xmm3, %xmm6, 909 %xmm11, %xmm15, %xmm2, %xmm7, 910 (%rax), (%r8)); 911 912 FRAME_END 913 RET; 914SYM_FUNC_END(__aria_aesni_avx_crypt_16way) 915 916SYM_FUNC_START(aria_aesni_avx_encrypt_16way) 917 /* input: 918 * %rdi: ctx, CTX 919 * %rsi: dst 920 * %rdx: src 921 */ 922 923 FRAME_BEGIN 924 925 leaq enc_key(CTX), %r9; 926 927 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 928 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 929 %xmm15, %rdx); 930 931 call __aria_aesni_avx_crypt_16way; 932 933 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 934 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 935 %xmm15, %rax); 936 937 FRAME_END 938 RET; 939SYM_FUNC_END(aria_aesni_avx_encrypt_16way) 940 941SYM_FUNC_START(aria_aesni_avx_decrypt_16way) 942 /* input: 943 * %rdi: ctx, CTX 944 * %rsi: dst 945 * %rdx: src 946 */ 947 948 FRAME_BEGIN 949 950 leaq dec_key(CTX), %r9; 951 952 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 953 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 954 %xmm15, %rdx); 955 956 call __aria_aesni_avx_crypt_16way; 957 958 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 959 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 960 %xmm15, %rax); 961 962 FRAME_END 963 RET; 964SYM_FUNC_END(aria_aesni_avx_decrypt_16way) 965 966SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way) 967 /* input: 968 * %rdi: ctx 969 * %rsi: dst 970 * %rdx: src 971 * %rcx: keystream 972 * %r8: iv (big endian, 128bit) 973 */ 974 975 FRAME_BEGIN 976 /* load IV and byteswap */ 977 vmovdqu (%r8), %xmm8; 978 979 vmovdqa .Lbswap128_mask (%rip), %xmm1; 980 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */ 981 982 vpcmpeqd %xmm0, %xmm0, %xmm0; 983 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */ 984 985 /* construct IVs */ 986 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 987 vpshufb %xmm1, %xmm3, %xmm9; 988 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 989 vpshufb %xmm1, %xmm3, %xmm10; 990 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 991 vpshufb %xmm1, %xmm3, %xmm11; 992 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 993 vpshufb %xmm1, %xmm3, %xmm12; 994 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 995 vpshufb %xmm1, %xmm3, %xmm13; 996 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 997 vpshufb %xmm1, %xmm3, %xmm14; 998 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 999 vpshufb %xmm1, %xmm3, %xmm15; 1000 vmovdqu %xmm8, (0 * 16)(%rcx); 1001 vmovdqu %xmm9, (1 * 16)(%rcx); 1002 vmovdqu %xmm10, (2 * 16)(%rcx); 1003 vmovdqu %xmm11, (3 * 16)(%rcx); 1004 vmovdqu %xmm12, (4 * 16)(%rcx); 1005 vmovdqu %xmm13, (5 * 16)(%rcx); 1006 vmovdqu %xmm14, (6 * 16)(%rcx); 1007 vmovdqu %xmm15, (7 * 16)(%rcx); 1008 1009 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1010 vpshufb %xmm1, %xmm3, %xmm8; 1011 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1012 vpshufb %xmm1, %xmm3, %xmm9; 1013 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1014 vpshufb %xmm1, %xmm3, %xmm10; 1015 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1016 vpshufb %xmm1, %xmm3, %xmm11; 1017 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1018 vpshufb %xmm1, %xmm3, %xmm12; 1019 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1020 vpshufb %xmm1, %xmm3, %xmm13; 1021 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1022 vpshufb %xmm1, %xmm3, %xmm14; 1023 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1024 vpshufb %xmm1, %xmm3, %xmm15; 1025 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1026 vpshufb %xmm1, %xmm3, %xmm4; 1027 vmovdqu %xmm4, (%r8); 1028 1029 vmovdqu (0 * 16)(%rcx), %xmm0; 1030 vmovdqu (1 * 16)(%rcx), %xmm1; 1031 vmovdqu (2 * 16)(%rcx), %xmm2; 1032 vmovdqu (3 * 16)(%rcx), %xmm3; 1033 vmovdqu (4 * 16)(%rcx), %xmm4; 1034 vmovdqu (5 * 16)(%rcx), %xmm5; 1035 vmovdqu (6 * 16)(%rcx), %xmm6; 1036 vmovdqu (7 * 16)(%rcx), %xmm7; 1037 1038 FRAME_END 1039 RET; 1040SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way) 1041 1042SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way) 1043 /* input: 1044 * %rdi: ctx 1045 * %rsi: dst 1046 * %rdx: src 1047 * %rcx: keystream 1048 * %r8: iv (big endian, 128bit) 1049 */ 1050 FRAME_BEGIN 1051 1052 call __aria_aesni_avx_ctr_gen_keystream_16way; 1053 1054 leaq (%rsi), %r10; 1055 leaq (%rdx), %r11; 1056 leaq (%rcx), %rsi; 1057 leaq (%rcx), %rdx; 1058 leaq enc_key(CTX), %r9; 1059 1060 call __aria_aesni_avx_crypt_16way; 1061 1062 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1063 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1064 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1065 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1066 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1067 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1068 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1069 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1070 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1071 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1072 vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1073 vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1074 vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1075 vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1076 vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1077 vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1078 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1079 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1080 %xmm15, %r10); 1081 1082 FRAME_END 1083 RET; 1084SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way) 1085 1086SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) 1087 /* input: 1088 * %r9: rk 1089 * %rsi: dst 1090 * %rdx: src 1091 * %xmm0..%xmm15: 16 byte-sliced blocks 1092 */ 1093 1094 FRAME_BEGIN 1095 1096 movq %rsi, %rax; 1097 leaq 8 * 16(%rax), %r8; 1098 1099 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, 1100 %xmm4, %xmm5, %xmm6, %xmm7, 1101 %xmm8, %xmm9, %xmm10, %xmm11, 1102 %xmm12, %xmm13, %xmm14, 1103 %xmm15, %rax, %r8); 1104 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11, 1105 %xmm12, %xmm13, %xmm14, %xmm15, 1106 %xmm0, %xmm1, %xmm2, %xmm3, 1107 %xmm4, %xmm5, %xmm6, %xmm7, 1108 %rax, %r9, 0); 1109 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1110 %xmm4, %xmm5, %xmm6, %xmm7, 1111 %xmm8, %xmm9, %xmm10, %xmm11, 1112 %xmm12, %xmm13, %xmm14, 1113 %xmm15, %rax, %r9, 1); 1114 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1115 %xmm12, %xmm13, %xmm14, %xmm15, 1116 %xmm0, %xmm1, %xmm2, %xmm3, 1117 %xmm4, %xmm5, %xmm6, %xmm7, 1118 %rax, %r9, 2); 1119 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1120 %xmm4, %xmm5, %xmm6, %xmm7, 1121 %xmm8, %xmm9, %xmm10, %xmm11, 1122 %xmm12, %xmm13, %xmm14, 1123 %xmm15, %rax, %r9, 3); 1124 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1125 %xmm12, %xmm13, %xmm14, %xmm15, 1126 %xmm0, %xmm1, %xmm2, %xmm3, 1127 %xmm4, %xmm5, %xmm6, %xmm7, 1128 %rax, %r9, 4); 1129 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1130 %xmm4, %xmm5, %xmm6, %xmm7, 1131 %xmm8, %xmm9, %xmm10, %xmm11, 1132 %xmm12, %xmm13, %xmm14, 1133 %xmm15, %rax, %r9, 5); 1134 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1135 %xmm12, %xmm13, %xmm14, %xmm15, 1136 %xmm0, %xmm1, %xmm2, %xmm3, 1137 %xmm4, %xmm5, %xmm6, %xmm7, 1138 %rax, %r9, 6); 1139 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1140 %xmm4, %xmm5, %xmm6, %xmm7, 1141 %xmm8, %xmm9, %xmm10, %xmm11, 1142 %xmm12, %xmm13, %xmm14, 1143 %xmm15, %rax, %r9, 7); 1144 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1145 %xmm12, %xmm13, %xmm14, %xmm15, 1146 %xmm0, %xmm1, %xmm2, %xmm3, 1147 %xmm4, %xmm5, %xmm6, %xmm7, 1148 %rax, %r9, 8); 1149 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1150 %xmm4, %xmm5, %xmm6, %xmm7, 1151 %xmm8, %xmm9, %xmm10, %xmm11, 1152 %xmm12, %xmm13, %xmm14, 1153 %xmm15, %rax, %r9, 9); 1154 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1155 %xmm12, %xmm13, %xmm14, %xmm15, 1156 %xmm0, %xmm1, %xmm2, %xmm3, 1157 %xmm4, %xmm5, %xmm6, %xmm7, 1158 %rax, %r9, 10); 1159 cmpl $12, rounds(CTX); 1160 jne .Laria_gfni_192; 1161 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1162 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1163 %xmm15, %rax, %r9, 11, 12); 1164 jmp .Laria_gfni_end; 1165.Laria_gfni_192: 1166 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1167 %xmm4, %xmm5, %xmm6, %xmm7, 1168 %xmm8, %xmm9, %xmm10, %xmm11, 1169 %xmm12, %xmm13, %xmm14, 1170 %xmm15, %rax, %r9, 11); 1171 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1172 %xmm12, %xmm13, %xmm14, %xmm15, 1173 %xmm0, %xmm1, %xmm2, %xmm3, 1174 %xmm4, %xmm5, %xmm6, %xmm7, 1175 %rax, %r9, 12); 1176 cmpl $14, rounds(CTX); 1177 jne .Laria_gfni_256; 1178 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1179 %xmm4, %xmm5, %xmm6, %xmm7, 1180 %xmm8, %xmm9, %xmm10, %xmm11, 1181 %xmm12, %xmm13, %xmm14, 1182 %xmm15, %rax, %r9, 13, 14); 1183 jmp .Laria_gfni_end; 1184.Laria_gfni_256: 1185 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1186 %xmm4, %xmm5, %xmm6, %xmm7, 1187 %xmm8, %xmm9, %xmm10, %xmm11, 1188 %xmm12, %xmm13, %xmm14, 1189 %xmm15, %rax, %r9, 13); 1190 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1191 %xmm12, %xmm13, %xmm14, %xmm15, 1192 %xmm0, %xmm1, %xmm2, %xmm3, 1193 %xmm4, %xmm5, %xmm6, %xmm7, 1194 %rax, %r9, 14); 1195 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1196 %xmm4, %xmm5, %xmm6, %xmm7, 1197 %xmm8, %xmm9, %xmm10, %xmm11, 1198 %xmm12, %xmm13, %xmm14, 1199 %xmm15, %rax, %r9, 15, 16); 1200.Laria_gfni_end: 1201 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 1202 %xmm9, %xmm13, %xmm0, %xmm5, 1203 %xmm10, %xmm14, %xmm3, %xmm6, 1204 %xmm11, %xmm15, %xmm2, %xmm7, 1205 (%rax), (%r8)); 1206 1207 FRAME_END 1208 RET; 1209SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way) 1210 1211SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) 1212 /* input: 1213 * %rdi: ctx, CTX 1214 * %rsi: dst 1215 * %rdx: src 1216 */ 1217 1218 FRAME_BEGIN 1219 1220 leaq enc_key(CTX), %r9; 1221 1222 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1223 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1224 %xmm15, %rdx); 1225 1226 call __aria_aesni_avx_gfni_crypt_16way; 1227 1228 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1229 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1230 %xmm15, %rax); 1231 1232 FRAME_END 1233 RET; 1234SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way) 1235 1236SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) 1237 /* input: 1238 * %rdi: ctx, CTX 1239 * %rsi: dst 1240 * %rdx: src 1241 */ 1242 1243 FRAME_BEGIN 1244 1245 leaq dec_key(CTX), %r9; 1246 1247 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1248 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1249 %xmm15, %rdx); 1250 1251 call __aria_aesni_avx_gfni_crypt_16way; 1252 1253 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1254 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1255 %xmm15, %rax); 1256 1257 FRAME_END 1258 RET; 1259SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way) 1260 1261SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) 1262 /* input: 1263 * %rdi: ctx 1264 * %rsi: dst 1265 * %rdx: src 1266 * %rcx: keystream 1267 * %r8: iv (big endian, 128bit) 1268 */ 1269 FRAME_BEGIN 1270 1271 call __aria_aesni_avx_ctr_gen_keystream_16way 1272 1273 leaq (%rsi), %r10; 1274 leaq (%rdx), %r11; 1275 leaq (%rcx), %rsi; 1276 leaq (%rcx), %rdx; 1277 leaq enc_key(CTX), %r9; 1278 1279 call __aria_aesni_avx_gfni_crypt_16way; 1280 1281 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1282 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1283 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1284 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1285 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1286 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1287 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1288 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1289 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1290 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1291 vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1292 vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1293 vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1294 vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1295 vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1296 vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1297 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1298 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1299 %xmm15, %r10); 1300 1301 FRAME_END 1302 RET; 1303SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way) 1304