1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * ARIA Cipher 16-way parallel algorithm (AVX) 4 * 5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> 6 * 7 */ 8 9#include <linux/linkage.h> 10#include <linux/cfi_types.h> 11#include <asm/asm-offsets.h> 12#include <asm/frame.h> 13 14/* register macros */ 15#define CTX %rdi 16 17 18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ 19 ( (((a0) & 1) << 0) | \ 20 (((a1) & 1) << 1) | \ 21 (((a2) & 1) << 2) | \ 22 (((a3) & 1) << 3) | \ 23 (((a4) & 1) << 4) | \ 24 (((a5) & 1) << 5) | \ 25 (((a6) & 1) << 6) | \ 26 (((a7) & 1) << 7) ) 27 28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ 29 ( ((l7) << (0 * 8)) | \ 30 ((l6) << (1 * 8)) | \ 31 ((l5) << (2 * 8)) | \ 32 ((l4) << (3 * 8)) | \ 33 ((l3) << (4 * 8)) | \ 34 ((l2) << (5 * 8)) | \ 35 ((l1) << (6 * 8)) | \ 36 ((l0) << (7 * 8)) ) 37 38#define inc_le128(x, minus_one, tmp) \ 39 vpcmpeqq minus_one, x, tmp; \ 40 vpsubq minus_one, x, x; \ 41 vpslldq $8, tmp, tmp; \ 42 vpsubq tmp, x, x; 43 44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 45 vpand x, mask4bit, tmp0; \ 46 vpandn x, mask4bit, x; \ 47 vpsrld $4, x, x; \ 48 \ 49 vpshufb tmp0, lo_t, tmp0; \ 50 vpshufb x, hi_t, x; \ 51 vpxor tmp0, x, x; 52 53#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 54 vpunpckhdq x1, x0, t2; \ 55 vpunpckldq x1, x0, x0; \ 56 \ 57 vpunpckldq x3, x2, t1; \ 58 vpunpckhdq x3, x2, x2; \ 59 \ 60 vpunpckhqdq t1, x0, x1; \ 61 vpunpcklqdq t1, x0, x0; \ 62 \ 63 vpunpckhqdq x2, t2, x3; \ 64 vpunpcklqdq x2, t2, x2; 65 66#define byteslice_16x16b(a0, b0, c0, d0, \ 67 a1, b1, c1, d1, \ 68 a2, b2, c2, d2, \ 69 a3, b3, c3, d3, \ 70 st0, st1) \ 71 vmovdqu d2, st0; \ 72 vmovdqu d3, st1; \ 73 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 74 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 75 vmovdqu st0, d2; \ 76 vmovdqu st1, d3; \ 77 \ 78 vmovdqu a0, st0; \ 79 vmovdqu a1, st1; \ 80 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 81 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 82 \ 83 vmovdqu .Lshufb_16x16b, a0; \ 84 vmovdqu st1, a1; \ 85 vpshufb a0, a2, a2; \ 86 vpshufb a0, a3, a3; \ 87 vpshufb a0, b0, b0; \ 88 vpshufb a0, b1, b1; \ 89 vpshufb a0, b2, b2; \ 90 vpshufb a0, b3, b3; \ 91 vpshufb a0, a1, a1; \ 92 vpshufb a0, c0, c0; \ 93 vpshufb a0, c1, c1; \ 94 vpshufb a0, c2, c2; \ 95 vpshufb a0, c3, c3; \ 96 vpshufb a0, d0, d0; \ 97 vpshufb a0, d1, d1; \ 98 vpshufb a0, d2, d2; \ 99 vpshufb a0, d3, d3; \ 100 vmovdqu d3, st1; \ 101 vmovdqu st0, d3; \ 102 vpshufb a0, d3, a0; \ 103 vmovdqu d2, st0; \ 104 \ 105 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 106 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 107 vmovdqu st0, d2; \ 108 vmovdqu st1, d3; \ 109 \ 110 vmovdqu b0, st0; \ 111 vmovdqu b1, st1; \ 112 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 113 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 114 vmovdqu st0, b0; \ 115 vmovdqu st1, b1; \ 116 /* does not adjust output bytes inside vectors */ 117 118#define debyteslice_16x16b(a0, b0, c0, d0, \ 119 a1, b1, c1, d1, \ 120 a2, b2, c2, d2, \ 121 a3, b3, c3, d3, \ 122 st0, st1) \ 123 vmovdqu d2, st0; \ 124 vmovdqu d3, st1; \ 125 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 126 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 127 vmovdqu st0, d2; \ 128 vmovdqu st1, d3; \ 129 \ 130 vmovdqu a0, st0; \ 131 vmovdqu a1, st1; \ 132 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 133 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 134 \ 135 vmovdqu .Lshufb_16x16b, a0; \ 136 vmovdqu st1, a1; \ 137 vpshufb a0, a2, a2; \ 138 vpshufb a0, a3, a3; \ 139 vpshufb a0, b0, b0; \ 140 vpshufb a0, b1, b1; \ 141 vpshufb a0, b2, b2; \ 142 vpshufb a0, b3, b3; \ 143 vpshufb a0, a1, a1; \ 144 vpshufb a0, c0, c0; \ 145 vpshufb a0, c1, c1; \ 146 vpshufb a0, c2, c2; \ 147 vpshufb a0, c3, c3; \ 148 vpshufb a0, d0, d0; \ 149 vpshufb a0, d1, d1; \ 150 vpshufb a0, d2, d2; \ 151 vpshufb a0, d3, d3; \ 152 vmovdqu d3, st1; \ 153 vmovdqu st0, d3; \ 154 vpshufb a0, d3, a0; \ 155 vmovdqu d2, st0; \ 156 \ 157 transpose_4x4(c0, d0, a0, b0, d2, d3); \ 158 transpose_4x4(c1, d1, a1, b1, d2, d3); \ 159 vmovdqu st0, d2; \ 160 vmovdqu st1, d3; \ 161 \ 162 vmovdqu b0, st0; \ 163 vmovdqu b1, st1; \ 164 transpose_4x4(c2, d2, a2, b2, b0, b1); \ 165 transpose_4x4(c3, d3, a3, b3, b0, b1); \ 166 vmovdqu st0, b0; \ 167 vmovdqu st1, b1; \ 168 /* does not adjust output bytes inside vectors */ 169 170/* load blocks to registers and apply pre-whitening */ 171#define inpack16_pre(x0, x1, x2, x3, \ 172 x4, x5, x6, x7, \ 173 y0, y1, y2, y3, \ 174 y4, y5, y6, y7, \ 175 rio) \ 176 vmovdqu (0 * 16)(rio), x0; \ 177 vmovdqu (1 * 16)(rio), x1; \ 178 vmovdqu (2 * 16)(rio), x2; \ 179 vmovdqu (3 * 16)(rio), x3; \ 180 vmovdqu (4 * 16)(rio), x4; \ 181 vmovdqu (5 * 16)(rio), x5; \ 182 vmovdqu (6 * 16)(rio), x6; \ 183 vmovdqu (7 * 16)(rio), x7; \ 184 vmovdqu (8 * 16)(rio), y0; \ 185 vmovdqu (9 * 16)(rio), y1; \ 186 vmovdqu (10 * 16)(rio), y2; \ 187 vmovdqu (11 * 16)(rio), y3; \ 188 vmovdqu (12 * 16)(rio), y4; \ 189 vmovdqu (13 * 16)(rio), y5; \ 190 vmovdqu (14 * 16)(rio), y6; \ 191 vmovdqu (15 * 16)(rio), y7; 192 193/* byteslice pre-whitened blocks and store to temporary memory */ 194#define inpack16_post(x0, x1, x2, x3, \ 195 x4, x5, x6, x7, \ 196 y0, y1, y2, y3, \ 197 y4, y5, y6, y7, \ 198 mem_ab, mem_cd) \ 199 byteslice_16x16b(x0, x1, x2, x3, \ 200 x4, x5, x6, x7, \ 201 y0, y1, y2, y3, \ 202 y4, y5, y6, y7, \ 203 (mem_ab), (mem_cd)); \ 204 \ 205 vmovdqu x0, 0 * 16(mem_ab); \ 206 vmovdqu x1, 1 * 16(mem_ab); \ 207 vmovdqu x2, 2 * 16(mem_ab); \ 208 vmovdqu x3, 3 * 16(mem_ab); \ 209 vmovdqu x4, 4 * 16(mem_ab); \ 210 vmovdqu x5, 5 * 16(mem_ab); \ 211 vmovdqu x6, 6 * 16(mem_ab); \ 212 vmovdqu x7, 7 * 16(mem_ab); \ 213 vmovdqu y0, 0 * 16(mem_cd); \ 214 vmovdqu y1, 1 * 16(mem_cd); \ 215 vmovdqu y2, 2 * 16(mem_cd); \ 216 vmovdqu y3, 3 * 16(mem_cd); \ 217 vmovdqu y4, 4 * 16(mem_cd); \ 218 vmovdqu y5, 5 * 16(mem_cd); \ 219 vmovdqu y6, 6 * 16(mem_cd); \ 220 vmovdqu y7, 7 * 16(mem_cd); 221 222#define write_output(x0, x1, x2, x3, \ 223 x4, x5, x6, x7, \ 224 y0, y1, y2, y3, \ 225 y4, y5, y6, y7, \ 226 mem) \ 227 vmovdqu x0, 0 * 16(mem); \ 228 vmovdqu x1, 1 * 16(mem); \ 229 vmovdqu x2, 2 * 16(mem); \ 230 vmovdqu x3, 3 * 16(mem); \ 231 vmovdqu x4, 4 * 16(mem); \ 232 vmovdqu x5, 5 * 16(mem); \ 233 vmovdqu x6, 6 * 16(mem); \ 234 vmovdqu x7, 7 * 16(mem); \ 235 vmovdqu y0, 8 * 16(mem); \ 236 vmovdqu y1, 9 * 16(mem); \ 237 vmovdqu y2, 10 * 16(mem); \ 238 vmovdqu y3, 11 * 16(mem); \ 239 vmovdqu y4, 12 * 16(mem); \ 240 vmovdqu y5, 13 * 16(mem); \ 241 vmovdqu y6, 14 * 16(mem); \ 242 vmovdqu y7, 15 * 16(mem); \ 243 244#define aria_store_state_8way(x0, x1, x2, x3, \ 245 x4, x5, x6, x7, \ 246 mem_tmp, idx) \ 247 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \ 248 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \ 249 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \ 250 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \ 251 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \ 252 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \ 253 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \ 254 vmovdqu x7, ((idx + 7) * 16)(mem_tmp); 255 256#define aria_load_state_8way(x0, x1, x2, x3, \ 257 x4, x5, x6, x7, \ 258 mem_tmp, idx) \ 259 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \ 260 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \ 261 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \ 262 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \ 263 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \ 264 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \ 265 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \ 266 vmovdqu ((idx + 7) * 16)(mem_tmp), x7; 267 268#define aria_ark_8way(x0, x1, x2, x3, \ 269 x4, x5, x6, x7, \ 270 t0, t1, t2, rk, \ 271 idx, round) \ 272 /* AddRoundKey */ \ 273 vbroadcastss ((round * 16) + idx + 0)(rk), t0; \ 274 vpsrld $24, t0, t2; \ 275 vpshufb t1, t2, t2; \ 276 vpxor t2, x0, x0; \ 277 vpsrld $16, t0, t2; \ 278 vpshufb t1, t2, t2; \ 279 vpxor t2, x1, x1; \ 280 vpsrld $8, t0, t2; \ 281 vpshufb t1, t2, t2; \ 282 vpxor t2, x2, x2; \ 283 vpshufb t1, t0, t2; \ 284 vpxor t2, x3, x3; \ 285 vbroadcastss ((round * 16) + idx + 4)(rk), t0; \ 286 vpsrld $24, t0, t2; \ 287 vpshufb t1, t2, t2; \ 288 vpxor t2, x4, x4; \ 289 vpsrld $16, t0, t2; \ 290 vpshufb t1, t2, t2; \ 291 vpxor t2, x5, x5; \ 292 vpsrld $8, t0, t2; \ 293 vpshufb t1, t2, t2; \ 294 vpxor t2, x6, x6; \ 295 vpshufb t1, t0, t2; \ 296 vpxor t2, x7, x7; 297 298#ifdef CONFIG_AS_GFNI 299#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ 300 x4, x5, x6, x7, \ 301 t0, t1, t2, t3, \ 302 t4, t5, t6, t7) \ 303 vmovdqa .Ltf_s2_bitmatrix, t0; \ 304 vmovdqa .Ltf_inv_bitmatrix, t1; \ 305 vmovdqa .Ltf_id_bitmatrix, t2; \ 306 vmovdqa .Ltf_aff_bitmatrix, t3; \ 307 vmovdqa .Ltf_x2_bitmatrix, t4; \ 308 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ 309 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ 310 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ 311 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ 312 vgf2p8affineinvqb $0, t2, x2, x2; \ 313 vgf2p8affineinvqb $0, t2, x6, x6; \ 314 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ 315 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ 316 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ 317 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ 318 vgf2p8affineinvqb $0, t2, x3, x3; \ 319 vgf2p8affineinvqb $0, t2, x7, x7 320 321#endif /* CONFIG_AS_GFNI */ 322 323#define aria_sbox_8way(x0, x1, x2, x3, \ 324 x4, x5, x6, x7, \ 325 t0, t1, t2, t3, \ 326 t4, t5, t6, t7) \ 327 vmovdqa .Linv_shift_row, t0; \ 328 vmovdqa .Lshift_row, t1; \ 329 vbroadcastss .L0f0f0f0f, t6; \ 330 vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ 331 vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ 332 vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ 333 vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \ 334 \ 335 vaesenclast t7, x0, x0; \ 336 vaesenclast t7, x4, x4; \ 337 vaesenclast t7, x1, x1; \ 338 vaesenclast t7, x5, x5; \ 339 vaesdeclast t7, x2, x2; \ 340 vaesdeclast t7, x6, x6; \ 341 \ 342 /* AES inverse shift rows */ \ 343 vpshufb t0, x0, x0; \ 344 vpshufb t0, x4, x4; \ 345 vpshufb t0, x1, x1; \ 346 vpshufb t0, x5, x5; \ 347 vpshufb t1, x3, x3; \ 348 vpshufb t1, x7, x7; \ 349 vpshufb t1, x2, x2; \ 350 vpshufb t1, x6, x6; \ 351 \ 352 /* affine transformation for S2 */ \ 353 filter_8bit(x1, t2, t3, t6, t0); \ 354 /* affine transformation for S2 */ \ 355 filter_8bit(x5, t2, t3, t6, t0); \ 356 \ 357 /* affine transformation for X2 */ \ 358 filter_8bit(x3, t4, t5, t6, t0); \ 359 /* affine transformation for X2 */ \ 360 filter_8bit(x7, t4, t5, t6, t0); \ 361 vaesdeclast t7, x3, x3; \ 362 vaesdeclast t7, x7, x7; 363 364#define aria_diff_m(x0, x1, x2, x3, \ 365 t0, t1, t2, t3) \ 366 /* T = rotr32(X, 8); */ \ 367 /* X ^= T */ \ 368 vpxor x0, x3, t0; \ 369 vpxor x1, x0, t1; \ 370 vpxor x2, x1, t2; \ 371 vpxor x3, x2, t3; \ 372 /* X = T ^ rotr(X, 16); */ \ 373 vpxor t2, x0, x0; \ 374 vpxor x1, t3, t3; \ 375 vpxor t0, x2, x2; \ 376 vpxor t1, x3, x1; \ 377 vmovdqu t3, x3; 378 379#define aria_diff_word(x0, x1, x2, x3, \ 380 x4, x5, x6, x7, \ 381 y0, y1, y2, y3, \ 382 y4, y5, y6, y7) \ 383 /* t1 ^= t2; */ \ 384 vpxor y0, x4, x4; \ 385 vpxor y1, x5, x5; \ 386 vpxor y2, x6, x6; \ 387 vpxor y3, x7, x7; \ 388 \ 389 /* t2 ^= t3; */ \ 390 vpxor y4, y0, y0; \ 391 vpxor y5, y1, y1; \ 392 vpxor y6, y2, y2; \ 393 vpxor y7, y3, y3; \ 394 \ 395 /* t0 ^= t1; */ \ 396 vpxor x4, x0, x0; \ 397 vpxor x5, x1, x1; \ 398 vpxor x6, x2, x2; \ 399 vpxor x7, x3, x3; \ 400 \ 401 /* t3 ^= t1; */ \ 402 vpxor x4, y4, y4; \ 403 vpxor x5, y5, y5; \ 404 vpxor x6, y6, y6; \ 405 vpxor x7, y7, y7; \ 406 \ 407 /* t2 ^= t0; */ \ 408 vpxor x0, y0, y0; \ 409 vpxor x1, y1, y1; \ 410 vpxor x2, y2, y2; \ 411 vpxor x3, y3, y3; \ 412 \ 413 /* t1 ^= t2; */ \ 414 vpxor y0, x4, x4; \ 415 vpxor y1, x5, x5; \ 416 vpxor y2, x6, x6; \ 417 vpxor y3, x7, x7; 418 419#define aria_fe(x0, x1, x2, x3, \ 420 x4, x5, x6, x7, \ 421 y0, y1, y2, y3, \ 422 y4, y5, y6, y7, \ 423 mem_tmp, rk, round) \ 424 vpxor y7, y7, y7; \ 425 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 426 y0, y7, y2, rk, 8, round); \ 427 \ 428 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 429 y0, y1, y2, y3, y4, y5, y6, y7); \ 430 \ 431 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 432 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 433 aria_store_state_8way(x0, x1, x2, x3, \ 434 x4, x5, x6, x7, \ 435 mem_tmp, 8); \ 436 \ 437 aria_load_state_8way(x0, x1, x2, x3, \ 438 x4, x5, x6, x7, \ 439 mem_tmp, 0); \ 440 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 441 y0, y7, y2, rk, 0, round); \ 442 \ 443 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 444 y0, y1, y2, y3, y4, y5, y6, y7); \ 445 \ 446 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 447 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 448 aria_store_state_8way(x0, x1, x2, x3, \ 449 x4, x5, x6, x7, \ 450 mem_tmp, 0); \ 451 aria_load_state_8way(y0, y1, y2, y3, \ 452 y4, y5, y6, y7, \ 453 mem_tmp, 8); \ 454 aria_diff_word(x0, x1, x2, x3, \ 455 x4, x5, x6, x7, \ 456 y0, y1, y2, y3, \ 457 y4, y5, y6, y7); \ 458 /* aria_diff_byte() \ 459 * T3 = ABCD -> BADC \ 460 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 461 * T0 = ABCD -> CDAB \ 462 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 463 * T1 = ABCD -> DCBA \ 464 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 465 */ \ 466 aria_diff_word(x2, x3, x0, x1, \ 467 x7, x6, x5, x4, \ 468 y0, y1, y2, y3, \ 469 y5, y4, y7, y6); \ 470 aria_store_state_8way(x3, x2, x1, x0, \ 471 x6, x7, x4, x5, \ 472 mem_tmp, 0); 473 474#define aria_fo(x0, x1, x2, x3, \ 475 x4, x5, x6, x7, \ 476 y0, y1, y2, y3, \ 477 y4, y5, y6, y7, \ 478 mem_tmp, rk, round) \ 479 vpxor y7, y7, y7; \ 480 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 481 y0, y7, y2, rk, 8, round); \ 482 \ 483 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 484 y0, y1, y2, y3, y4, y5, y6, y7); \ 485 \ 486 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 487 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 488 aria_store_state_8way(x0, x1, x2, x3, \ 489 x4, x5, x6, x7, \ 490 mem_tmp, 8); \ 491 \ 492 aria_load_state_8way(x0, x1, x2, x3, \ 493 x4, x5, x6, x7, \ 494 mem_tmp, 0); \ 495 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 496 y0, y7, y2, rk, 0, round); \ 497 \ 498 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 499 y0, y1, y2, y3, y4, y5, y6, y7); \ 500 \ 501 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 502 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 503 aria_store_state_8way(x0, x1, x2, x3, \ 504 x4, x5, x6, x7, \ 505 mem_tmp, 0); \ 506 aria_load_state_8way(y0, y1, y2, y3, \ 507 y4, y5, y6, y7, \ 508 mem_tmp, 8); \ 509 aria_diff_word(x0, x1, x2, x3, \ 510 x4, x5, x6, x7, \ 511 y0, y1, y2, y3, \ 512 y4, y5, y6, y7); \ 513 /* aria_diff_byte() \ 514 * T1 = ABCD -> BADC \ 515 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 516 * T2 = ABCD -> CDAB \ 517 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 518 * T3 = ABCD -> DCBA \ 519 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 520 */ \ 521 aria_diff_word(x0, x1, x2, x3, \ 522 x5, x4, x7, x6, \ 523 y2, y3, y0, y1, \ 524 y7, y6, y5, y4); \ 525 aria_store_state_8way(x3, x2, x1, x0, \ 526 x6, x7, x4, x5, \ 527 mem_tmp, 0); 528 529#define aria_ff(x0, x1, x2, x3, \ 530 x4, x5, x6, x7, \ 531 y0, y1, y2, y3, \ 532 y4, y5, y6, y7, \ 533 mem_tmp, rk, round, last_round) \ 534 vpxor y7, y7, y7; \ 535 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 536 y0, y7, y2, rk, 8, round); \ 537 \ 538 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 539 y0, y1, y2, y3, y4, y5, y6, y7); \ 540 \ 541 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 542 y0, y7, y2, rk, 8, last_round); \ 543 \ 544 aria_store_state_8way(x0, x1, x2, x3, \ 545 x4, x5, x6, x7, \ 546 mem_tmp, 8); \ 547 \ 548 aria_load_state_8way(x0, x1, x2, x3, \ 549 x4, x5, x6, x7, \ 550 mem_tmp, 0); \ 551 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 552 y0, y7, y2, rk, 0, round); \ 553 \ 554 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 555 y0, y1, y2, y3, y4, y5, y6, y7); \ 556 \ 557 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 558 y0, y7, y2, rk, 0, last_round); \ 559 \ 560 aria_load_state_8way(y0, y1, y2, y3, \ 561 y4, y5, y6, y7, \ 562 mem_tmp, 8); 563 564#ifdef CONFIG_AS_GFNI 565#define aria_fe_gfni(x0, x1, x2, x3, \ 566 x4, x5, x6, x7, \ 567 y0, y1, y2, y3, \ 568 y4, y5, y6, y7, \ 569 mem_tmp, rk, round) \ 570 vpxor y7, y7, y7; \ 571 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 572 y0, y7, y2, rk, 8, round); \ 573 \ 574 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 575 x6, x7, x4, x5, \ 576 y0, y1, y2, y3, \ 577 y4, y5, y6, y7); \ 578 \ 579 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 580 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 581 aria_store_state_8way(x0, x1, x2, x3, \ 582 x4, x5, x6, x7, \ 583 mem_tmp, 8); \ 584 \ 585 aria_load_state_8way(x0, x1, x2, x3, \ 586 x4, x5, x6, x7, \ 587 mem_tmp, 0); \ 588 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 589 y0, y7, y2, rk, 0, round); \ 590 \ 591 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 592 x6, x7, x4, x5, \ 593 y0, y1, y2, y3, \ 594 y4, y5, y6, y7); \ 595 \ 596 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 597 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 598 aria_store_state_8way(x0, x1, x2, x3, \ 599 x4, x5, x6, x7, \ 600 mem_tmp, 0); \ 601 aria_load_state_8way(y0, y1, y2, y3, \ 602 y4, y5, y6, y7, \ 603 mem_tmp, 8); \ 604 aria_diff_word(x0, x1, x2, x3, \ 605 x4, x5, x6, x7, \ 606 y0, y1, y2, y3, \ 607 y4, y5, y6, y7); \ 608 /* aria_diff_byte() \ 609 * T3 = ABCD -> BADC \ 610 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 611 * T0 = ABCD -> CDAB \ 612 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 613 * T1 = ABCD -> DCBA \ 614 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 615 */ \ 616 aria_diff_word(x2, x3, x0, x1, \ 617 x7, x6, x5, x4, \ 618 y0, y1, y2, y3, \ 619 y5, y4, y7, y6); \ 620 aria_store_state_8way(x3, x2, x1, x0, \ 621 x6, x7, x4, x5, \ 622 mem_tmp, 0); 623 624#define aria_fo_gfni(x0, x1, x2, x3, \ 625 x4, x5, x6, x7, \ 626 y0, y1, y2, y3, \ 627 y4, y5, y6, y7, \ 628 mem_tmp, rk, round) \ 629 vpxor y7, y7, y7; \ 630 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 631 y0, y7, y2, rk, 8, round); \ 632 \ 633 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 634 x4, x5, x6, x7, \ 635 y0, y1, y2, y3, \ 636 y4, y5, y6, y7); \ 637 \ 638 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 639 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 640 aria_store_state_8way(x0, x1, x2, x3, \ 641 x4, x5, x6, x7, \ 642 mem_tmp, 8); \ 643 \ 644 aria_load_state_8way(x0, x1, x2, x3, \ 645 x4, x5, x6, x7, \ 646 mem_tmp, 0); \ 647 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 648 y0, y7, y2, rk, 0, round); \ 649 \ 650 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 651 x4, x5, x6, x7, \ 652 y0, y1, y2, y3, \ 653 y4, y5, y6, y7); \ 654 \ 655 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 656 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 657 aria_store_state_8way(x0, x1, x2, x3, \ 658 x4, x5, x6, x7, \ 659 mem_tmp, 0); \ 660 aria_load_state_8way(y0, y1, y2, y3, \ 661 y4, y5, y6, y7, \ 662 mem_tmp, 8); \ 663 aria_diff_word(x0, x1, x2, x3, \ 664 x4, x5, x6, x7, \ 665 y0, y1, y2, y3, \ 666 y4, y5, y6, y7); \ 667 /* aria_diff_byte() \ 668 * T1 = ABCD -> BADC \ 669 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 670 * T2 = ABCD -> CDAB \ 671 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 672 * T3 = ABCD -> DCBA \ 673 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 674 */ \ 675 aria_diff_word(x0, x1, x2, x3, \ 676 x5, x4, x7, x6, \ 677 y2, y3, y0, y1, \ 678 y7, y6, y5, y4); \ 679 aria_store_state_8way(x3, x2, x1, x0, \ 680 x6, x7, x4, x5, \ 681 mem_tmp, 0); 682 683#define aria_ff_gfni(x0, x1, x2, x3, \ 684 x4, x5, x6, x7, \ 685 y0, y1, y2, y3, \ 686 y4, y5, y6, y7, \ 687 mem_tmp, rk, round, last_round) \ 688 vpxor y7, y7, y7; \ 689 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 690 y0, y7, y2, rk, 8, round); \ 691 \ 692 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 693 x6, x7, x4, x5, \ 694 y0, y1, y2, y3, \ 695 y4, y5, y6, y7); \ 696 \ 697 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 698 y0, y7, y2, rk, 8, last_round); \ 699 \ 700 aria_store_state_8way(x0, x1, x2, x3, \ 701 x4, x5, x6, x7, \ 702 mem_tmp, 8); \ 703 \ 704 aria_load_state_8way(x0, x1, x2, x3, \ 705 x4, x5, x6, x7, \ 706 mem_tmp, 0); \ 707 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 708 y0, y7, y2, rk, 0, round); \ 709 \ 710 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 711 x6, x7, x4, x5, \ 712 y0, y1, y2, y3, \ 713 y4, y5, y6, y7); \ 714 \ 715 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 716 y0, y7, y2, rk, 0, last_round); \ 717 \ 718 aria_load_state_8way(y0, y1, y2, y3, \ 719 y4, y5, y6, y7, \ 720 mem_tmp, 8); 721 722#endif /* CONFIG_AS_GFNI */ 723 724/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 725.section .rodata.cst16, "aM", @progbits, 16 726.align 16 727 728#define SHUFB_BYTES(idx) \ 729 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 730 731.Lshufb_16x16b: 732 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); 733/* For isolating SubBytes from AESENCLAST, inverse shift row */ 734.Linv_shift_row: 735 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 736 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 737.Lshift_row: 738 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 739 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b 740/* For CTR-mode IV byteswap */ 741.Lbswap128_mask: 742 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 743 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 744 745/* AES inverse affine and S2 combined: 746 * 1 1 0 0 0 0 0 1 x0 0 747 * 0 1 0 0 1 0 0 0 x1 0 748 * 1 1 0 0 1 1 1 1 x2 0 749 * 0 1 1 0 1 0 0 1 x3 1 750 * 0 1 0 0 1 1 0 0 * x4 + 0 751 * 0 1 0 1 1 0 0 0 x5 0 752 * 0 0 0 0 0 1 0 1 x6 0 753 * 1 1 1 0 0 1 1 1 x7 1 754 */ 755.Ltf_lo__inv_aff__and__s2: 756 .octa 0x92172DA81A9FA520B2370D883ABF8500 757.Ltf_hi__inv_aff__and__s2: 758 .octa 0x2B15FFC1AF917B45E6D8320C625CB688 759 760/* X2 and AES forward affine combined: 761 * 1 0 1 1 0 0 0 1 x0 0 762 * 0 1 1 1 1 0 1 1 x1 0 763 * 0 0 0 1 1 0 1 0 x2 1 764 * 0 1 0 0 0 1 0 0 x3 0 765 * 0 0 1 1 1 0 1 1 * x4 + 0 766 * 0 1 0 0 1 0 0 0 x5 0 767 * 1 1 0 1 0 0 1 1 x6 0 768 * 0 1 0 0 1 0 1 0 x7 0 769 */ 770.Ltf_lo__x2__and__fwd_aff: 771 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 772.Ltf_hi__x2__and__fwd_aff: 773 .octa 0x3F893781E95FE1576CDA64D2BA0CB204 774 775#ifdef CONFIG_AS_GFNI 776.section .rodata.cst8, "aM", @progbits, 8 777.align 8 778/* AES affine: */ 779#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) 780.Ltf_aff_bitmatrix: 781 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 782 BV8(1, 1, 0, 0, 0, 1, 1, 1), 783 BV8(1, 1, 1, 0, 0, 0, 1, 1), 784 BV8(1, 1, 1, 1, 0, 0, 0, 1), 785 BV8(1, 1, 1, 1, 1, 0, 0, 0), 786 BV8(0, 1, 1, 1, 1, 1, 0, 0), 787 BV8(0, 0, 1, 1, 1, 1, 1, 0), 788 BV8(0, 0, 0, 1, 1, 1, 1, 1)) 789 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 790 BV8(1, 1, 0, 0, 0, 1, 1, 1), 791 BV8(1, 1, 1, 0, 0, 0, 1, 1), 792 BV8(1, 1, 1, 1, 0, 0, 0, 1), 793 BV8(1, 1, 1, 1, 1, 0, 0, 0), 794 BV8(0, 1, 1, 1, 1, 1, 0, 0), 795 BV8(0, 0, 1, 1, 1, 1, 1, 0), 796 BV8(0, 0, 0, 1, 1, 1, 1, 1)) 797 798/* AES inverse affine: */ 799#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) 800.Ltf_inv_bitmatrix: 801 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 802 BV8(1, 0, 0, 1, 0, 0, 1, 0), 803 BV8(0, 1, 0, 0, 1, 0, 0, 1), 804 BV8(1, 0, 1, 0, 0, 1, 0, 0), 805 BV8(0, 1, 0, 1, 0, 0, 1, 0), 806 BV8(0, 0, 1, 0, 1, 0, 0, 1), 807 BV8(1, 0, 0, 1, 0, 1, 0, 0), 808 BV8(0, 1, 0, 0, 1, 0, 1, 0)) 809 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 810 BV8(1, 0, 0, 1, 0, 0, 1, 0), 811 BV8(0, 1, 0, 0, 1, 0, 0, 1), 812 BV8(1, 0, 1, 0, 0, 1, 0, 0), 813 BV8(0, 1, 0, 1, 0, 0, 1, 0), 814 BV8(0, 0, 1, 0, 1, 0, 0, 1), 815 BV8(1, 0, 0, 1, 0, 1, 0, 0), 816 BV8(0, 1, 0, 0, 1, 0, 1, 0)) 817 818/* S2: */ 819#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) 820.Ltf_s2_bitmatrix: 821 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 822 BV8(0, 0, 1, 1, 1, 1, 1, 1), 823 BV8(1, 1, 1, 0, 1, 1, 0, 1), 824 BV8(1, 1, 0, 0, 0, 0, 1, 1), 825 BV8(0, 1, 0, 0, 0, 0, 1, 1), 826 BV8(1, 1, 0, 0, 1, 1, 1, 0), 827 BV8(0, 1, 1, 0, 0, 0, 1, 1), 828 BV8(1, 1, 1, 1, 0, 1, 1, 0)) 829 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 830 BV8(0, 0, 1, 1, 1, 1, 1, 1), 831 BV8(1, 1, 1, 0, 1, 1, 0, 1), 832 BV8(1, 1, 0, 0, 0, 0, 1, 1), 833 BV8(0, 1, 0, 0, 0, 0, 1, 1), 834 BV8(1, 1, 0, 0, 1, 1, 1, 0), 835 BV8(0, 1, 1, 0, 0, 0, 1, 1), 836 BV8(1, 1, 1, 1, 0, 1, 1, 0)) 837 838/* X2: */ 839#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) 840.Ltf_x2_bitmatrix: 841 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 842 BV8(0, 0, 1, 0, 0, 1, 1, 0), 843 BV8(0, 0, 0, 0, 1, 0, 1, 0), 844 BV8(1, 1, 1, 0, 0, 0, 1, 1), 845 BV8(1, 1, 1, 0, 1, 1, 0, 0), 846 BV8(0, 1, 1, 0, 1, 0, 1, 1), 847 BV8(1, 0, 1, 1, 1, 1, 0, 1), 848 BV8(1, 0, 0, 1, 0, 0, 1, 1)) 849 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 850 BV8(0, 0, 1, 0, 0, 1, 1, 0), 851 BV8(0, 0, 0, 0, 1, 0, 1, 0), 852 BV8(1, 1, 1, 0, 0, 0, 1, 1), 853 BV8(1, 1, 1, 0, 1, 1, 0, 0), 854 BV8(0, 1, 1, 0, 1, 0, 1, 1), 855 BV8(1, 0, 1, 1, 1, 1, 0, 1), 856 BV8(1, 0, 0, 1, 0, 0, 1, 1)) 857 858/* Identity matrix: */ 859.Ltf_id_bitmatrix: 860 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 861 BV8(0, 1, 0, 0, 0, 0, 0, 0), 862 BV8(0, 0, 1, 0, 0, 0, 0, 0), 863 BV8(0, 0, 0, 1, 0, 0, 0, 0), 864 BV8(0, 0, 0, 0, 1, 0, 0, 0), 865 BV8(0, 0, 0, 0, 0, 1, 0, 0), 866 BV8(0, 0, 0, 0, 0, 0, 1, 0), 867 BV8(0, 0, 0, 0, 0, 0, 0, 1)) 868 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 869 BV8(0, 1, 0, 0, 0, 0, 0, 0), 870 BV8(0, 0, 1, 0, 0, 0, 0, 0), 871 BV8(0, 0, 0, 1, 0, 0, 0, 0), 872 BV8(0, 0, 0, 0, 1, 0, 0, 0), 873 BV8(0, 0, 0, 0, 0, 1, 0, 0), 874 BV8(0, 0, 0, 0, 0, 0, 1, 0), 875 BV8(0, 0, 0, 0, 0, 0, 0, 1)) 876#endif /* CONFIG_AS_GFNI */ 877 878/* 4-bit mask */ 879.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 880.align 4 881.L0f0f0f0f: 882 .long 0x0f0f0f0f 883 884.text 885 886SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) 887 /* input: 888 * %r9: rk 889 * %rsi: dst 890 * %rdx: src 891 * %xmm0..%xmm15: 16 byte-sliced blocks 892 */ 893 894 FRAME_BEGIN 895 896 movq %rsi, %rax; 897 leaq 8 * 16(%rax), %r8; 898 899 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 900 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 901 %xmm15, %rax, %r8); 902 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 903 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 904 %rax, %r9, 0); 905 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 906 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 907 %xmm15, %rax, %r9, 1); 908 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 909 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 910 %rax, %r9, 2); 911 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 912 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 913 %xmm15, %rax, %r9, 3); 914 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 915 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 916 %rax, %r9, 4); 917 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 918 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 919 %xmm15, %rax, %r9, 5); 920 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 921 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 922 %rax, %r9, 6); 923 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 924 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 925 %xmm15, %rax, %r9, 7); 926 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 927 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 928 %rax, %r9, 8); 929 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 930 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 931 %xmm15, %rax, %r9, 9); 932 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 933 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 934 %rax, %r9, 10); 935 cmpl $12, ARIA_CTX_rounds(CTX); 936 jne .Laria_192; 937 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 938 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 939 %xmm15, %rax, %r9, 11, 12); 940 jmp .Laria_end; 941.Laria_192: 942 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 943 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 944 %xmm15, %rax, %r9, 11); 945 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 946 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 947 %rax, %r9, 12); 948 cmpl $14, ARIA_CTX_rounds(CTX); 949 jne .Laria_256; 950 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 951 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 952 %xmm15, %rax, %r9, 13, 14); 953 jmp .Laria_end; 954.Laria_256: 955 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 956 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 957 %xmm15, %rax, %r9, 13); 958 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 959 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 960 %rax, %r9, 14); 961 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 962 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 963 %xmm15, %rax, %r9, 15, 16); 964.Laria_end: 965 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 966 %xmm9, %xmm13, %xmm0, %xmm5, 967 %xmm10, %xmm14, %xmm3, %xmm6, 968 %xmm11, %xmm15, %xmm2, %xmm7, 969 (%rax), (%r8)); 970 971 FRAME_END 972 RET; 973SYM_FUNC_END(__aria_aesni_avx_crypt_16way) 974 975SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way) 976 /* input: 977 * %rdi: ctx, CTX 978 * %rsi: dst 979 * %rdx: src 980 */ 981 982 FRAME_BEGIN 983 984 leaq ARIA_CTX_enc_key(CTX), %r9; 985 986 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 987 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 988 %xmm15, %rdx); 989 990 call __aria_aesni_avx_crypt_16way; 991 992 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 993 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 994 %xmm15, %rax); 995 996 FRAME_END 997 RET; 998SYM_FUNC_END(aria_aesni_avx_encrypt_16way) 999 1000SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way) 1001 /* input: 1002 * %rdi: ctx, CTX 1003 * %rsi: dst 1004 * %rdx: src 1005 */ 1006 1007 FRAME_BEGIN 1008 1009 leaq ARIA_CTX_dec_key(CTX), %r9; 1010 1011 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1012 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1013 %xmm15, %rdx); 1014 1015 call __aria_aesni_avx_crypt_16way; 1016 1017 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1018 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1019 %xmm15, %rax); 1020 1021 FRAME_END 1022 RET; 1023SYM_FUNC_END(aria_aesni_avx_decrypt_16way) 1024 1025SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way) 1026 /* input: 1027 * %rdi: ctx 1028 * %rsi: dst 1029 * %rdx: src 1030 * %rcx: keystream 1031 * %r8: iv (big endian, 128bit) 1032 */ 1033 1034 FRAME_BEGIN 1035 /* load IV and byteswap */ 1036 vmovdqu (%r8), %xmm8; 1037 1038 vmovdqa .Lbswap128_mask (%rip), %xmm1; 1039 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */ 1040 1041 vpcmpeqd %xmm0, %xmm0, %xmm0; 1042 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */ 1043 1044 /* construct IVs */ 1045 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1046 vpshufb %xmm1, %xmm3, %xmm9; 1047 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1048 vpshufb %xmm1, %xmm3, %xmm10; 1049 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1050 vpshufb %xmm1, %xmm3, %xmm11; 1051 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1052 vpshufb %xmm1, %xmm3, %xmm12; 1053 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1054 vpshufb %xmm1, %xmm3, %xmm13; 1055 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1056 vpshufb %xmm1, %xmm3, %xmm14; 1057 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1058 vpshufb %xmm1, %xmm3, %xmm15; 1059 vmovdqu %xmm8, (0 * 16)(%rcx); 1060 vmovdqu %xmm9, (1 * 16)(%rcx); 1061 vmovdqu %xmm10, (2 * 16)(%rcx); 1062 vmovdqu %xmm11, (3 * 16)(%rcx); 1063 vmovdqu %xmm12, (4 * 16)(%rcx); 1064 vmovdqu %xmm13, (5 * 16)(%rcx); 1065 vmovdqu %xmm14, (6 * 16)(%rcx); 1066 vmovdqu %xmm15, (7 * 16)(%rcx); 1067 1068 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1069 vpshufb %xmm1, %xmm3, %xmm8; 1070 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1071 vpshufb %xmm1, %xmm3, %xmm9; 1072 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1073 vpshufb %xmm1, %xmm3, %xmm10; 1074 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1075 vpshufb %xmm1, %xmm3, %xmm11; 1076 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1077 vpshufb %xmm1, %xmm3, %xmm12; 1078 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1079 vpshufb %xmm1, %xmm3, %xmm13; 1080 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1081 vpshufb %xmm1, %xmm3, %xmm14; 1082 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1083 vpshufb %xmm1, %xmm3, %xmm15; 1084 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1085 vpshufb %xmm1, %xmm3, %xmm4; 1086 vmovdqu %xmm4, (%r8); 1087 1088 vmovdqu (0 * 16)(%rcx), %xmm0; 1089 vmovdqu (1 * 16)(%rcx), %xmm1; 1090 vmovdqu (2 * 16)(%rcx), %xmm2; 1091 vmovdqu (3 * 16)(%rcx), %xmm3; 1092 vmovdqu (4 * 16)(%rcx), %xmm4; 1093 vmovdqu (5 * 16)(%rcx), %xmm5; 1094 vmovdqu (6 * 16)(%rcx), %xmm6; 1095 vmovdqu (7 * 16)(%rcx), %xmm7; 1096 1097 FRAME_END 1098 RET; 1099SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way) 1100 1101SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way) 1102 /* input: 1103 * %rdi: ctx 1104 * %rsi: dst 1105 * %rdx: src 1106 * %rcx: keystream 1107 * %r8: iv (big endian, 128bit) 1108 */ 1109 FRAME_BEGIN 1110 1111 call __aria_aesni_avx_ctr_gen_keystream_16way; 1112 1113 leaq (%rsi), %r10; 1114 leaq (%rdx), %r11; 1115 leaq (%rcx), %rsi; 1116 leaq (%rcx), %rdx; 1117 leaq ARIA_CTX_enc_key(CTX), %r9; 1118 1119 call __aria_aesni_avx_crypt_16way; 1120 1121 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1122 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1123 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1124 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1125 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1126 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1127 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1128 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1129 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1130 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1131 vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1132 vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1133 vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1134 vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1135 vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1136 vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1137 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1138 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1139 %xmm15, %r10); 1140 1141 FRAME_END 1142 RET; 1143SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way) 1144 1145#ifdef CONFIG_AS_GFNI 1146SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) 1147 /* input: 1148 * %r9: rk 1149 * %rsi: dst 1150 * %rdx: src 1151 * %xmm0..%xmm15: 16 byte-sliced blocks 1152 */ 1153 1154 FRAME_BEGIN 1155 1156 movq %rsi, %rax; 1157 leaq 8 * 16(%rax), %r8; 1158 1159 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, 1160 %xmm4, %xmm5, %xmm6, %xmm7, 1161 %xmm8, %xmm9, %xmm10, %xmm11, 1162 %xmm12, %xmm13, %xmm14, 1163 %xmm15, %rax, %r8); 1164 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11, 1165 %xmm12, %xmm13, %xmm14, %xmm15, 1166 %xmm0, %xmm1, %xmm2, %xmm3, 1167 %xmm4, %xmm5, %xmm6, %xmm7, 1168 %rax, %r9, 0); 1169 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1170 %xmm4, %xmm5, %xmm6, %xmm7, 1171 %xmm8, %xmm9, %xmm10, %xmm11, 1172 %xmm12, %xmm13, %xmm14, 1173 %xmm15, %rax, %r9, 1); 1174 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1175 %xmm12, %xmm13, %xmm14, %xmm15, 1176 %xmm0, %xmm1, %xmm2, %xmm3, 1177 %xmm4, %xmm5, %xmm6, %xmm7, 1178 %rax, %r9, 2); 1179 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1180 %xmm4, %xmm5, %xmm6, %xmm7, 1181 %xmm8, %xmm9, %xmm10, %xmm11, 1182 %xmm12, %xmm13, %xmm14, 1183 %xmm15, %rax, %r9, 3); 1184 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1185 %xmm12, %xmm13, %xmm14, %xmm15, 1186 %xmm0, %xmm1, %xmm2, %xmm3, 1187 %xmm4, %xmm5, %xmm6, %xmm7, 1188 %rax, %r9, 4); 1189 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1190 %xmm4, %xmm5, %xmm6, %xmm7, 1191 %xmm8, %xmm9, %xmm10, %xmm11, 1192 %xmm12, %xmm13, %xmm14, 1193 %xmm15, %rax, %r9, 5); 1194 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1195 %xmm12, %xmm13, %xmm14, %xmm15, 1196 %xmm0, %xmm1, %xmm2, %xmm3, 1197 %xmm4, %xmm5, %xmm6, %xmm7, 1198 %rax, %r9, 6); 1199 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1200 %xmm4, %xmm5, %xmm6, %xmm7, 1201 %xmm8, %xmm9, %xmm10, %xmm11, 1202 %xmm12, %xmm13, %xmm14, 1203 %xmm15, %rax, %r9, 7); 1204 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1205 %xmm12, %xmm13, %xmm14, %xmm15, 1206 %xmm0, %xmm1, %xmm2, %xmm3, 1207 %xmm4, %xmm5, %xmm6, %xmm7, 1208 %rax, %r9, 8); 1209 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1210 %xmm4, %xmm5, %xmm6, %xmm7, 1211 %xmm8, %xmm9, %xmm10, %xmm11, 1212 %xmm12, %xmm13, %xmm14, 1213 %xmm15, %rax, %r9, 9); 1214 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1215 %xmm12, %xmm13, %xmm14, %xmm15, 1216 %xmm0, %xmm1, %xmm2, %xmm3, 1217 %xmm4, %xmm5, %xmm6, %xmm7, 1218 %rax, %r9, 10); 1219 cmpl $12, ARIA_CTX_rounds(CTX); 1220 jne .Laria_gfni_192; 1221 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1222 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1223 %xmm15, %rax, %r9, 11, 12); 1224 jmp .Laria_gfni_end; 1225.Laria_gfni_192: 1226 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1227 %xmm4, %xmm5, %xmm6, %xmm7, 1228 %xmm8, %xmm9, %xmm10, %xmm11, 1229 %xmm12, %xmm13, %xmm14, 1230 %xmm15, %rax, %r9, 11); 1231 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1232 %xmm12, %xmm13, %xmm14, %xmm15, 1233 %xmm0, %xmm1, %xmm2, %xmm3, 1234 %xmm4, %xmm5, %xmm6, %xmm7, 1235 %rax, %r9, 12); 1236 cmpl $14, ARIA_CTX_rounds(CTX); 1237 jne .Laria_gfni_256; 1238 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1239 %xmm4, %xmm5, %xmm6, %xmm7, 1240 %xmm8, %xmm9, %xmm10, %xmm11, 1241 %xmm12, %xmm13, %xmm14, 1242 %xmm15, %rax, %r9, 13, 14); 1243 jmp .Laria_gfni_end; 1244.Laria_gfni_256: 1245 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1246 %xmm4, %xmm5, %xmm6, %xmm7, 1247 %xmm8, %xmm9, %xmm10, %xmm11, 1248 %xmm12, %xmm13, %xmm14, 1249 %xmm15, %rax, %r9, 13); 1250 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1251 %xmm12, %xmm13, %xmm14, %xmm15, 1252 %xmm0, %xmm1, %xmm2, %xmm3, 1253 %xmm4, %xmm5, %xmm6, %xmm7, 1254 %rax, %r9, 14); 1255 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1256 %xmm4, %xmm5, %xmm6, %xmm7, 1257 %xmm8, %xmm9, %xmm10, %xmm11, 1258 %xmm12, %xmm13, %xmm14, 1259 %xmm15, %rax, %r9, 15, 16); 1260.Laria_gfni_end: 1261 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 1262 %xmm9, %xmm13, %xmm0, %xmm5, 1263 %xmm10, %xmm14, %xmm3, %xmm6, 1264 %xmm11, %xmm15, %xmm2, %xmm7, 1265 (%rax), (%r8)); 1266 1267 FRAME_END 1268 RET; 1269SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way) 1270 1271SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) 1272 /* input: 1273 * %rdi: ctx, CTX 1274 * %rsi: dst 1275 * %rdx: src 1276 */ 1277 1278 FRAME_BEGIN 1279 1280 leaq ARIA_CTX_enc_key(CTX), %r9; 1281 1282 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1283 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1284 %xmm15, %rdx); 1285 1286 call __aria_aesni_avx_gfni_crypt_16way; 1287 1288 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1289 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1290 %xmm15, %rax); 1291 1292 FRAME_END 1293 RET; 1294SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way) 1295 1296SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) 1297 /* input: 1298 * %rdi: ctx, CTX 1299 * %rsi: dst 1300 * %rdx: src 1301 */ 1302 1303 FRAME_BEGIN 1304 1305 leaq ARIA_CTX_dec_key(CTX), %r9; 1306 1307 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1308 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1309 %xmm15, %rdx); 1310 1311 call __aria_aesni_avx_gfni_crypt_16way; 1312 1313 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1314 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1315 %xmm15, %rax); 1316 1317 FRAME_END 1318 RET; 1319SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way) 1320 1321SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) 1322 /* input: 1323 * %rdi: ctx 1324 * %rsi: dst 1325 * %rdx: src 1326 * %rcx: keystream 1327 * %r8: iv (big endian, 128bit) 1328 */ 1329 FRAME_BEGIN 1330 1331 call __aria_aesni_avx_ctr_gen_keystream_16way 1332 1333 leaq (%rsi), %r10; 1334 leaq (%rdx), %r11; 1335 leaq (%rcx), %rsi; 1336 leaq (%rcx), %rdx; 1337 leaq ARIA_CTX_enc_key(CTX), %r9; 1338 1339 call __aria_aesni_avx_gfni_crypt_16way; 1340 1341 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1342 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1343 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1344 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1345 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1346 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1347 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1348 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1349 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1350 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1351 vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1352 vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1353 vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1354 vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1355 vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1356 vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1357 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1358 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1359 %xmm15, %r10); 1360 1361 FRAME_END 1362 RET; 1363SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way) 1364#endif /* CONFIG_AS_GFNI */ 1365