1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * ARIA Cipher 64-way parallel algorithm (AVX512) 4 * 5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> 6 * 7 */ 8 9#include <linux/linkage.h> 10#include <asm/frame.h> 11#include <asm/asm-offsets.h> 12#include <linux/cfi_types.h> 13 14/* register macros */ 15#define CTX %rdi 16 17 18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ 19 ( (((a0) & 1) << 0) | \ 20 (((a1) & 1) << 1) | \ 21 (((a2) & 1) << 2) | \ 22 (((a3) & 1) << 3) | \ 23 (((a4) & 1) << 4) | \ 24 (((a5) & 1) << 5) | \ 25 (((a6) & 1) << 6) | \ 26 (((a7) & 1) << 7) ) 27 28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ 29 ( ((l7) << (0 * 8)) | \ 30 ((l6) << (1 * 8)) | \ 31 ((l5) << (2 * 8)) | \ 32 ((l4) << (3 * 8)) | \ 33 ((l3) << (4 * 8)) | \ 34 ((l2) << (5 * 8)) | \ 35 ((l1) << (6 * 8)) | \ 36 ((l0) << (7 * 8)) ) 37 38#define add_le128(out, in, lo_counter, hi_counter1) \ 39 vpaddq lo_counter, in, out; \ 40 vpcmpuq $1, lo_counter, out, %k1; \ 41 kaddb %k1, %k1, %k1; \ 42 vpaddq hi_counter1, out, out{%k1}; 43 44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 45 vpandq x, mask4bit, tmp0; \ 46 vpandqn x, mask4bit, x; \ 47 vpsrld $4, x, x; \ 48 \ 49 vpshufb tmp0, lo_t, tmp0; \ 50 vpshufb x, hi_t, x; \ 51 vpxorq tmp0, x, x; 52 53#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 54 vpunpckhdq x1, x0, t2; \ 55 vpunpckldq x1, x0, x0; \ 56 \ 57 vpunpckldq x3, x2, t1; \ 58 vpunpckhdq x3, x2, x2; \ 59 \ 60 vpunpckhqdq t1, x0, x1; \ 61 vpunpcklqdq t1, x0, x0; \ 62 \ 63 vpunpckhqdq x2, t2, x3; \ 64 vpunpcklqdq x2, t2, x2; 65 66#define byteslice_16x16b(a0, b0, c0, d0, \ 67 a1, b1, c1, d1, \ 68 a2, b2, c2, d2, \ 69 a3, b3, c3, d3, \ 70 st0, st1) \ 71 vmovdqu64 d2, st0; \ 72 vmovdqu64 d3, st1; \ 73 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 74 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 75 vmovdqu64 st0, d2; \ 76 vmovdqu64 st1, d3; \ 77 \ 78 vmovdqu64 a0, st0; \ 79 vmovdqu64 a1, st1; \ 80 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 81 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 82 \ 83 vbroadcasti64x2 .Lshufb_16x16b, a0; \ 84 vmovdqu64 st1, a1; \ 85 vpshufb a0, a2, a2; \ 86 vpshufb a0, a3, a3; \ 87 vpshufb a0, b0, b0; \ 88 vpshufb a0, b1, b1; \ 89 vpshufb a0, b2, b2; \ 90 vpshufb a0, b3, b3; \ 91 vpshufb a0, a1, a1; \ 92 vpshufb a0, c0, c0; \ 93 vpshufb a0, c1, c1; \ 94 vpshufb a0, c2, c2; \ 95 vpshufb a0, c3, c3; \ 96 vpshufb a0, d0, d0; \ 97 vpshufb a0, d1, d1; \ 98 vpshufb a0, d2, d2; \ 99 vpshufb a0, d3, d3; \ 100 vmovdqu64 d3, st1; \ 101 vmovdqu64 st0, d3; \ 102 vpshufb a0, d3, a0; \ 103 vmovdqu64 d2, st0; \ 104 \ 105 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 106 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 107 vmovdqu64 st0, d2; \ 108 vmovdqu64 st1, d3; \ 109 \ 110 vmovdqu64 b0, st0; \ 111 vmovdqu64 b1, st1; \ 112 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 113 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 114 vmovdqu64 st0, b0; \ 115 vmovdqu64 st1, b1; \ 116 /* does not adjust output bytes inside vectors */ 117 118#define debyteslice_16x16b(a0, b0, c0, d0, \ 119 a1, b1, c1, d1, \ 120 a2, b2, c2, d2, \ 121 a3, b3, c3, d3, \ 122 st0, st1) \ 123 vmovdqu64 d2, st0; \ 124 vmovdqu64 d3, st1; \ 125 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 126 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 127 vmovdqu64 st0, d2; \ 128 vmovdqu64 st1, d3; \ 129 \ 130 vmovdqu64 a0, st0; \ 131 vmovdqu64 a1, st1; \ 132 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 133 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 134 \ 135 vbroadcasti64x2 .Lshufb_16x16b, a0; \ 136 vmovdqu64 st1, a1; \ 137 vpshufb a0, a2, a2; \ 138 vpshufb a0, a3, a3; \ 139 vpshufb a0, b0, b0; \ 140 vpshufb a0, b1, b1; \ 141 vpshufb a0, b2, b2; \ 142 vpshufb a0, b3, b3; \ 143 vpshufb a0, a1, a1; \ 144 vpshufb a0, c0, c0; \ 145 vpshufb a0, c1, c1; \ 146 vpshufb a0, c2, c2; \ 147 vpshufb a0, c3, c3; \ 148 vpshufb a0, d0, d0; \ 149 vpshufb a0, d1, d1; \ 150 vpshufb a0, d2, d2; \ 151 vpshufb a0, d3, d3; \ 152 vmovdqu64 d3, st1; \ 153 vmovdqu64 st0, d3; \ 154 vpshufb a0, d3, a0; \ 155 vmovdqu64 d2, st0; \ 156 \ 157 transpose_4x4(c0, d0, a0, b0, d2, d3); \ 158 transpose_4x4(c1, d1, a1, b1, d2, d3); \ 159 vmovdqu64 st0, d2; \ 160 vmovdqu64 st1, d3; \ 161 \ 162 vmovdqu64 b0, st0; \ 163 vmovdqu64 b1, st1; \ 164 transpose_4x4(c2, d2, a2, b2, b0, b1); \ 165 transpose_4x4(c3, d3, a3, b3, b0, b1); \ 166 vmovdqu64 st0, b0; \ 167 vmovdqu64 st1, b1; \ 168 /* does not adjust output bytes inside vectors */ 169 170/* load blocks to registers and apply pre-whitening */ 171#define inpack16_pre(x0, x1, x2, x3, \ 172 x4, x5, x6, x7, \ 173 y0, y1, y2, y3, \ 174 y4, y5, y6, y7, \ 175 rio) \ 176 vmovdqu64 (0 * 64)(rio), x0; \ 177 vmovdqu64 (1 * 64)(rio), x1; \ 178 vmovdqu64 (2 * 64)(rio), x2; \ 179 vmovdqu64 (3 * 64)(rio), x3; \ 180 vmovdqu64 (4 * 64)(rio), x4; \ 181 vmovdqu64 (5 * 64)(rio), x5; \ 182 vmovdqu64 (6 * 64)(rio), x6; \ 183 vmovdqu64 (7 * 64)(rio), x7; \ 184 vmovdqu64 (8 * 64)(rio), y0; \ 185 vmovdqu64 (9 * 64)(rio), y1; \ 186 vmovdqu64 (10 * 64)(rio), y2; \ 187 vmovdqu64 (11 * 64)(rio), y3; \ 188 vmovdqu64 (12 * 64)(rio), y4; \ 189 vmovdqu64 (13 * 64)(rio), y5; \ 190 vmovdqu64 (14 * 64)(rio), y6; \ 191 vmovdqu64 (15 * 64)(rio), y7; 192 193/* byteslice pre-whitened blocks and store to temporary memory */ 194#define inpack16_post(x0, x1, x2, x3, \ 195 x4, x5, x6, x7, \ 196 y0, y1, y2, y3, \ 197 y4, y5, y6, y7, \ 198 mem_ab, mem_cd) \ 199 byteslice_16x16b(x0, x1, x2, x3, \ 200 x4, x5, x6, x7, \ 201 y0, y1, y2, y3, \ 202 y4, y5, y6, y7, \ 203 (mem_ab), (mem_cd)); \ 204 \ 205 vmovdqu64 x0, 0 * 64(mem_ab); \ 206 vmovdqu64 x1, 1 * 64(mem_ab); \ 207 vmovdqu64 x2, 2 * 64(mem_ab); \ 208 vmovdqu64 x3, 3 * 64(mem_ab); \ 209 vmovdqu64 x4, 4 * 64(mem_ab); \ 210 vmovdqu64 x5, 5 * 64(mem_ab); \ 211 vmovdqu64 x6, 6 * 64(mem_ab); \ 212 vmovdqu64 x7, 7 * 64(mem_ab); \ 213 vmovdqu64 y0, 0 * 64(mem_cd); \ 214 vmovdqu64 y1, 1 * 64(mem_cd); \ 215 vmovdqu64 y2, 2 * 64(mem_cd); \ 216 vmovdqu64 y3, 3 * 64(mem_cd); \ 217 vmovdqu64 y4, 4 * 64(mem_cd); \ 218 vmovdqu64 y5, 5 * 64(mem_cd); \ 219 vmovdqu64 y6, 6 * 64(mem_cd); \ 220 vmovdqu64 y7, 7 * 64(mem_cd); 221 222#define write_output(x0, x1, x2, x3, \ 223 x4, x5, x6, x7, \ 224 y0, y1, y2, y3, \ 225 y4, y5, y6, y7, \ 226 mem) \ 227 vmovdqu64 x0, 0 * 64(mem); \ 228 vmovdqu64 x1, 1 * 64(mem); \ 229 vmovdqu64 x2, 2 * 64(mem); \ 230 vmovdqu64 x3, 3 * 64(mem); \ 231 vmovdqu64 x4, 4 * 64(mem); \ 232 vmovdqu64 x5, 5 * 64(mem); \ 233 vmovdqu64 x6, 6 * 64(mem); \ 234 vmovdqu64 x7, 7 * 64(mem); \ 235 vmovdqu64 y0, 8 * 64(mem); \ 236 vmovdqu64 y1, 9 * 64(mem); \ 237 vmovdqu64 y2, 10 * 64(mem); \ 238 vmovdqu64 y3, 11 * 64(mem); \ 239 vmovdqu64 y4, 12 * 64(mem); \ 240 vmovdqu64 y5, 13 * 64(mem); \ 241 vmovdqu64 y6, 14 * 64(mem); \ 242 vmovdqu64 y7, 15 * 64(mem); \ 243 244#define aria_store_state_8way(x0, x1, x2, x3, \ 245 x4, x5, x6, x7, \ 246 mem_tmp, idx) \ 247 vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp); \ 248 vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp); \ 249 vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp); \ 250 vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp); \ 251 vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp); \ 252 vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp); \ 253 vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp); \ 254 vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp); 255 256#define aria_load_state_8way(x0, x1, x2, x3, \ 257 x4, x5, x6, x7, \ 258 mem_tmp, idx) \ 259 vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0; \ 260 vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1; \ 261 vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2; \ 262 vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3; \ 263 vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4; \ 264 vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5; \ 265 vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6; \ 266 vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7; 267 268#define aria_ark_16way(x0, x1, x2, x3, \ 269 x4, x5, x6, x7, \ 270 y0, y1, y2, y3, \ 271 y4, y5, y6, y7, \ 272 t0, rk, round) \ 273 /* AddRoundKey */ \ 274 vpbroadcastb ((round * 16) + 3)(rk), t0; \ 275 vpxorq t0, x0, x0; \ 276 vpbroadcastb ((round * 16) + 2)(rk), t0; \ 277 vpxorq t0, x1, x1; \ 278 vpbroadcastb ((round * 16) + 1)(rk), t0; \ 279 vpxorq t0, x2, x2; \ 280 vpbroadcastb ((round * 16) + 0)(rk), t0; \ 281 vpxorq t0, x3, x3; \ 282 vpbroadcastb ((round * 16) + 7)(rk), t0; \ 283 vpxorq t0, x4, x4; \ 284 vpbroadcastb ((round * 16) + 6)(rk), t0; \ 285 vpxorq t0, x5, x5; \ 286 vpbroadcastb ((round * 16) + 5)(rk), t0; \ 287 vpxorq t0, x6, x6; \ 288 vpbroadcastb ((round * 16) + 4)(rk), t0; \ 289 vpxorq t0, x7, x7; \ 290 vpbroadcastb ((round * 16) + 11)(rk), t0; \ 291 vpxorq t0, y0, y0; \ 292 vpbroadcastb ((round * 16) + 10)(rk), t0; \ 293 vpxorq t0, y1, y1; \ 294 vpbroadcastb ((round * 16) + 9)(rk), t0; \ 295 vpxorq t0, y2, y2; \ 296 vpbroadcastb ((round * 16) + 8)(rk), t0; \ 297 vpxorq t0, y3, y3; \ 298 vpbroadcastb ((round * 16) + 15)(rk), t0; \ 299 vpxorq t0, y4, y4; \ 300 vpbroadcastb ((round * 16) + 14)(rk), t0; \ 301 vpxorq t0, y5, y5; \ 302 vpbroadcastb ((round * 16) + 13)(rk), t0; \ 303 vpxorq t0, y6, y6; \ 304 vpbroadcastb ((round * 16) + 12)(rk), t0; \ 305 vpxorq t0, y7, y7; 306 307#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ 308 x4, x5, x6, x7, \ 309 t0, t1, t2, t3, \ 310 t4, t5, t6, t7) \ 311 vpbroadcastq .Ltf_s2_bitmatrix, t0; \ 312 vpbroadcastq .Ltf_inv_bitmatrix, t1; \ 313 vpbroadcastq .Ltf_id_bitmatrix, t2; \ 314 vpbroadcastq .Ltf_aff_bitmatrix, t3; \ 315 vpbroadcastq .Ltf_x2_bitmatrix, t4; \ 316 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ 317 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ 318 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ 319 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ 320 vgf2p8affineinvqb $0, t2, x2, x2; \ 321 vgf2p8affineinvqb $0, t2, x6, x6; \ 322 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ 323 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ 324 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ 325 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ 326 vgf2p8affineinvqb $0, t2, x3, x3; \ 327 vgf2p8affineinvqb $0, t2, x7, x7; 328 329#define aria_sbox_16way_gfni(x0, x1, x2, x3, \ 330 x4, x5, x6, x7, \ 331 y0, y1, y2, y3, \ 332 y4, y5, y6, y7, \ 333 t0, t1, t2, t3, \ 334 t4, t5, t6, t7) \ 335 vpbroadcastq .Ltf_s2_bitmatrix, t0; \ 336 vpbroadcastq .Ltf_inv_bitmatrix, t1; \ 337 vpbroadcastq .Ltf_id_bitmatrix, t2; \ 338 vpbroadcastq .Ltf_aff_bitmatrix, t3; \ 339 vpbroadcastq .Ltf_x2_bitmatrix, t4; \ 340 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ 341 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ 342 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ 343 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ 344 vgf2p8affineinvqb $0, t2, x2, x2; \ 345 vgf2p8affineinvqb $0, t2, x6, x6; \ 346 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ 347 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ 348 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ 349 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ 350 vgf2p8affineinvqb $0, t2, x3, x3; \ 351 vgf2p8affineinvqb $0, t2, x7, x7; \ 352 vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1; \ 353 vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5; \ 354 vgf2p8affineqb $(tf_inv_const), t1, y2, y2; \ 355 vgf2p8affineqb $(tf_inv_const), t1, y6, y6; \ 356 vgf2p8affineinvqb $0, t2, y2, y2; \ 357 vgf2p8affineinvqb $0, t2, y6, y6; \ 358 vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0; \ 359 vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4; \ 360 vgf2p8affineqb $(tf_x2_const), t4, y3, y3; \ 361 vgf2p8affineqb $(tf_x2_const), t4, y7, y7; \ 362 vgf2p8affineinvqb $0, t2, y3, y3; \ 363 vgf2p8affineinvqb $0, t2, y7, y7; 364 365 366#define aria_diff_m(x0, x1, x2, x3, \ 367 t0, t1, t2, t3) \ 368 /* T = rotr32(X, 8); */ \ 369 /* X ^= T */ \ 370 vpxorq x0, x3, t0; \ 371 vpxorq x1, x0, t1; \ 372 vpxorq x2, x1, t2; \ 373 vpxorq x3, x2, t3; \ 374 /* X = T ^ rotr(X, 16); */ \ 375 vpxorq t2, x0, x0; \ 376 vpxorq x1, t3, t3; \ 377 vpxorq t0, x2, x2; \ 378 vpxorq t1, x3, x1; \ 379 vmovdqu64 t3, x3; 380 381#define aria_diff_word(x0, x1, x2, x3, \ 382 x4, x5, x6, x7, \ 383 y0, y1, y2, y3, \ 384 y4, y5, y6, y7) \ 385 /* t1 ^= t2; */ \ 386 vpxorq y0, x4, x4; \ 387 vpxorq y1, x5, x5; \ 388 vpxorq y2, x6, x6; \ 389 vpxorq y3, x7, x7; \ 390 \ 391 /* t2 ^= t3; */ \ 392 vpxorq y4, y0, y0; \ 393 vpxorq y5, y1, y1; \ 394 vpxorq y6, y2, y2; \ 395 vpxorq y7, y3, y3; \ 396 \ 397 /* t0 ^= t1; */ \ 398 vpxorq x4, x0, x0; \ 399 vpxorq x5, x1, x1; \ 400 vpxorq x6, x2, x2; \ 401 vpxorq x7, x3, x3; \ 402 \ 403 /* t3 ^= t1; */ \ 404 vpxorq x4, y4, y4; \ 405 vpxorq x5, y5, y5; \ 406 vpxorq x6, y6, y6; \ 407 vpxorq x7, y7, y7; \ 408 \ 409 /* t2 ^= t0; */ \ 410 vpxorq x0, y0, y0; \ 411 vpxorq x1, y1, y1; \ 412 vpxorq x2, y2, y2; \ 413 vpxorq x3, y3, y3; \ 414 \ 415 /* t1 ^= t2; */ \ 416 vpxorq y0, x4, x4; \ 417 vpxorq y1, x5, x5; \ 418 vpxorq y2, x6, x6; \ 419 vpxorq y3, x7, x7; 420 421#define aria_fe_gfni(x0, x1, x2, x3, \ 422 x4, x5, x6, x7, \ 423 y0, y1, y2, y3, \ 424 y4, y5, y6, y7, \ 425 z0, z1, z2, z3, \ 426 z4, z5, z6, z7, \ 427 mem_tmp, rk, round) \ 428 aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \ 429 y0, y1, y2, y3, y4, y5, y6, y7, \ 430 z0, rk, round); \ 431 \ 432 aria_sbox_16way_gfni(x2, x3, x0, x1, \ 433 x6, x7, x4, x5, \ 434 y2, y3, y0, y1, \ 435 y6, y7, y4, y5, \ 436 z0, z1, z2, z3, \ 437 z4, z5, z6, z7); \ 438 \ 439 aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \ 440 aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \ 441 aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \ 442 aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \ 443 aria_diff_word(x0, x1, x2, x3, \ 444 x4, x5, x6, x7, \ 445 y0, y1, y2, y3, \ 446 y4, y5, y6, y7); \ 447 /* aria_diff_byte() \ 448 * T3 = ABCD -> BADC \ 449 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 450 * T0 = ABCD -> CDAB \ 451 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 452 * T1 = ABCD -> DCBA \ 453 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 454 */ \ 455 aria_diff_word(x2, x3, x0, x1, \ 456 x7, x6, x5, x4, \ 457 y0, y1, y2, y3, \ 458 y5, y4, y7, y6); \ 459 460 461#define aria_fo_gfni(x0, x1, x2, x3, \ 462 x4, x5, x6, x7, \ 463 y0, y1, y2, y3, \ 464 y4, y5, y6, y7, \ 465 z0, z1, z2, z3, \ 466 z4, z5, z6, z7, \ 467 mem_tmp, rk, round) \ 468 aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \ 469 y0, y1, y2, y3, y4, y5, y6, y7, \ 470 z0, rk, round); \ 471 \ 472 aria_sbox_16way_gfni(x0, x1, x2, x3, \ 473 x4, x5, x6, x7, \ 474 y0, y1, y2, y3, \ 475 y4, y5, y6, y7, \ 476 z0, z1, z2, z3, \ 477 z4, z5, z6, z7); \ 478 \ 479 aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \ 480 aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \ 481 aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \ 482 aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \ 483 aria_diff_word(x0, x1, x2, x3, \ 484 x4, x5, x6, x7, \ 485 y0, y1, y2, y3, \ 486 y4, y5, y6, y7); \ 487 /* aria_diff_byte() \ 488 * T1 = ABCD -> BADC \ 489 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 490 * T2 = ABCD -> CDAB \ 491 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 492 * T3 = ABCD -> DCBA \ 493 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 494 */ \ 495 aria_diff_word(x0, x1, x2, x3, \ 496 x5, x4, x7, x6, \ 497 y2, y3, y0, y1, \ 498 y7, y6, y5, y4); 499 500#define aria_ff_gfni(x0, x1, x2, x3, \ 501 x4, x5, x6, x7, \ 502 y0, y1, y2, y3, \ 503 y4, y5, y6, y7, \ 504 z0, z1, z2, z3, \ 505 z4, z5, z6, z7, \ 506 mem_tmp, rk, round, last_round) \ 507 aria_ark_16way(x0, x1, x2, x3, \ 508 x4, x5, x6, x7, \ 509 y0, y1, y2, y3, \ 510 y4, y5, y6, y7, \ 511 z0, rk, round); \ 512 aria_sbox_16way_gfni(x2, x3, x0, x1, \ 513 x6, x7, x4, x5, \ 514 y2, y3, y0, y1, \ 515 y6, y7, y4, y5, \ 516 z0, z1, z2, z3, \ 517 z4, z5, z6, z7); \ 518 aria_ark_16way(x0, x1, x2, x3, \ 519 x4, x5, x6, x7, \ 520 y0, y1, y2, y3, \ 521 y4, y5, y6, y7, \ 522 z0, rk, last_round); 523 524 525.section .rodata.cst64, "aM", @progbits, 64 526.align 64 527.Lcounter0123_lo: 528 .quad 0, 0 529 .quad 1, 0 530 .quad 2, 0 531 .quad 3, 0 532 533.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32 534.align 32 535#define SHUFB_BYTES(idx) \ 536 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 537.Lshufb_16x16b: 538 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 539 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 540 541.section .rodata.cst16, "aM", @progbits, 16 542.align 16 543 544.Lcounter4444_lo: 545 .quad 4, 0 546.Lcounter8888_lo: 547 .quad 8, 0 548.Lcounter16161616_lo: 549 .quad 16, 0 550.Lcounter1111_hi: 551 .quad 0, 1 552 553/* For CTR-mode IV byteswap */ 554.Lbswap128_mask: 555 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 556 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 557 558.section .rodata.cst8, "aM", @progbits, 8 559.align 8 560/* AES affine: */ 561#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) 562.Ltf_aff_bitmatrix: 563 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 564 BV8(1, 1, 0, 0, 0, 1, 1, 1), 565 BV8(1, 1, 1, 0, 0, 0, 1, 1), 566 BV8(1, 1, 1, 1, 0, 0, 0, 1), 567 BV8(1, 1, 1, 1, 1, 0, 0, 0), 568 BV8(0, 1, 1, 1, 1, 1, 0, 0), 569 BV8(0, 0, 1, 1, 1, 1, 1, 0), 570 BV8(0, 0, 0, 1, 1, 1, 1, 1)) 571 572/* AES inverse affine: */ 573#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) 574.Ltf_inv_bitmatrix: 575 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 576 BV8(1, 0, 0, 1, 0, 0, 1, 0), 577 BV8(0, 1, 0, 0, 1, 0, 0, 1), 578 BV8(1, 0, 1, 0, 0, 1, 0, 0), 579 BV8(0, 1, 0, 1, 0, 0, 1, 0), 580 BV8(0, 0, 1, 0, 1, 0, 0, 1), 581 BV8(1, 0, 0, 1, 0, 1, 0, 0), 582 BV8(0, 1, 0, 0, 1, 0, 1, 0)) 583 584/* S2: */ 585#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) 586.Ltf_s2_bitmatrix: 587 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 588 BV8(0, 0, 1, 1, 1, 1, 1, 1), 589 BV8(1, 1, 1, 0, 1, 1, 0, 1), 590 BV8(1, 1, 0, 0, 0, 0, 1, 1), 591 BV8(0, 1, 0, 0, 0, 0, 1, 1), 592 BV8(1, 1, 0, 0, 1, 1, 1, 0), 593 BV8(0, 1, 1, 0, 0, 0, 1, 1), 594 BV8(1, 1, 1, 1, 0, 1, 1, 0)) 595 596/* X2: */ 597#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) 598.Ltf_x2_bitmatrix: 599 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 600 BV8(0, 0, 1, 0, 0, 1, 1, 0), 601 BV8(0, 0, 0, 0, 1, 0, 1, 0), 602 BV8(1, 1, 1, 0, 0, 0, 1, 1), 603 BV8(1, 1, 1, 0, 1, 1, 0, 0), 604 BV8(0, 1, 1, 0, 1, 0, 1, 1), 605 BV8(1, 0, 1, 1, 1, 1, 0, 1), 606 BV8(1, 0, 0, 1, 0, 0, 1, 1)) 607 608/* Identity matrix: */ 609.Ltf_id_bitmatrix: 610 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 611 BV8(0, 1, 0, 0, 0, 0, 0, 0), 612 BV8(0, 0, 1, 0, 0, 0, 0, 0), 613 BV8(0, 0, 0, 1, 0, 0, 0, 0), 614 BV8(0, 0, 0, 0, 1, 0, 0, 0), 615 BV8(0, 0, 0, 0, 0, 1, 0, 0), 616 BV8(0, 0, 0, 0, 0, 0, 1, 0), 617 BV8(0, 0, 0, 0, 0, 0, 0, 1)) 618 619.text 620SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way) 621 /* input: 622 * %r9: rk 623 * %rsi: dst 624 * %rdx: src 625 * %zmm0..%zmm15: byte-sliced blocks 626 */ 627 628 FRAME_BEGIN 629 630 movq %rsi, %rax; 631 leaq 8 * 64(%rax), %r8; 632 633 inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3, 634 %zmm4, %zmm5, %zmm6, %zmm7, 635 %zmm8, %zmm9, %zmm10, %zmm11, 636 %zmm12, %zmm13, %zmm14, 637 %zmm15, %rax, %r8); 638 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, 639 %zmm4, %zmm5, %zmm6, %zmm7, 640 %zmm8, %zmm9, %zmm10, %zmm11, 641 %zmm12, %zmm13, %zmm14, %zmm15, 642 %zmm24, %zmm25, %zmm26, %zmm27, 643 %zmm28, %zmm29, %zmm30, %zmm31, 644 %rax, %r9, 0); 645 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 646 %zmm6, %zmm7, %zmm4, %zmm5, 647 %zmm9, %zmm8, %zmm11, %zmm10, 648 %zmm12, %zmm13, %zmm14, %zmm15, 649 %zmm24, %zmm25, %zmm26, %zmm27, 650 %zmm28, %zmm29, %zmm30, %zmm31, 651 %rax, %r9, 1); 652 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, 653 %zmm4, %zmm5, %zmm6, %zmm7, 654 %zmm8, %zmm9, %zmm10, %zmm11, 655 %zmm12, %zmm13, %zmm14, %zmm15, 656 %zmm24, %zmm25, %zmm26, %zmm27, 657 %zmm28, %zmm29, %zmm30, %zmm31, 658 %rax, %r9, 2); 659 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 660 %zmm6, %zmm7, %zmm4, %zmm5, 661 %zmm9, %zmm8, %zmm11, %zmm10, 662 %zmm12, %zmm13, %zmm14, %zmm15, 663 %zmm24, %zmm25, %zmm26, %zmm27, 664 %zmm28, %zmm29, %zmm30, %zmm31, 665 %rax, %r9, 3); 666 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, 667 %zmm4, %zmm5, %zmm6, %zmm7, 668 %zmm8, %zmm9, %zmm10, %zmm11, 669 %zmm12, %zmm13, %zmm14, %zmm15, 670 %zmm24, %zmm25, %zmm26, %zmm27, 671 %zmm28, %zmm29, %zmm30, %zmm31, 672 %rax, %r9, 4); 673 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 674 %zmm6, %zmm7, %zmm4, %zmm5, 675 %zmm9, %zmm8, %zmm11, %zmm10, 676 %zmm12, %zmm13, %zmm14, %zmm15, 677 %zmm24, %zmm25, %zmm26, %zmm27, 678 %zmm28, %zmm29, %zmm30, %zmm31, 679 %rax, %r9, 5); 680 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, 681 %zmm4, %zmm5, %zmm6, %zmm7, 682 %zmm8, %zmm9, %zmm10, %zmm11, 683 %zmm12, %zmm13, %zmm14, %zmm15, 684 %zmm24, %zmm25, %zmm26, %zmm27, 685 %zmm28, %zmm29, %zmm30, %zmm31, 686 %rax, %r9, 6); 687 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 688 %zmm6, %zmm7, %zmm4, %zmm5, 689 %zmm9, %zmm8, %zmm11, %zmm10, 690 %zmm12, %zmm13, %zmm14, %zmm15, 691 %zmm24, %zmm25, %zmm26, %zmm27, 692 %zmm28, %zmm29, %zmm30, %zmm31, 693 %rax, %r9, 7); 694 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, 695 %zmm4, %zmm5, %zmm6, %zmm7, 696 %zmm8, %zmm9, %zmm10, %zmm11, 697 %zmm12, %zmm13, %zmm14, %zmm15, 698 %zmm24, %zmm25, %zmm26, %zmm27, 699 %zmm28, %zmm29, %zmm30, %zmm31, 700 %rax, %r9, 8); 701 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 702 %zmm6, %zmm7, %zmm4, %zmm5, 703 %zmm9, %zmm8, %zmm11, %zmm10, 704 %zmm12, %zmm13, %zmm14, %zmm15, 705 %zmm24, %zmm25, %zmm26, %zmm27, 706 %zmm28, %zmm29, %zmm30, %zmm31, 707 %rax, %r9, 9); 708 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, 709 %zmm4, %zmm5, %zmm6, %zmm7, 710 %zmm8, %zmm9, %zmm10, %zmm11, 711 %zmm12, %zmm13, %zmm14, %zmm15, 712 %zmm24, %zmm25, %zmm26, %zmm27, 713 %zmm28, %zmm29, %zmm30, %zmm31, 714 %rax, %r9, 10); 715 cmpl $12, ARIA_CTX_rounds(CTX); 716 jne .Laria_gfni_192; 717 aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 718 %zmm6, %zmm7, %zmm4, %zmm5, 719 %zmm9, %zmm8, %zmm11, %zmm10, 720 %zmm12, %zmm13, %zmm14, %zmm15, 721 %zmm24, %zmm25, %zmm26, %zmm27, 722 %zmm28, %zmm29, %zmm30, %zmm31, 723 %rax, %r9, 11, 12); 724 jmp .Laria_gfni_end; 725.Laria_gfni_192: 726 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 727 %zmm6, %zmm7, %zmm4, %zmm5, 728 %zmm9, %zmm8, %zmm11, %zmm10, 729 %zmm12, %zmm13, %zmm14, %zmm15, 730 %zmm24, %zmm25, %zmm26, %zmm27, 731 %zmm28, %zmm29, %zmm30, %zmm31, 732 %rax, %r9, 11); 733 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, 734 %zmm4, %zmm5, %zmm6, %zmm7, 735 %zmm8, %zmm9, %zmm10, %zmm11, 736 %zmm12, %zmm13, %zmm14, %zmm15, 737 %zmm24, %zmm25, %zmm26, %zmm27, 738 %zmm28, %zmm29, %zmm30, %zmm31, 739 %rax, %r9, 12); 740 cmpl $14, ARIA_CTX_rounds(CTX); 741 jne .Laria_gfni_256; 742 aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 743 %zmm6, %zmm7, %zmm4, %zmm5, 744 %zmm9, %zmm8, %zmm11, %zmm10, 745 %zmm12, %zmm13, %zmm14, %zmm15, 746 %zmm24, %zmm25, %zmm26, %zmm27, 747 %zmm28, %zmm29, %zmm30, %zmm31, 748 %rax, %r9, 13, 14); 749 jmp .Laria_gfni_end; 750.Laria_gfni_256: 751 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 752 %zmm6, %zmm7, %zmm4, %zmm5, 753 %zmm9, %zmm8, %zmm11, %zmm10, 754 %zmm12, %zmm13, %zmm14, %zmm15, 755 %zmm24, %zmm25, %zmm26, %zmm27, 756 %zmm28, %zmm29, %zmm30, %zmm31, 757 %rax, %r9, 13); 758 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3, 759 %zmm4, %zmm5, %zmm6, %zmm7, 760 %zmm8, %zmm9, %zmm10, %zmm11, 761 %zmm12, %zmm13, %zmm14, %zmm15, 762 %zmm24, %zmm25, %zmm26, %zmm27, 763 %zmm28, %zmm29, %zmm30, %zmm31, 764 %rax, %r9, 14); 765 aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0, 766 %zmm6, %zmm7, %zmm4, %zmm5, 767 %zmm9, %zmm8, %zmm11, %zmm10, 768 %zmm12, %zmm13, %zmm14, %zmm15, 769 %zmm24, %zmm25, %zmm26, %zmm27, 770 %zmm28, %zmm29, %zmm30, %zmm31, 771 %rax, %r9, 15, 16); 772.Laria_gfni_end: 773 debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6, 774 %zmm8, %zmm13, %zmm2, %zmm7, 775 %zmm11, %zmm14, %zmm1, %zmm4, 776 %zmm10, %zmm15, %zmm0, %zmm5, 777 (%rax), (%r8)); 778 FRAME_END 779 RET; 780SYM_FUNC_END(__aria_gfni_avx512_crypt_64way) 781 782SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way) 783 /* input: 784 * %rdi: ctx, CTX 785 * %rsi: dst 786 * %rdx: src 787 */ 788 789 FRAME_BEGIN 790 791 leaq ARIA_CTX_enc_key(CTX), %r9; 792 793 inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, 794 %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, 795 %zmm15, %rdx); 796 797 call __aria_gfni_avx512_crypt_64way; 798 799 write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5, 800 %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14, 801 %zmm15, %rax); 802 803 FRAME_END 804 RET; 805SYM_FUNC_END(aria_gfni_avx512_encrypt_64way) 806 807SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way) 808 /* input: 809 * %rdi: ctx, CTX 810 * %rsi: dst 811 * %rdx: src 812 */ 813 814 FRAME_BEGIN 815 816 leaq ARIA_CTX_dec_key(CTX), %r9; 817 818 inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, 819 %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, 820 %zmm15, %rdx); 821 822 call __aria_gfni_avx512_crypt_64way; 823 824 write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5, 825 %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14, 826 %zmm15, %rax); 827 828 FRAME_END 829 RET; 830SYM_FUNC_END(aria_gfni_avx512_decrypt_64way) 831 832SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way) 833 /* input: 834 * %rdi: ctx 835 * %rsi: dst 836 * %rdx: src 837 * %rcx: keystream 838 * %r8: iv (big endian, 128bit) 839 */ 840 841 FRAME_BEGIN 842 843 vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19; 844 vmovdqa64 .Lcounter0123_lo (%rip), %zmm21; 845 vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22; 846 vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23; 847 vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24; 848 vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25; 849 850 /* load IV and byteswap */ 851 movq 8(%r8), %r11; 852 movq (%r8), %r10; 853 bswapq %r11; 854 bswapq %r10; 855 vbroadcasti64x2 (%r8), %zmm20; 856 vpshufb %zmm19, %zmm20, %zmm20; 857 858 /* check need for handling 64-bit overflow and carry */ 859 cmpq $(0xffffffffffffffff - 64), %r11; 860 ja .Lload_ctr_carry; 861 862 /* construct IVs */ 863 vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */ 864 vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */ 865 vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */ 866 vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */ 867 vpaddq %zmm24, %zmm0, %zmm4; /* +16... */ 868 vpaddq %zmm24, %zmm1, %zmm5; /* +20... */ 869 vpaddq %zmm24, %zmm2, %zmm6; /* +24... */ 870 vpaddq %zmm24, %zmm3, %zmm7; /* +28... */ 871 vpaddq %zmm24, %zmm4, %zmm8; /* +32... */ 872 vpaddq %zmm24, %zmm5, %zmm9; /* +36... */ 873 vpaddq %zmm24, %zmm6, %zmm10; /* +40... */ 874 vpaddq %zmm24, %zmm7, %zmm11; /* +44... */ 875 vpaddq %zmm24, %zmm8, %zmm12; /* +48... */ 876 vpaddq %zmm24, %zmm9, %zmm13; /* +52... */ 877 vpaddq %zmm24, %zmm10, %zmm14; /* +56... */ 878 vpaddq %zmm24, %zmm11, %zmm15; /* +60... */ 879 jmp .Lload_ctr_done; 880 881.Lload_ctr_carry: 882 /* construct IVs */ 883 add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */ 884 add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */ 885 add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */ 886 add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */ 887 add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */ 888 add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */ 889 add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */ 890 add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */ 891 add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */ 892 add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */ 893 add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */ 894 add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */ 895 add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */ 896 add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */ 897 add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */ 898 add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */ 899 900.Lload_ctr_done: 901 /* Byte-swap IVs and update counter. */ 902 addq $64, %r11; 903 adcq $0, %r10; 904 vpshufb %zmm19, %zmm15, %zmm15; 905 vpshufb %zmm19, %zmm14, %zmm14; 906 vpshufb %zmm19, %zmm13, %zmm13; 907 vpshufb %zmm19, %zmm12, %zmm12; 908 vpshufb %zmm19, %zmm11, %zmm11; 909 vpshufb %zmm19, %zmm10, %zmm10; 910 vpshufb %zmm19, %zmm9, %zmm9; 911 vpshufb %zmm19, %zmm8, %zmm8; 912 bswapq %r11; 913 bswapq %r10; 914 vpshufb %zmm19, %zmm7, %zmm7; 915 vpshufb %zmm19, %zmm6, %zmm6; 916 vpshufb %zmm19, %zmm5, %zmm5; 917 vpshufb %zmm19, %zmm4, %zmm4; 918 vpshufb %zmm19, %zmm3, %zmm3; 919 vpshufb %zmm19, %zmm2, %zmm2; 920 vpshufb %zmm19, %zmm1, %zmm1; 921 vpshufb %zmm19, %zmm0, %zmm0; 922 movq %r11, 8(%r8); 923 movq %r10, (%r8); 924 925 FRAME_END 926 RET; 927SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way) 928 929SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way) 930 /* input: 931 * %rdi: ctx 932 * %rsi: dst 933 * %rdx: src 934 * %rcx: keystream 935 * %r8: iv (big endian, 128bit) 936 */ 937 FRAME_BEGIN 938 939 call __aria_gfni_avx512_ctr_gen_keystream_64way 940 941 leaq (%rsi), %r10; 942 leaq (%rdx), %r11; 943 leaq (%rcx), %rsi; 944 leaq (%rcx), %rdx; 945 leaq ARIA_CTX_enc_key(CTX), %r9; 946 947 call __aria_gfni_avx512_crypt_64way; 948 949 vpxorq (0 * 64)(%r11), %zmm3, %zmm3; 950 vpxorq (1 * 64)(%r11), %zmm2, %zmm2; 951 vpxorq (2 * 64)(%r11), %zmm1, %zmm1; 952 vpxorq (3 * 64)(%r11), %zmm0, %zmm0; 953 vpxorq (4 * 64)(%r11), %zmm6, %zmm6; 954 vpxorq (5 * 64)(%r11), %zmm7, %zmm7; 955 vpxorq (6 * 64)(%r11), %zmm4, %zmm4; 956 vpxorq (7 * 64)(%r11), %zmm5, %zmm5; 957 vpxorq (8 * 64)(%r11), %zmm9, %zmm9; 958 vpxorq (9 * 64)(%r11), %zmm8, %zmm8; 959 vpxorq (10 * 64)(%r11), %zmm11, %zmm11; 960 vpxorq (11 * 64)(%r11), %zmm10, %zmm10; 961 vpxorq (12 * 64)(%r11), %zmm12, %zmm12; 962 vpxorq (13 * 64)(%r11), %zmm13, %zmm13; 963 vpxorq (14 * 64)(%r11), %zmm14, %zmm14; 964 vpxorq (15 * 64)(%r11), %zmm15, %zmm15; 965 write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5, 966 %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14, 967 %zmm15, %r10); 968 969 FRAME_END 970 RET; 971SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way) 972