1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) 4 * 5 * Copyright (C) 2012 Johannes Goetzfried 6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 7 * 8 * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/frame.h> 13#include "glue_helper-asm-avx.S" 14 15.file "serpent-avx-x86_64-asm_64.S" 16 17.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 18.align 16 19.Lbswap128_mask: 20 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 21.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 22.align 16 23.Lxts_gf128mul_and_shl1_mask: 24 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 25 26.text 27 28#define CTX %rdi 29 30/********************************************************************** 31 8-way AVX serpent 32 **********************************************************************/ 33#define RA1 %xmm0 34#define RB1 %xmm1 35#define RC1 %xmm2 36#define RD1 %xmm3 37#define RE1 %xmm4 38 39#define tp %xmm5 40 41#define RA2 %xmm6 42#define RB2 %xmm7 43#define RC2 %xmm8 44#define RD2 %xmm9 45#define RE2 %xmm10 46 47#define RNOT %xmm11 48 49#define RK0 %xmm12 50#define RK1 %xmm13 51#define RK2 %xmm14 52#define RK3 %xmm15 53 54 55#define S0_1(x0, x1, x2, x3, x4) \ 56 vpor x0, x3, tp; \ 57 vpxor x3, x0, x0; \ 58 vpxor x2, x3, x4; \ 59 vpxor RNOT, x4, x4; \ 60 vpxor x1, tp, x3; \ 61 vpand x0, x1, x1; \ 62 vpxor x4, x1, x1; \ 63 vpxor x0, x2, x2; 64#define S0_2(x0, x1, x2, x3, x4) \ 65 vpxor x3, x0, x0; \ 66 vpor x0, x4, x4; \ 67 vpxor x2, x0, x0; \ 68 vpand x1, x2, x2; \ 69 vpxor x2, x3, x3; \ 70 vpxor RNOT, x1, x1; \ 71 vpxor x4, x2, x2; \ 72 vpxor x2, x1, x1; 73 74#define S1_1(x0, x1, x2, x3, x4) \ 75 vpxor x0, x1, tp; \ 76 vpxor x3, x0, x0; \ 77 vpxor RNOT, x3, x3; \ 78 vpand tp, x1, x4; \ 79 vpor tp, x0, x0; \ 80 vpxor x2, x3, x3; \ 81 vpxor x3, x0, x0; \ 82 vpxor x3, tp, x1; 83#define S1_2(x0, x1, x2, x3, x4) \ 84 vpxor x4, x3, x3; \ 85 vpor x4, x1, x1; \ 86 vpxor x2, x4, x4; \ 87 vpand x0, x2, x2; \ 88 vpxor x1, x2, x2; \ 89 vpor x0, x1, x1; \ 90 vpxor RNOT, x0, x0; \ 91 vpxor x2, x0, x0; \ 92 vpxor x1, x4, x4; 93 94#define S2_1(x0, x1, x2, x3, x4) \ 95 vpxor RNOT, x3, x3; \ 96 vpxor x0, x1, x1; \ 97 vpand x2, x0, tp; \ 98 vpxor x3, tp, tp; \ 99 vpor x0, x3, x3; \ 100 vpxor x1, x2, x2; \ 101 vpxor x1, x3, x3; \ 102 vpand tp, x1, x1; 103#define S2_2(x0, x1, x2, x3, x4) \ 104 vpxor x2, tp, tp; \ 105 vpand x3, x2, x2; \ 106 vpor x1, x3, x3; \ 107 vpxor RNOT, tp, tp; \ 108 vpxor tp, x3, x3; \ 109 vpxor tp, x0, x4; \ 110 vpxor x2, tp, x0; \ 111 vpor x2, x1, x1; 112 113#define S3_1(x0, x1, x2, x3, x4) \ 114 vpxor x3, x1, tp; \ 115 vpor x0, x3, x3; \ 116 vpand x0, x1, x4; \ 117 vpxor x2, x0, x0; \ 118 vpxor tp, x2, x2; \ 119 vpand x3, tp, x1; \ 120 vpxor x3, x2, x2; \ 121 vpor x4, x0, x0; \ 122 vpxor x3, x4, x4; 123#define S3_2(x0, x1, x2, x3, x4) \ 124 vpxor x0, x1, x1; \ 125 vpand x3, x0, x0; \ 126 vpand x4, x3, x3; \ 127 vpxor x2, x3, x3; \ 128 vpor x1, x4, x4; \ 129 vpand x1, x2, x2; \ 130 vpxor x3, x4, x4; \ 131 vpxor x3, x0, x0; \ 132 vpxor x2, x3, x3; 133 134#define S4_1(x0, x1, x2, x3, x4) \ 135 vpand x0, x3, tp; \ 136 vpxor x3, x0, x0; \ 137 vpxor x2, tp, tp; \ 138 vpor x3, x2, x2; \ 139 vpxor x1, x0, x0; \ 140 vpxor tp, x3, x4; \ 141 vpor x0, x2, x2; \ 142 vpxor x1, x2, x2; 143#define S4_2(x0, x1, x2, x3, x4) \ 144 vpand x0, x1, x1; \ 145 vpxor x4, x1, x1; \ 146 vpand x2, x4, x4; \ 147 vpxor tp, x2, x2; \ 148 vpxor x0, x4, x4; \ 149 vpor x1, tp, x3; \ 150 vpxor RNOT, x1, x1; \ 151 vpxor x0, x3, x3; 152 153#define S5_1(x0, x1, x2, x3, x4) \ 154 vpor x0, x1, tp; \ 155 vpxor tp, x2, x2; \ 156 vpxor RNOT, x3, x3; \ 157 vpxor x0, x1, x4; \ 158 vpxor x2, x0, x0; \ 159 vpand x4, tp, x1; \ 160 vpor x3, x4, x4; \ 161 vpxor x0, x4, x4; 162#define S5_2(x0, x1, x2, x3, x4) \ 163 vpand x3, x0, x0; \ 164 vpxor x3, x1, x1; \ 165 vpxor x2, x3, x3; \ 166 vpxor x1, x0, x0; \ 167 vpand x4, x2, x2; \ 168 vpxor x2, x1, x1; \ 169 vpand x0, x2, x2; \ 170 vpxor x2, x3, x3; 171 172#define S6_1(x0, x1, x2, x3, x4) \ 173 vpxor x0, x3, x3; \ 174 vpxor x2, x1, tp; \ 175 vpxor x0, x2, x2; \ 176 vpand x3, x0, x0; \ 177 vpor x3, tp, tp; \ 178 vpxor RNOT, x1, x4; \ 179 vpxor tp, x0, x0; \ 180 vpxor x2, tp, x1; 181#define S6_2(x0, x1, x2, x3, x4) \ 182 vpxor x4, x3, x3; \ 183 vpxor x0, x4, x4; \ 184 vpand x0, x2, x2; \ 185 vpxor x1, x4, x4; \ 186 vpxor x3, x2, x2; \ 187 vpand x1, x3, x3; \ 188 vpxor x0, x3, x3; \ 189 vpxor x2, x1, x1; 190 191#define S7_1(x0, x1, x2, x3, x4) \ 192 vpxor RNOT, x1, tp; \ 193 vpxor RNOT, x0, x0; \ 194 vpand x2, tp, x1; \ 195 vpxor x3, x1, x1; \ 196 vpor tp, x3, x3; \ 197 vpxor x2, tp, x4; \ 198 vpxor x3, x2, x2; \ 199 vpxor x0, x3, x3; \ 200 vpor x1, x0, x0; 201#define S7_2(x0, x1, x2, x3, x4) \ 202 vpand x0, x2, x2; \ 203 vpxor x4, x0, x0; \ 204 vpxor x3, x4, x4; \ 205 vpand x0, x3, x3; \ 206 vpxor x1, x4, x4; \ 207 vpxor x4, x2, x2; \ 208 vpxor x1, x3, x3; \ 209 vpor x0, x4, x4; \ 210 vpxor x1, x4, x4; 211 212#define SI0_1(x0, x1, x2, x3, x4) \ 213 vpxor x0, x1, x1; \ 214 vpor x1, x3, tp; \ 215 vpxor x1, x3, x4; \ 216 vpxor RNOT, x0, x0; \ 217 vpxor tp, x2, x2; \ 218 vpxor x0, tp, x3; \ 219 vpand x1, x0, x0; \ 220 vpxor x2, x0, x0; 221#define SI0_2(x0, x1, x2, x3, x4) \ 222 vpand x3, x2, x2; \ 223 vpxor x4, x3, x3; \ 224 vpxor x3, x2, x2; \ 225 vpxor x3, x1, x1; \ 226 vpand x0, x3, x3; \ 227 vpxor x0, x1, x1; \ 228 vpxor x2, x0, x0; \ 229 vpxor x3, x4, x4; 230 231#define SI1_1(x0, x1, x2, x3, x4) \ 232 vpxor x3, x1, x1; \ 233 vpxor x2, x0, tp; \ 234 vpxor RNOT, x2, x2; \ 235 vpor x1, x0, x4; \ 236 vpxor x3, x4, x4; \ 237 vpand x1, x3, x3; \ 238 vpxor x2, x1, x1; \ 239 vpand x4, x2, x2; 240#define SI1_2(x0, x1, x2, x3, x4) \ 241 vpxor x1, x4, x4; \ 242 vpor x3, x1, x1; \ 243 vpxor tp, x3, x3; \ 244 vpxor tp, x2, x2; \ 245 vpor x4, tp, x0; \ 246 vpxor x4, x2, x2; \ 247 vpxor x0, x1, x1; \ 248 vpxor x1, x4, x4; 249 250#define SI2_1(x0, x1, x2, x3, x4) \ 251 vpxor x1, x2, x2; \ 252 vpxor RNOT, x3, tp; \ 253 vpor x2, tp, tp; \ 254 vpxor x3, x2, x2; \ 255 vpxor x0, x3, x4; \ 256 vpxor x1, tp, x3; \ 257 vpor x2, x1, x1; \ 258 vpxor x0, x2, x2; 259#define SI2_2(x0, x1, x2, x3, x4) \ 260 vpxor x4, x1, x1; \ 261 vpor x3, x4, x4; \ 262 vpxor x3, x2, x2; \ 263 vpxor x2, x4, x4; \ 264 vpand x1, x2, x2; \ 265 vpxor x3, x2, x2; \ 266 vpxor x4, x3, x3; \ 267 vpxor x0, x4, x4; 268 269#define SI3_1(x0, x1, x2, x3, x4) \ 270 vpxor x1, x2, x2; \ 271 vpand x2, x1, tp; \ 272 vpxor x0, tp, tp; \ 273 vpor x1, x0, x0; \ 274 vpxor x3, x1, x4; \ 275 vpxor x3, x0, x0; \ 276 vpor tp, x3, x3; \ 277 vpxor x2, tp, x1; 278#define SI3_2(x0, x1, x2, x3, x4) \ 279 vpxor x3, x1, x1; \ 280 vpxor x2, x0, x0; \ 281 vpxor x3, x2, x2; \ 282 vpand x1, x3, x3; \ 283 vpxor x0, x1, x1; \ 284 vpand x2, x0, x0; \ 285 vpxor x3, x4, x4; \ 286 vpxor x0, x3, x3; \ 287 vpxor x1, x0, x0; 288 289#define SI4_1(x0, x1, x2, x3, x4) \ 290 vpxor x3, x2, x2; \ 291 vpand x1, x0, tp; \ 292 vpxor x2, tp, tp; \ 293 vpor x3, x2, x2; \ 294 vpxor RNOT, x0, x4; \ 295 vpxor tp, x1, x1; \ 296 vpxor x2, tp, x0; \ 297 vpand x4, x2, x2; 298#define SI4_2(x0, x1, x2, x3, x4) \ 299 vpxor x0, x2, x2; \ 300 vpor x4, x0, x0; \ 301 vpxor x3, x0, x0; \ 302 vpand x2, x3, x3; \ 303 vpxor x3, x4, x4; \ 304 vpxor x1, x3, x3; \ 305 vpand x0, x1, x1; \ 306 vpxor x1, x4, x4; \ 307 vpxor x3, x0, x0; 308 309#define SI5_1(x0, x1, x2, x3, x4) \ 310 vpor x2, x1, tp; \ 311 vpxor x1, x2, x2; \ 312 vpxor x3, tp, tp; \ 313 vpand x1, x3, x3; \ 314 vpxor x3, x2, x2; \ 315 vpor x0, x3, x3; \ 316 vpxor RNOT, x0, x0; \ 317 vpxor x2, x3, x3; \ 318 vpor x0, x2, x2; 319#define SI5_2(x0, x1, x2, x3, x4) \ 320 vpxor tp, x1, x4; \ 321 vpxor x4, x2, x2; \ 322 vpand x0, x4, x4; \ 323 vpxor tp, x0, x0; \ 324 vpxor x3, tp, x1; \ 325 vpand x2, x0, x0; \ 326 vpxor x3, x2, x2; \ 327 vpxor x2, x0, x0; \ 328 vpxor x4, x2, x2; \ 329 vpxor x3, x4, x4; 330 331#define SI6_1(x0, x1, x2, x3, x4) \ 332 vpxor x2, x0, x0; \ 333 vpand x3, x0, tp; \ 334 vpxor x3, x2, x2; \ 335 vpxor x2, tp, tp; \ 336 vpxor x1, x3, x3; \ 337 vpor x0, x2, x2; \ 338 vpxor x3, x2, x2; \ 339 vpand tp, x3, x3; 340#define SI6_2(x0, x1, x2, x3, x4) \ 341 vpxor RNOT, tp, tp; \ 342 vpxor x1, x3, x3; \ 343 vpand x2, x1, x1; \ 344 vpxor tp, x0, x4; \ 345 vpxor x4, x3, x3; \ 346 vpxor x2, x4, x4; \ 347 vpxor x1, tp, x0; \ 348 vpxor x0, x2, x2; 349 350#define SI7_1(x0, x1, x2, x3, x4) \ 351 vpand x0, x3, tp; \ 352 vpxor x2, x0, x0; \ 353 vpor x3, x2, x2; \ 354 vpxor x1, x3, x4; \ 355 vpxor RNOT, x0, x0; \ 356 vpor tp, x1, x1; \ 357 vpxor x0, x4, x4; \ 358 vpand x2, x0, x0; \ 359 vpxor x1, x0, x0; 360#define SI7_2(x0, x1, x2, x3, x4) \ 361 vpand x2, x1, x1; \ 362 vpxor x2, tp, x3; \ 363 vpxor x3, x4, x4; \ 364 vpand x3, x2, x2; \ 365 vpor x0, x3, x3; \ 366 vpxor x4, x1, x1; \ 367 vpxor x4, x3, x3; \ 368 vpand x0, x4, x4; \ 369 vpxor x2, x4, x4; 370 371#define get_key(i, j, t) \ 372 vbroadcastss (4*(i)+(j))*4(CTX), t; 373 374#define K2(x0, x1, x2, x3, x4, i) \ 375 get_key(i, 0, RK0); \ 376 get_key(i, 1, RK1); \ 377 get_key(i, 2, RK2); \ 378 get_key(i, 3, RK3); \ 379 vpxor RK0, x0 ## 1, x0 ## 1; \ 380 vpxor RK1, x1 ## 1, x1 ## 1; \ 381 vpxor RK2, x2 ## 1, x2 ## 1; \ 382 vpxor RK3, x3 ## 1, x3 ## 1; \ 383 vpxor RK0, x0 ## 2, x0 ## 2; \ 384 vpxor RK1, x1 ## 2, x1 ## 2; \ 385 vpxor RK2, x2 ## 2, x2 ## 2; \ 386 vpxor RK3, x3 ## 2, x3 ## 2; 387 388#define LK2(x0, x1, x2, x3, x4, i) \ 389 vpslld $13, x0 ## 1, x4 ## 1; \ 390 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 391 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 392 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 393 vpslld $3, x2 ## 1, x4 ## 1; \ 394 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 395 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 396 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 397 vpslld $13, x0 ## 2, x4 ## 2; \ 398 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 399 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 400 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 401 vpslld $3, x2 ## 2, x4 ## 2; \ 402 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 403 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 404 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 405 vpslld $1, x1 ## 1, x4 ## 1; \ 406 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 407 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 408 vpslld $3, x0 ## 1, x4 ## 1; \ 409 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 410 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 411 get_key(i, 1, RK1); \ 412 vpslld $1, x1 ## 2, x4 ## 2; \ 413 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 414 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 415 vpslld $3, x0 ## 2, x4 ## 2; \ 416 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 417 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 418 get_key(i, 3, RK3); \ 419 vpslld $7, x3 ## 1, x4 ## 1; \ 420 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 421 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 422 vpslld $7, x1 ## 1, x4 ## 1; \ 423 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 424 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 425 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 426 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 427 get_key(i, 0, RK0); \ 428 vpslld $7, x3 ## 2, x4 ## 2; \ 429 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 430 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 431 vpslld $7, x1 ## 2, x4 ## 2; \ 432 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 433 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 434 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 435 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 436 get_key(i, 2, RK2); \ 437 vpxor RK1, x1 ## 1, x1 ## 1; \ 438 vpxor RK3, x3 ## 1, x3 ## 1; \ 439 vpslld $5, x0 ## 1, x4 ## 1; \ 440 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 441 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 442 vpslld $22, x2 ## 1, x4 ## 1; \ 443 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 444 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 445 vpxor RK0, x0 ## 1, x0 ## 1; \ 446 vpxor RK2, x2 ## 1, x2 ## 1; \ 447 vpxor RK1, x1 ## 2, x1 ## 2; \ 448 vpxor RK3, x3 ## 2, x3 ## 2; \ 449 vpslld $5, x0 ## 2, x4 ## 2; \ 450 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 451 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 452 vpslld $22, x2 ## 2, x4 ## 2; \ 453 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 454 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 455 vpxor RK0, x0 ## 2, x0 ## 2; \ 456 vpxor RK2, x2 ## 2, x2 ## 2; 457 458#define KL2(x0, x1, x2, x3, x4, i) \ 459 vpxor RK0, x0 ## 1, x0 ## 1; \ 460 vpxor RK2, x2 ## 1, x2 ## 1; \ 461 vpsrld $5, x0 ## 1, x4 ## 1; \ 462 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 463 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 464 vpxor RK3, x3 ## 1, x3 ## 1; \ 465 vpxor RK1, x1 ## 1, x1 ## 1; \ 466 vpsrld $22, x2 ## 1, x4 ## 1; \ 467 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 468 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 469 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 470 vpxor RK0, x0 ## 2, x0 ## 2; \ 471 vpxor RK2, x2 ## 2, x2 ## 2; \ 472 vpsrld $5, x0 ## 2, x4 ## 2; \ 473 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 474 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 475 vpxor RK3, x3 ## 2, x3 ## 2; \ 476 vpxor RK1, x1 ## 2, x1 ## 2; \ 477 vpsrld $22, x2 ## 2, x4 ## 2; \ 478 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 479 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 480 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 481 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 482 vpslld $7, x1 ## 1, x4 ## 1; \ 483 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 484 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 485 vpsrld $1, x1 ## 1, x4 ## 1; \ 486 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 487 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 488 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 489 vpslld $7, x1 ## 2, x4 ## 2; \ 490 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 491 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 492 vpsrld $1, x1 ## 2, x4 ## 2; \ 493 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 494 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 495 vpsrld $7, x3 ## 1, x4 ## 1; \ 496 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 497 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 498 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 499 vpslld $3, x0 ## 1, x4 ## 1; \ 500 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 501 vpsrld $7, x3 ## 2, x4 ## 2; \ 502 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 503 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 504 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 505 vpslld $3, x0 ## 2, x4 ## 2; \ 506 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 507 vpsrld $13, x0 ## 1, x4 ## 1; \ 508 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 509 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 510 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 511 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 512 vpsrld $3, x2 ## 1, x4 ## 1; \ 513 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 514 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 515 vpsrld $13, x0 ## 2, x4 ## 2; \ 516 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 517 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 518 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 519 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 520 vpsrld $3, x2 ## 2, x4 ## 2; \ 521 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 522 vpor x4 ## 2, x2 ## 2, x2 ## 2; 523 524#define S(SBOX, x0, x1, x2, x3, x4) \ 525 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 526 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 527 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 528 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 529 530#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 531 get_key(i, 0, RK0); \ 532 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 533 get_key(i, 2, RK2); \ 534 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 535 get_key(i, 3, RK3); \ 536 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 537 get_key(i, 1, RK1); \ 538 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 539 540#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 541 vpunpckldq x1, x0, t0; \ 542 vpunpckhdq x1, x0, t2; \ 543 vpunpckldq x3, x2, t1; \ 544 vpunpckhdq x3, x2, x3; \ 545 \ 546 vpunpcklqdq t1, t0, x0; \ 547 vpunpckhqdq t1, t0, x1; \ 548 vpunpcklqdq x3, t2, x2; \ 549 vpunpckhqdq x3, t2, x3; 550 551#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 552 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 553 554#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 555 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 556 557.align 8 558__serpent_enc_blk8_avx: 559 /* input: 560 * %rdi: ctx, CTX 561 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 562 * output: 563 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 564 */ 565 566 vpcmpeqd RNOT, RNOT, RNOT; 567 568 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 569 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 570 571 K2(RA, RB, RC, RD, RE, 0); 572 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 573 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 574 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 575 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 576 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 577 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 578 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 579 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 580 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 581 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 582 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 583 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 584 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 585 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 586 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 587 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 588 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 589 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 590 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 591 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 592 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 593 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 594 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 595 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 596 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 597 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 598 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 599 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 600 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 601 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 602 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 603 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 604 605 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 606 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 607 608 ret; 609ENDPROC(__serpent_enc_blk8_avx) 610 611.align 8 612__serpent_dec_blk8_avx: 613 /* input: 614 * %rdi: ctx, CTX 615 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 616 * output: 617 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks 618 */ 619 620 vpcmpeqd RNOT, RNOT, RNOT; 621 622 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 623 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 624 625 K2(RA, RB, RC, RD, RE, 32); 626 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 627 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 628 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 629 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 630 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 631 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 632 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 633 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 634 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 635 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 636 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 637 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 638 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 639 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 640 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 641 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 642 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 643 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 644 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 645 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 646 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 647 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 648 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 649 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 650 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 651 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 652 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 653 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 654 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 655 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 656 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 657 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 658 659 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 660 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 661 662 ret; 663ENDPROC(__serpent_dec_blk8_avx) 664 665ENTRY(serpent_ecb_enc_8way_avx) 666 /* input: 667 * %rdi: ctx, CTX 668 * %rsi: dst 669 * %rdx: src 670 */ 671 FRAME_BEGIN 672 673 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 674 675 call __serpent_enc_blk8_avx; 676 677 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 678 679 FRAME_END 680 ret; 681ENDPROC(serpent_ecb_enc_8way_avx) 682 683ENTRY(serpent_ecb_dec_8way_avx) 684 /* input: 685 * %rdi: ctx, CTX 686 * %rsi: dst 687 * %rdx: src 688 */ 689 FRAME_BEGIN 690 691 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 692 693 call __serpent_dec_blk8_avx; 694 695 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 696 697 FRAME_END 698 ret; 699ENDPROC(serpent_ecb_dec_8way_avx) 700 701ENTRY(serpent_cbc_dec_8way_avx) 702 /* input: 703 * %rdi: ctx, CTX 704 * %rsi: dst 705 * %rdx: src 706 */ 707 FRAME_BEGIN 708 709 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 710 711 call __serpent_dec_blk8_avx; 712 713 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 714 715 FRAME_END 716 ret; 717ENDPROC(serpent_cbc_dec_8way_avx) 718 719ENTRY(serpent_ctr_8way_avx) 720 /* input: 721 * %rdi: ctx, CTX 722 * %rsi: dst 723 * %rdx: src 724 * %rcx: iv (little endian, 128bit) 725 */ 726 FRAME_BEGIN 727 728 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 729 RD2, RK0, RK1, RK2); 730 731 call __serpent_enc_blk8_avx; 732 733 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 734 735 FRAME_END 736 ret; 737ENDPROC(serpent_ctr_8way_avx) 738 739ENTRY(serpent_xts_enc_8way_avx) 740 /* input: 741 * %rdi: ctx, CTX 742 * %rsi: dst 743 * %rdx: src 744 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 745 */ 746 FRAME_BEGIN 747 748 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 749 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 750 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 751 752 call __serpent_enc_blk8_avx; 753 754 /* dst <= regs xor IVs(in dst) */ 755 store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 756 757 FRAME_END 758 ret; 759ENDPROC(serpent_xts_enc_8way_avx) 760 761ENTRY(serpent_xts_dec_8way_avx) 762 /* input: 763 * %rdi: ctx, CTX 764 * %rsi: dst 765 * %rdx: src 766 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 767 */ 768 FRAME_BEGIN 769 770 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 771 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 772 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 773 774 call __serpent_dec_blk8_avx; 775 776 /* dst <= regs xor IVs(in dst) */ 777 store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 778 779 FRAME_END 780 ret; 781ENDPROC(serpent_xts_dec_8way_avx) 782