1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2) 4 * 5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 * 7 * Based on crypto/serpent.c by 8 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no> 9 * 2003 Herbert Valerio Riedel <hvr@gnu.org> 10 */ 11 12#include <linux/linkage.h> 13 14.file "serpent-sse2-x86_64-asm_64.S" 15.text 16 17#define CTX %rdi 18 19/********************************************************************** 20 8-way SSE2 serpent 21 **********************************************************************/ 22#define RA1 %xmm0 23#define RB1 %xmm1 24#define RC1 %xmm2 25#define RD1 %xmm3 26#define RE1 %xmm4 27 28#define RA2 %xmm5 29#define RB2 %xmm6 30#define RC2 %xmm7 31#define RD2 %xmm8 32#define RE2 %xmm9 33 34#define RNOT %xmm10 35 36#define RK0 %xmm11 37#define RK1 %xmm12 38#define RK2 %xmm13 39#define RK3 %xmm14 40 41#define S0_1(x0, x1, x2, x3, x4) \ 42 movdqa x3, x4; \ 43 por x0, x3; \ 44 pxor x4, x0; \ 45 pxor x2, x4; \ 46 pxor RNOT, x4; \ 47 pxor x1, x3; \ 48 pand x0, x1; \ 49 pxor x4, x1; \ 50 pxor x0, x2; 51#define S0_2(x0, x1, x2, x3, x4) \ 52 pxor x3, x0; \ 53 por x0, x4; \ 54 pxor x2, x0; \ 55 pand x1, x2; \ 56 pxor x2, x3; \ 57 pxor RNOT, x1; \ 58 pxor x4, x2; \ 59 pxor x2, x1; 60 61#define S1_1(x0, x1, x2, x3, x4) \ 62 movdqa x1, x4; \ 63 pxor x0, x1; \ 64 pxor x3, x0; \ 65 pxor RNOT, x3; \ 66 pand x1, x4; \ 67 por x1, x0; \ 68 pxor x2, x3; \ 69 pxor x3, x0; \ 70 pxor x3, x1; 71#define S1_2(x0, x1, x2, x3, x4) \ 72 pxor x4, x3; \ 73 por x4, x1; \ 74 pxor x2, x4; \ 75 pand x0, x2; \ 76 pxor x1, x2; \ 77 por x0, x1; \ 78 pxor RNOT, x0; \ 79 pxor x2, x0; \ 80 pxor x1, x4; 81 82#define S2_1(x0, x1, x2, x3, x4) \ 83 pxor RNOT, x3; \ 84 pxor x0, x1; \ 85 movdqa x0, x4; \ 86 pand x2, x0; \ 87 pxor x3, x0; \ 88 por x4, x3; \ 89 pxor x1, x2; \ 90 pxor x1, x3; \ 91 pand x0, x1; 92#define S2_2(x0, x1, x2, x3, x4) \ 93 pxor x2, x0; \ 94 pand x3, x2; \ 95 por x1, x3; \ 96 pxor RNOT, x0; \ 97 pxor x0, x3; \ 98 pxor x0, x4; \ 99 pxor x2, x0; \ 100 por x2, x1; 101 102#define S3_1(x0, x1, x2, x3, x4) \ 103 movdqa x1, x4; \ 104 pxor x3, x1; \ 105 por x0, x3; \ 106 pand x0, x4; \ 107 pxor x2, x0; \ 108 pxor x1, x2; \ 109 pand x3, x1; \ 110 pxor x3, x2; \ 111 por x4, x0; \ 112 pxor x3, x4; 113#define S3_2(x0, x1, x2, x3, x4) \ 114 pxor x0, x1; \ 115 pand x3, x0; \ 116 pand x4, x3; \ 117 pxor x2, x3; \ 118 por x1, x4; \ 119 pand x1, x2; \ 120 pxor x3, x4; \ 121 pxor x3, x0; \ 122 pxor x2, x3; 123 124#define S4_1(x0, x1, x2, x3, x4) \ 125 movdqa x3, x4; \ 126 pand x0, x3; \ 127 pxor x4, x0; \ 128 pxor x2, x3; \ 129 por x4, x2; \ 130 pxor x1, x0; \ 131 pxor x3, x4; \ 132 por x0, x2; \ 133 pxor x1, x2; 134#define S4_2(x0, x1, x2, x3, x4) \ 135 pand x0, x1; \ 136 pxor x4, x1; \ 137 pand x2, x4; \ 138 pxor x3, x2; \ 139 pxor x0, x4; \ 140 por x1, x3; \ 141 pxor RNOT, x1; \ 142 pxor x0, x3; 143 144#define S5_1(x0, x1, x2, x3, x4) \ 145 movdqa x1, x4; \ 146 por x0, x1; \ 147 pxor x1, x2; \ 148 pxor RNOT, x3; \ 149 pxor x0, x4; \ 150 pxor x2, x0; \ 151 pand x4, x1; \ 152 por x3, x4; \ 153 pxor x0, x4; 154#define S5_2(x0, x1, x2, x3, x4) \ 155 pand x3, x0; \ 156 pxor x3, x1; \ 157 pxor x2, x3; \ 158 pxor x1, x0; \ 159 pand x4, x2; \ 160 pxor x2, x1; \ 161 pand x0, x2; \ 162 pxor x2, x3; 163 164#define S6_1(x0, x1, x2, x3, x4) \ 165 movdqa x1, x4; \ 166 pxor x0, x3; \ 167 pxor x2, x1; \ 168 pxor x0, x2; \ 169 pand x3, x0; \ 170 por x3, x1; \ 171 pxor RNOT, x4; \ 172 pxor x1, x0; \ 173 pxor x2, x1; 174#define S6_2(x0, x1, x2, x3, x4) \ 175 pxor x4, x3; \ 176 pxor x0, x4; \ 177 pand x0, x2; \ 178 pxor x1, x4; \ 179 pxor x3, x2; \ 180 pand x1, x3; \ 181 pxor x0, x3; \ 182 pxor x2, x1; 183 184#define S7_1(x0, x1, x2, x3, x4) \ 185 pxor RNOT, x1; \ 186 movdqa x1, x4; \ 187 pxor RNOT, x0; \ 188 pand x2, x1; \ 189 pxor x3, x1; \ 190 por x4, x3; \ 191 pxor x2, x4; \ 192 pxor x3, x2; \ 193 pxor x0, x3; \ 194 por x1, x0; 195#define S7_2(x0, x1, x2, x3, x4) \ 196 pand x0, x2; \ 197 pxor x4, x0; \ 198 pxor x3, x4; \ 199 pand x0, x3; \ 200 pxor x1, x4; \ 201 pxor x4, x2; \ 202 pxor x1, x3; \ 203 por x0, x4; \ 204 pxor x1, x4; 205 206#define SI0_1(x0, x1, x2, x3, x4) \ 207 movdqa x3, x4; \ 208 pxor x0, x1; \ 209 por x1, x3; \ 210 pxor x1, x4; \ 211 pxor RNOT, x0; \ 212 pxor x3, x2; \ 213 pxor x0, x3; \ 214 pand x1, x0; \ 215 pxor x2, x0; 216#define SI0_2(x0, x1, x2, x3, x4) \ 217 pand x3, x2; \ 218 pxor x4, x3; \ 219 pxor x3, x2; \ 220 pxor x3, x1; \ 221 pand x0, x3; \ 222 pxor x0, x1; \ 223 pxor x2, x0; \ 224 pxor x3, x4; 225 226#define SI1_1(x0, x1, x2, x3, x4) \ 227 pxor x3, x1; \ 228 movdqa x0, x4; \ 229 pxor x2, x0; \ 230 pxor RNOT, x2; \ 231 por x1, x4; \ 232 pxor x3, x4; \ 233 pand x1, x3; \ 234 pxor x2, x1; \ 235 pand x4, x2; 236#define SI1_2(x0, x1, x2, x3, x4) \ 237 pxor x1, x4; \ 238 por x3, x1; \ 239 pxor x0, x3; \ 240 pxor x0, x2; \ 241 por x4, x0; \ 242 pxor x4, x2; \ 243 pxor x0, x1; \ 244 pxor x1, x4; 245 246#define SI2_1(x0, x1, x2, x3, x4) \ 247 pxor x1, x2; \ 248 movdqa x3, x4; \ 249 pxor RNOT, x3; \ 250 por x2, x3; \ 251 pxor x4, x2; \ 252 pxor x0, x4; \ 253 pxor x1, x3; \ 254 por x2, x1; \ 255 pxor x0, x2; 256#define SI2_2(x0, x1, x2, x3, x4) \ 257 pxor x4, x1; \ 258 por x3, x4; \ 259 pxor x3, x2; \ 260 pxor x2, x4; \ 261 pand x1, x2; \ 262 pxor x3, x2; \ 263 pxor x4, x3; \ 264 pxor x0, x4; 265 266#define SI3_1(x0, x1, x2, x3, x4) \ 267 pxor x1, x2; \ 268 movdqa x1, x4; \ 269 pand x2, x1; \ 270 pxor x0, x1; \ 271 por x4, x0; \ 272 pxor x3, x4; \ 273 pxor x3, x0; \ 274 por x1, x3; \ 275 pxor x2, x1; 276#define SI3_2(x0, x1, x2, x3, x4) \ 277 pxor x3, x1; \ 278 pxor x2, x0; \ 279 pxor x3, x2; \ 280 pand x1, x3; \ 281 pxor x0, x1; \ 282 pand x2, x0; \ 283 pxor x3, x4; \ 284 pxor x0, x3; \ 285 pxor x1, x0; 286 287#define SI4_1(x0, x1, x2, x3, x4) \ 288 pxor x3, x2; \ 289 movdqa x0, x4; \ 290 pand x1, x0; \ 291 pxor x2, x0; \ 292 por x3, x2; \ 293 pxor RNOT, x4; \ 294 pxor x0, x1; \ 295 pxor x2, x0; \ 296 pand x4, x2; 297#define SI4_2(x0, x1, x2, x3, x4) \ 298 pxor x0, x2; \ 299 por x4, x0; \ 300 pxor x3, x0; \ 301 pand x2, x3; \ 302 pxor x3, x4; \ 303 pxor x1, x3; \ 304 pand x0, x1; \ 305 pxor x1, x4; \ 306 pxor x3, x0; 307 308#define SI5_1(x0, x1, x2, x3, x4) \ 309 movdqa x1, x4; \ 310 por x2, x1; \ 311 pxor x4, x2; \ 312 pxor x3, x1; \ 313 pand x4, x3; \ 314 pxor x3, x2; \ 315 por x0, x3; \ 316 pxor RNOT, x0; \ 317 pxor x2, x3; \ 318 por x0, x2; 319#define SI5_2(x0, x1, x2, x3, x4) \ 320 pxor x1, x4; \ 321 pxor x4, x2; \ 322 pand x0, x4; \ 323 pxor x1, x0; \ 324 pxor x3, x1; \ 325 pand x2, x0; \ 326 pxor x3, x2; \ 327 pxor x2, x0; \ 328 pxor x4, x2; \ 329 pxor x3, x4; 330 331#define SI6_1(x0, x1, x2, x3, x4) \ 332 pxor x2, x0; \ 333 movdqa x0, x4; \ 334 pand x3, x0; \ 335 pxor x3, x2; \ 336 pxor x2, x0; \ 337 pxor x1, x3; \ 338 por x4, x2; \ 339 pxor x3, x2; \ 340 pand x0, x3; 341#define SI6_2(x0, x1, x2, x3, x4) \ 342 pxor RNOT, x0; \ 343 pxor x1, x3; \ 344 pand x2, x1; \ 345 pxor x0, x4; \ 346 pxor x4, x3; \ 347 pxor x2, x4; \ 348 pxor x1, x0; \ 349 pxor x0, x2; 350 351#define SI7_1(x0, x1, x2, x3, x4) \ 352 movdqa x3, x4; \ 353 pand x0, x3; \ 354 pxor x2, x0; \ 355 por x4, x2; \ 356 pxor x1, x4; \ 357 pxor RNOT, x0; \ 358 por x3, x1; \ 359 pxor x0, x4; \ 360 pand x2, x0; \ 361 pxor x1, x0; 362#define SI7_2(x0, x1, x2, x3, x4) \ 363 pand x2, x1; \ 364 pxor x2, x3; \ 365 pxor x3, x4; \ 366 pand x3, x2; \ 367 por x0, x3; \ 368 pxor x4, x1; \ 369 pxor x4, x3; \ 370 pand x0, x4; \ 371 pxor x2, x4; 372 373#define get_key(i, j, t) \ 374 movd (4*(i)+(j))*4(CTX), t; \ 375 pshufd $0, t, t; 376 377#define K2(x0, x1, x2, x3, x4, i) \ 378 get_key(i, 0, RK0); \ 379 get_key(i, 1, RK1); \ 380 get_key(i, 2, RK2); \ 381 get_key(i, 3, RK3); \ 382 pxor RK0, x0 ## 1; \ 383 pxor RK1, x1 ## 1; \ 384 pxor RK2, x2 ## 1; \ 385 pxor RK3, x3 ## 1; \ 386 pxor RK0, x0 ## 2; \ 387 pxor RK1, x1 ## 2; \ 388 pxor RK2, x2 ## 2; \ 389 pxor RK3, x3 ## 2; 390 391#define LK2(x0, x1, x2, x3, x4, i) \ 392 movdqa x0 ## 1, x4 ## 1; \ 393 pslld $13, x0 ## 1; \ 394 psrld $(32 - 13), x4 ## 1; \ 395 por x4 ## 1, x0 ## 1; \ 396 pxor x0 ## 1, x1 ## 1; \ 397 movdqa x2 ## 1, x4 ## 1; \ 398 pslld $3, x2 ## 1; \ 399 psrld $(32 - 3), x4 ## 1; \ 400 por x4 ## 1, x2 ## 1; \ 401 pxor x2 ## 1, x1 ## 1; \ 402 movdqa x0 ## 2, x4 ## 2; \ 403 pslld $13, x0 ## 2; \ 404 psrld $(32 - 13), x4 ## 2; \ 405 por x4 ## 2, x0 ## 2; \ 406 pxor x0 ## 2, x1 ## 2; \ 407 movdqa x2 ## 2, x4 ## 2; \ 408 pslld $3, x2 ## 2; \ 409 psrld $(32 - 3), x4 ## 2; \ 410 por x4 ## 2, x2 ## 2; \ 411 pxor x2 ## 2, x1 ## 2; \ 412 movdqa x1 ## 1, x4 ## 1; \ 413 pslld $1, x1 ## 1; \ 414 psrld $(32 - 1), x4 ## 1; \ 415 por x4 ## 1, x1 ## 1; \ 416 movdqa x0 ## 1, x4 ## 1; \ 417 pslld $3, x4 ## 1; \ 418 pxor x2 ## 1, x3 ## 1; \ 419 pxor x4 ## 1, x3 ## 1; \ 420 movdqa x3 ## 1, x4 ## 1; \ 421 get_key(i, 1, RK1); \ 422 movdqa x1 ## 2, x4 ## 2; \ 423 pslld $1, x1 ## 2; \ 424 psrld $(32 - 1), x4 ## 2; \ 425 por x4 ## 2, x1 ## 2; \ 426 movdqa x0 ## 2, x4 ## 2; \ 427 pslld $3, x4 ## 2; \ 428 pxor x2 ## 2, x3 ## 2; \ 429 pxor x4 ## 2, x3 ## 2; \ 430 movdqa x3 ## 2, x4 ## 2; \ 431 get_key(i, 3, RK3); \ 432 pslld $7, x3 ## 1; \ 433 psrld $(32 - 7), x4 ## 1; \ 434 por x4 ## 1, x3 ## 1; \ 435 movdqa x1 ## 1, x4 ## 1; \ 436 pslld $7, x4 ## 1; \ 437 pxor x1 ## 1, x0 ## 1; \ 438 pxor x3 ## 1, x0 ## 1; \ 439 pxor x3 ## 1, x2 ## 1; \ 440 pxor x4 ## 1, x2 ## 1; \ 441 get_key(i, 0, RK0); \ 442 pslld $7, x3 ## 2; \ 443 psrld $(32 - 7), x4 ## 2; \ 444 por x4 ## 2, x3 ## 2; \ 445 movdqa x1 ## 2, x4 ## 2; \ 446 pslld $7, x4 ## 2; \ 447 pxor x1 ## 2, x0 ## 2; \ 448 pxor x3 ## 2, x0 ## 2; \ 449 pxor x3 ## 2, x2 ## 2; \ 450 pxor x4 ## 2, x2 ## 2; \ 451 get_key(i, 2, RK2); \ 452 pxor RK1, x1 ## 1; \ 453 pxor RK3, x3 ## 1; \ 454 movdqa x0 ## 1, x4 ## 1; \ 455 pslld $5, x0 ## 1; \ 456 psrld $(32 - 5), x4 ## 1; \ 457 por x4 ## 1, x0 ## 1; \ 458 movdqa x2 ## 1, x4 ## 1; \ 459 pslld $22, x2 ## 1; \ 460 psrld $(32 - 22), x4 ## 1; \ 461 por x4 ## 1, x2 ## 1; \ 462 pxor RK0, x0 ## 1; \ 463 pxor RK2, x2 ## 1; \ 464 pxor RK1, x1 ## 2; \ 465 pxor RK3, x3 ## 2; \ 466 movdqa x0 ## 2, x4 ## 2; \ 467 pslld $5, x0 ## 2; \ 468 psrld $(32 - 5), x4 ## 2; \ 469 por x4 ## 2, x0 ## 2; \ 470 movdqa x2 ## 2, x4 ## 2; \ 471 pslld $22, x2 ## 2; \ 472 psrld $(32 - 22), x4 ## 2; \ 473 por x4 ## 2, x2 ## 2; \ 474 pxor RK0, x0 ## 2; \ 475 pxor RK2, x2 ## 2; 476 477#define KL2(x0, x1, x2, x3, x4, i) \ 478 pxor RK0, x0 ## 1; \ 479 pxor RK2, x2 ## 1; \ 480 movdqa x0 ## 1, x4 ## 1; \ 481 psrld $5, x0 ## 1; \ 482 pslld $(32 - 5), x4 ## 1; \ 483 por x4 ## 1, x0 ## 1; \ 484 pxor RK3, x3 ## 1; \ 485 pxor RK1, x1 ## 1; \ 486 movdqa x2 ## 1, x4 ## 1; \ 487 psrld $22, x2 ## 1; \ 488 pslld $(32 - 22), x4 ## 1; \ 489 por x4 ## 1, x2 ## 1; \ 490 pxor x3 ## 1, x2 ## 1; \ 491 pxor RK0, x0 ## 2; \ 492 pxor RK2, x2 ## 2; \ 493 movdqa x0 ## 2, x4 ## 2; \ 494 psrld $5, x0 ## 2; \ 495 pslld $(32 - 5), x4 ## 2; \ 496 por x4 ## 2, x0 ## 2; \ 497 pxor RK3, x3 ## 2; \ 498 pxor RK1, x1 ## 2; \ 499 movdqa x2 ## 2, x4 ## 2; \ 500 psrld $22, x2 ## 2; \ 501 pslld $(32 - 22), x4 ## 2; \ 502 por x4 ## 2, x2 ## 2; \ 503 pxor x3 ## 2, x2 ## 2; \ 504 pxor x3 ## 1, x0 ## 1; \ 505 movdqa x1 ## 1, x4 ## 1; \ 506 pslld $7, x4 ## 1; \ 507 pxor x1 ## 1, x0 ## 1; \ 508 pxor x4 ## 1, x2 ## 1; \ 509 movdqa x1 ## 1, x4 ## 1; \ 510 psrld $1, x1 ## 1; \ 511 pslld $(32 - 1), x4 ## 1; \ 512 por x4 ## 1, x1 ## 1; \ 513 pxor x3 ## 2, x0 ## 2; \ 514 movdqa x1 ## 2, x4 ## 2; \ 515 pslld $7, x4 ## 2; \ 516 pxor x1 ## 2, x0 ## 2; \ 517 pxor x4 ## 2, x2 ## 2; \ 518 movdqa x1 ## 2, x4 ## 2; \ 519 psrld $1, x1 ## 2; \ 520 pslld $(32 - 1), x4 ## 2; \ 521 por x4 ## 2, x1 ## 2; \ 522 movdqa x3 ## 1, x4 ## 1; \ 523 psrld $7, x3 ## 1; \ 524 pslld $(32 - 7), x4 ## 1; \ 525 por x4 ## 1, x3 ## 1; \ 526 pxor x0 ## 1, x1 ## 1; \ 527 movdqa x0 ## 1, x4 ## 1; \ 528 pslld $3, x4 ## 1; \ 529 pxor x4 ## 1, x3 ## 1; \ 530 movdqa x0 ## 1, x4 ## 1; \ 531 movdqa x3 ## 2, x4 ## 2; \ 532 psrld $7, x3 ## 2; \ 533 pslld $(32 - 7), x4 ## 2; \ 534 por x4 ## 2, x3 ## 2; \ 535 pxor x0 ## 2, x1 ## 2; \ 536 movdqa x0 ## 2, x4 ## 2; \ 537 pslld $3, x4 ## 2; \ 538 pxor x4 ## 2, x3 ## 2; \ 539 movdqa x0 ## 2, x4 ## 2; \ 540 psrld $13, x0 ## 1; \ 541 pslld $(32 - 13), x4 ## 1; \ 542 por x4 ## 1, x0 ## 1; \ 543 pxor x2 ## 1, x1 ## 1; \ 544 pxor x2 ## 1, x3 ## 1; \ 545 movdqa x2 ## 1, x4 ## 1; \ 546 psrld $3, x2 ## 1; \ 547 pslld $(32 - 3), x4 ## 1; \ 548 por x4 ## 1, x2 ## 1; \ 549 psrld $13, x0 ## 2; \ 550 pslld $(32 - 13), x4 ## 2; \ 551 por x4 ## 2, x0 ## 2; \ 552 pxor x2 ## 2, x1 ## 2; \ 553 pxor x2 ## 2, x3 ## 2; \ 554 movdqa x2 ## 2, x4 ## 2; \ 555 psrld $3, x2 ## 2; \ 556 pslld $(32 - 3), x4 ## 2; \ 557 por x4 ## 2, x2 ## 2; 558 559#define S(SBOX, x0, x1, x2, x3, x4) \ 560 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 561 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 562 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 563 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 564 565#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 566 get_key(i, 0, RK0); \ 567 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 568 get_key(i, 2, RK2); \ 569 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 570 get_key(i, 3, RK3); \ 571 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 572 get_key(i, 1, RK1); \ 573 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 574 575#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 576 movdqa x0, t2; \ 577 punpckldq x1, x0; \ 578 punpckhdq x1, t2; \ 579 movdqa x2, t1; \ 580 punpckhdq x3, x2; \ 581 punpckldq x3, t1; \ 582 movdqa x0, x1; \ 583 punpcklqdq t1, x0; \ 584 punpckhqdq t1, x1; \ 585 movdqa t2, x3; \ 586 punpcklqdq x2, t2; \ 587 punpckhqdq x2, x3; \ 588 movdqa t2, x2; 589 590#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 591 movdqu (0*4*4)(in), x0; \ 592 movdqu (1*4*4)(in), x1; \ 593 movdqu (2*4*4)(in), x2; \ 594 movdqu (3*4*4)(in), x3; \ 595 \ 596 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 597 598#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 599 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 600 \ 601 movdqu x0, (0*4*4)(out); \ 602 movdqu x1, (1*4*4)(out); \ 603 movdqu x2, (2*4*4)(out); \ 604 movdqu x3, (3*4*4)(out); 605 606#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 607 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 608 \ 609 movdqu (0*4*4)(out), t0; \ 610 pxor t0, x0; \ 611 movdqu x0, (0*4*4)(out); \ 612 movdqu (1*4*4)(out), t0; \ 613 pxor t0, x1; \ 614 movdqu x1, (1*4*4)(out); \ 615 movdqu (2*4*4)(out), t0; \ 616 pxor t0, x2; \ 617 movdqu x2, (2*4*4)(out); \ 618 movdqu (3*4*4)(out), t0; \ 619 pxor t0, x3; \ 620 movdqu x3, (3*4*4)(out); 621 622ENTRY(__serpent_enc_blk_8way) 623 /* input: 624 * %rdi: ctx, CTX 625 * %rsi: dst 626 * %rdx: src 627 * %rcx: bool, if true: xor output 628 */ 629 630 pcmpeqd RNOT, RNOT; 631 632 leaq (4*4*4)(%rdx), %rax; 633 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 634 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); 635 636 K2(RA, RB, RC, RD, RE, 0); 637 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 638 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 639 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 640 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 641 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 642 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 643 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 644 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 645 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 646 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 647 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 648 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 649 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 650 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 651 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 652 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 653 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 654 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 655 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 656 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 657 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 658 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 659 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 660 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 661 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 662 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 663 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 664 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 665 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 666 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 667 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 668 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 669 670 leaq (4*4*4)(%rsi), %rax; 671 672 testb %cl, %cl; 673 jnz .L__enc_xor8; 674 675 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 676 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); 677 678 ret; 679 680.L__enc_xor8: 681 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 682 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); 683 684 ret; 685ENDPROC(__serpent_enc_blk_8way) 686 687ENTRY(serpent_dec_blk_8way) 688 /* input: 689 * %rdi: ctx, CTX 690 * %rsi: dst 691 * %rdx: src 692 */ 693 694 pcmpeqd RNOT, RNOT; 695 696 leaq (4*4*4)(%rdx), %rax; 697 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 698 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); 699 700 K2(RA, RB, RC, RD, RE, 32); 701 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 702 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 703 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 704 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 705 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 706 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 707 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 708 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 709 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 710 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 711 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 712 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 713 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 714 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 715 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 716 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 717 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 718 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 719 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 720 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 721 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 722 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 723 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 724 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 725 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 726 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 727 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 728 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 729 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 730 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 731 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 732 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 733 734 leaq (4*4*4)(%rsi), %rax; 735 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); 736 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); 737 738 ret; 739ENDPROC(serpent_dec_blk_8way) 740