1/* 2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) 3 * 4 * Copyright (C) 2012 Johannes Goetzfried 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 * 7 * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by 8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 23 * USA 24 * 25 */ 26 27.file "serpent-avx-x86_64-asm_64.S" 28.text 29 30#define CTX %rdi 31 32/********************************************************************** 33 8-way AVX serpent 34 **********************************************************************/ 35#define RA1 %xmm0 36#define RB1 %xmm1 37#define RC1 %xmm2 38#define RD1 %xmm3 39#define RE1 %xmm4 40 41#define tp %xmm5 42 43#define RA2 %xmm6 44#define RB2 %xmm7 45#define RC2 %xmm8 46#define RD2 %xmm9 47#define RE2 %xmm10 48 49#define RNOT %xmm11 50 51#define RK0 %xmm12 52#define RK1 %xmm13 53#define RK2 %xmm14 54#define RK3 %xmm15 55 56 57#define S0_1(x0, x1, x2, x3, x4) \ 58 vpor x0, x3, tp; \ 59 vpxor x3, x0, x0; \ 60 vpxor x2, x3, x4; \ 61 vpxor RNOT, x4, x4; \ 62 vpxor x1, tp, x3; \ 63 vpand x0, x1, x1; \ 64 vpxor x4, x1, x1; \ 65 vpxor x0, x2, x2; 66#define S0_2(x0, x1, x2, x3, x4) \ 67 vpxor x3, x0, x0; \ 68 vpor x0, x4, x4; \ 69 vpxor x2, x0, x0; \ 70 vpand x1, x2, x2; \ 71 vpxor x2, x3, x3; \ 72 vpxor RNOT, x1, x1; \ 73 vpxor x4, x2, x2; \ 74 vpxor x2, x1, x1; 75 76#define S1_1(x0, x1, x2, x3, x4) \ 77 vpxor x0, x1, tp; \ 78 vpxor x3, x0, x0; \ 79 vpxor RNOT, x3, x3; \ 80 vpand tp, x1, x4; \ 81 vpor tp, x0, x0; \ 82 vpxor x2, x3, x3; \ 83 vpxor x3, x0, x0; \ 84 vpxor x3, tp, x1; 85#define S1_2(x0, x1, x2, x3, x4) \ 86 vpxor x4, x3, x3; \ 87 vpor x4, x1, x1; \ 88 vpxor x2, x4, x4; \ 89 vpand x0, x2, x2; \ 90 vpxor x1, x2, x2; \ 91 vpor x0, x1, x1; \ 92 vpxor RNOT, x0, x0; \ 93 vpxor x2, x0, x0; \ 94 vpxor x1, x4, x4; 95 96#define S2_1(x0, x1, x2, x3, x4) \ 97 vpxor RNOT, x3, x3; \ 98 vpxor x0, x1, x1; \ 99 vpand x2, x0, tp; \ 100 vpxor x3, tp, tp; \ 101 vpor x0, x3, x3; \ 102 vpxor x1, x2, x2; \ 103 vpxor x1, x3, x3; \ 104 vpand tp, x1, x1; 105#define S2_2(x0, x1, x2, x3, x4) \ 106 vpxor x2, tp, tp; \ 107 vpand x3, x2, x2; \ 108 vpor x1, x3, x3; \ 109 vpxor RNOT, tp, tp; \ 110 vpxor tp, x3, x3; \ 111 vpxor tp, x0, x4; \ 112 vpxor x2, tp, x0; \ 113 vpor x2, x1, x1; 114 115#define S3_1(x0, x1, x2, x3, x4) \ 116 vpxor x3, x1, tp; \ 117 vpor x0, x3, x3; \ 118 vpand x0, x1, x4; \ 119 vpxor x2, x0, x0; \ 120 vpxor tp, x2, x2; \ 121 vpand x3, tp, x1; \ 122 vpxor x3, x2, x2; \ 123 vpor x4, x0, x0; \ 124 vpxor x3, x4, x4; 125#define S3_2(x0, x1, x2, x3, x4) \ 126 vpxor x0, x1, x1; \ 127 vpand x3, x0, x0; \ 128 vpand x4, x3, x3; \ 129 vpxor x2, x3, x3; \ 130 vpor x1, x4, x4; \ 131 vpand x1, x2, x2; \ 132 vpxor x3, x4, x4; \ 133 vpxor x3, x0, x0; \ 134 vpxor x2, x3, x3; 135 136#define S4_1(x0, x1, x2, x3, x4) \ 137 vpand x0, x3, tp; \ 138 vpxor x3, x0, x0; \ 139 vpxor x2, tp, tp; \ 140 vpor x3, x2, x2; \ 141 vpxor x1, x0, x0; \ 142 vpxor tp, x3, x4; \ 143 vpor x0, x2, x2; \ 144 vpxor x1, x2, x2; 145#define S4_2(x0, x1, x2, x3, x4) \ 146 vpand x0, x1, x1; \ 147 vpxor x4, x1, x1; \ 148 vpand x2, x4, x4; \ 149 vpxor tp, x2, x2; \ 150 vpxor x0, x4, x4; \ 151 vpor x1, tp, x3; \ 152 vpxor RNOT, x1, x1; \ 153 vpxor x0, x3, x3; 154 155#define S5_1(x0, x1, x2, x3, x4) \ 156 vpor x0, x1, tp; \ 157 vpxor tp, x2, x2; \ 158 vpxor RNOT, x3, x3; \ 159 vpxor x0, x1, x4; \ 160 vpxor x2, x0, x0; \ 161 vpand x4, tp, x1; \ 162 vpor x3, x4, x4; \ 163 vpxor x0, x4, x4; 164#define S5_2(x0, x1, x2, x3, x4) \ 165 vpand x3, x0, x0; \ 166 vpxor x3, x1, x1; \ 167 vpxor x2, x3, x3; \ 168 vpxor x1, x0, x0; \ 169 vpand x4, x2, x2; \ 170 vpxor x2, x1, x1; \ 171 vpand x0, x2, x2; \ 172 vpxor x2, x3, x3; 173 174#define S6_1(x0, x1, x2, x3, x4) \ 175 vpxor x0, x3, x3; \ 176 vpxor x2, x1, tp; \ 177 vpxor x0, x2, x2; \ 178 vpand x3, x0, x0; \ 179 vpor x3, tp, tp; \ 180 vpxor RNOT, x1, x4; \ 181 vpxor tp, x0, x0; \ 182 vpxor x2, tp, x1; 183#define S6_2(x0, x1, x2, x3, x4) \ 184 vpxor x4, x3, x3; \ 185 vpxor x0, x4, x4; \ 186 vpand x0, x2, x2; \ 187 vpxor x1, x4, x4; \ 188 vpxor x3, x2, x2; \ 189 vpand x1, x3, x3; \ 190 vpxor x0, x3, x3; \ 191 vpxor x2, x1, x1; 192 193#define S7_1(x0, x1, x2, x3, x4) \ 194 vpxor RNOT, x1, tp; \ 195 vpxor RNOT, x0, x0; \ 196 vpand x2, tp, x1; \ 197 vpxor x3, x1, x1; \ 198 vpor tp, x3, x3; \ 199 vpxor x2, tp, x4; \ 200 vpxor x3, x2, x2; \ 201 vpxor x0, x3, x3; \ 202 vpor x1, x0, x0; 203#define S7_2(x0, x1, x2, x3, x4) \ 204 vpand x0, x2, x2; \ 205 vpxor x4, x0, x0; \ 206 vpxor x3, x4, x4; \ 207 vpand x0, x3, x3; \ 208 vpxor x1, x4, x4; \ 209 vpxor x4, x2, x2; \ 210 vpxor x1, x3, x3; \ 211 vpor x0, x4, x4; \ 212 vpxor x1, x4, x4; 213 214#define SI0_1(x0, x1, x2, x3, x4) \ 215 vpxor x0, x1, x1; \ 216 vpor x1, x3, tp; \ 217 vpxor x1, x3, x4; \ 218 vpxor RNOT, x0, x0; \ 219 vpxor tp, x2, x2; \ 220 vpxor x0, tp, x3; \ 221 vpand x1, x0, x0; \ 222 vpxor x2, x0, x0; 223#define SI0_2(x0, x1, x2, x3, x4) \ 224 vpand x3, x2, x2; \ 225 vpxor x4, x3, x3; \ 226 vpxor x3, x2, x2; \ 227 vpxor x3, x1, x1; \ 228 vpand x0, x3, x3; \ 229 vpxor x0, x1, x1; \ 230 vpxor x2, x0, x0; \ 231 vpxor x3, x4, x4; 232 233#define SI1_1(x0, x1, x2, x3, x4) \ 234 vpxor x3, x1, x1; \ 235 vpxor x2, x0, tp; \ 236 vpxor RNOT, x2, x2; \ 237 vpor x1, x0, x4; \ 238 vpxor x3, x4, x4; \ 239 vpand x1, x3, x3; \ 240 vpxor x2, x1, x1; \ 241 vpand x4, x2, x2; 242#define SI1_2(x0, x1, x2, x3, x4) \ 243 vpxor x1, x4, x4; \ 244 vpor x3, x1, x1; \ 245 vpxor tp, x3, x3; \ 246 vpxor tp, x2, x2; \ 247 vpor x4, tp, x0; \ 248 vpxor x4, x2, x2; \ 249 vpxor x0, x1, x1; \ 250 vpxor x1, x4, x4; 251 252#define SI2_1(x0, x1, x2, x3, x4) \ 253 vpxor x1, x2, x2; \ 254 vpxor RNOT, x3, tp; \ 255 vpor x2, tp, tp; \ 256 vpxor x3, x2, x2; \ 257 vpxor x0, x3, x4; \ 258 vpxor x1, tp, x3; \ 259 vpor x2, x1, x1; \ 260 vpxor x0, x2, x2; 261#define SI2_2(x0, x1, x2, x3, x4) \ 262 vpxor x4, x1, x1; \ 263 vpor x3, x4, x4; \ 264 vpxor x3, x2, x2; \ 265 vpxor x2, x4, x4; \ 266 vpand x1, x2, x2; \ 267 vpxor x3, x2, x2; \ 268 vpxor x4, x3, x3; \ 269 vpxor x0, x4, x4; 270 271#define SI3_1(x0, x1, x2, x3, x4) \ 272 vpxor x1, x2, x2; \ 273 vpand x2, x1, tp; \ 274 vpxor x0, tp, tp; \ 275 vpor x1, x0, x0; \ 276 vpxor x3, x1, x4; \ 277 vpxor x3, x0, x0; \ 278 vpor tp, x3, x3; \ 279 vpxor x2, tp, x1; 280#define SI3_2(x0, x1, x2, x3, x4) \ 281 vpxor x3, x1, x1; \ 282 vpxor x2, x0, x0; \ 283 vpxor x3, x2, x2; \ 284 vpand x1, x3, x3; \ 285 vpxor x0, x1, x1; \ 286 vpand x2, x0, x0; \ 287 vpxor x3, x4, x4; \ 288 vpxor x0, x3, x3; \ 289 vpxor x1, x0, x0; 290 291#define SI4_1(x0, x1, x2, x3, x4) \ 292 vpxor x3, x2, x2; \ 293 vpand x1, x0, tp; \ 294 vpxor x2, tp, tp; \ 295 vpor x3, x2, x2; \ 296 vpxor RNOT, x0, x4; \ 297 vpxor tp, x1, x1; \ 298 vpxor x2, tp, x0; \ 299 vpand x4, x2, x2; 300#define SI4_2(x0, x1, x2, x3, x4) \ 301 vpxor x0, x2, x2; \ 302 vpor x4, x0, x0; \ 303 vpxor x3, x0, x0; \ 304 vpand x2, x3, x3; \ 305 vpxor x3, x4, x4; \ 306 vpxor x1, x3, x3; \ 307 vpand x0, x1, x1; \ 308 vpxor x1, x4, x4; \ 309 vpxor x3, x0, x0; 310 311#define SI5_1(x0, x1, x2, x3, x4) \ 312 vpor x2, x1, tp; \ 313 vpxor x1, x2, x2; \ 314 vpxor x3, tp, tp; \ 315 vpand x1, x3, x3; \ 316 vpxor x3, x2, x2; \ 317 vpor x0, x3, x3; \ 318 vpxor RNOT, x0, x0; \ 319 vpxor x2, x3, x3; \ 320 vpor x0, x2, x2; 321#define SI5_2(x0, x1, x2, x3, x4) \ 322 vpxor tp, x1, x4; \ 323 vpxor x4, x2, x2; \ 324 vpand x0, x4, x4; \ 325 vpxor tp, x0, x0; \ 326 vpxor x3, tp, x1; \ 327 vpand x2, x0, x0; \ 328 vpxor x3, x2, x2; \ 329 vpxor x2, x0, x0; \ 330 vpxor x4, x2, x2; \ 331 vpxor x3, x4, x4; 332 333#define SI6_1(x0, x1, x2, x3, x4) \ 334 vpxor x2, x0, x0; \ 335 vpand x3, x0, tp; \ 336 vpxor x3, x2, x2; \ 337 vpxor x2, tp, tp; \ 338 vpxor x1, x3, x3; \ 339 vpor x0, x2, x2; \ 340 vpxor x3, x2, x2; \ 341 vpand tp, x3, x3; 342#define SI6_2(x0, x1, x2, x3, x4) \ 343 vpxor RNOT, tp, tp; \ 344 vpxor x1, x3, x3; \ 345 vpand x2, x1, x1; \ 346 vpxor tp, x0, x4; \ 347 vpxor x4, x3, x3; \ 348 vpxor x2, x4, x4; \ 349 vpxor x1, tp, x0; \ 350 vpxor x0, x2, x2; 351 352#define SI7_1(x0, x1, x2, x3, x4) \ 353 vpand x0, x3, tp; \ 354 vpxor x2, x0, x0; \ 355 vpor x3, x2, x2; \ 356 vpxor x1, x3, x4; \ 357 vpxor RNOT, x0, x0; \ 358 vpor tp, x1, x1; \ 359 vpxor x0, x4, x4; \ 360 vpand x2, x0, x0; \ 361 vpxor x1, x0, x0; 362#define SI7_2(x0, x1, x2, x3, x4) \ 363 vpand x2, x1, x1; \ 364 vpxor x2, tp, x3; \ 365 vpxor x3, x4, x4; \ 366 vpand x3, x2, x2; \ 367 vpor x0, x3, x3; \ 368 vpxor x4, x1, x1; \ 369 vpxor x4, x3, x3; \ 370 vpand x0, x4, x4; \ 371 vpxor x2, x4, x4; 372 373#define get_key(i, j, t) \ 374 vbroadcastss (4*(i)+(j))*4(CTX), t; 375 376#define K2(x0, x1, x2, x3, x4, i) \ 377 get_key(i, 0, RK0); \ 378 get_key(i, 1, RK1); \ 379 get_key(i, 2, RK2); \ 380 get_key(i, 3, RK3); \ 381 vpxor RK0, x0 ## 1, x0 ## 1; \ 382 vpxor RK1, x1 ## 1, x1 ## 1; \ 383 vpxor RK2, x2 ## 1, x2 ## 1; \ 384 vpxor RK3, x3 ## 1, x3 ## 1; \ 385 vpxor RK0, x0 ## 2, x0 ## 2; \ 386 vpxor RK1, x1 ## 2, x1 ## 2; \ 387 vpxor RK2, x2 ## 2, x2 ## 2; \ 388 vpxor RK3, x3 ## 2, x3 ## 2; 389 390#define LK2(x0, x1, x2, x3, x4, i) \ 391 vpslld $13, x0 ## 1, x4 ## 1; \ 392 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 393 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 394 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 395 vpslld $3, x2 ## 1, x4 ## 1; \ 396 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 397 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 398 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 399 vpslld $13, x0 ## 2, x4 ## 2; \ 400 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 401 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 402 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 403 vpslld $3, x2 ## 2, x4 ## 2; \ 404 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 405 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 406 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 407 vpslld $1, x1 ## 1, x4 ## 1; \ 408 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 409 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 410 vpslld $3, x0 ## 1, x4 ## 1; \ 411 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 412 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 413 get_key(i, 1, RK1); \ 414 vpslld $1, x1 ## 2, x4 ## 2; \ 415 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 416 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 417 vpslld $3, x0 ## 2, x4 ## 2; \ 418 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 419 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 420 get_key(i, 3, RK3); \ 421 vpslld $7, x3 ## 1, x4 ## 1; \ 422 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 423 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 424 vpslld $7, x1 ## 1, x4 ## 1; \ 425 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 426 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 427 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 428 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 429 get_key(i, 0, RK0); \ 430 vpslld $7, x3 ## 2, x4 ## 2; \ 431 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 432 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 433 vpslld $7, x1 ## 2, x4 ## 2; \ 434 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 435 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 436 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 437 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 438 get_key(i, 2, RK2); \ 439 vpxor RK1, x1 ## 1, x1 ## 1; \ 440 vpxor RK3, x3 ## 1, x3 ## 1; \ 441 vpslld $5, x0 ## 1, x4 ## 1; \ 442 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 443 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 444 vpslld $22, x2 ## 1, x4 ## 1; \ 445 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 446 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 447 vpxor RK0, x0 ## 1, x0 ## 1; \ 448 vpxor RK2, x2 ## 1, x2 ## 1; \ 449 vpxor RK1, x1 ## 2, x1 ## 2; \ 450 vpxor RK3, x3 ## 2, x3 ## 2; \ 451 vpslld $5, x0 ## 2, x4 ## 2; \ 452 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 453 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 454 vpslld $22, x2 ## 2, x4 ## 2; \ 455 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 456 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 457 vpxor RK0, x0 ## 2, x0 ## 2; \ 458 vpxor RK2, x2 ## 2, x2 ## 2; 459 460#define KL2(x0, x1, x2, x3, x4, i) \ 461 vpxor RK0, x0 ## 1, x0 ## 1; \ 462 vpxor RK2, x2 ## 1, x2 ## 1; \ 463 vpsrld $5, x0 ## 1, x4 ## 1; \ 464 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 465 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 466 vpxor RK3, x3 ## 1, x3 ## 1; \ 467 vpxor RK1, x1 ## 1, x1 ## 1; \ 468 vpsrld $22, x2 ## 1, x4 ## 1; \ 469 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 470 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 471 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 472 vpxor RK0, x0 ## 2, x0 ## 2; \ 473 vpxor RK2, x2 ## 2, x2 ## 2; \ 474 vpsrld $5, x0 ## 2, x4 ## 2; \ 475 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 476 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 477 vpxor RK3, x3 ## 2, x3 ## 2; \ 478 vpxor RK1, x1 ## 2, x1 ## 2; \ 479 vpsrld $22, x2 ## 2, x4 ## 2; \ 480 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 481 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 482 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 483 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 484 vpslld $7, x1 ## 1, x4 ## 1; \ 485 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 486 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 487 vpsrld $1, x1 ## 1, x4 ## 1; \ 488 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 489 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 490 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 491 vpslld $7, x1 ## 2, x4 ## 2; \ 492 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 493 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 494 vpsrld $1, x1 ## 2, x4 ## 2; \ 495 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 496 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 497 vpsrld $7, x3 ## 1, x4 ## 1; \ 498 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 499 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 500 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 501 vpslld $3, x0 ## 1, x4 ## 1; \ 502 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 503 vpsrld $7, x3 ## 2, x4 ## 2; \ 504 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 505 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 506 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 507 vpslld $3, x0 ## 2, x4 ## 2; \ 508 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 509 vpsrld $13, x0 ## 1, x4 ## 1; \ 510 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 511 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 512 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 513 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 514 vpsrld $3, x2 ## 1, x4 ## 1; \ 515 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 516 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 517 vpsrld $13, x0 ## 2, x4 ## 2; \ 518 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 519 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 520 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 521 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 522 vpsrld $3, x2 ## 2, x4 ## 2; \ 523 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 524 vpor x4 ## 2, x2 ## 2, x2 ## 2; 525 526#define S(SBOX, x0, x1, x2, x3, x4) \ 527 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 528 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 529 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 530 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 531 532#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 533 get_key(i, 0, RK0); \ 534 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 535 get_key(i, 2, RK2); \ 536 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 537 get_key(i, 3, RK3); \ 538 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 539 get_key(i, 1, RK1); \ 540 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 541 542#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 543 vpunpckldq x1, x0, t0; \ 544 vpunpckhdq x1, x0, t2; \ 545 vpunpckldq x3, x2, t1; \ 546 vpunpckhdq x3, x2, x3; \ 547 \ 548 vpunpcklqdq t1, t0, x0; \ 549 vpunpckhqdq t1, t0, x1; \ 550 vpunpcklqdq x3, t2, x2; \ 551 vpunpckhqdq x3, t2, x3; 552 553#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 554 vmovdqu (0*4*4)(in), x0; \ 555 vmovdqu (1*4*4)(in), x1; \ 556 vmovdqu (2*4*4)(in), x2; \ 557 vmovdqu (3*4*4)(in), x3; \ 558 \ 559 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 560 561#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 562 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 563 \ 564 vmovdqu x0, (0*4*4)(out); \ 565 vmovdqu x1, (1*4*4)(out); \ 566 vmovdqu x2, (2*4*4)(out); \ 567 vmovdqu x3, (3*4*4)(out); 568 569#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 571 \ 572 vpxor (0*4*4)(out), x0, x0; \ 573 vmovdqu x0, (0*4*4)(out); \ 574 vpxor (1*4*4)(out), x1, x1; \ 575 vmovdqu x1, (1*4*4)(out); \ 576 vpxor (2*4*4)(out), x2, x2; \ 577 vmovdqu x2, (2*4*4)(out); \ 578 vpxor (3*4*4)(out), x3, x3; \ 579 vmovdqu x3, (3*4*4)(out); 580 581.align 8 582.global __serpent_enc_blk_8way_avx 583.type __serpent_enc_blk_8way_avx,@function; 584 585__serpent_enc_blk_8way_avx: 586 /* input: 587 * %rdi: ctx, CTX 588 * %rsi: dst 589 * %rdx: src 590 * %rcx: bool, if true: xor output 591 */ 592 593 vpcmpeqd RNOT, RNOT, RNOT; 594 595 leaq (4*4*4)(%rdx), %rax; 596 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 597 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); 598 599 K2(RA, RB, RC, RD, RE, 0); 600 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 601 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 602 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 603 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 604 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 605 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 606 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 607 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 608 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 609 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 610 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 611 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 612 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 613 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 614 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 615 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 616 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 617 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 618 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 619 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 620 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 621 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 622 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 623 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 624 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 625 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 626 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 627 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 628 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 629 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 630 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 631 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 632 633 leaq (4*4*4)(%rsi), %rax; 634 635 testb %cl, %cl; 636 jnz __enc_xor8; 637 638 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 639 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); 640 641 ret; 642 643__enc_xor8: 644 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 645 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); 646 647 ret; 648 649.align 8 650.global serpent_dec_blk_8way_avx 651.type serpent_dec_blk_8way_avx,@function; 652 653serpent_dec_blk_8way_avx: 654 /* input: 655 * %rdi: ctx, CTX 656 * %rsi: dst 657 * %rdx: src 658 */ 659 660 vpcmpeqd RNOT, RNOT, RNOT; 661 662 leaq (4*4*4)(%rdx), %rax; 663 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 664 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); 665 666 K2(RA, RB, RC, RD, RE, 32); 667 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 668 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 669 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 670 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 671 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 672 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 673 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 674 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 675 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 676 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 677 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 678 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 679 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 680 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 681 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 682 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 683 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 684 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 685 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 686 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 687 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 688 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 689 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 690 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 691 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 692 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 693 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 694 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 695 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 696 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 697 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 698 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 699 700 leaq (4*4*4)(%rsi), %rax; 701 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); 702 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); 703 704 ret; 705