1/* 2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) 3 * 4 * Copyright (C) 2012 Johannes Goetzfried 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 * 7 * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, write to the Free Software 21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 22 * USA 23 * 24 */ 25 26#include <linux/linkage.h> 27#include <asm/frame.h> 28#include "glue_helper-asm-avx.S" 29 30.file "serpent-avx-x86_64-asm_64.S" 31 32.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 33.align 16 34.Lbswap128_mask: 35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 36.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 37.align 16 38.Lxts_gf128mul_and_shl1_mask: 39 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 40 41.text 42 43#define CTX %rdi 44 45/********************************************************************** 46 8-way AVX serpent 47 **********************************************************************/ 48#define RA1 %xmm0 49#define RB1 %xmm1 50#define RC1 %xmm2 51#define RD1 %xmm3 52#define RE1 %xmm4 53 54#define tp %xmm5 55 56#define RA2 %xmm6 57#define RB2 %xmm7 58#define RC2 %xmm8 59#define RD2 %xmm9 60#define RE2 %xmm10 61 62#define RNOT %xmm11 63 64#define RK0 %xmm12 65#define RK1 %xmm13 66#define RK2 %xmm14 67#define RK3 %xmm15 68 69 70#define S0_1(x0, x1, x2, x3, x4) \ 71 vpor x0, x3, tp; \ 72 vpxor x3, x0, x0; \ 73 vpxor x2, x3, x4; \ 74 vpxor RNOT, x4, x4; \ 75 vpxor x1, tp, x3; \ 76 vpand x0, x1, x1; \ 77 vpxor x4, x1, x1; \ 78 vpxor x0, x2, x2; 79#define S0_2(x0, x1, x2, x3, x4) \ 80 vpxor x3, x0, x0; \ 81 vpor x0, x4, x4; \ 82 vpxor x2, x0, x0; \ 83 vpand x1, x2, x2; \ 84 vpxor x2, x3, x3; \ 85 vpxor RNOT, x1, x1; \ 86 vpxor x4, x2, x2; \ 87 vpxor x2, x1, x1; 88 89#define S1_1(x0, x1, x2, x3, x4) \ 90 vpxor x0, x1, tp; \ 91 vpxor x3, x0, x0; \ 92 vpxor RNOT, x3, x3; \ 93 vpand tp, x1, x4; \ 94 vpor tp, x0, x0; \ 95 vpxor x2, x3, x3; \ 96 vpxor x3, x0, x0; \ 97 vpxor x3, tp, x1; 98#define S1_2(x0, x1, x2, x3, x4) \ 99 vpxor x4, x3, x3; \ 100 vpor x4, x1, x1; \ 101 vpxor x2, x4, x4; \ 102 vpand x0, x2, x2; \ 103 vpxor x1, x2, x2; \ 104 vpor x0, x1, x1; \ 105 vpxor RNOT, x0, x0; \ 106 vpxor x2, x0, x0; \ 107 vpxor x1, x4, x4; 108 109#define S2_1(x0, x1, x2, x3, x4) \ 110 vpxor RNOT, x3, x3; \ 111 vpxor x0, x1, x1; \ 112 vpand x2, x0, tp; \ 113 vpxor x3, tp, tp; \ 114 vpor x0, x3, x3; \ 115 vpxor x1, x2, x2; \ 116 vpxor x1, x3, x3; \ 117 vpand tp, x1, x1; 118#define S2_2(x0, x1, x2, x3, x4) \ 119 vpxor x2, tp, tp; \ 120 vpand x3, x2, x2; \ 121 vpor x1, x3, x3; \ 122 vpxor RNOT, tp, tp; \ 123 vpxor tp, x3, x3; \ 124 vpxor tp, x0, x4; \ 125 vpxor x2, tp, x0; \ 126 vpor x2, x1, x1; 127 128#define S3_1(x0, x1, x2, x3, x4) \ 129 vpxor x3, x1, tp; \ 130 vpor x0, x3, x3; \ 131 vpand x0, x1, x4; \ 132 vpxor x2, x0, x0; \ 133 vpxor tp, x2, x2; \ 134 vpand x3, tp, x1; \ 135 vpxor x3, x2, x2; \ 136 vpor x4, x0, x0; \ 137 vpxor x3, x4, x4; 138#define S3_2(x0, x1, x2, x3, x4) \ 139 vpxor x0, x1, x1; \ 140 vpand x3, x0, x0; \ 141 vpand x4, x3, x3; \ 142 vpxor x2, x3, x3; \ 143 vpor x1, x4, x4; \ 144 vpand x1, x2, x2; \ 145 vpxor x3, x4, x4; \ 146 vpxor x3, x0, x0; \ 147 vpxor x2, x3, x3; 148 149#define S4_1(x0, x1, x2, x3, x4) \ 150 vpand x0, x3, tp; \ 151 vpxor x3, x0, x0; \ 152 vpxor x2, tp, tp; \ 153 vpor x3, x2, x2; \ 154 vpxor x1, x0, x0; \ 155 vpxor tp, x3, x4; \ 156 vpor x0, x2, x2; \ 157 vpxor x1, x2, x2; 158#define S4_2(x0, x1, x2, x3, x4) \ 159 vpand x0, x1, x1; \ 160 vpxor x4, x1, x1; \ 161 vpand x2, x4, x4; \ 162 vpxor tp, x2, x2; \ 163 vpxor x0, x4, x4; \ 164 vpor x1, tp, x3; \ 165 vpxor RNOT, x1, x1; \ 166 vpxor x0, x3, x3; 167 168#define S5_1(x0, x1, x2, x3, x4) \ 169 vpor x0, x1, tp; \ 170 vpxor tp, x2, x2; \ 171 vpxor RNOT, x3, x3; \ 172 vpxor x0, x1, x4; \ 173 vpxor x2, x0, x0; \ 174 vpand x4, tp, x1; \ 175 vpor x3, x4, x4; \ 176 vpxor x0, x4, x4; 177#define S5_2(x0, x1, x2, x3, x4) \ 178 vpand x3, x0, x0; \ 179 vpxor x3, x1, x1; \ 180 vpxor x2, x3, x3; \ 181 vpxor x1, x0, x0; \ 182 vpand x4, x2, x2; \ 183 vpxor x2, x1, x1; \ 184 vpand x0, x2, x2; \ 185 vpxor x2, x3, x3; 186 187#define S6_1(x0, x1, x2, x3, x4) \ 188 vpxor x0, x3, x3; \ 189 vpxor x2, x1, tp; \ 190 vpxor x0, x2, x2; \ 191 vpand x3, x0, x0; \ 192 vpor x3, tp, tp; \ 193 vpxor RNOT, x1, x4; \ 194 vpxor tp, x0, x0; \ 195 vpxor x2, tp, x1; 196#define S6_2(x0, x1, x2, x3, x4) \ 197 vpxor x4, x3, x3; \ 198 vpxor x0, x4, x4; \ 199 vpand x0, x2, x2; \ 200 vpxor x1, x4, x4; \ 201 vpxor x3, x2, x2; \ 202 vpand x1, x3, x3; \ 203 vpxor x0, x3, x3; \ 204 vpxor x2, x1, x1; 205 206#define S7_1(x0, x1, x2, x3, x4) \ 207 vpxor RNOT, x1, tp; \ 208 vpxor RNOT, x0, x0; \ 209 vpand x2, tp, x1; \ 210 vpxor x3, x1, x1; \ 211 vpor tp, x3, x3; \ 212 vpxor x2, tp, x4; \ 213 vpxor x3, x2, x2; \ 214 vpxor x0, x3, x3; \ 215 vpor x1, x0, x0; 216#define S7_2(x0, x1, x2, x3, x4) \ 217 vpand x0, x2, x2; \ 218 vpxor x4, x0, x0; \ 219 vpxor x3, x4, x4; \ 220 vpand x0, x3, x3; \ 221 vpxor x1, x4, x4; \ 222 vpxor x4, x2, x2; \ 223 vpxor x1, x3, x3; \ 224 vpor x0, x4, x4; \ 225 vpxor x1, x4, x4; 226 227#define SI0_1(x0, x1, x2, x3, x4) \ 228 vpxor x0, x1, x1; \ 229 vpor x1, x3, tp; \ 230 vpxor x1, x3, x4; \ 231 vpxor RNOT, x0, x0; \ 232 vpxor tp, x2, x2; \ 233 vpxor x0, tp, x3; \ 234 vpand x1, x0, x0; \ 235 vpxor x2, x0, x0; 236#define SI0_2(x0, x1, x2, x3, x4) \ 237 vpand x3, x2, x2; \ 238 vpxor x4, x3, x3; \ 239 vpxor x3, x2, x2; \ 240 vpxor x3, x1, x1; \ 241 vpand x0, x3, x3; \ 242 vpxor x0, x1, x1; \ 243 vpxor x2, x0, x0; \ 244 vpxor x3, x4, x4; 245 246#define SI1_1(x0, x1, x2, x3, x4) \ 247 vpxor x3, x1, x1; \ 248 vpxor x2, x0, tp; \ 249 vpxor RNOT, x2, x2; \ 250 vpor x1, x0, x4; \ 251 vpxor x3, x4, x4; \ 252 vpand x1, x3, x3; \ 253 vpxor x2, x1, x1; \ 254 vpand x4, x2, x2; 255#define SI1_2(x0, x1, x2, x3, x4) \ 256 vpxor x1, x4, x4; \ 257 vpor x3, x1, x1; \ 258 vpxor tp, x3, x3; \ 259 vpxor tp, x2, x2; \ 260 vpor x4, tp, x0; \ 261 vpxor x4, x2, x2; \ 262 vpxor x0, x1, x1; \ 263 vpxor x1, x4, x4; 264 265#define SI2_1(x0, x1, x2, x3, x4) \ 266 vpxor x1, x2, x2; \ 267 vpxor RNOT, x3, tp; \ 268 vpor x2, tp, tp; \ 269 vpxor x3, x2, x2; \ 270 vpxor x0, x3, x4; \ 271 vpxor x1, tp, x3; \ 272 vpor x2, x1, x1; \ 273 vpxor x0, x2, x2; 274#define SI2_2(x0, x1, x2, x3, x4) \ 275 vpxor x4, x1, x1; \ 276 vpor x3, x4, x4; \ 277 vpxor x3, x2, x2; \ 278 vpxor x2, x4, x4; \ 279 vpand x1, x2, x2; \ 280 vpxor x3, x2, x2; \ 281 vpxor x4, x3, x3; \ 282 vpxor x0, x4, x4; 283 284#define SI3_1(x0, x1, x2, x3, x4) \ 285 vpxor x1, x2, x2; \ 286 vpand x2, x1, tp; \ 287 vpxor x0, tp, tp; \ 288 vpor x1, x0, x0; \ 289 vpxor x3, x1, x4; \ 290 vpxor x3, x0, x0; \ 291 vpor tp, x3, x3; \ 292 vpxor x2, tp, x1; 293#define SI3_2(x0, x1, x2, x3, x4) \ 294 vpxor x3, x1, x1; \ 295 vpxor x2, x0, x0; \ 296 vpxor x3, x2, x2; \ 297 vpand x1, x3, x3; \ 298 vpxor x0, x1, x1; \ 299 vpand x2, x0, x0; \ 300 vpxor x3, x4, x4; \ 301 vpxor x0, x3, x3; \ 302 vpxor x1, x0, x0; 303 304#define SI4_1(x0, x1, x2, x3, x4) \ 305 vpxor x3, x2, x2; \ 306 vpand x1, x0, tp; \ 307 vpxor x2, tp, tp; \ 308 vpor x3, x2, x2; \ 309 vpxor RNOT, x0, x4; \ 310 vpxor tp, x1, x1; \ 311 vpxor x2, tp, x0; \ 312 vpand x4, x2, x2; 313#define SI4_2(x0, x1, x2, x3, x4) \ 314 vpxor x0, x2, x2; \ 315 vpor x4, x0, x0; \ 316 vpxor x3, x0, x0; \ 317 vpand x2, x3, x3; \ 318 vpxor x3, x4, x4; \ 319 vpxor x1, x3, x3; \ 320 vpand x0, x1, x1; \ 321 vpxor x1, x4, x4; \ 322 vpxor x3, x0, x0; 323 324#define SI5_1(x0, x1, x2, x3, x4) \ 325 vpor x2, x1, tp; \ 326 vpxor x1, x2, x2; \ 327 vpxor x3, tp, tp; \ 328 vpand x1, x3, x3; \ 329 vpxor x3, x2, x2; \ 330 vpor x0, x3, x3; \ 331 vpxor RNOT, x0, x0; \ 332 vpxor x2, x3, x3; \ 333 vpor x0, x2, x2; 334#define SI5_2(x0, x1, x2, x3, x4) \ 335 vpxor tp, x1, x4; \ 336 vpxor x4, x2, x2; \ 337 vpand x0, x4, x4; \ 338 vpxor tp, x0, x0; \ 339 vpxor x3, tp, x1; \ 340 vpand x2, x0, x0; \ 341 vpxor x3, x2, x2; \ 342 vpxor x2, x0, x0; \ 343 vpxor x4, x2, x2; \ 344 vpxor x3, x4, x4; 345 346#define SI6_1(x0, x1, x2, x3, x4) \ 347 vpxor x2, x0, x0; \ 348 vpand x3, x0, tp; \ 349 vpxor x3, x2, x2; \ 350 vpxor x2, tp, tp; \ 351 vpxor x1, x3, x3; \ 352 vpor x0, x2, x2; \ 353 vpxor x3, x2, x2; \ 354 vpand tp, x3, x3; 355#define SI6_2(x0, x1, x2, x3, x4) \ 356 vpxor RNOT, tp, tp; \ 357 vpxor x1, x3, x3; \ 358 vpand x2, x1, x1; \ 359 vpxor tp, x0, x4; \ 360 vpxor x4, x3, x3; \ 361 vpxor x2, x4, x4; \ 362 vpxor x1, tp, x0; \ 363 vpxor x0, x2, x2; 364 365#define SI7_1(x0, x1, x2, x3, x4) \ 366 vpand x0, x3, tp; \ 367 vpxor x2, x0, x0; \ 368 vpor x3, x2, x2; \ 369 vpxor x1, x3, x4; \ 370 vpxor RNOT, x0, x0; \ 371 vpor tp, x1, x1; \ 372 vpxor x0, x4, x4; \ 373 vpand x2, x0, x0; \ 374 vpxor x1, x0, x0; 375#define SI7_2(x0, x1, x2, x3, x4) \ 376 vpand x2, x1, x1; \ 377 vpxor x2, tp, x3; \ 378 vpxor x3, x4, x4; \ 379 vpand x3, x2, x2; \ 380 vpor x0, x3, x3; \ 381 vpxor x4, x1, x1; \ 382 vpxor x4, x3, x3; \ 383 vpand x0, x4, x4; \ 384 vpxor x2, x4, x4; 385 386#define get_key(i, j, t) \ 387 vbroadcastss (4*(i)+(j))*4(CTX), t; 388 389#define K2(x0, x1, x2, x3, x4, i) \ 390 get_key(i, 0, RK0); \ 391 get_key(i, 1, RK1); \ 392 get_key(i, 2, RK2); \ 393 get_key(i, 3, RK3); \ 394 vpxor RK0, x0 ## 1, x0 ## 1; \ 395 vpxor RK1, x1 ## 1, x1 ## 1; \ 396 vpxor RK2, x2 ## 1, x2 ## 1; \ 397 vpxor RK3, x3 ## 1, x3 ## 1; \ 398 vpxor RK0, x0 ## 2, x0 ## 2; \ 399 vpxor RK1, x1 ## 2, x1 ## 2; \ 400 vpxor RK2, x2 ## 2, x2 ## 2; \ 401 vpxor RK3, x3 ## 2, x3 ## 2; 402 403#define LK2(x0, x1, x2, x3, x4, i) \ 404 vpslld $13, x0 ## 1, x4 ## 1; \ 405 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 406 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 407 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 408 vpslld $3, x2 ## 1, x4 ## 1; \ 409 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 410 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 411 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 412 vpslld $13, x0 ## 2, x4 ## 2; \ 413 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 414 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 415 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 416 vpslld $3, x2 ## 2, x4 ## 2; \ 417 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 418 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 419 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 420 vpslld $1, x1 ## 1, x4 ## 1; \ 421 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 422 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 423 vpslld $3, x0 ## 1, x4 ## 1; \ 424 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 425 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 426 get_key(i, 1, RK1); \ 427 vpslld $1, x1 ## 2, x4 ## 2; \ 428 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 429 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 430 vpslld $3, x0 ## 2, x4 ## 2; \ 431 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 432 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 433 get_key(i, 3, RK3); \ 434 vpslld $7, x3 ## 1, x4 ## 1; \ 435 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 436 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 437 vpslld $7, x1 ## 1, x4 ## 1; \ 438 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 439 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 440 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 441 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 442 get_key(i, 0, RK0); \ 443 vpslld $7, x3 ## 2, x4 ## 2; \ 444 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 445 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 446 vpslld $7, x1 ## 2, x4 ## 2; \ 447 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 448 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 449 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 450 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 451 get_key(i, 2, RK2); \ 452 vpxor RK1, x1 ## 1, x1 ## 1; \ 453 vpxor RK3, x3 ## 1, x3 ## 1; \ 454 vpslld $5, x0 ## 1, x4 ## 1; \ 455 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 456 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 457 vpslld $22, x2 ## 1, x4 ## 1; \ 458 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 459 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 460 vpxor RK0, x0 ## 1, x0 ## 1; \ 461 vpxor RK2, x2 ## 1, x2 ## 1; \ 462 vpxor RK1, x1 ## 2, x1 ## 2; \ 463 vpxor RK3, x3 ## 2, x3 ## 2; \ 464 vpslld $5, x0 ## 2, x4 ## 2; \ 465 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 466 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 467 vpslld $22, x2 ## 2, x4 ## 2; \ 468 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 469 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 470 vpxor RK0, x0 ## 2, x0 ## 2; \ 471 vpxor RK2, x2 ## 2, x2 ## 2; 472 473#define KL2(x0, x1, x2, x3, x4, i) \ 474 vpxor RK0, x0 ## 1, x0 ## 1; \ 475 vpxor RK2, x2 ## 1, x2 ## 1; \ 476 vpsrld $5, x0 ## 1, x4 ## 1; \ 477 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 478 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 479 vpxor RK3, x3 ## 1, x3 ## 1; \ 480 vpxor RK1, x1 ## 1, x1 ## 1; \ 481 vpsrld $22, x2 ## 1, x4 ## 1; \ 482 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 483 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 484 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 485 vpxor RK0, x0 ## 2, x0 ## 2; \ 486 vpxor RK2, x2 ## 2, x2 ## 2; \ 487 vpsrld $5, x0 ## 2, x4 ## 2; \ 488 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 489 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 490 vpxor RK3, x3 ## 2, x3 ## 2; \ 491 vpxor RK1, x1 ## 2, x1 ## 2; \ 492 vpsrld $22, x2 ## 2, x4 ## 2; \ 493 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 494 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 495 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 496 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 497 vpslld $7, x1 ## 1, x4 ## 1; \ 498 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 499 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 500 vpsrld $1, x1 ## 1, x4 ## 1; \ 501 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 502 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 503 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 504 vpslld $7, x1 ## 2, x4 ## 2; \ 505 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 506 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 507 vpsrld $1, x1 ## 2, x4 ## 2; \ 508 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 509 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 510 vpsrld $7, x3 ## 1, x4 ## 1; \ 511 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 512 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 513 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 514 vpslld $3, x0 ## 1, x4 ## 1; \ 515 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 516 vpsrld $7, x3 ## 2, x4 ## 2; \ 517 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 518 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 519 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 520 vpslld $3, x0 ## 2, x4 ## 2; \ 521 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 522 vpsrld $13, x0 ## 1, x4 ## 1; \ 523 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 524 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 525 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 526 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 527 vpsrld $3, x2 ## 1, x4 ## 1; \ 528 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 529 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 530 vpsrld $13, x0 ## 2, x4 ## 2; \ 531 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 532 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 533 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 534 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 535 vpsrld $3, x2 ## 2, x4 ## 2; \ 536 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 537 vpor x4 ## 2, x2 ## 2, x2 ## 2; 538 539#define S(SBOX, x0, x1, x2, x3, x4) \ 540 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 541 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 542 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 543 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 544 545#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 546 get_key(i, 0, RK0); \ 547 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 548 get_key(i, 2, RK2); \ 549 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 550 get_key(i, 3, RK3); \ 551 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 552 get_key(i, 1, RK1); \ 553 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 554 555#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 556 vpunpckldq x1, x0, t0; \ 557 vpunpckhdq x1, x0, t2; \ 558 vpunpckldq x3, x2, t1; \ 559 vpunpckhdq x3, x2, x3; \ 560 \ 561 vpunpcklqdq t1, t0, x0; \ 562 vpunpckhqdq t1, t0, x1; \ 563 vpunpcklqdq x3, t2, x2; \ 564 vpunpckhqdq x3, t2, x3; 565 566#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 567 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 568 569#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 571 572.align 8 573__serpent_enc_blk8_avx: 574 /* input: 575 * %rdi: ctx, CTX 576 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 577 * output: 578 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 579 */ 580 581 vpcmpeqd RNOT, RNOT, RNOT; 582 583 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 584 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 585 586 K2(RA, RB, RC, RD, RE, 0); 587 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 588 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 589 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 590 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 591 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 592 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 593 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 594 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 595 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 596 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 597 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 598 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 599 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 600 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 601 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 602 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 603 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 604 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 605 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 606 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 607 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 608 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 609 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 610 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 611 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 612 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 613 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 614 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 615 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 616 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 617 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 618 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 619 620 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 621 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 622 623 ret; 624ENDPROC(__serpent_enc_blk8_avx) 625 626.align 8 627__serpent_dec_blk8_avx: 628 /* input: 629 * %rdi: ctx, CTX 630 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 631 * output: 632 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks 633 */ 634 635 vpcmpeqd RNOT, RNOT, RNOT; 636 637 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 638 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 639 640 K2(RA, RB, RC, RD, RE, 32); 641 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 642 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 643 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 644 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 645 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 646 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 647 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 648 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 649 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 650 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 651 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 652 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 653 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 654 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 655 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 656 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 657 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 658 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 659 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 660 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 661 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 662 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 663 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 664 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 665 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 666 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 667 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 668 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 669 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 670 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 671 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 672 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 673 674 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 675 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 676 677 ret; 678ENDPROC(__serpent_dec_blk8_avx) 679 680ENTRY(serpent_ecb_enc_8way_avx) 681 /* input: 682 * %rdi: ctx, CTX 683 * %rsi: dst 684 * %rdx: src 685 */ 686 FRAME_BEGIN 687 688 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 689 690 call __serpent_enc_blk8_avx; 691 692 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 693 694 FRAME_END 695 ret; 696ENDPROC(serpent_ecb_enc_8way_avx) 697 698ENTRY(serpent_ecb_dec_8way_avx) 699 /* input: 700 * %rdi: ctx, CTX 701 * %rsi: dst 702 * %rdx: src 703 */ 704 FRAME_BEGIN 705 706 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 707 708 call __serpent_dec_blk8_avx; 709 710 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 711 712 FRAME_END 713 ret; 714ENDPROC(serpent_ecb_dec_8way_avx) 715 716ENTRY(serpent_cbc_dec_8way_avx) 717 /* input: 718 * %rdi: ctx, CTX 719 * %rsi: dst 720 * %rdx: src 721 */ 722 FRAME_BEGIN 723 724 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 725 726 call __serpent_dec_blk8_avx; 727 728 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 729 730 FRAME_END 731 ret; 732ENDPROC(serpent_cbc_dec_8way_avx) 733 734ENTRY(serpent_ctr_8way_avx) 735 /* input: 736 * %rdi: ctx, CTX 737 * %rsi: dst 738 * %rdx: src 739 * %rcx: iv (little endian, 128bit) 740 */ 741 FRAME_BEGIN 742 743 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 744 RD2, RK0, RK1, RK2); 745 746 call __serpent_enc_blk8_avx; 747 748 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 749 750 FRAME_END 751 ret; 752ENDPROC(serpent_ctr_8way_avx) 753 754ENTRY(serpent_xts_enc_8way_avx) 755 /* input: 756 * %rdi: ctx, CTX 757 * %rsi: dst 758 * %rdx: src 759 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 760 */ 761 FRAME_BEGIN 762 763 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 764 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 765 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 766 767 call __serpent_enc_blk8_avx; 768 769 /* dst <= regs xor IVs(in dst) */ 770 store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 771 772 FRAME_END 773 ret; 774ENDPROC(serpent_xts_enc_8way_avx) 775 776ENTRY(serpent_xts_dec_8way_avx) 777 /* input: 778 * %rdi: ctx, CTX 779 * %rsi: dst 780 * %rdx: src 781 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 782 */ 783 FRAME_BEGIN 784 785 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 786 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 787 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 788 789 call __serpent_dec_blk8_avx; 790 791 /* dst <= regs xor IVs(in dst) */ 792 store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 793 794 FRAME_END 795 ret; 796ENDPROC(serpent_xts_dec_8way_avx) 797