1/* 2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) 3 * 4 * Copyright (C) 2012 Johannes Goetzfried 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 * 7 * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, write to the Free Software 21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 22 * USA 23 * 24 */ 25 26#include <linux/linkage.h> 27#include <asm/frame.h> 28#include "glue_helper-asm-avx.S" 29 30.file "serpent-avx-x86_64-asm_64.S" 31 32.data 33.align 16 34 35.Lbswap128_mask: 36 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 37.Lxts_gf128mul_and_shl1_mask: 38 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 39 40.text 41 42#define CTX %rdi 43 44/********************************************************************** 45 8-way AVX serpent 46 **********************************************************************/ 47#define RA1 %xmm0 48#define RB1 %xmm1 49#define RC1 %xmm2 50#define RD1 %xmm3 51#define RE1 %xmm4 52 53#define tp %xmm5 54 55#define RA2 %xmm6 56#define RB2 %xmm7 57#define RC2 %xmm8 58#define RD2 %xmm9 59#define RE2 %xmm10 60 61#define RNOT %xmm11 62 63#define RK0 %xmm12 64#define RK1 %xmm13 65#define RK2 %xmm14 66#define RK3 %xmm15 67 68 69#define S0_1(x0, x1, x2, x3, x4) \ 70 vpor x0, x3, tp; \ 71 vpxor x3, x0, x0; \ 72 vpxor x2, x3, x4; \ 73 vpxor RNOT, x4, x4; \ 74 vpxor x1, tp, x3; \ 75 vpand x0, x1, x1; \ 76 vpxor x4, x1, x1; \ 77 vpxor x0, x2, x2; 78#define S0_2(x0, x1, x2, x3, x4) \ 79 vpxor x3, x0, x0; \ 80 vpor x0, x4, x4; \ 81 vpxor x2, x0, x0; \ 82 vpand x1, x2, x2; \ 83 vpxor x2, x3, x3; \ 84 vpxor RNOT, x1, x1; \ 85 vpxor x4, x2, x2; \ 86 vpxor x2, x1, x1; 87 88#define S1_1(x0, x1, x2, x3, x4) \ 89 vpxor x0, x1, tp; \ 90 vpxor x3, x0, x0; \ 91 vpxor RNOT, x3, x3; \ 92 vpand tp, x1, x4; \ 93 vpor tp, x0, x0; \ 94 vpxor x2, x3, x3; \ 95 vpxor x3, x0, x0; \ 96 vpxor x3, tp, x1; 97#define S1_2(x0, x1, x2, x3, x4) \ 98 vpxor x4, x3, x3; \ 99 vpor x4, x1, x1; \ 100 vpxor x2, x4, x4; \ 101 vpand x0, x2, x2; \ 102 vpxor x1, x2, x2; \ 103 vpor x0, x1, x1; \ 104 vpxor RNOT, x0, x0; \ 105 vpxor x2, x0, x0; \ 106 vpxor x1, x4, x4; 107 108#define S2_1(x0, x1, x2, x3, x4) \ 109 vpxor RNOT, x3, x3; \ 110 vpxor x0, x1, x1; \ 111 vpand x2, x0, tp; \ 112 vpxor x3, tp, tp; \ 113 vpor x0, x3, x3; \ 114 vpxor x1, x2, x2; \ 115 vpxor x1, x3, x3; \ 116 vpand tp, x1, x1; 117#define S2_2(x0, x1, x2, x3, x4) \ 118 vpxor x2, tp, tp; \ 119 vpand x3, x2, x2; \ 120 vpor x1, x3, x3; \ 121 vpxor RNOT, tp, tp; \ 122 vpxor tp, x3, x3; \ 123 vpxor tp, x0, x4; \ 124 vpxor x2, tp, x0; \ 125 vpor x2, x1, x1; 126 127#define S3_1(x0, x1, x2, x3, x4) \ 128 vpxor x3, x1, tp; \ 129 vpor x0, x3, x3; \ 130 vpand x0, x1, x4; \ 131 vpxor x2, x0, x0; \ 132 vpxor tp, x2, x2; \ 133 vpand x3, tp, x1; \ 134 vpxor x3, x2, x2; \ 135 vpor x4, x0, x0; \ 136 vpxor x3, x4, x4; 137#define S3_2(x0, x1, x2, x3, x4) \ 138 vpxor x0, x1, x1; \ 139 vpand x3, x0, x0; \ 140 vpand x4, x3, x3; \ 141 vpxor x2, x3, x3; \ 142 vpor x1, x4, x4; \ 143 vpand x1, x2, x2; \ 144 vpxor x3, x4, x4; \ 145 vpxor x3, x0, x0; \ 146 vpxor x2, x3, x3; 147 148#define S4_1(x0, x1, x2, x3, x4) \ 149 vpand x0, x3, tp; \ 150 vpxor x3, x0, x0; \ 151 vpxor x2, tp, tp; \ 152 vpor x3, x2, x2; \ 153 vpxor x1, x0, x0; \ 154 vpxor tp, x3, x4; \ 155 vpor x0, x2, x2; \ 156 vpxor x1, x2, x2; 157#define S4_2(x0, x1, x2, x3, x4) \ 158 vpand x0, x1, x1; \ 159 vpxor x4, x1, x1; \ 160 vpand x2, x4, x4; \ 161 vpxor tp, x2, x2; \ 162 vpxor x0, x4, x4; \ 163 vpor x1, tp, x3; \ 164 vpxor RNOT, x1, x1; \ 165 vpxor x0, x3, x3; 166 167#define S5_1(x0, x1, x2, x3, x4) \ 168 vpor x0, x1, tp; \ 169 vpxor tp, x2, x2; \ 170 vpxor RNOT, x3, x3; \ 171 vpxor x0, x1, x4; \ 172 vpxor x2, x0, x0; \ 173 vpand x4, tp, x1; \ 174 vpor x3, x4, x4; \ 175 vpxor x0, x4, x4; 176#define S5_2(x0, x1, x2, x3, x4) \ 177 vpand x3, x0, x0; \ 178 vpxor x3, x1, x1; \ 179 vpxor x2, x3, x3; \ 180 vpxor x1, x0, x0; \ 181 vpand x4, x2, x2; \ 182 vpxor x2, x1, x1; \ 183 vpand x0, x2, x2; \ 184 vpxor x2, x3, x3; 185 186#define S6_1(x0, x1, x2, x3, x4) \ 187 vpxor x0, x3, x3; \ 188 vpxor x2, x1, tp; \ 189 vpxor x0, x2, x2; \ 190 vpand x3, x0, x0; \ 191 vpor x3, tp, tp; \ 192 vpxor RNOT, x1, x4; \ 193 vpxor tp, x0, x0; \ 194 vpxor x2, tp, x1; 195#define S6_2(x0, x1, x2, x3, x4) \ 196 vpxor x4, x3, x3; \ 197 vpxor x0, x4, x4; \ 198 vpand x0, x2, x2; \ 199 vpxor x1, x4, x4; \ 200 vpxor x3, x2, x2; \ 201 vpand x1, x3, x3; \ 202 vpxor x0, x3, x3; \ 203 vpxor x2, x1, x1; 204 205#define S7_1(x0, x1, x2, x3, x4) \ 206 vpxor RNOT, x1, tp; \ 207 vpxor RNOT, x0, x0; \ 208 vpand x2, tp, x1; \ 209 vpxor x3, x1, x1; \ 210 vpor tp, x3, x3; \ 211 vpxor x2, tp, x4; \ 212 vpxor x3, x2, x2; \ 213 vpxor x0, x3, x3; \ 214 vpor x1, x0, x0; 215#define S7_2(x0, x1, x2, x3, x4) \ 216 vpand x0, x2, x2; \ 217 vpxor x4, x0, x0; \ 218 vpxor x3, x4, x4; \ 219 vpand x0, x3, x3; \ 220 vpxor x1, x4, x4; \ 221 vpxor x4, x2, x2; \ 222 vpxor x1, x3, x3; \ 223 vpor x0, x4, x4; \ 224 vpxor x1, x4, x4; 225 226#define SI0_1(x0, x1, x2, x3, x4) \ 227 vpxor x0, x1, x1; \ 228 vpor x1, x3, tp; \ 229 vpxor x1, x3, x4; \ 230 vpxor RNOT, x0, x0; \ 231 vpxor tp, x2, x2; \ 232 vpxor x0, tp, x3; \ 233 vpand x1, x0, x0; \ 234 vpxor x2, x0, x0; 235#define SI0_2(x0, x1, x2, x3, x4) \ 236 vpand x3, x2, x2; \ 237 vpxor x4, x3, x3; \ 238 vpxor x3, x2, x2; \ 239 vpxor x3, x1, x1; \ 240 vpand x0, x3, x3; \ 241 vpxor x0, x1, x1; \ 242 vpxor x2, x0, x0; \ 243 vpxor x3, x4, x4; 244 245#define SI1_1(x0, x1, x2, x3, x4) \ 246 vpxor x3, x1, x1; \ 247 vpxor x2, x0, tp; \ 248 vpxor RNOT, x2, x2; \ 249 vpor x1, x0, x4; \ 250 vpxor x3, x4, x4; \ 251 vpand x1, x3, x3; \ 252 vpxor x2, x1, x1; \ 253 vpand x4, x2, x2; 254#define SI1_2(x0, x1, x2, x3, x4) \ 255 vpxor x1, x4, x4; \ 256 vpor x3, x1, x1; \ 257 vpxor tp, x3, x3; \ 258 vpxor tp, x2, x2; \ 259 vpor x4, tp, x0; \ 260 vpxor x4, x2, x2; \ 261 vpxor x0, x1, x1; \ 262 vpxor x1, x4, x4; 263 264#define SI2_1(x0, x1, x2, x3, x4) \ 265 vpxor x1, x2, x2; \ 266 vpxor RNOT, x3, tp; \ 267 vpor x2, tp, tp; \ 268 vpxor x3, x2, x2; \ 269 vpxor x0, x3, x4; \ 270 vpxor x1, tp, x3; \ 271 vpor x2, x1, x1; \ 272 vpxor x0, x2, x2; 273#define SI2_2(x0, x1, x2, x3, x4) \ 274 vpxor x4, x1, x1; \ 275 vpor x3, x4, x4; \ 276 vpxor x3, x2, x2; \ 277 vpxor x2, x4, x4; \ 278 vpand x1, x2, x2; \ 279 vpxor x3, x2, x2; \ 280 vpxor x4, x3, x3; \ 281 vpxor x0, x4, x4; 282 283#define SI3_1(x0, x1, x2, x3, x4) \ 284 vpxor x1, x2, x2; \ 285 vpand x2, x1, tp; \ 286 vpxor x0, tp, tp; \ 287 vpor x1, x0, x0; \ 288 vpxor x3, x1, x4; \ 289 vpxor x3, x0, x0; \ 290 vpor tp, x3, x3; \ 291 vpxor x2, tp, x1; 292#define SI3_2(x0, x1, x2, x3, x4) \ 293 vpxor x3, x1, x1; \ 294 vpxor x2, x0, x0; \ 295 vpxor x3, x2, x2; \ 296 vpand x1, x3, x3; \ 297 vpxor x0, x1, x1; \ 298 vpand x2, x0, x0; \ 299 vpxor x3, x4, x4; \ 300 vpxor x0, x3, x3; \ 301 vpxor x1, x0, x0; 302 303#define SI4_1(x0, x1, x2, x3, x4) \ 304 vpxor x3, x2, x2; \ 305 vpand x1, x0, tp; \ 306 vpxor x2, tp, tp; \ 307 vpor x3, x2, x2; \ 308 vpxor RNOT, x0, x4; \ 309 vpxor tp, x1, x1; \ 310 vpxor x2, tp, x0; \ 311 vpand x4, x2, x2; 312#define SI4_2(x0, x1, x2, x3, x4) \ 313 vpxor x0, x2, x2; \ 314 vpor x4, x0, x0; \ 315 vpxor x3, x0, x0; \ 316 vpand x2, x3, x3; \ 317 vpxor x3, x4, x4; \ 318 vpxor x1, x3, x3; \ 319 vpand x0, x1, x1; \ 320 vpxor x1, x4, x4; \ 321 vpxor x3, x0, x0; 322 323#define SI5_1(x0, x1, x2, x3, x4) \ 324 vpor x2, x1, tp; \ 325 vpxor x1, x2, x2; \ 326 vpxor x3, tp, tp; \ 327 vpand x1, x3, x3; \ 328 vpxor x3, x2, x2; \ 329 vpor x0, x3, x3; \ 330 vpxor RNOT, x0, x0; \ 331 vpxor x2, x3, x3; \ 332 vpor x0, x2, x2; 333#define SI5_2(x0, x1, x2, x3, x4) \ 334 vpxor tp, x1, x4; \ 335 vpxor x4, x2, x2; \ 336 vpand x0, x4, x4; \ 337 vpxor tp, x0, x0; \ 338 vpxor x3, tp, x1; \ 339 vpand x2, x0, x0; \ 340 vpxor x3, x2, x2; \ 341 vpxor x2, x0, x0; \ 342 vpxor x4, x2, x2; \ 343 vpxor x3, x4, x4; 344 345#define SI6_1(x0, x1, x2, x3, x4) \ 346 vpxor x2, x0, x0; \ 347 vpand x3, x0, tp; \ 348 vpxor x3, x2, x2; \ 349 vpxor x2, tp, tp; \ 350 vpxor x1, x3, x3; \ 351 vpor x0, x2, x2; \ 352 vpxor x3, x2, x2; \ 353 vpand tp, x3, x3; 354#define SI6_2(x0, x1, x2, x3, x4) \ 355 vpxor RNOT, tp, tp; \ 356 vpxor x1, x3, x3; \ 357 vpand x2, x1, x1; \ 358 vpxor tp, x0, x4; \ 359 vpxor x4, x3, x3; \ 360 vpxor x2, x4, x4; \ 361 vpxor x1, tp, x0; \ 362 vpxor x0, x2, x2; 363 364#define SI7_1(x0, x1, x2, x3, x4) \ 365 vpand x0, x3, tp; \ 366 vpxor x2, x0, x0; \ 367 vpor x3, x2, x2; \ 368 vpxor x1, x3, x4; \ 369 vpxor RNOT, x0, x0; \ 370 vpor tp, x1, x1; \ 371 vpxor x0, x4, x4; \ 372 vpand x2, x0, x0; \ 373 vpxor x1, x0, x0; 374#define SI7_2(x0, x1, x2, x3, x4) \ 375 vpand x2, x1, x1; \ 376 vpxor x2, tp, x3; \ 377 vpxor x3, x4, x4; \ 378 vpand x3, x2, x2; \ 379 vpor x0, x3, x3; \ 380 vpxor x4, x1, x1; \ 381 vpxor x4, x3, x3; \ 382 vpand x0, x4, x4; \ 383 vpxor x2, x4, x4; 384 385#define get_key(i, j, t) \ 386 vbroadcastss (4*(i)+(j))*4(CTX), t; 387 388#define K2(x0, x1, x2, x3, x4, i) \ 389 get_key(i, 0, RK0); \ 390 get_key(i, 1, RK1); \ 391 get_key(i, 2, RK2); \ 392 get_key(i, 3, RK3); \ 393 vpxor RK0, x0 ## 1, x0 ## 1; \ 394 vpxor RK1, x1 ## 1, x1 ## 1; \ 395 vpxor RK2, x2 ## 1, x2 ## 1; \ 396 vpxor RK3, x3 ## 1, x3 ## 1; \ 397 vpxor RK0, x0 ## 2, x0 ## 2; \ 398 vpxor RK1, x1 ## 2, x1 ## 2; \ 399 vpxor RK2, x2 ## 2, x2 ## 2; \ 400 vpxor RK3, x3 ## 2, x3 ## 2; 401 402#define LK2(x0, x1, x2, x3, x4, i) \ 403 vpslld $13, x0 ## 1, x4 ## 1; \ 404 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 405 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 406 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 407 vpslld $3, x2 ## 1, x4 ## 1; \ 408 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 409 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 410 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 411 vpslld $13, x0 ## 2, x4 ## 2; \ 412 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 413 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 414 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 415 vpslld $3, x2 ## 2, x4 ## 2; \ 416 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 417 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 418 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 419 vpslld $1, x1 ## 1, x4 ## 1; \ 420 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 421 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 422 vpslld $3, x0 ## 1, x4 ## 1; \ 423 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 424 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 425 get_key(i, 1, RK1); \ 426 vpslld $1, x1 ## 2, x4 ## 2; \ 427 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 428 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 429 vpslld $3, x0 ## 2, x4 ## 2; \ 430 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 431 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 432 get_key(i, 3, RK3); \ 433 vpslld $7, x3 ## 1, x4 ## 1; \ 434 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 435 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 436 vpslld $7, x1 ## 1, x4 ## 1; \ 437 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 438 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 439 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 440 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 441 get_key(i, 0, RK0); \ 442 vpslld $7, x3 ## 2, x4 ## 2; \ 443 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 444 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 445 vpslld $7, x1 ## 2, x4 ## 2; \ 446 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 447 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 448 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 449 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 450 get_key(i, 2, RK2); \ 451 vpxor RK1, x1 ## 1, x1 ## 1; \ 452 vpxor RK3, x3 ## 1, x3 ## 1; \ 453 vpslld $5, x0 ## 1, x4 ## 1; \ 454 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 455 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 456 vpslld $22, x2 ## 1, x4 ## 1; \ 457 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 458 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 459 vpxor RK0, x0 ## 1, x0 ## 1; \ 460 vpxor RK2, x2 ## 1, x2 ## 1; \ 461 vpxor RK1, x1 ## 2, x1 ## 2; \ 462 vpxor RK3, x3 ## 2, x3 ## 2; \ 463 vpslld $5, x0 ## 2, x4 ## 2; \ 464 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 465 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 466 vpslld $22, x2 ## 2, x4 ## 2; \ 467 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 468 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 469 vpxor RK0, x0 ## 2, x0 ## 2; \ 470 vpxor RK2, x2 ## 2, x2 ## 2; 471 472#define KL2(x0, x1, x2, x3, x4, i) \ 473 vpxor RK0, x0 ## 1, x0 ## 1; \ 474 vpxor RK2, x2 ## 1, x2 ## 1; \ 475 vpsrld $5, x0 ## 1, x4 ## 1; \ 476 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 477 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 478 vpxor RK3, x3 ## 1, x3 ## 1; \ 479 vpxor RK1, x1 ## 1, x1 ## 1; \ 480 vpsrld $22, x2 ## 1, x4 ## 1; \ 481 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 482 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 483 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 484 vpxor RK0, x0 ## 2, x0 ## 2; \ 485 vpxor RK2, x2 ## 2, x2 ## 2; \ 486 vpsrld $5, x0 ## 2, x4 ## 2; \ 487 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 488 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 489 vpxor RK3, x3 ## 2, x3 ## 2; \ 490 vpxor RK1, x1 ## 2, x1 ## 2; \ 491 vpsrld $22, x2 ## 2, x4 ## 2; \ 492 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 493 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 494 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 495 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 496 vpslld $7, x1 ## 1, x4 ## 1; \ 497 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 498 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 499 vpsrld $1, x1 ## 1, x4 ## 1; \ 500 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 501 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 502 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 503 vpslld $7, x1 ## 2, x4 ## 2; \ 504 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 505 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 506 vpsrld $1, x1 ## 2, x4 ## 2; \ 507 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 508 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 509 vpsrld $7, x3 ## 1, x4 ## 1; \ 510 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 511 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 512 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 513 vpslld $3, x0 ## 1, x4 ## 1; \ 514 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 515 vpsrld $7, x3 ## 2, x4 ## 2; \ 516 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 517 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 518 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 519 vpslld $3, x0 ## 2, x4 ## 2; \ 520 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 521 vpsrld $13, x0 ## 1, x4 ## 1; \ 522 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 523 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 524 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 525 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 526 vpsrld $3, x2 ## 1, x4 ## 1; \ 527 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 528 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 529 vpsrld $13, x0 ## 2, x4 ## 2; \ 530 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 531 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 532 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 533 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 534 vpsrld $3, x2 ## 2, x4 ## 2; \ 535 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 536 vpor x4 ## 2, x2 ## 2, x2 ## 2; 537 538#define S(SBOX, x0, x1, x2, x3, x4) \ 539 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 540 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 541 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 542 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 543 544#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 545 get_key(i, 0, RK0); \ 546 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 547 get_key(i, 2, RK2); \ 548 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 549 get_key(i, 3, RK3); \ 550 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 551 get_key(i, 1, RK1); \ 552 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 553 554#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 555 vpunpckldq x1, x0, t0; \ 556 vpunpckhdq x1, x0, t2; \ 557 vpunpckldq x3, x2, t1; \ 558 vpunpckhdq x3, x2, x3; \ 559 \ 560 vpunpcklqdq t1, t0, x0; \ 561 vpunpckhqdq t1, t0, x1; \ 562 vpunpcklqdq x3, t2, x2; \ 563 vpunpckhqdq x3, t2, x3; 564 565#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 566 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 567 568#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 569 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 570 571.align 8 572__serpent_enc_blk8_avx: 573 /* input: 574 * %rdi: ctx, CTX 575 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 576 * output: 577 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 578 */ 579 580 vpcmpeqd RNOT, RNOT, RNOT; 581 582 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 583 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 584 585 K2(RA, RB, RC, RD, RE, 0); 586 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 587 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 588 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 589 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 590 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 591 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 592 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 593 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 594 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 595 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 596 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 597 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 598 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 599 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 600 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 601 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 602 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 603 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 604 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 605 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 606 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 607 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 608 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 609 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 610 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 611 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 612 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 613 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 614 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 615 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 616 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 617 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 618 619 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 620 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 621 622 ret; 623ENDPROC(__serpent_enc_blk8_avx) 624 625.align 8 626__serpent_dec_blk8_avx: 627 /* input: 628 * %rdi: ctx, CTX 629 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 630 * output: 631 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks 632 */ 633 634 vpcmpeqd RNOT, RNOT, RNOT; 635 636 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 637 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 638 639 K2(RA, RB, RC, RD, RE, 32); 640 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 641 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 642 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 643 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 644 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 645 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 646 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 647 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 648 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 649 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 650 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 651 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 652 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 653 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 654 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 655 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 656 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 657 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 658 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 659 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 660 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 661 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 662 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 663 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 664 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 665 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 666 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 667 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 668 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 669 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 670 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 671 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 672 673 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 674 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 675 676 ret; 677ENDPROC(__serpent_dec_blk8_avx) 678 679ENTRY(serpent_ecb_enc_8way_avx) 680 /* input: 681 * %rdi: ctx, CTX 682 * %rsi: dst 683 * %rdx: src 684 */ 685 FRAME_BEGIN 686 687 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 688 689 call __serpent_enc_blk8_avx; 690 691 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 692 693 FRAME_END 694 ret; 695ENDPROC(serpent_ecb_enc_8way_avx) 696 697ENTRY(serpent_ecb_dec_8way_avx) 698 /* input: 699 * %rdi: ctx, CTX 700 * %rsi: dst 701 * %rdx: src 702 */ 703 FRAME_BEGIN 704 705 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 706 707 call __serpent_dec_blk8_avx; 708 709 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 710 711 FRAME_END 712 ret; 713ENDPROC(serpent_ecb_dec_8way_avx) 714 715ENTRY(serpent_cbc_dec_8way_avx) 716 /* input: 717 * %rdi: ctx, CTX 718 * %rsi: dst 719 * %rdx: src 720 */ 721 FRAME_BEGIN 722 723 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 724 725 call __serpent_dec_blk8_avx; 726 727 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 728 729 FRAME_END 730 ret; 731ENDPROC(serpent_cbc_dec_8way_avx) 732 733ENTRY(serpent_ctr_8way_avx) 734 /* input: 735 * %rdi: ctx, CTX 736 * %rsi: dst 737 * %rdx: src 738 * %rcx: iv (little endian, 128bit) 739 */ 740 FRAME_BEGIN 741 742 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 743 RD2, RK0, RK1, RK2); 744 745 call __serpent_enc_blk8_avx; 746 747 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 748 749 FRAME_END 750 ret; 751ENDPROC(serpent_ctr_8way_avx) 752 753ENTRY(serpent_xts_enc_8way_avx) 754 /* input: 755 * %rdi: ctx, CTX 756 * %rsi: dst 757 * %rdx: src 758 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 759 */ 760 FRAME_BEGIN 761 762 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 763 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 764 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 765 766 call __serpent_enc_blk8_avx; 767 768 /* dst <= regs xor IVs(in dst) */ 769 store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 770 771 FRAME_END 772 ret; 773ENDPROC(serpent_xts_enc_8way_avx) 774 775ENTRY(serpent_xts_dec_8way_avx) 776 /* input: 777 * %rdi: ctx, CTX 778 * %rsi: dst 779 * %rdx: src 780 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 781 */ 782 FRAME_BEGIN 783 784 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 785 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 786 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 787 788 call __serpent_dec_blk8_avx; 789 790 /* dst <= regs xor IVs(in dst) */ 791 store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 792 793 FRAME_END 794 ret; 795ENDPROC(serpent_xts_dec_8way_avx) 796