1/* 2 * x86_64/AVX2 assembler optimized version of Serpent 3 * 4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * Based on AVX assembler implementation of Serpent by: 7 * Copyright © 2012 Johannes Goetzfried 8 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 */ 16 17#include <linux/linkage.h> 18#include <asm/frame.h> 19#include "glue_helper-asm-avx2.S" 20 21.file "serpent-avx2-asm_64.S" 22 23.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 24.align 16 25.Lbswap128_mask: 26 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 27 28.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16 29.align 16 30.Lxts_gf128mul_and_shl1_mask_0: 31 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 32 33.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16 34.align 16 35.Lxts_gf128mul_and_shl1_mask_1: 36 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 37 38.text 39 40#define CTX %rdi 41 42#define RNOT %ymm0 43#define tp %ymm1 44 45#define RA1 %ymm2 46#define RA2 %ymm3 47#define RB1 %ymm4 48#define RB2 %ymm5 49#define RC1 %ymm6 50#define RC2 %ymm7 51#define RD1 %ymm8 52#define RD2 %ymm9 53#define RE1 %ymm10 54#define RE2 %ymm11 55 56#define RK0 %ymm12 57#define RK1 %ymm13 58#define RK2 %ymm14 59#define RK3 %ymm15 60 61#define RK0x %xmm12 62#define RK1x %xmm13 63#define RK2x %xmm14 64#define RK3x %xmm15 65 66#define S0_1(x0, x1, x2, x3, x4) \ 67 vpor x0, x3, tp; \ 68 vpxor x3, x0, x0; \ 69 vpxor x2, x3, x4; \ 70 vpxor RNOT, x4, x4; \ 71 vpxor x1, tp, x3; \ 72 vpand x0, x1, x1; \ 73 vpxor x4, x1, x1; \ 74 vpxor x0, x2, x2; 75#define S0_2(x0, x1, x2, x3, x4) \ 76 vpxor x3, x0, x0; \ 77 vpor x0, x4, x4; \ 78 vpxor x2, x0, x0; \ 79 vpand x1, x2, x2; \ 80 vpxor x2, x3, x3; \ 81 vpxor RNOT, x1, x1; \ 82 vpxor x4, x2, x2; \ 83 vpxor x2, x1, x1; 84 85#define S1_1(x0, x1, x2, x3, x4) \ 86 vpxor x0, x1, tp; \ 87 vpxor x3, x0, x0; \ 88 vpxor RNOT, x3, x3; \ 89 vpand tp, x1, x4; \ 90 vpor tp, x0, x0; \ 91 vpxor x2, x3, x3; \ 92 vpxor x3, x0, x0; \ 93 vpxor x3, tp, x1; 94#define S1_2(x0, x1, x2, x3, x4) \ 95 vpxor x4, x3, x3; \ 96 vpor x4, x1, x1; \ 97 vpxor x2, x4, x4; \ 98 vpand x0, x2, x2; \ 99 vpxor x1, x2, x2; \ 100 vpor x0, x1, x1; \ 101 vpxor RNOT, x0, x0; \ 102 vpxor x2, x0, x0; \ 103 vpxor x1, x4, x4; 104 105#define S2_1(x0, x1, x2, x3, x4) \ 106 vpxor RNOT, x3, x3; \ 107 vpxor x0, x1, x1; \ 108 vpand x2, x0, tp; \ 109 vpxor x3, tp, tp; \ 110 vpor x0, x3, x3; \ 111 vpxor x1, x2, x2; \ 112 vpxor x1, x3, x3; \ 113 vpand tp, x1, x1; 114#define S2_2(x0, x1, x2, x3, x4) \ 115 vpxor x2, tp, tp; \ 116 vpand x3, x2, x2; \ 117 vpor x1, x3, x3; \ 118 vpxor RNOT, tp, tp; \ 119 vpxor tp, x3, x3; \ 120 vpxor tp, x0, x4; \ 121 vpxor x2, tp, x0; \ 122 vpor x2, x1, x1; 123 124#define S3_1(x0, x1, x2, x3, x4) \ 125 vpxor x3, x1, tp; \ 126 vpor x0, x3, x3; \ 127 vpand x0, x1, x4; \ 128 vpxor x2, x0, x0; \ 129 vpxor tp, x2, x2; \ 130 vpand x3, tp, x1; \ 131 vpxor x3, x2, x2; \ 132 vpor x4, x0, x0; \ 133 vpxor x3, x4, x4; 134#define S3_2(x0, x1, x2, x3, x4) \ 135 vpxor x0, x1, x1; \ 136 vpand x3, x0, x0; \ 137 vpand x4, x3, x3; \ 138 vpxor x2, x3, x3; \ 139 vpor x1, x4, x4; \ 140 vpand x1, x2, x2; \ 141 vpxor x3, x4, x4; \ 142 vpxor x3, x0, x0; \ 143 vpxor x2, x3, x3; 144 145#define S4_1(x0, x1, x2, x3, x4) \ 146 vpand x0, x3, tp; \ 147 vpxor x3, x0, x0; \ 148 vpxor x2, tp, tp; \ 149 vpor x3, x2, x2; \ 150 vpxor x1, x0, x0; \ 151 vpxor tp, x3, x4; \ 152 vpor x0, x2, x2; \ 153 vpxor x1, x2, x2; 154#define S4_2(x0, x1, x2, x3, x4) \ 155 vpand x0, x1, x1; \ 156 vpxor x4, x1, x1; \ 157 vpand x2, x4, x4; \ 158 vpxor tp, x2, x2; \ 159 vpxor x0, x4, x4; \ 160 vpor x1, tp, x3; \ 161 vpxor RNOT, x1, x1; \ 162 vpxor x0, x3, x3; 163 164#define S5_1(x0, x1, x2, x3, x4) \ 165 vpor x0, x1, tp; \ 166 vpxor tp, x2, x2; \ 167 vpxor RNOT, x3, x3; \ 168 vpxor x0, x1, x4; \ 169 vpxor x2, x0, x0; \ 170 vpand x4, tp, x1; \ 171 vpor x3, x4, x4; \ 172 vpxor x0, x4, x4; 173#define S5_2(x0, x1, x2, x3, x4) \ 174 vpand x3, x0, x0; \ 175 vpxor x3, x1, x1; \ 176 vpxor x2, x3, x3; \ 177 vpxor x1, x0, x0; \ 178 vpand x4, x2, x2; \ 179 vpxor x2, x1, x1; \ 180 vpand x0, x2, x2; \ 181 vpxor x2, x3, x3; 182 183#define S6_1(x0, x1, x2, x3, x4) \ 184 vpxor x0, x3, x3; \ 185 vpxor x2, x1, tp; \ 186 vpxor x0, x2, x2; \ 187 vpand x3, x0, x0; \ 188 vpor x3, tp, tp; \ 189 vpxor RNOT, x1, x4; \ 190 vpxor tp, x0, x0; \ 191 vpxor x2, tp, x1; 192#define S6_2(x0, x1, x2, x3, x4) \ 193 vpxor x4, x3, x3; \ 194 vpxor x0, x4, x4; \ 195 vpand x0, x2, x2; \ 196 vpxor x1, x4, x4; \ 197 vpxor x3, x2, x2; \ 198 vpand x1, x3, x3; \ 199 vpxor x0, x3, x3; \ 200 vpxor x2, x1, x1; 201 202#define S7_1(x0, x1, x2, x3, x4) \ 203 vpxor RNOT, x1, tp; \ 204 vpxor RNOT, x0, x0; \ 205 vpand x2, tp, x1; \ 206 vpxor x3, x1, x1; \ 207 vpor tp, x3, x3; \ 208 vpxor x2, tp, x4; \ 209 vpxor x3, x2, x2; \ 210 vpxor x0, x3, x3; \ 211 vpor x1, x0, x0; 212#define S7_2(x0, x1, x2, x3, x4) \ 213 vpand x0, x2, x2; \ 214 vpxor x4, x0, x0; \ 215 vpxor x3, x4, x4; \ 216 vpand x0, x3, x3; \ 217 vpxor x1, x4, x4; \ 218 vpxor x4, x2, x2; \ 219 vpxor x1, x3, x3; \ 220 vpor x0, x4, x4; \ 221 vpxor x1, x4, x4; 222 223#define SI0_1(x0, x1, x2, x3, x4) \ 224 vpxor x0, x1, x1; \ 225 vpor x1, x3, tp; \ 226 vpxor x1, x3, x4; \ 227 vpxor RNOT, x0, x0; \ 228 vpxor tp, x2, x2; \ 229 vpxor x0, tp, x3; \ 230 vpand x1, x0, x0; \ 231 vpxor x2, x0, x0; 232#define SI0_2(x0, x1, x2, x3, x4) \ 233 vpand x3, x2, x2; \ 234 vpxor x4, x3, x3; \ 235 vpxor x3, x2, x2; \ 236 vpxor x3, x1, x1; \ 237 vpand x0, x3, x3; \ 238 vpxor x0, x1, x1; \ 239 vpxor x2, x0, x0; \ 240 vpxor x3, x4, x4; 241 242#define SI1_1(x0, x1, x2, x3, x4) \ 243 vpxor x3, x1, x1; \ 244 vpxor x2, x0, tp; \ 245 vpxor RNOT, x2, x2; \ 246 vpor x1, x0, x4; \ 247 vpxor x3, x4, x4; \ 248 vpand x1, x3, x3; \ 249 vpxor x2, x1, x1; \ 250 vpand x4, x2, x2; 251#define SI1_2(x0, x1, x2, x3, x4) \ 252 vpxor x1, x4, x4; \ 253 vpor x3, x1, x1; \ 254 vpxor tp, x3, x3; \ 255 vpxor tp, x2, x2; \ 256 vpor x4, tp, x0; \ 257 vpxor x4, x2, x2; \ 258 vpxor x0, x1, x1; \ 259 vpxor x1, x4, x4; 260 261#define SI2_1(x0, x1, x2, x3, x4) \ 262 vpxor x1, x2, x2; \ 263 vpxor RNOT, x3, tp; \ 264 vpor x2, tp, tp; \ 265 vpxor x3, x2, x2; \ 266 vpxor x0, x3, x4; \ 267 vpxor x1, tp, x3; \ 268 vpor x2, x1, x1; \ 269 vpxor x0, x2, x2; 270#define SI2_2(x0, x1, x2, x3, x4) \ 271 vpxor x4, x1, x1; \ 272 vpor x3, x4, x4; \ 273 vpxor x3, x2, x2; \ 274 vpxor x2, x4, x4; \ 275 vpand x1, x2, x2; \ 276 vpxor x3, x2, x2; \ 277 vpxor x4, x3, x3; \ 278 vpxor x0, x4, x4; 279 280#define SI3_1(x0, x1, x2, x3, x4) \ 281 vpxor x1, x2, x2; \ 282 vpand x2, x1, tp; \ 283 vpxor x0, tp, tp; \ 284 vpor x1, x0, x0; \ 285 vpxor x3, x1, x4; \ 286 vpxor x3, x0, x0; \ 287 vpor tp, x3, x3; \ 288 vpxor x2, tp, x1; 289#define SI3_2(x0, x1, x2, x3, x4) \ 290 vpxor x3, x1, x1; \ 291 vpxor x2, x0, x0; \ 292 vpxor x3, x2, x2; \ 293 vpand x1, x3, x3; \ 294 vpxor x0, x1, x1; \ 295 vpand x2, x0, x0; \ 296 vpxor x3, x4, x4; \ 297 vpxor x0, x3, x3; \ 298 vpxor x1, x0, x0; 299 300#define SI4_1(x0, x1, x2, x3, x4) \ 301 vpxor x3, x2, x2; \ 302 vpand x1, x0, tp; \ 303 vpxor x2, tp, tp; \ 304 vpor x3, x2, x2; \ 305 vpxor RNOT, x0, x4; \ 306 vpxor tp, x1, x1; \ 307 vpxor x2, tp, x0; \ 308 vpand x4, x2, x2; 309#define SI4_2(x0, x1, x2, x3, x4) \ 310 vpxor x0, x2, x2; \ 311 vpor x4, x0, x0; \ 312 vpxor x3, x0, x0; \ 313 vpand x2, x3, x3; \ 314 vpxor x3, x4, x4; \ 315 vpxor x1, x3, x3; \ 316 vpand x0, x1, x1; \ 317 vpxor x1, x4, x4; \ 318 vpxor x3, x0, x0; 319 320#define SI5_1(x0, x1, x2, x3, x4) \ 321 vpor x2, x1, tp; \ 322 vpxor x1, x2, x2; \ 323 vpxor x3, tp, tp; \ 324 vpand x1, x3, x3; \ 325 vpxor x3, x2, x2; \ 326 vpor x0, x3, x3; \ 327 vpxor RNOT, x0, x0; \ 328 vpxor x2, x3, x3; \ 329 vpor x0, x2, x2; 330#define SI5_2(x0, x1, x2, x3, x4) \ 331 vpxor tp, x1, x4; \ 332 vpxor x4, x2, x2; \ 333 vpand x0, x4, x4; \ 334 vpxor tp, x0, x0; \ 335 vpxor x3, tp, x1; \ 336 vpand x2, x0, x0; \ 337 vpxor x3, x2, x2; \ 338 vpxor x2, x0, x0; \ 339 vpxor x4, x2, x2; \ 340 vpxor x3, x4, x4; 341 342#define SI6_1(x0, x1, x2, x3, x4) \ 343 vpxor x2, x0, x0; \ 344 vpand x3, x0, tp; \ 345 vpxor x3, x2, x2; \ 346 vpxor x2, tp, tp; \ 347 vpxor x1, x3, x3; \ 348 vpor x0, x2, x2; \ 349 vpxor x3, x2, x2; \ 350 vpand tp, x3, x3; 351#define SI6_2(x0, x1, x2, x3, x4) \ 352 vpxor RNOT, tp, tp; \ 353 vpxor x1, x3, x3; \ 354 vpand x2, x1, x1; \ 355 vpxor tp, x0, x4; \ 356 vpxor x4, x3, x3; \ 357 vpxor x2, x4, x4; \ 358 vpxor x1, tp, x0; \ 359 vpxor x0, x2, x2; 360 361#define SI7_1(x0, x1, x2, x3, x4) \ 362 vpand x0, x3, tp; \ 363 vpxor x2, x0, x0; \ 364 vpor x3, x2, x2; \ 365 vpxor x1, x3, x4; \ 366 vpxor RNOT, x0, x0; \ 367 vpor tp, x1, x1; \ 368 vpxor x0, x4, x4; \ 369 vpand x2, x0, x0; \ 370 vpxor x1, x0, x0; 371#define SI7_2(x0, x1, x2, x3, x4) \ 372 vpand x2, x1, x1; \ 373 vpxor x2, tp, x3; \ 374 vpxor x3, x4, x4; \ 375 vpand x3, x2, x2; \ 376 vpor x0, x3, x3; \ 377 vpxor x4, x1, x1; \ 378 vpxor x4, x3, x3; \ 379 vpand x0, x4, x4; \ 380 vpxor x2, x4, x4; 381 382#define get_key(i,j,t) \ 383 vpbroadcastd (4*(i)+(j))*4(CTX), t; 384 385#define K2(x0, x1, x2, x3, x4, i) \ 386 get_key(i, 0, RK0); \ 387 get_key(i, 1, RK1); \ 388 get_key(i, 2, RK2); \ 389 get_key(i, 3, RK3); \ 390 vpxor RK0, x0 ## 1, x0 ## 1; \ 391 vpxor RK1, x1 ## 1, x1 ## 1; \ 392 vpxor RK2, x2 ## 1, x2 ## 1; \ 393 vpxor RK3, x3 ## 1, x3 ## 1; \ 394 vpxor RK0, x0 ## 2, x0 ## 2; \ 395 vpxor RK1, x1 ## 2, x1 ## 2; \ 396 vpxor RK2, x2 ## 2, x2 ## 2; \ 397 vpxor RK3, x3 ## 2, x3 ## 2; 398 399#define LK2(x0, x1, x2, x3, x4, i) \ 400 vpslld $13, x0 ## 1, x4 ## 1; \ 401 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 402 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 403 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 404 vpslld $3, x2 ## 1, x4 ## 1; \ 405 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 406 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 407 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 408 vpslld $13, x0 ## 2, x4 ## 2; \ 409 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 410 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 411 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 412 vpslld $3, x2 ## 2, x4 ## 2; \ 413 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 414 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 415 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 416 vpslld $1, x1 ## 1, x4 ## 1; \ 417 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 418 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 419 vpslld $3, x0 ## 1, x4 ## 1; \ 420 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 421 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 422 get_key(i, 1, RK1); \ 423 vpslld $1, x1 ## 2, x4 ## 2; \ 424 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 425 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 426 vpslld $3, x0 ## 2, x4 ## 2; \ 427 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 428 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 429 get_key(i, 3, RK3); \ 430 vpslld $7, x3 ## 1, x4 ## 1; \ 431 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 432 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 433 vpslld $7, x1 ## 1, x4 ## 1; \ 434 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 435 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 436 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 437 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 438 get_key(i, 0, RK0); \ 439 vpslld $7, x3 ## 2, x4 ## 2; \ 440 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 441 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 442 vpslld $7, x1 ## 2, x4 ## 2; \ 443 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 444 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 445 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 446 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 447 get_key(i, 2, RK2); \ 448 vpxor RK1, x1 ## 1, x1 ## 1; \ 449 vpxor RK3, x3 ## 1, x3 ## 1; \ 450 vpslld $5, x0 ## 1, x4 ## 1; \ 451 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 452 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 453 vpslld $22, x2 ## 1, x4 ## 1; \ 454 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 455 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 456 vpxor RK0, x0 ## 1, x0 ## 1; \ 457 vpxor RK2, x2 ## 1, x2 ## 1; \ 458 vpxor RK1, x1 ## 2, x1 ## 2; \ 459 vpxor RK3, x3 ## 2, x3 ## 2; \ 460 vpslld $5, x0 ## 2, x4 ## 2; \ 461 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 462 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 463 vpslld $22, x2 ## 2, x4 ## 2; \ 464 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 465 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 466 vpxor RK0, x0 ## 2, x0 ## 2; \ 467 vpxor RK2, x2 ## 2, x2 ## 2; 468 469#define KL2(x0, x1, x2, x3, x4, i) \ 470 vpxor RK0, x0 ## 1, x0 ## 1; \ 471 vpxor RK2, x2 ## 1, x2 ## 1; \ 472 vpsrld $5, x0 ## 1, x4 ## 1; \ 473 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 474 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 475 vpxor RK3, x3 ## 1, x3 ## 1; \ 476 vpxor RK1, x1 ## 1, x1 ## 1; \ 477 vpsrld $22, x2 ## 1, x4 ## 1; \ 478 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 479 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 480 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 481 vpxor RK0, x0 ## 2, x0 ## 2; \ 482 vpxor RK2, x2 ## 2, x2 ## 2; \ 483 vpsrld $5, x0 ## 2, x4 ## 2; \ 484 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 485 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 486 vpxor RK3, x3 ## 2, x3 ## 2; \ 487 vpxor RK1, x1 ## 2, x1 ## 2; \ 488 vpsrld $22, x2 ## 2, x4 ## 2; \ 489 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 490 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 491 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 492 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 493 vpslld $7, x1 ## 1, x4 ## 1; \ 494 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 495 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 496 vpsrld $1, x1 ## 1, x4 ## 1; \ 497 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 498 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 499 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 500 vpslld $7, x1 ## 2, x4 ## 2; \ 501 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 502 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 503 vpsrld $1, x1 ## 2, x4 ## 2; \ 504 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 505 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 506 vpsrld $7, x3 ## 1, x4 ## 1; \ 507 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 508 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 509 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 510 vpslld $3, x0 ## 1, x4 ## 1; \ 511 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 512 vpsrld $7, x3 ## 2, x4 ## 2; \ 513 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 514 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 515 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 516 vpslld $3, x0 ## 2, x4 ## 2; \ 517 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 518 vpsrld $13, x0 ## 1, x4 ## 1; \ 519 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 520 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 521 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 522 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 523 vpsrld $3, x2 ## 1, x4 ## 1; \ 524 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 525 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 526 vpsrld $13, x0 ## 2, x4 ## 2; \ 527 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 528 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 529 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 530 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 531 vpsrld $3, x2 ## 2, x4 ## 2; \ 532 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 533 vpor x4 ## 2, x2 ## 2, x2 ## 2; 534 535#define S(SBOX, x0, x1, x2, x3, x4) \ 536 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 537 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 538 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 539 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 540 541#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 542 get_key(i, 0, RK0); \ 543 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 544 get_key(i, 2, RK2); \ 545 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 546 get_key(i, 3, RK3); \ 547 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 548 get_key(i, 1, RK1); \ 549 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 550 551#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 552 vpunpckldq x1, x0, t0; \ 553 vpunpckhdq x1, x0, t2; \ 554 vpunpckldq x3, x2, t1; \ 555 vpunpckhdq x3, x2, x3; \ 556 \ 557 vpunpcklqdq t1, t0, x0; \ 558 vpunpckhqdq t1, t0, x1; \ 559 vpunpcklqdq x3, t2, x2; \ 560 vpunpckhqdq x3, t2, x3; 561 562#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 563 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 564 565#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 566 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 567 568.align 8 569__serpent_enc_blk16: 570 /* input: 571 * %rdi: ctx, CTX 572 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext 573 * output: 574 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 575 */ 576 577 vpcmpeqd RNOT, RNOT, RNOT; 578 579 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 580 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 581 582 K2(RA, RB, RC, RD, RE, 0); 583 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 584 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 585 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 586 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 587 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 588 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 589 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 590 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 591 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 592 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 593 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 594 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 595 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 596 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 597 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 598 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 599 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 600 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 601 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 602 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 603 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 604 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 605 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 606 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 607 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 608 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 609 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 610 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 611 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 612 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 613 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 614 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 615 616 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 617 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 618 619 ret; 620ENDPROC(__serpent_enc_blk16) 621 622.align 8 623__serpent_dec_blk16: 624 /* input: 625 * %rdi: ctx, CTX 626 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 627 * output: 628 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext 629 */ 630 631 vpcmpeqd RNOT, RNOT, RNOT; 632 633 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 634 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 635 636 K2(RA, RB, RC, RD, RE, 32); 637 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 638 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 639 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 640 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 641 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 642 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 643 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 644 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 645 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 646 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 647 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 648 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 649 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 650 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 651 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 652 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 653 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 654 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 655 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 656 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 657 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 658 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 659 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 660 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 661 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 662 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 663 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 664 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 665 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 666 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 667 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 668 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 669 670 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 671 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 672 673 ret; 674ENDPROC(__serpent_dec_blk16) 675 676ENTRY(serpent_ecb_enc_16way) 677 /* input: 678 * %rdi: ctx, CTX 679 * %rsi: dst 680 * %rdx: src 681 */ 682 FRAME_BEGIN 683 684 vzeroupper; 685 686 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 687 688 call __serpent_enc_blk16; 689 690 store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 691 692 vzeroupper; 693 694 FRAME_END 695 ret; 696ENDPROC(serpent_ecb_enc_16way) 697 698ENTRY(serpent_ecb_dec_16way) 699 /* input: 700 * %rdi: ctx, CTX 701 * %rsi: dst 702 * %rdx: src 703 */ 704 FRAME_BEGIN 705 706 vzeroupper; 707 708 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 709 710 call __serpent_dec_blk16; 711 712 store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 713 714 vzeroupper; 715 716 FRAME_END 717 ret; 718ENDPROC(serpent_ecb_dec_16way) 719 720ENTRY(serpent_cbc_dec_16way) 721 /* input: 722 * %rdi: ctx, CTX 723 * %rsi: dst 724 * %rdx: src 725 */ 726 FRAME_BEGIN 727 728 vzeroupper; 729 730 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 731 732 call __serpent_dec_blk16; 733 734 store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2, 735 RK0); 736 737 vzeroupper; 738 739 FRAME_END 740 ret; 741ENDPROC(serpent_cbc_dec_16way) 742 743ENTRY(serpent_ctr_16way) 744 /* input: 745 * %rdi: ctx, CTX 746 * %rsi: dst (16 blocks) 747 * %rdx: src (16 blocks) 748 * %rcx: iv (little endian, 128bit) 749 */ 750 FRAME_BEGIN 751 752 vzeroupper; 753 754 load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 755 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 756 tp); 757 758 call __serpent_enc_blk16; 759 760 store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 761 762 vzeroupper; 763 764 FRAME_END 765 ret; 766ENDPROC(serpent_ctr_16way) 767 768ENTRY(serpent_xts_enc_16way) 769 /* input: 770 * %rdi: ctx, CTX 771 * %rsi: dst (16 blocks) 772 * %rdx: src (16 blocks) 773 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 774 */ 775 FRAME_BEGIN 776 777 vzeroupper; 778 779 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 780 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 781 .Lxts_gf128mul_and_shl1_mask_0, 782 .Lxts_gf128mul_and_shl1_mask_1); 783 784 call __serpent_enc_blk16; 785 786 store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 787 788 vzeroupper; 789 790 FRAME_END 791 ret; 792ENDPROC(serpent_xts_enc_16way) 793 794ENTRY(serpent_xts_dec_16way) 795 /* input: 796 * %rdi: ctx, CTX 797 * %rsi: dst (16 blocks) 798 * %rdx: src (16 blocks) 799 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 800 */ 801 FRAME_BEGIN 802 803 vzeroupper; 804 805 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 806 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 807 .Lxts_gf128mul_and_shl1_mask_0, 808 .Lxts_gf128mul_and_shl1_mask_1); 809 810 call __serpent_dec_blk16; 811 812 store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 813 814 vzeroupper; 815 816 FRAME_END 817 ret; 818ENDPROC(serpent_xts_dec_16way) 819