1/* 2 * x86_64/AVX2 assembler optimized version of Serpent 3 * 4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * Based on AVX assembler implementation of Serpent by: 7 * Copyright © 2012 Johannes Goetzfried 8 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 */ 16 17#include <linux/linkage.h> 18#include <asm/frame.h> 19#include "glue_helper-asm-avx2.S" 20 21.file "serpent-avx2-asm_64.S" 22 23.data 24.align 16 25 26.Lbswap128_mask: 27 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 28.Lxts_gf128mul_and_shl1_mask_0: 29 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 30.Lxts_gf128mul_and_shl1_mask_1: 31 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 32 33.text 34 35#define CTX %rdi 36 37#define RNOT %ymm0 38#define tp %ymm1 39 40#define RA1 %ymm2 41#define RA2 %ymm3 42#define RB1 %ymm4 43#define RB2 %ymm5 44#define RC1 %ymm6 45#define RC2 %ymm7 46#define RD1 %ymm8 47#define RD2 %ymm9 48#define RE1 %ymm10 49#define RE2 %ymm11 50 51#define RK0 %ymm12 52#define RK1 %ymm13 53#define RK2 %ymm14 54#define RK3 %ymm15 55 56#define RK0x %xmm12 57#define RK1x %xmm13 58#define RK2x %xmm14 59#define RK3x %xmm15 60 61#define S0_1(x0, x1, x2, x3, x4) \ 62 vpor x0, x3, tp; \ 63 vpxor x3, x0, x0; \ 64 vpxor x2, x3, x4; \ 65 vpxor RNOT, x4, x4; \ 66 vpxor x1, tp, x3; \ 67 vpand x0, x1, x1; \ 68 vpxor x4, x1, x1; \ 69 vpxor x0, x2, x2; 70#define S0_2(x0, x1, x2, x3, x4) \ 71 vpxor x3, x0, x0; \ 72 vpor x0, x4, x4; \ 73 vpxor x2, x0, x0; \ 74 vpand x1, x2, x2; \ 75 vpxor x2, x3, x3; \ 76 vpxor RNOT, x1, x1; \ 77 vpxor x4, x2, x2; \ 78 vpxor x2, x1, x1; 79 80#define S1_1(x0, x1, x2, x3, x4) \ 81 vpxor x0, x1, tp; \ 82 vpxor x3, x0, x0; \ 83 vpxor RNOT, x3, x3; \ 84 vpand tp, x1, x4; \ 85 vpor tp, x0, x0; \ 86 vpxor x2, x3, x3; \ 87 vpxor x3, x0, x0; \ 88 vpxor x3, tp, x1; 89#define S1_2(x0, x1, x2, x3, x4) \ 90 vpxor x4, x3, x3; \ 91 vpor x4, x1, x1; \ 92 vpxor x2, x4, x4; \ 93 vpand x0, x2, x2; \ 94 vpxor x1, x2, x2; \ 95 vpor x0, x1, x1; \ 96 vpxor RNOT, x0, x0; \ 97 vpxor x2, x0, x0; \ 98 vpxor x1, x4, x4; 99 100#define S2_1(x0, x1, x2, x3, x4) \ 101 vpxor RNOT, x3, x3; \ 102 vpxor x0, x1, x1; \ 103 vpand x2, x0, tp; \ 104 vpxor x3, tp, tp; \ 105 vpor x0, x3, x3; \ 106 vpxor x1, x2, x2; \ 107 vpxor x1, x3, x3; \ 108 vpand tp, x1, x1; 109#define S2_2(x0, x1, x2, x3, x4) \ 110 vpxor x2, tp, tp; \ 111 vpand x3, x2, x2; \ 112 vpor x1, x3, x3; \ 113 vpxor RNOT, tp, tp; \ 114 vpxor tp, x3, x3; \ 115 vpxor tp, x0, x4; \ 116 vpxor x2, tp, x0; \ 117 vpor x2, x1, x1; 118 119#define S3_1(x0, x1, x2, x3, x4) \ 120 vpxor x3, x1, tp; \ 121 vpor x0, x3, x3; \ 122 vpand x0, x1, x4; \ 123 vpxor x2, x0, x0; \ 124 vpxor tp, x2, x2; \ 125 vpand x3, tp, x1; \ 126 vpxor x3, x2, x2; \ 127 vpor x4, x0, x0; \ 128 vpxor x3, x4, x4; 129#define S3_2(x0, x1, x2, x3, x4) \ 130 vpxor x0, x1, x1; \ 131 vpand x3, x0, x0; \ 132 vpand x4, x3, x3; \ 133 vpxor x2, x3, x3; \ 134 vpor x1, x4, x4; \ 135 vpand x1, x2, x2; \ 136 vpxor x3, x4, x4; \ 137 vpxor x3, x0, x0; \ 138 vpxor x2, x3, x3; 139 140#define S4_1(x0, x1, x2, x3, x4) \ 141 vpand x0, x3, tp; \ 142 vpxor x3, x0, x0; \ 143 vpxor x2, tp, tp; \ 144 vpor x3, x2, x2; \ 145 vpxor x1, x0, x0; \ 146 vpxor tp, x3, x4; \ 147 vpor x0, x2, x2; \ 148 vpxor x1, x2, x2; 149#define S4_2(x0, x1, x2, x3, x4) \ 150 vpand x0, x1, x1; \ 151 vpxor x4, x1, x1; \ 152 vpand x2, x4, x4; \ 153 vpxor tp, x2, x2; \ 154 vpxor x0, x4, x4; \ 155 vpor x1, tp, x3; \ 156 vpxor RNOT, x1, x1; \ 157 vpxor x0, x3, x3; 158 159#define S5_1(x0, x1, x2, x3, x4) \ 160 vpor x0, x1, tp; \ 161 vpxor tp, x2, x2; \ 162 vpxor RNOT, x3, x3; \ 163 vpxor x0, x1, x4; \ 164 vpxor x2, x0, x0; \ 165 vpand x4, tp, x1; \ 166 vpor x3, x4, x4; \ 167 vpxor x0, x4, x4; 168#define S5_2(x0, x1, x2, x3, x4) \ 169 vpand x3, x0, x0; \ 170 vpxor x3, x1, x1; \ 171 vpxor x2, x3, x3; \ 172 vpxor x1, x0, x0; \ 173 vpand x4, x2, x2; \ 174 vpxor x2, x1, x1; \ 175 vpand x0, x2, x2; \ 176 vpxor x2, x3, x3; 177 178#define S6_1(x0, x1, x2, x3, x4) \ 179 vpxor x0, x3, x3; \ 180 vpxor x2, x1, tp; \ 181 vpxor x0, x2, x2; \ 182 vpand x3, x0, x0; \ 183 vpor x3, tp, tp; \ 184 vpxor RNOT, x1, x4; \ 185 vpxor tp, x0, x0; \ 186 vpxor x2, tp, x1; 187#define S6_2(x0, x1, x2, x3, x4) \ 188 vpxor x4, x3, x3; \ 189 vpxor x0, x4, x4; \ 190 vpand x0, x2, x2; \ 191 vpxor x1, x4, x4; \ 192 vpxor x3, x2, x2; \ 193 vpand x1, x3, x3; \ 194 vpxor x0, x3, x3; \ 195 vpxor x2, x1, x1; 196 197#define S7_1(x0, x1, x2, x3, x4) \ 198 vpxor RNOT, x1, tp; \ 199 vpxor RNOT, x0, x0; \ 200 vpand x2, tp, x1; \ 201 vpxor x3, x1, x1; \ 202 vpor tp, x3, x3; \ 203 vpxor x2, tp, x4; \ 204 vpxor x3, x2, x2; \ 205 vpxor x0, x3, x3; \ 206 vpor x1, x0, x0; 207#define S7_2(x0, x1, x2, x3, x4) \ 208 vpand x0, x2, x2; \ 209 vpxor x4, x0, x0; \ 210 vpxor x3, x4, x4; \ 211 vpand x0, x3, x3; \ 212 vpxor x1, x4, x4; \ 213 vpxor x4, x2, x2; \ 214 vpxor x1, x3, x3; \ 215 vpor x0, x4, x4; \ 216 vpxor x1, x4, x4; 217 218#define SI0_1(x0, x1, x2, x3, x4) \ 219 vpxor x0, x1, x1; \ 220 vpor x1, x3, tp; \ 221 vpxor x1, x3, x4; \ 222 vpxor RNOT, x0, x0; \ 223 vpxor tp, x2, x2; \ 224 vpxor x0, tp, x3; \ 225 vpand x1, x0, x0; \ 226 vpxor x2, x0, x0; 227#define SI0_2(x0, x1, x2, x3, x4) \ 228 vpand x3, x2, x2; \ 229 vpxor x4, x3, x3; \ 230 vpxor x3, x2, x2; \ 231 vpxor x3, x1, x1; \ 232 vpand x0, x3, x3; \ 233 vpxor x0, x1, x1; \ 234 vpxor x2, x0, x0; \ 235 vpxor x3, x4, x4; 236 237#define SI1_1(x0, x1, x2, x3, x4) \ 238 vpxor x3, x1, x1; \ 239 vpxor x2, x0, tp; \ 240 vpxor RNOT, x2, x2; \ 241 vpor x1, x0, x4; \ 242 vpxor x3, x4, x4; \ 243 vpand x1, x3, x3; \ 244 vpxor x2, x1, x1; \ 245 vpand x4, x2, x2; 246#define SI1_2(x0, x1, x2, x3, x4) \ 247 vpxor x1, x4, x4; \ 248 vpor x3, x1, x1; \ 249 vpxor tp, x3, x3; \ 250 vpxor tp, x2, x2; \ 251 vpor x4, tp, x0; \ 252 vpxor x4, x2, x2; \ 253 vpxor x0, x1, x1; \ 254 vpxor x1, x4, x4; 255 256#define SI2_1(x0, x1, x2, x3, x4) \ 257 vpxor x1, x2, x2; \ 258 vpxor RNOT, x3, tp; \ 259 vpor x2, tp, tp; \ 260 vpxor x3, x2, x2; \ 261 vpxor x0, x3, x4; \ 262 vpxor x1, tp, x3; \ 263 vpor x2, x1, x1; \ 264 vpxor x0, x2, x2; 265#define SI2_2(x0, x1, x2, x3, x4) \ 266 vpxor x4, x1, x1; \ 267 vpor x3, x4, x4; \ 268 vpxor x3, x2, x2; \ 269 vpxor x2, x4, x4; \ 270 vpand x1, x2, x2; \ 271 vpxor x3, x2, x2; \ 272 vpxor x4, x3, x3; \ 273 vpxor x0, x4, x4; 274 275#define SI3_1(x0, x1, x2, x3, x4) \ 276 vpxor x1, x2, x2; \ 277 vpand x2, x1, tp; \ 278 vpxor x0, tp, tp; \ 279 vpor x1, x0, x0; \ 280 vpxor x3, x1, x4; \ 281 vpxor x3, x0, x0; \ 282 vpor tp, x3, x3; \ 283 vpxor x2, tp, x1; 284#define SI3_2(x0, x1, x2, x3, x4) \ 285 vpxor x3, x1, x1; \ 286 vpxor x2, x0, x0; \ 287 vpxor x3, x2, x2; \ 288 vpand x1, x3, x3; \ 289 vpxor x0, x1, x1; \ 290 vpand x2, x0, x0; \ 291 vpxor x3, x4, x4; \ 292 vpxor x0, x3, x3; \ 293 vpxor x1, x0, x0; 294 295#define SI4_1(x0, x1, x2, x3, x4) \ 296 vpxor x3, x2, x2; \ 297 vpand x1, x0, tp; \ 298 vpxor x2, tp, tp; \ 299 vpor x3, x2, x2; \ 300 vpxor RNOT, x0, x4; \ 301 vpxor tp, x1, x1; \ 302 vpxor x2, tp, x0; \ 303 vpand x4, x2, x2; 304#define SI4_2(x0, x1, x2, x3, x4) \ 305 vpxor x0, x2, x2; \ 306 vpor x4, x0, x0; \ 307 vpxor x3, x0, x0; \ 308 vpand x2, x3, x3; \ 309 vpxor x3, x4, x4; \ 310 vpxor x1, x3, x3; \ 311 vpand x0, x1, x1; \ 312 vpxor x1, x4, x4; \ 313 vpxor x3, x0, x0; 314 315#define SI5_1(x0, x1, x2, x3, x4) \ 316 vpor x2, x1, tp; \ 317 vpxor x1, x2, x2; \ 318 vpxor x3, tp, tp; \ 319 vpand x1, x3, x3; \ 320 vpxor x3, x2, x2; \ 321 vpor x0, x3, x3; \ 322 vpxor RNOT, x0, x0; \ 323 vpxor x2, x3, x3; \ 324 vpor x0, x2, x2; 325#define SI5_2(x0, x1, x2, x3, x4) \ 326 vpxor tp, x1, x4; \ 327 vpxor x4, x2, x2; \ 328 vpand x0, x4, x4; \ 329 vpxor tp, x0, x0; \ 330 vpxor x3, tp, x1; \ 331 vpand x2, x0, x0; \ 332 vpxor x3, x2, x2; \ 333 vpxor x2, x0, x0; \ 334 vpxor x4, x2, x2; \ 335 vpxor x3, x4, x4; 336 337#define SI6_1(x0, x1, x2, x3, x4) \ 338 vpxor x2, x0, x0; \ 339 vpand x3, x0, tp; \ 340 vpxor x3, x2, x2; \ 341 vpxor x2, tp, tp; \ 342 vpxor x1, x3, x3; \ 343 vpor x0, x2, x2; \ 344 vpxor x3, x2, x2; \ 345 vpand tp, x3, x3; 346#define SI6_2(x0, x1, x2, x3, x4) \ 347 vpxor RNOT, tp, tp; \ 348 vpxor x1, x3, x3; \ 349 vpand x2, x1, x1; \ 350 vpxor tp, x0, x4; \ 351 vpxor x4, x3, x3; \ 352 vpxor x2, x4, x4; \ 353 vpxor x1, tp, x0; \ 354 vpxor x0, x2, x2; 355 356#define SI7_1(x0, x1, x2, x3, x4) \ 357 vpand x0, x3, tp; \ 358 vpxor x2, x0, x0; \ 359 vpor x3, x2, x2; \ 360 vpxor x1, x3, x4; \ 361 vpxor RNOT, x0, x0; \ 362 vpor tp, x1, x1; \ 363 vpxor x0, x4, x4; \ 364 vpand x2, x0, x0; \ 365 vpxor x1, x0, x0; 366#define SI7_2(x0, x1, x2, x3, x4) \ 367 vpand x2, x1, x1; \ 368 vpxor x2, tp, x3; \ 369 vpxor x3, x4, x4; \ 370 vpand x3, x2, x2; \ 371 vpor x0, x3, x3; \ 372 vpxor x4, x1, x1; \ 373 vpxor x4, x3, x3; \ 374 vpand x0, x4, x4; \ 375 vpxor x2, x4, x4; 376 377#define get_key(i,j,t) \ 378 vpbroadcastd (4*(i)+(j))*4(CTX), t; 379 380#define K2(x0, x1, x2, x3, x4, i) \ 381 get_key(i, 0, RK0); \ 382 get_key(i, 1, RK1); \ 383 get_key(i, 2, RK2); \ 384 get_key(i, 3, RK3); \ 385 vpxor RK0, x0 ## 1, x0 ## 1; \ 386 vpxor RK1, x1 ## 1, x1 ## 1; \ 387 vpxor RK2, x2 ## 1, x2 ## 1; \ 388 vpxor RK3, x3 ## 1, x3 ## 1; \ 389 vpxor RK0, x0 ## 2, x0 ## 2; \ 390 vpxor RK1, x1 ## 2, x1 ## 2; \ 391 vpxor RK2, x2 ## 2, x2 ## 2; \ 392 vpxor RK3, x3 ## 2, x3 ## 2; 393 394#define LK2(x0, x1, x2, x3, x4, i) \ 395 vpslld $13, x0 ## 1, x4 ## 1; \ 396 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 397 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 398 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 399 vpslld $3, x2 ## 1, x4 ## 1; \ 400 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 401 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 402 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 403 vpslld $13, x0 ## 2, x4 ## 2; \ 404 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 405 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 406 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 407 vpslld $3, x2 ## 2, x4 ## 2; \ 408 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 409 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 410 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 411 vpslld $1, x1 ## 1, x4 ## 1; \ 412 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 413 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 414 vpslld $3, x0 ## 1, x4 ## 1; \ 415 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 416 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 417 get_key(i, 1, RK1); \ 418 vpslld $1, x1 ## 2, x4 ## 2; \ 419 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 420 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 421 vpslld $3, x0 ## 2, x4 ## 2; \ 422 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 423 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 424 get_key(i, 3, RK3); \ 425 vpslld $7, x3 ## 1, x4 ## 1; \ 426 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 427 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 428 vpslld $7, x1 ## 1, x4 ## 1; \ 429 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 430 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 431 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 432 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 433 get_key(i, 0, RK0); \ 434 vpslld $7, x3 ## 2, x4 ## 2; \ 435 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 436 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 437 vpslld $7, x1 ## 2, x4 ## 2; \ 438 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 439 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 440 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 441 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 442 get_key(i, 2, RK2); \ 443 vpxor RK1, x1 ## 1, x1 ## 1; \ 444 vpxor RK3, x3 ## 1, x3 ## 1; \ 445 vpslld $5, x0 ## 1, x4 ## 1; \ 446 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 447 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 448 vpslld $22, x2 ## 1, x4 ## 1; \ 449 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 450 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 451 vpxor RK0, x0 ## 1, x0 ## 1; \ 452 vpxor RK2, x2 ## 1, x2 ## 1; \ 453 vpxor RK1, x1 ## 2, x1 ## 2; \ 454 vpxor RK3, x3 ## 2, x3 ## 2; \ 455 vpslld $5, x0 ## 2, x4 ## 2; \ 456 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 457 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 458 vpslld $22, x2 ## 2, x4 ## 2; \ 459 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 460 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 461 vpxor RK0, x0 ## 2, x0 ## 2; \ 462 vpxor RK2, x2 ## 2, x2 ## 2; 463 464#define KL2(x0, x1, x2, x3, x4, i) \ 465 vpxor RK0, x0 ## 1, x0 ## 1; \ 466 vpxor RK2, x2 ## 1, x2 ## 1; \ 467 vpsrld $5, x0 ## 1, x4 ## 1; \ 468 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 469 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 470 vpxor RK3, x3 ## 1, x3 ## 1; \ 471 vpxor RK1, x1 ## 1, x1 ## 1; \ 472 vpsrld $22, x2 ## 1, x4 ## 1; \ 473 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 474 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 475 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 476 vpxor RK0, x0 ## 2, x0 ## 2; \ 477 vpxor RK2, x2 ## 2, x2 ## 2; \ 478 vpsrld $5, x0 ## 2, x4 ## 2; \ 479 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 480 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 481 vpxor RK3, x3 ## 2, x3 ## 2; \ 482 vpxor RK1, x1 ## 2, x1 ## 2; \ 483 vpsrld $22, x2 ## 2, x4 ## 2; \ 484 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 485 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 486 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 487 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 488 vpslld $7, x1 ## 1, x4 ## 1; \ 489 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 490 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 491 vpsrld $1, x1 ## 1, x4 ## 1; \ 492 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 493 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 494 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 495 vpslld $7, x1 ## 2, x4 ## 2; \ 496 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 497 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 498 vpsrld $1, x1 ## 2, x4 ## 2; \ 499 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 500 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 501 vpsrld $7, x3 ## 1, x4 ## 1; \ 502 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 503 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 504 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 505 vpslld $3, x0 ## 1, x4 ## 1; \ 506 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 507 vpsrld $7, x3 ## 2, x4 ## 2; \ 508 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 509 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 510 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 511 vpslld $3, x0 ## 2, x4 ## 2; \ 512 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 513 vpsrld $13, x0 ## 1, x4 ## 1; \ 514 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 515 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 516 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 517 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 518 vpsrld $3, x2 ## 1, x4 ## 1; \ 519 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 520 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 521 vpsrld $13, x0 ## 2, x4 ## 2; \ 522 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 523 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 524 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 525 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 526 vpsrld $3, x2 ## 2, x4 ## 2; \ 527 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 528 vpor x4 ## 2, x2 ## 2, x2 ## 2; 529 530#define S(SBOX, x0, x1, x2, x3, x4) \ 531 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 532 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 533 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 534 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 535 536#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 537 get_key(i, 0, RK0); \ 538 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 539 get_key(i, 2, RK2); \ 540 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 541 get_key(i, 3, RK3); \ 542 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 543 get_key(i, 1, RK1); \ 544 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 545 546#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 547 vpunpckldq x1, x0, t0; \ 548 vpunpckhdq x1, x0, t2; \ 549 vpunpckldq x3, x2, t1; \ 550 vpunpckhdq x3, x2, x3; \ 551 \ 552 vpunpcklqdq t1, t0, x0; \ 553 vpunpckhqdq t1, t0, x1; \ 554 vpunpcklqdq x3, t2, x2; \ 555 vpunpckhqdq x3, t2, x3; 556 557#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 558 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 559 560#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 561 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 562 563.align 8 564__serpent_enc_blk16: 565 /* input: 566 * %rdi: ctx, CTX 567 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext 568 * output: 569 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 570 */ 571 572 vpcmpeqd RNOT, RNOT, RNOT; 573 574 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 575 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 576 577 K2(RA, RB, RC, RD, RE, 0); 578 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 579 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 580 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 581 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 582 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 583 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 584 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 585 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 586 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 587 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 588 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 589 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 590 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 591 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 592 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 593 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 594 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 595 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 596 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 597 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 598 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 599 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 600 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 601 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 602 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 603 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 604 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 605 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 606 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 607 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 608 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 609 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 610 611 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 612 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 613 614 ret; 615ENDPROC(__serpent_enc_blk16) 616 617.align 8 618__serpent_dec_blk16: 619 /* input: 620 * %rdi: ctx, CTX 621 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 622 * output: 623 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext 624 */ 625 626 vpcmpeqd RNOT, RNOT, RNOT; 627 628 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 629 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 630 631 K2(RA, RB, RC, RD, RE, 32); 632 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 633 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 634 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 635 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 636 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 637 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 638 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 639 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 640 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 641 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 642 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 643 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 644 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 645 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 646 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 647 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 648 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 649 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 650 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 651 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 652 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 653 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 654 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 655 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 656 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 657 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 658 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 659 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 660 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 661 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 662 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 663 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 664 665 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 666 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 667 668 ret; 669ENDPROC(__serpent_dec_blk16) 670 671ENTRY(serpent_ecb_enc_16way) 672 /* input: 673 * %rdi: ctx, CTX 674 * %rsi: dst 675 * %rdx: src 676 */ 677 FRAME_BEGIN 678 679 vzeroupper; 680 681 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 682 683 call __serpent_enc_blk16; 684 685 store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 686 687 vzeroupper; 688 689 FRAME_END 690 ret; 691ENDPROC(serpent_ecb_enc_16way) 692 693ENTRY(serpent_ecb_dec_16way) 694 /* input: 695 * %rdi: ctx, CTX 696 * %rsi: dst 697 * %rdx: src 698 */ 699 FRAME_BEGIN 700 701 vzeroupper; 702 703 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 704 705 call __serpent_dec_blk16; 706 707 store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 708 709 vzeroupper; 710 711 FRAME_END 712 ret; 713ENDPROC(serpent_ecb_dec_16way) 714 715ENTRY(serpent_cbc_dec_16way) 716 /* input: 717 * %rdi: ctx, CTX 718 * %rsi: dst 719 * %rdx: src 720 */ 721 FRAME_BEGIN 722 723 vzeroupper; 724 725 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 726 727 call __serpent_dec_blk16; 728 729 store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2, 730 RK0); 731 732 vzeroupper; 733 734 FRAME_END 735 ret; 736ENDPROC(serpent_cbc_dec_16way) 737 738ENTRY(serpent_ctr_16way) 739 /* input: 740 * %rdi: ctx, CTX 741 * %rsi: dst (16 blocks) 742 * %rdx: src (16 blocks) 743 * %rcx: iv (little endian, 128bit) 744 */ 745 FRAME_BEGIN 746 747 vzeroupper; 748 749 load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 750 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 751 tp); 752 753 call __serpent_enc_blk16; 754 755 store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 756 757 vzeroupper; 758 759 FRAME_END 760 ret; 761ENDPROC(serpent_ctr_16way) 762 763ENTRY(serpent_xts_enc_16way) 764 /* input: 765 * %rdi: ctx, CTX 766 * %rsi: dst (16 blocks) 767 * %rdx: src (16 blocks) 768 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 769 */ 770 FRAME_BEGIN 771 772 vzeroupper; 773 774 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 775 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 776 .Lxts_gf128mul_and_shl1_mask_0, 777 .Lxts_gf128mul_and_shl1_mask_1); 778 779 call __serpent_enc_blk16; 780 781 store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 782 783 vzeroupper; 784 785 FRAME_END 786 ret; 787ENDPROC(serpent_xts_enc_16way) 788 789ENTRY(serpent_xts_dec_16way) 790 /* input: 791 * %rdi: ctx, CTX 792 * %rsi: dst (16 blocks) 793 * %rdx: src (16 blocks) 794 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 795 */ 796 FRAME_BEGIN 797 798 vzeroupper; 799 800 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 801 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 802 .Lxts_gf128mul_and_shl1_mask_0, 803 .Lxts_gf128mul_and_shl1_mask_1); 804 805 call __serpent_dec_blk16; 806 807 store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 808 809 vzeroupper; 810 811 FRAME_END 812 ret; 813ENDPROC(serpent_xts_dec_16way) 814