1/* 2 * x86_64/AVX/AES-NI assembler implementation of Camellia 3 * 4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 */ 12 13/* 14 * Version licensed under 2-clause BSD License is available at: 15 * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz 16 */ 17 18#include <linux/linkage.h> 19#include <asm/frame.h> 20#include <asm/nospec-branch.h> 21 22#define CAMELLIA_TABLE_BYTE_LEN 272 23 24/* struct camellia_ctx: */ 25#define key_table 0 26#define key_length CAMELLIA_TABLE_BYTE_LEN 27 28/* register macros */ 29#define CTX %rdi 30 31/********************************************************************** 32 16-way camellia 33 **********************************************************************/ 34#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 35 vpand x, mask4bit, tmp0; \ 36 vpandn x, mask4bit, x; \ 37 vpsrld $4, x, x; \ 38 \ 39 vpshufb tmp0, lo_t, tmp0; \ 40 vpshufb x, hi_t, x; \ 41 vpxor tmp0, x, x; 42 43/* 44 * IN: 45 * x0..x7: byte-sliced AB state 46 * mem_cd: register pointer storing CD state 47 * key: index for key material 48 * OUT: 49 * x0..x7: new byte-sliced CD state 50 */ 51#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ 52 t7, mem_cd, key) \ 53 /* \ 54 * S-function with AES subbytes \ 55 */ \ 56 vmovdqa .Linv_shift_row, t4; \ 57 vbroadcastss .L0f0f0f0f, t7; \ 58 vmovdqa .Lpre_tf_lo_s1, t0; \ 59 vmovdqa .Lpre_tf_hi_s1, t1; \ 60 \ 61 /* AES inverse shift rows */ \ 62 vpshufb t4, x0, x0; \ 63 vpshufb t4, x7, x7; \ 64 vpshufb t4, x1, x1; \ 65 vpshufb t4, x4, x4; \ 66 vpshufb t4, x2, x2; \ 67 vpshufb t4, x5, x5; \ 68 vpshufb t4, x3, x3; \ 69 vpshufb t4, x6, x6; \ 70 \ 71 /* prefilter sboxes 1, 2 and 3 */ \ 72 vmovdqa .Lpre_tf_lo_s4, t2; \ 73 vmovdqa .Lpre_tf_hi_s4, t3; \ 74 filter_8bit(x0, t0, t1, t7, t6); \ 75 filter_8bit(x7, t0, t1, t7, t6); \ 76 filter_8bit(x1, t0, t1, t7, t6); \ 77 filter_8bit(x4, t0, t1, t7, t6); \ 78 filter_8bit(x2, t0, t1, t7, t6); \ 79 filter_8bit(x5, t0, t1, t7, t6); \ 80 \ 81 /* prefilter sbox 4 */ \ 82 vpxor t4, t4, t4; \ 83 filter_8bit(x3, t2, t3, t7, t6); \ 84 filter_8bit(x6, t2, t3, t7, t6); \ 85 \ 86 /* AES subbytes + AES shift rows */ \ 87 vmovdqa .Lpost_tf_lo_s1, t0; \ 88 vmovdqa .Lpost_tf_hi_s1, t1; \ 89 vaesenclast t4, x0, x0; \ 90 vaesenclast t4, x7, x7; \ 91 vaesenclast t4, x1, x1; \ 92 vaesenclast t4, x4, x4; \ 93 vaesenclast t4, x2, x2; \ 94 vaesenclast t4, x5, x5; \ 95 vaesenclast t4, x3, x3; \ 96 vaesenclast t4, x6, x6; \ 97 \ 98 /* postfilter sboxes 1 and 4 */ \ 99 vmovdqa .Lpost_tf_lo_s3, t2; \ 100 vmovdqa .Lpost_tf_hi_s3, t3; \ 101 filter_8bit(x0, t0, t1, t7, t6); \ 102 filter_8bit(x7, t0, t1, t7, t6); \ 103 filter_8bit(x3, t0, t1, t7, t6); \ 104 filter_8bit(x6, t0, t1, t7, t6); \ 105 \ 106 /* postfilter sbox 3 */ \ 107 vmovdqa .Lpost_tf_lo_s2, t4; \ 108 vmovdqa .Lpost_tf_hi_s2, t5; \ 109 filter_8bit(x2, t2, t3, t7, t6); \ 110 filter_8bit(x5, t2, t3, t7, t6); \ 111 \ 112 vpxor t6, t6, t6; \ 113 vmovq key, t0; \ 114 \ 115 /* postfilter sbox 2 */ \ 116 filter_8bit(x1, t4, t5, t7, t2); \ 117 filter_8bit(x4, t4, t5, t7, t2); \ 118 \ 119 vpsrldq $5, t0, t5; \ 120 vpsrldq $1, t0, t1; \ 121 vpsrldq $2, t0, t2; \ 122 vpsrldq $3, t0, t3; \ 123 vpsrldq $4, t0, t4; \ 124 vpshufb t6, t0, t0; \ 125 vpshufb t6, t1, t1; \ 126 vpshufb t6, t2, t2; \ 127 vpshufb t6, t3, t3; \ 128 vpshufb t6, t4, t4; \ 129 vpsrldq $2, t5, t7; \ 130 vpshufb t6, t7, t7; \ 131 \ 132 /* \ 133 * P-function \ 134 */ \ 135 vpxor x5, x0, x0; \ 136 vpxor x6, x1, x1; \ 137 vpxor x7, x2, x2; \ 138 vpxor x4, x3, x3; \ 139 \ 140 vpxor x2, x4, x4; \ 141 vpxor x3, x5, x5; \ 142 vpxor x0, x6, x6; \ 143 vpxor x1, x7, x7; \ 144 \ 145 vpxor x7, x0, x0; \ 146 vpxor x4, x1, x1; \ 147 vpxor x5, x2, x2; \ 148 vpxor x6, x3, x3; \ 149 \ 150 vpxor x3, x4, x4; \ 151 vpxor x0, x5, x5; \ 152 vpxor x1, x6, x6; \ 153 vpxor x2, x7, x7; /* note: high and low parts swapped */ \ 154 \ 155 /* \ 156 * Add key material and result to CD (x becomes new CD) \ 157 */ \ 158 \ 159 vpxor t3, x4, x4; \ 160 vpxor 0 * 16(mem_cd), x4, x4; \ 161 \ 162 vpxor t2, x5, x5; \ 163 vpxor 1 * 16(mem_cd), x5, x5; \ 164 \ 165 vpsrldq $1, t5, t3; \ 166 vpshufb t6, t5, t5; \ 167 vpshufb t6, t3, t6; \ 168 \ 169 vpxor t1, x6, x6; \ 170 vpxor 2 * 16(mem_cd), x6, x6; \ 171 \ 172 vpxor t0, x7, x7; \ 173 vpxor 3 * 16(mem_cd), x7, x7; \ 174 \ 175 vpxor t7, x0, x0; \ 176 vpxor 4 * 16(mem_cd), x0, x0; \ 177 \ 178 vpxor t6, x1, x1; \ 179 vpxor 5 * 16(mem_cd), x1, x1; \ 180 \ 181 vpxor t5, x2, x2; \ 182 vpxor 6 * 16(mem_cd), x2, x2; \ 183 \ 184 vpxor t4, x3, x3; \ 185 vpxor 7 * 16(mem_cd), x3, x3; 186 187/* 188 * Size optimization... with inlined roundsm16, binary would be over 5 times 189 * larger and would only be 0.5% faster (on sandy-bridge). 190 */ 191.align 8 192roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: 193 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 194 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 195 %rcx, (%r9)); 196 ret; 197ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 198 199.align 8 200roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: 201 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, 202 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, 203 %rax, (%r9)); 204 ret; 205ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 206 207/* 208 * IN/OUT: 209 * x0..x7: byte-sliced AB state preloaded 210 * mem_ab: byte-sliced AB state in memory 211 * mem_cb: byte-sliced CD state in memory 212 */ 213#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 214 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ 215 leaq (key_table + (i) * 8)(CTX), %r9; \ 216 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ 217 \ 218 vmovdqu x4, 0 * 16(mem_cd); \ 219 vmovdqu x5, 1 * 16(mem_cd); \ 220 vmovdqu x6, 2 * 16(mem_cd); \ 221 vmovdqu x7, 3 * 16(mem_cd); \ 222 vmovdqu x0, 4 * 16(mem_cd); \ 223 vmovdqu x1, 5 * 16(mem_cd); \ 224 vmovdqu x2, 6 * 16(mem_cd); \ 225 vmovdqu x3, 7 * 16(mem_cd); \ 226 \ 227 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ 228 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ 229 \ 230 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); 231 232#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ 233 234#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ 235 /* Store new AB state */ \ 236 vmovdqu x0, 0 * 16(mem_ab); \ 237 vmovdqu x1, 1 * 16(mem_ab); \ 238 vmovdqu x2, 2 * 16(mem_ab); \ 239 vmovdqu x3, 3 * 16(mem_ab); \ 240 vmovdqu x4, 4 * 16(mem_ab); \ 241 vmovdqu x5, 5 * 16(mem_ab); \ 242 vmovdqu x6, 6 * 16(mem_ab); \ 243 vmovdqu x7, 7 * 16(mem_ab); 244 245#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 246 y6, y7, mem_ab, mem_cd, i) \ 247 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 248 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ 249 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 250 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ 251 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 252 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); 253 254#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 255 y6, y7, mem_ab, mem_cd, i) \ 256 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 257 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ 258 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 259 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ 260 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 261 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); 262 263/* 264 * IN: 265 * v0..3: byte-sliced 32-bit integers 266 * OUT: 267 * v0..3: (IN <<< 1) 268 */ 269#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ 270 vpcmpgtb v0, zero, t0; \ 271 vpaddb v0, v0, v0; \ 272 vpabsb t0, t0; \ 273 \ 274 vpcmpgtb v1, zero, t1; \ 275 vpaddb v1, v1, v1; \ 276 vpabsb t1, t1; \ 277 \ 278 vpcmpgtb v2, zero, t2; \ 279 vpaddb v2, v2, v2; \ 280 vpabsb t2, t2; \ 281 \ 282 vpor t0, v1, v1; \ 283 \ 284 vpcmpgtb v3, zero, t0; \ 285 vpaddb v3, v3, v3; \ 286 vpabsb t0, t0; \ 287 \ 288 vpor t1, v2, v2; \ 289 vpor t2, v3, v3; \ 290 vpor t0, v0, v0; 291 292/* 293 * IN: 294 * r: byte-sliced AB state in memory 295 * l: byte-sliced CD state in memory 296 * OUT: 297 * x0..x7: new byte-sliced CD state 298 */ 299#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ 300 tt1, tt2, tt3, kll, klr, krl, krr) \ 301 /* \ 302 * t0 = kll; \ 303 * t0 &= ll; \ 304 * lr ^= rol32(t0, 1); \ 305 */ \ 306 vpxor tt0, tt0, tt0; \ 307 vmovd kll, t0; \ 308 vpshufb tt0, t0, t3; \ 309 vpsrldq $1, t0, t0; \ 310 vpshufb tt0, t0, t2; \ 311 vpsrldq $1, t0, t0; \ 312 vpshufb tt0, t0, t1; \ 313 vpsrldq $1, t0, t0; \ 314 vpshufb tt0, t0, t0; \ 315 \ 316 vpand l0, t0, t0; \ 317 vpand l1, t1, t1; \ 318 vpand l2, t2, t2; \ 319 vpand l3, t3, t3; \ 320 \ 321 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 322 \ 323 vpxor l4, t0, l4; \ 324 vmovdqu l4, 4 * 16(l); \ 325 vpxor l5, t1, l5; \ 326 vmovdqu l5, 5 * 16(l); \ 327 vpxor l6, t2, l6; \ 328 vmovdqu l6, 6 * 16(l); \ 329 vpxor l7, t3, l7; \ 330 vmovdqu l7, 7 * 16(l); \ 331 \ 332 /* \ 333 * t2 = krr; \ 334 * t2 |= rr; \ 335 * rl ^= t2; \ 336 */ \ 337 \ 338 vmovd krr, t0; \ 339 vpshufb tt0, t0, t3; \ 340 vpsrldq $1, t0, t0; \ 341 vpshufb tt0, t0, t2; \ 342 vpsrldq $1, t0, t0; \ 343 vpshufb tt0, t0, t1; \ 344 vpsrldq $1, t0, t0; \ 345 vpshufb tt0, t0, t0; \ 346 \ 347 vpor 4 * 16(r), t0, t0; \ 348 vpor 5 * 16(r), t1, t1; \ 349 vpor 6 * 16(r), t2, t2; \ 350 vpor 7 * 16(r), t3, t3; \ 351 \ 352 vpxor 0 * 16(r), t0, t0; \ 353 vpxor 1 * 16(r), t1, t1; \ 354 vpxor 2 * 16(r), t2, t2; \ 355 vpxor 3 * 16(r), t3, t3; \ 356 vmovdqu t0, 0 * 16(r); \ 357 vmovdqu t1, 1 * 16(r); \ 358 vmovdqu t2, 2 * 16(r); \ 359 vmovdqu t3, 3 * 16(r); \ 360 \ 361 /* \ 362 * t2 = krl; \ 363 * t2 &= rl; \ 364 * rr ^= rol32(t2, 1); \ 365 */ \ 366 vmovd krl, t0; \ 367 vpshufb tt0, t0, t3; \ 368 vpsrldq $1, t0, t0; \ 369 vpshufb tt0, t0, t2; \ 370 vpsrldq $1, t0, t0; \ 371 vpshufb tt0, t0, t1; \ 372 vpsrldq $1, t0, t0; \ 373 vpshufb tt0, t0, t0; \ 374 \ 375 vpand 0 * 16(r), t0, t0; \ 376 vpand 1 * 16(r), t1, t1; \ 377 vpand 2 * 16(r), t2, t2; \ 378 vpand 3 * 16(r), t3, t3; \ 379 \ 380 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 381 \ 382 vpxor 4 * 16(r), t0, t0; \ 383 vpxor 5 * 16(r), t1, t1; \ 384 vpxor 6 * 16(r), t2, t2; \ 385 vpxor 7 * 16(r), t3, t3; \ 386 vmovdqu t0, 4 * 16(r); \ 387 vmovdqu t1, 5 * 16(r); \ 388 vmovdqu t2, 6 * 16(r); \ 389 vmovdqu t3, 7 * 16(r); \ 390 \ 391 /* \ 392 * t0 = klr; \ 393 * t0 |= lr; \ 394 * ll ^= t0; \ 395 */ \ 396 \ 397 vmovd klr, t0; \ 398 vpshufb tt0, t0, t3; \ 399 vpsrldq $1, t0, t0; \ 400 vpshufb tt0, t0, t2; \ 401 vpsrldq $1, t0, t0; \ 402 vpshufb tt0, t0, t1; \ 403 vpsrldq $1, t0, t0; \ 404 vpshufb tt0, t0, t0; \ 405 \ 406 vpor l4, t0, t0; \ 407 vpor l5, t1, t1; \ 408 vpor l6, t2, t2; \ 409 vpor l7, t3, t3; \ 410 \ 411 vpxor l0, t0, l0; \ 412 vmovdqu l0, 0 * 16(l); \ 413 vpxor l1, t1, l1; \ 414 vmovdqu l1, 1 * 16(l); \ 415 vpxor l2, t2, l2; \ 416 vmovdqu l2, 2 * 16(l); \ 417 vpxor l3, t3, l3; \ 418 vmovdqu l3, 3 * 16(l); 419 420#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 421 vpunpckhdq x1, x0, t2; \ 422 vpunpckldq x1, x0, x0; \ 423 \ 424 vpunpckldq x3, x2, t1; \ 425 vpunpckhdq x3, x2, x2; \ 426 \ 427 vpunpckhqdq t1, x0, x1; \ 428 vpunpcklqdq t1, x0, x0; \ 429 \ 430 vpunpckhqdq x2, t2, x3; \ 431 vpunpcklqdq x2, t2, x2; 432 433#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \ 434 b3, c3, d3, st0, st1) \ 435 vmovdqu d2, st0; \ 436 vmovdqu d3, st1; \ 437 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 438 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 439 vmovdqu st0, d2; \ 440 vmovdqu st1, d3; \ 441 \ 442 vmovdqu a0, st0; \ 443 vmovdqu a1, st1; \ 444 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 445 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 446 \ 447 vmovdqu .Lshufb_16x16b, a0; \ 448 vmovdqu st1, a1; \ 449 vpshufb a0, a2, a2; \ 450 vpshufb a0, a3, a3; \ 451 vpshufb a0, b0, b0; \ 452 vpshufb a0, b1, b1; \ 453 vpshufb a0, b2, b2; \ 454 vpshufb a0, b3, b3; \ 455 vpshufb a0, a1, a1; \ 456 vpshufb a0, c0, c0; \ 457 vpshufb a0, c1, c1; \ 458 vpshufb a0, c2, c2; \ 459 vpshufb a0, c3, c3; \ 460 vpshufb a0, d0, d0; \ 461 vpshufb a0, d1, d1; \ 462 vpshufb a0, d2, d2; \ 463 vpshufb a0, d3, d3; \ 464 vmovdqu d3, st1; \ 465 vmovdqu st0, d3; \ 466 vpshufb a0, d3, a0; \ 467 vmovdqu d2, st0; \ 468 \ 469 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 470 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 471 vmovdqu st0, d2; \ 472 vmovdqu st1, d3; \ 473 \ 474 vmovdqu b0, st0; \ 475 vmovdqu b1, st1; \ 476 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 477 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 478 vmovdqu st0, b0; \ 479 vmovdqu st1, b1; \ 480 /* does not adjust output bytes inside vectors */ 481 482/* load blocks to registers and apply pre-whitening */ 483#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 484 y6, y7, rio, key) \ 485 vmovq key, x0; \ 486 vpshufb .Lpack_bswap, x0, x0; \ 487 \ 488 vpxor 0 * 16(rio), x0, y7; \ 489 vpxor 1 * 16(rio), x0, y6; \ 490 vpxor 2 * 16(rio), x0, y5; \ 491 vpxor 3 * 16(rio), x0, y4; \ 492 vpxor 4 * 16(rio), x0, y3; \ 493 vpxor 5 * 16(rio), x0, y2; \ 494 vpxor 6 * 16(rio), x0, y1; \ 495 vpxor 7 * 16(rio), x0, y0; \ 496 vpxor 8 * 16(rio), x0, x7; \ 497 vpxor 9 * 16(rio), x0, x6; \ 498 vpxor 10 * 16(rio), x0, x5; \ 499 vpxor 11 * 16(rio), x0, x4; \ 500 vpxor 12 * 16(rio), x0, x3; \ 501 vpxor 13 * 16(rio), x0, x2; \ 502 vpxor 14 * 16(rio), x0, x1; \ 503 vpxor 15 * 16(rio), x0, x0; 504 505/* byteslice pre-whitened blocks and store to temporary memory */ 506#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 507 y6, y7, mem_ab, mem_cd) \ 508 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 509 y5, y6, y7, (mem_ab), (mem_cd)); \ 510 \ 511 vmovdqu x0, 0 * 16(mem_ab); \ 512 vmovdqu x1, 1 * 16(mem_ab); \ 513 vmovdqu x2, 2 * 16(mem_ab); \ 514 vmovdqu x3, 3 * 16(mem_ab); \ 515 vmovdqu x4, 4 * 16(mem_ab); \ 516 vmovdqu x5, 5 * 16(mem_ab); \ 517 vmovdqu x6, 6 * 16(mem_ab); \ 518 vmovdqu x7, 7 * 16(mem_ab); \ 519 vmovdqu y0, 0 * 16(mem_cd); \ 520 vmovdqu y1, 1 * 16(mem_cd); \ 521 vmovdqu y2, 2 * 16(mem_cd); \ 522 vmovdqu y3, 3 * 16(mem_cd); \ 523 vmovdqu y4, 4 * 16(mem_cd); \ 524 vmovdqu y5, 5 * 16(mem_cd); \ 525 vmovdqu y6, 6 * 16(mem_cd); \ 526 vmovdqu y7, 7 * 16(mem_cd); 527 528/* de-byteslice, apply post-whitening and store blocks */ 529#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 530 y5, y6, y7, key, stack_tmp0, stack_tmp1) \ 531 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \ 532 y7, x3, x7, stack_tmp0, stack_tmp1); \ 533 \ 534 vmovdqu x0, stack_tmp0; \ 535 \ 536 vmovq key, x0; \ 537 vpshufb .Lpack_bswap, x0, x0; \ 538 \ 539 vpxor x0, y7, y7; \ 540 vpxor x0, y6, y6; \ 541 vpxor x0, y5, y5; \ 542 vpxor x0, y4, y4; \ 543 vpxor x0, y3, y3; \ 544 vpxor x0, y2, y2; \ 545 vpxor x0, y1, y1; \ 546 vpxor x0, y0, y0; \ 547 vpxor x0, x7, x7; \ 548 vpxor x0, x6, x6; \ 549 vpxor x0, x5, x5; \ 550 vpxor x0, x4, x4; \ 551 vpxor x0, x3, x3; \ 552 vpxor x0, x2, x2; \ 553 vpxor x0, x1, x1; \ 554 vpxor stack_tmp0, x0, x0; 555 556#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 557 y6, y7, rio) \ 558 vmovdqu x0, 0 * 16(rio); \ 559 vmovdqu x1, 1 * 16(rio); \ 560 vmovdqu x2, 2 * 16(rio); \ 561 vmovdqu x3, 3 * 16(rio); \ 562 vmovdqu x4, 4 * 16(rio); \ 563 vmovdqu x5, 5 * 16(rio); \ 564 vmovdqu x6, 6 * 16(rio); \ 565 vmovdqu x7, 7 * 16(rio); \ 566 vmovdqu y0, 8 * 16(rio); \ 567 vmovdqu y1, 9 * 16(rio); \ 568 vmovdqu y2, 10 * 16(rio); \ 569 vmovdqu y3, 11 * 16(rio); \ 570 vmovdqu y4, 12 * 16(rio); \ 571 vmovdqu y5, 13 * 16(rio); \ 572 vmovdqu y6, 14 * 16(rio); \ 573 vmovdqu y7, 15 * 16(rio); 574 575 576/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 577.section .rodata.cst16, "aM", @progbits, 16 578.align 16 579 580#define SHUFB_BYTES(idx) \ 581 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 582 583.Lshufb_16x16b: 584 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); 585 586.Lpack_bswap: 587 .long 0x00010203 588 .long 0x04050607 589 .long 0x80808080 590 .long 0x80808080 591 592/* For CTR-mode IV byteswap */ 593.Lbswap128_mask: 594 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 595 596/* For XTS mode IV generation */ 597.Lxts_gf128mul_and_shl1_mask: 598 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 599 600/* 601 * pre-SubByte transform 602 * 603 * pre-lookup for sbox1, sbox2, sbox3: 604 * swap_bitendianness( 605 * isom_map_camellia_to_aes( 606 * camellia_f( 607 * swap_bitendianess(in) 608 * ) 609 * ) 610 * ) 611 * 612 * (note: '⊕ 0xc5' inside camellia_f()) 613 */ 614.Lpre_tf_lo_s1: 615 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 616 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 617.Lpre_tf_hi_s1: 618 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a 619 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 620 621/* 622 * pre-SubByte transform 623 * 624 * pre-lookup for sbox4: 625 * swap_bitendianness( 626 * isom_map_camellia_to_aes( 627 * camellia_f( 628 * swap_bitendianess(in <<< 1) 629 * ) 630 * ) 631 * ) 632 * 633 * (note: '⊕ 0xc5' inside camellia_f()) 634 */ 635.Lpre_tf_lo_s4: 636 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 637 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 638.Lpre_tf_hi_s4: 639 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 640 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf 641 642/* 643 * post-SubByte transform 644 * 645 * post-lookup for sbox1, sbox4: 646 * swap_bitendianness( 647 * camellia_h( 648 * isom_map_aes_to_camellia( 649 * swap_bitendianness( 650 * aes_inverse_affine_transform(in) 651 * ) 652 * ) 653 * ) 654 * ) 655 * 656 * (note: '⊕ 0x6e' inside camellia_h()) 657 */ 658.Lpost_tf_lo_s1: 659 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 660 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 661.Lpost_tf_hi_s1: 662 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 663 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c 664 665/* 666 * post-SubByte transform 667 * 668 * post-lookup for sbox2: 669 * swap_bitendianness( 670 * camellia_h( 671 * isom_map_aes_to_camellia( 672 * swap_bitendianness( 673 * aes_inverse_affine_transform(in) 674 * ) 675 * ) 676 * ) 677 * ) <<< 1 678 * 679 * (note: '⊕ 0x6e' inside camellia_h()) 680 */ 681.Lpost_tf_lo_s2: 682 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 683 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 684.Lpost_tf_hi_s2: 685 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 686 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 687 688/* 689 * post-SubByte transform 690 * 691 * post-lookup for sbox3: 692 * swap_bitendianness( 693 * camellia_h( 694 * isom_map_aes_to_camellia( 695 * swap_bitendianness( 696 * aes_inverse_affine_transform(in) 697 * ) 698 * ) 699 * ) 700 * ) >>> 1 701 * 702 * (note: '⊕ 0x6e' inside camellia_h()) 703 */ 704.Lpost_tf_lo_s3: 705 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 706 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 707.Lpost_tf_hi_s3: 708 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 709 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 710 711/* For isolating SubBytes from AESENCLAST, inverse shift row */ 712.Linv_shift_row: 713 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 714 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 715 716/* 4-bit mask */ 717.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 718.align 4 719.L0f0f0f0f: 720 .long 0x0f0f0f0f 721 722.text 723 724.align 8 725__camellia_enc_blk16: 726 /* input: 727 * %rdi: ctx, CTX 728 * %rax: temporary storage, 256 bytes 729 * %xmm0..%xmm15: 16 plaintext blocks 730 * output: 731 * %xmm0..%xmm15: 16 encrypted blocks, order swapped: 732 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 733 */ 734 FRAME_BEGIN 735 736 leaq 8 * 16(%rax), %rcx; 737 738 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 739 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 740 %xmm15, %rax, %rcx); 741 742 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 743 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 744 %xmm15, %rax, %rcx, 0); 745 746 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 747 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 748 %xmm15, 749 ((key_table + (8) * 8) + 0)(CTX), 750 ((key_table + (8) * 8) + 4)(CTX), 751 ((key_table + (8) * 8) + 8)(CTX), 752 ((key_table + (8) * 8) + 12)(CTX)); 753 754 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 755 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 756 %xmm15, %rax, %rcx, 8); 757 758 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 759 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 760 %xmm15, 761 ((key_table + (16) * 8) + 0)(CTX), 762 ((key_table + (16) * 8) + 4)(CTX), 763 ((key_table + (16) * 8) + 8)(CTX), 764 ((key_table + (16) * 8) + 12)(CTX)); 765 766 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 767 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 768 %xmm15, %rax, %rcx, 16); 769 770 movl $24, %r8d; 771 cmpl $16, key_length(CTX); 772 jne .Lenc_max32; 773 774.Lenc_done: 775 /* load CD for output */ 776 vmovdqu 0 * 16(%rcx), %xmm8; 777 vmovdqu 1 * 16(%rcx), %xmm9; 778 vmovdqu 2 * 16(%rcx), %xmm10; 779 vmovdqu 3 * 16(%rcx), %xmm11; 780 vmovdqu 4 * 16(%rcx), %xmm12; 781 vmovdqu 5 * 16(%rcx), %xmm13; 782 vmovdqu 6 * 16(%rcx), %xmm14; 783 vmovdqu 7 * 16(%rcx), %xmm15; 784 785 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 786 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 787 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); 788 789 FRAME_END 790 ret; 791 792.align 8 793.Lenc_max32: 794 movl $32, %r8d; 795 796 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 797 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 798 %xmm15, 799 ((key_table + (24) * 8) + 0)(CTX), 800 ((key_table + (24) * 8) + 4)(CTX), 801 ((key_table + (24) * 8) + 8)(CTX), 802 ((key_table + (24) * 8) + 12)(CTX)); 803 804 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 805 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 806 %xmm15, %rax, %rcx, 24); 807 808 jmp .Lenc_done; 809ENDPROC(__camellia_enc_blk16) 810 811.align 8 812__camellia_dec_blk16: 813 /* input: 814 * %rdi: ctx, CTX 815 * %rax: temporary storage, 256 bytes 816 * %r8d: 24 for 16 byte key, 32 for larger 817 * %xmm0..%xmm15: 16 encrypted blocks 818 * output: 819 * %xmm0..%xmm15: 16 plaintext blocks, order swapped: 820 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 821 */ 822 FRAME_BEGIN 823 824 leaq 8 * 16(%rax), %rcx; 825 826 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 827 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 828 %xmm15, %rax, %rcx); 829 830 cmpl $32, %r8d; 831 je .Ldec_max32; 832 833.Ldec_max24: 834 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 835 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 836 %xmm15, %rax, %rcx, 16); 837 838 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 839 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 840 %xmm15, 841 ((key_table + (16) * 8) + 8)(CTX), 842 ((key_table + (16) * 8) + 12)(CTX), 843 ((key_table + (16) * 8) + 0)(CTX), 844 ((key_table + (16) * 8) + 4)(CTX)); 845 846 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 847 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 848 %xmm15, %rax, %rcx, 8); 849 850 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 851 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 852 %xmm15, 853 ((key_table + (8) * 8) + 8)(CTX), 854 ((key_table + (8) * 8) + 12)(CTX), 855 ((key_table + (8) * 8) + 0)(CTX), 856 ((key_table + (8) * 8) + 4)(CTX)); 857 858 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 859 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 860 %xmm15, %rax, %rcx, 0); 861 862 /* load CD for output */ 863 vmovdqu 0 * 16(%rcx), %xmm8; 864 vmovdqu 1 * 16(%rcx), %xmm9; 865 vmovdqu 2 * 16(%rcx), %xmm10; 866 vmovdqu 3 * 16(%rcx), %xmm11; 867 vmovdqu 4 * 16(%rcx), %xmm12; 868 vmovdqu 5 * 16(%rcx), %xmm13; 869 vmovdqu 6 * 16(%rcx), %xmm14; 870 vmovdqu 7 * 16(%rcx), %xmm15; 871 872 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 873 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 874 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); 875 876 FRAME_END 877 ret; 878 879.align 8 880.Ldec_max32: 881 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 882 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 883 %xmm15, %rax, %rcx, 24); 884 885 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 886 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 887 %xmm15, 888 ((key_table + (24) * 8) + 8)(CTX), 889 ((key_table + (24) * 8) + 12)(CTX), 890 ((key_table + (24) * 8) + 0)(CTX), 891 ((key_table + (24) * 8) + 4)(CTX)); 892 893 jmp .Ldec_max24; 894ENDPROC(__camellia_dec_blk16) 895 896ENTRY(camellia_ecb_enc_16way) 897 /* input: 898 * %rdi: ctx, CTX 899 * %rsi: dst (16 blocks) 900 * %rdx: src (16 blocks) 901 */ 902 FRAME_BEGIN 903 904 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 905 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 906 %xmm15, %rdx, (key_table)(CTX)); 907 908 /* now dst can be used as temporary buffer (even in src == dst case) */ 909 movq %rsi, %rax; 910 911 call __camellia_enc_blk16; 912 913 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 914 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 915 %xmm8, %rsi); 916 917 FRAME_END 918 ret; 919ENDPROC(camellia_ecb_enc_16way) 920 921ENTRY(camellia_ecb_dec_16way) 922 /* input: 923 * %rdi: ctx, CTX 924 * %rsi: dst (16 blocks) 925 * %rdx: src (16 blocks) 926 */ 927 FRAME_BEGIN 928 929 cmpl $16, key_length(CTX); 930 movl $32, %r8d; 931 movl $24, %eax; 932 cmovel %eax, %r8d; /* max */ 933 934 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 935 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 936 %xmm15, %rdx, (key_table)(CTX, %r8, 8)); 937 938 /* now dst can be used as temporary buffer (even in src == dst case) */ 939 movq %rsi, %rax; 940 941 call __camellia_dec_blk16; 942 943 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 944 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 945 %xmm8, %rsi); 946 947 FRAME_END 948 ret; 949ENDPROC(camellia_ecb_dec_16way) 950 951ENTRY(camellia_cbc_dec_16way) 952 /* input: 953 * %rdi: ctx, CTX 954 * %rsi: dst (16 blocks) 955 * %rdx: src (16 blocks) 956 */ 957 FRAME_BEGIN 958 959 cmpl $16, key_length(CTX); 960 movl $32, %r8d; 961 movl $24, %eax; 962 cmovel %eax, %r8d; /* max */ 963 964 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 965 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 966 %xmm15, %rdx, (key_table)(CTX, %r8, 8)); 967 968 /* 969 * dst might still be in-use (in case dst == src), so use stack for 970 * temporary storage. 971 */ 972 subq $(16 * 16), %rsp; 973 movq %rsp, %rax; 974 975 call __camellia_dec_blk16; 976 977 addq $(16 * 16), %rsp; 978 979 vpxor (0 * 16)(%rdx), %xmm6, %xmm6; 980 vpxor (1 * 16)(%rdx), %xmm5, %xmm5; 981 vpxor (2 * 16)(%rdx), %xmm4, %xmm4; 982 vpxor (3 * 16)(%rdx), %xmm3, %xmm3; 983 vpxor (4 * 16)(%rdx), %xmm2, %xmm2; 984 vpxor (5 * 16)(%rdx), %xmm1, %xmm1; 985 vpxor (6 * 16)(%rdx), %xmm0, %xmm0; 986 vpxor (7 * 16)(%rdx), %xmm15, %xmm15; 987 vpxor (8 * 16)(%rdx), %xmm14, %xmm14; 988 vpxor (9 * 16)(%rdx), %xmm13, %xmm13; 989 vpxor (10 * 16)(%rdx), %xmm12, %xmm12; 990 vpxor (11 * 16)(%rdx), %xmm11, %xmm11; 991 vpxor (12 * 16)(%rdx), %xmm10, %xmm10; 992 vpxor (13 * 16)(%rdx), %xmm9, %xmm9; 993 vpxor (14 * 16)(%rdx), %xmm8, %xmm8; 994 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 995 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 996 %xmm8, %rsi); 997 998 FRAME_END 999 ret; 1000ENDPROC(camellia_cbc_dec_16way) 1001 1002#define inc_le128(x, minus_one, tmp) \ 1003 vpcmpeqq minus_one, x, tmp; \ 1004 vpsubq minus_one, x, x; \ 1005 vpslldq $8, tmp, tmp; \ 1006 vpsubq tmp, x, x; 1007 1008ENTRY(camellia_ctr_16way) 1009 /* input: 1010 * %rdi: ctx, CTX 1011 * %rsi: dst (16 blocks) 1012 * %rdx: src (16 blocks) 1013 * %rcx: iv (little endian, 128bit) 1014 */ 1015 FRAME_BEGIN 1016 1017 subq $(16 * 16), %rsp; 1018 movq %rsp, %rax; 1019 1020 vmovdqa .Lbswap128_mask, %xmm14; 1021 1022 /* load IV and byteswap */ 1023 vmovdqu (%rcx), %xmm0; 1024 vpshufb %xmm14, %xmm0, %xmm15; 1025 vmovdqu %xmm15, 15 * 16(%rax); 1026 1027 vpcmpeqd %xmm15, %xmm15, %xmm15; 1028 vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ 1029 1030 /* construct IVs */ 1031 inc_le128(%xmm0, %xmm15, %xmm13); 1032 vpshufb %xmm14, %xmm0, %xmm13; 1033 vmovdqu %xmm13, 14 * 16(%rax); 1034 inc_le128(%xmm0, %xmm15, %xmm13); 1035 vpshufb %xmm14, %xmm0, %xmm13; 1036 vmovdqu %xmm13, 13 * 16(%rax); 1037 inc_le128(%xmm0, %xmm15, %xmm13); 1038 vpshufb %xmm14, %xmm0, %xmm12; 1039 inc_le128(%xmm0, %xmm15, %xmm13); 1040 vpshufb %xmm14, %xmm0, %xmm11; 1041 inc_le128(%xmm0, %xmm15, %xmm13); 1042 vpshufb %xmm14, %xmm0, %xmm10; 1043 inc_le128(%xmm0, %xmm15, %xmm13); 1044 vpshufb %xmm14, %xmm0, %xmm9; 1045 inc_le128(%xmm0, %xmm15, %xmm13); 1046 vpshufb %xmm14, %xmm0, %xmm8; 1047 inc_le128(%xmm0, %xmm15, %xmm13); 1048 vpshufb %xmm14, %xmm0, %xmm7; 1049 inc_le128(%xmm0, %xmm15, %xmm13); 1050 vpshufb %xmm14, %xmm0, %xmm6; 1051 inc_le128(%xmm0, %xmm15, %xmm13); 1052 vpshufb %xmm14, %xmm0, %xmm5; 1053 inc_le128(%xmm0, %xmm15, %xmm13); 1054 vpshufb %xmm14, %xmm0, %xmm4; 1055 inc_le128(%xmm0, %xmm15, %xmm13); 1056 vpshufb %xmm14, %xmm0, %xmm3; 1057 inc_le128(%xmm0, %xmm15, %xmm13); 1058 vpshufb %xmm14, %xmm0, %xmm2; 1059 inc_le128(%xmm0, %xmm15, %xmm13); 1060 vpshufb %xmm14, %xmm0, %xmm1; 1061 inc_le128(%xmm0, %xmm15, %xmm13); 1062 vmovdqa %xmm0, %xmm13; 1063 vpshufb %xmm14, %xmm0, %xmm0; 1064 inc_le128(%xmm13, %xmm15, %xmm14); 1065 vmovdqu %xmm13, (%rcx); 1066 1067 /* inpack16_pre: */ 1068 vmovq (key_table)(CTX), %xmm15; 1069 vpshufb .Lpack_bswap, %xmm15, %xmm15; 1070 vpxor %xmm0, %xmm15, %xmm0; 1071 vpxor %xmm1, %xmm15, %xmm1; 1072 vpxor %xmm2, %xmm15, %xmm2; 1073 vpxor %xmm3, %xmm15, %xmm3; 1074 vpxor %xmm4, %xmm15, %xmm4; 1075 vpxor %xmm5, %xmm15, %xmm5; 1076 vpxor %xmm6, %xmm15, %xmm6; 1077 vpxor %xmm7, %xmm15, %xmm7; 1078 vpxor %xmm8, %xmm15, %xmm8; 1079 vpxor %xmm9, %xmm15, %xmm9; 1080 vpxor %xmm10, %xmm15, %xmm10; 1081 vpxor %xmm11, %xmm15, %xmm11; 1082 vpxor %xmm12, %xmm15, %xmm12; 1083 vpxor 13 * 16(%rax), %xmm15, %xmm13; 1084 vpxor 14 * 16(%rax), %xmm15, %xmm14; 1085 vpxor 15 * 16(%rax), %xmm15, %xmm15; 1086 1087 call __camellia_enc_blk16; 1088 1089 addq $(16 * 16), %rsp; 1090 1091 vpxor 0 * 16(%rdx), %xmm7, %xmm7; 1092 vpxor 1 * 16(%rdx), %xmm6, %xmm6; 1093 vpxor 2 * 16(%rdx), %xmm5, %xmm5; 1094 vpxor 3 * 16(%rdx), %xmm4, %xmm4; 1095 vpxor 4 * 16(%rdx), %xmm3, %xmm3; 1096 vpxor 5 * 16(%rdx), %xmm2, %xmm2; 1097 vpxor 6 * 16(%rdx), %xmm1, %xmm1; 1098 vpxor 7 * 16(%rdx), %xmm0, %xmm0; 1099 vpxor 8 * 16(%rdx), %xmm15, %xmm15; 1100 vpxor 9 * 16(%rdx), %xmm14, %xmm14; 1101 vpxor 10 * 16(%rdx), %xmm13, %xmm13; 1102 vpxor 11 * 16(%rdx), %xmm12, %xmm12; 1103 vpxor 12 * 16(%rdx), %xmm11, %xmm11; 1104 vpxor 13 * 16(%rdx), %xmm10, %xmm10; 1105 vpxor 14 * 16(%rdx), %xmm9, %xmm9; 1106 vpxor 15 * 16(%rdx), %xmm8, %xmm8; 1107 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 1108 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 1109 %xmm8, %rsi); 1110 1111 FRAME_END 1112 ret; 1113ENDPROC(camellia_ctr_16way) 1114 1115#define gf128mul_x_ble(iv, mask, tmp) \ 1116 vpsrad $31, iv, tmp; \ 1117 vpaddq iv, iv, iv; \ 1118 vpshufd $0x13, tmp, tmp; \ 1119 vpand mask, tmp, tmp; \ 1120 vpxor tmp, iv, iv; 1121 1122.align 8 1123camellia_xts_crypt_16way: 1124 /* input: 1125 * %rdi: ctx, CTX 1126 * %rsi: dst (16 blocks) 1127 * %rdx: src (16 blocks) 1128 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1129 * %r8: index for input whitening key 1130 * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 1131 */ 1132 FRAME_BEGIN 1133 1134 subq $(16 * 16), %rsp; 1135 movq %rsp, %rax; 1136 1137 vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; 1138 1139 /* load IV */ 1140 vmovdqu (%rcx), %xmm0; 1141 vpxor 0 * 16(%rdx), %xmm0, %xmm15; 1142 vmovdqu %xmm15, 15 * 16(%rax); 1143 vmovdqu %xmm0, 0 * 16(%rsi); 1144 1145 /* construct IVs */ 1146 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1147 vpxor 1 * 16(%rdx), %xmm0, %xmm15; 1148 vmovdqu %xmm15, 14 * 16(%rax); 1149 vmovdqu %xmm0, 1 * 16(%rsi); 1150 1151 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1152 vpxor 2 * 16(%rdx), %xmm0, %xmm13; 1153 vmovdqu %xmm0, 2 * 16(%rsi); 1154 1155 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1156 vpxor 3 * 16(%rdx), %xmm0, %xmm12; 1157 vmovdqu %xmm0, 3 * 16(%rsi); 1158 1159 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1160 vpxor 4 * 16(%rdx), %xmm0, %xmm11; 1161 vmovdqu %xmm0, 4 * 16(%rsi); 1162 1163 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1164 vpxor 5 * 16(%rdx), %xmm0, %xmm10; 1165 vmovdqu %xmm0, 5 * 16(%rsi); 1166 1167 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1168 vpxor 6 * 16(%rdx), %xmm0, %xmm9; 1169 vmovdqu %xmm0, 6 * 16(%rsi); 1170 1171 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1172 vpxor 7 * 16(%rdx), %xmm0, %xmm8; 1173 vmovdqu %xmm0, 7 * 16(%rsi); 1174 1175 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1176 vpxor 8 * 16(%rdx), %xmm0, %xmm7; 1177 vmovdqu %xmm0, 8 * 16(%rsi); 1178 1179 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1180 vpxor 9 * 16(%rdx), %xmm0, %xmm6; 1181 vmovdqu %xmm0, 9 * 16(%rsi); 1182 1183 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1184 vpxor 10 * 16(%rdx), %xmm0, %xmm5; 1185 vmovdqu %xmm0, 10 * 16(%rsi); 1186 1187 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1188 vpxor 11 * 16(%rdx), %xmm0, %xmm4; 1189 vmovdqu %xmm0, 11 * 16(%rsi); 1190 1191 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1192 vpxor 12 * 16(%rdx), %xmm0, %xmm3; 1193 vmovdqu %xmm0, 12 * 16(%rsi); 1194 1195 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1196 vpxor 13 * 16(%rdx), %xmm0, %xmm2; 1197 vmovdqu %xmm0, 13 * 16(%rsi); 1198 1199 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1200 vpxor 14 * 16(%rdx), %xmm0, %xmm1; 1201 vmovdqu %xmm0, 14 * 16(%rsi); 1202 1203 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1204 vpxor 15 * 16(%rdx), %xmm0, %xmm15; 1205 vmovdqu %xmm15, 0 * 16(%rax); 1206 vmovdqu %xmm0, 15 * 16(%rsi); 1207 1208 gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1209 vmovdqu %xmm0, (%rcx); 1210 1211 /* inpack16_pre: */ 1212 vmovq (key_table)(CTX, %r8, 8), %xmm15; 1213 vpshufb .Lpack_bswap, %xmm15, %xmm15; 1214 vpxor 0 * 16(%rax), %xmm15, %xmm0; 1215 vpxor %xmm1, %xmm15, %xmm1; 1216 vpxor %xmm2, %xmm15, %xmm2; 1217 vpxor %xmm3, %xmm15, %xmm3; 1218 vpxor %xmm4, %xmm15, %xmm4; 1219 vpxor %xmm5, %xmm15, %xmm5; 1220 vpxor %xmm6, %xmm15, %xmm6; 1221 vpxor %xmm7, %xmm15, %xmm7; 1222 vpxor %xmm8, %xmm15, %xmm8; 1223 vpxor %xmm9, %xmm15, %xmm9; 1224 vpxor %xmm10, %xmm15, %xmm10; 1225 vpxor %xmm11, %xmm15, %xmm11; 1226 vpxor %xmm12, %xmm15, %xmm12; 1227 vpxor %xmm13, %xmm15, %xmm13; 1228 vpxor 14 * 16(%rax), %xmm15, %xmm14; 1229 vpxor 15 * 16(%rax), %xmm15, %xmm15; 1230 1231 CALL_NOSPEC %r9; 1232 1233 addq $(16 * 16), %rsp; 1234 1235 vpxor 0 * 16(%rsi), %xmm7, %xmm7; 1236 vpxor 1 * 16(%rsi), %xmm6, %xmm6; 1237 vpxor 2 * 16(%rsi), %xmm5, %xmm5; 1238 vpxor 3 * 16(%rsi), %xmm4, %xmm4; 1239 vpxor 4 * 16(%rsi), %xmm3, %xmm3; 1240 vpxor 5 * 16(%rsi), %xmm2, %xmm2; 1241 vpxor 6 * 16(%rsi), %xmm1, %xmm1; 1242 vpxor 7 * 16(%rsi), %xmm0, %xmm0; 1243 vpxor 8 * 16(%rsi), %xmm15, %xmm15; 1244 vpxor 9 * 16(%rsi), %xmm14, %xmm14; 1245 vpxor 10 * 16(%rsi), %xmm13, %xmm13; 1246 vpxor 11 * 16(%rsi), %xmm12, %xmm12; 1247 vpxor 12 * 16(%rsi), %xmm11, %xmm11; 1248 vpxor 13 * 16(%rsi), %xmm10, %xmm10; 1249 vpxor 14 * 16(%rsi), %xmm9, %xmm9; 1250 vpxor 15 * 16(%rsi), %xmm8, %xmm8; 1251 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 1252 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 1253 %xmm8, %rsi); 1254 1255 FRAME_END 1256 ret; 1257ENDPROC(camellia_xts_crypt_16way) 1258 1259ENTRY(camellia_xts_enc_16way) 1260 /* input: 1261 * %rdi: ctx, CTX 1262 * %rsi: dst (16 blocks) 1263 * %rdx: src (16 blocks) 1264 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1265 */ 1266 xorl %r8d, %r8d; /* input whitening key, 0 for enc */ 1267 1268 leaq __camellia_enc_blk16, %r9; 1269 1270 jmp camellia_xts_crypt_16way; 1271ENDPROC(camellia_xts_enc_16way) 1272 1273ENTRY(camellia_xts_dec_16way) 1274 /* input: 1275 * %rdi: ctx, CTX 1276 * %rsi: dst (16 blocks) 1277 * %rdx: src (16 blocks) 1278 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1279 */ 1280 1281 cmpl $16, key_length(CTX); 1282 movl $32, %r8d; 1283 movl $24, %eax; 1284 cmovel %eax, %r8d; /* input whitening key, last for dec */ 1285 1286 leaq __camellia_dec_blk16, %r9; 1287 1288 jmp camellia_xts_crypt_16way; 1289ENDPROC(camellia_xts_dec_16way) 1290