1/* 2 * x86_64/AVX2/AES-NI assembler implementation of Camellia 3 * 4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 */ 12 13#include <linux/linkage.h> 14#include <asm/frame.h> 15 16#define CAMELLIA_TABLE_BYTE_LEN 272 17 18/* struct camellia_ctx: */ 19#define key_table 0 20#define key_length CAMELLIA_TABLE_BYTE_LEN 21 22/* register macros */ 23#define CTX %rdi 24#define RIO %r8 25 26/********************************************************************** 27 helper macros 28 **********************************************************************/ 29#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 30 vpand x, mask4bit, tmp0; \ 31 vpandn x, mask4bit, x; \ 32 vpsrld $4, x, x; \ 33 \ 34 vpshufb tmp0, lo_t, tmp0; \ 35 vpshufb x, hi_t, x; \ 36 vpxor tmp0, x, x; 37 38#define ymm0_x xmm0 39#define ymm1_x xmm1 40#define ymm2_x xmm2 41#define ymm3_x xmm3 42#define ymm4_x xmm4 43#define ymm5_x xmm5 44#define ymm6_x xmm6 45#define ymm7_x xmm7 46#define ymm8_x xmm8 47#define ymm9_x xmm9 48#define ymm10_x xmm10 49#define ymm11_x xmm11 50#define ymm12_x xmm12 51#define ymm13_x xmm13 52#define ymm14_x xmm14 53#define ymm15_x xmm15 54 55/********************************************************************** 56 32-way camellia 57 **********************************************************************/ 58 59/* 60 * IN: 61 * x0..x7: byte-sliced AB state 62 * mem_cd: register pointer storing CD state 63 * key: index for key material 64 * OUT: 65 * x0..x7: new byte-sliced CD state 66 */ 67#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ 68 t7, mem_cd, key) \ 69 /* \ 70 * S-function with AES subbytes \ 71 */ \ 72 vbroadcasti128 .Linv_shift_row, t4; \ 73 vpbroadcastd .L0f0f0f0f, t7; \ 74 vbroadcasti128 .Lpre_tf_lo_s1, t5; \ 75 vbroadcasti128 .Lpre_tf_hi_s1, t6; \ 76 vbroadcasti128 .Lpre_tf_lo_s4, t2; \ 77 vbroadcasti128 .Lpre_tf_hi_s4, t3; \ 78 \ 79 /* AES inverse shift rows */ \ 80 vpshufb t4, x0, x0; \ 81 vpshufb t4, x7, x7; \ 82 vpshufb t4, x3, x3; \ 83 vpshufb t4, x6, x6; \ 84 vpshufb t4, x2, x2; \ 85 vpshufb t4, x5, x5; \ 86 vpshufb t4, x1, x1; \ 87 vpshufb t4, x4, x4; \ 88 \ 89 /* prefilter sboxes 1, 2 and 3 */ \ 90 /* prefilter sbox 4 */ \ 91 filter_8bit(x0, t5, t6, t7, t4); \ 92 filter_8bit(x7, t5, t6, t7, t4); \ 93 vextracti128 $1, x0, t0##_x; \ 94 vextracti128 $1, x7, t1##_x; \ 95 filter_8bit(x3, t2, t3, t7, t4); \ 96 filter_8bit(x6, t2, t3, t7, t4); \ 97 vextracti128 $1, x3, t3##_x; \ 98 vextracti128 $1, x6, t2##_x; \ 99 filter_8bit(x2, t5, t6, t7, t4); \ 100 filter_8bit(x5, t5, t6, t7, t4); \ 101 filter_8bit(x1, t5, t6, t7, t4); \ 102 filter_8bit(x4, t5, t6, t7, t4); \ 103 \ 104 vpxor t4##_x, t4##_x, t4##_x; \ 105 \ 106 /* AES subbytes + AES shift rows */ \ 107 vextracti128 $1, x2, t6##_x; \ 108 vextracti128 $1, x5, t5##_x; \ 109 vaesenclast t4##_x, x0##_x, x0##_x; \ 110 vaesenclast t4##_x, t0##_x, t0##_x; \ 111 vinserti128 $1, t0##_x, x0, x0; \ 112 vaesenclast t4##_x, x7##_x, x7##_x; \ 113 vaesenclast t4##_x, t1##_x, t1##_x; \ 114 vinserti128 $1, t1##_x, x7, x7; \ 115 vaesenclast t4##_x, x3##_x, x3##_x; \ 116 vaesenclast t4##_x, t3##_x, t3##_x; \ 117 vinserti128 $1, t3##_x, x3, x3; \ 118 vaesenclast t4##_x, x6##_x, x6##_x; \ 119 vaesenclast t4##_x, t2##_x, t2##_x; \ 120 vinserti128 $1, t2##_x, x6, x6; \ 121 vextracti128 $1, x1, t3##_x; \ 122 vextracti128 $1, x4, t2##_x; \ 123 vbroadcasti128 .Lpost_tf_lo_s1, t0; \ 124 vbroadcasti128 .Lpost_tf_hi_s1, t1; \ 125 vaesenclast t4##_x, x2##_x, x2##_x; \ 126 vaesenclast t4##_x, t6##_x, t6##_x; \ 127 vinserti128 $1, t6##_x, x2, x2; \ 128 vaesenclast t4##_x, x5##_x, x5##_x; \ 129 vaesenclast t4##_x, t5##_x, t5##_x; \ 130 vinserti128 $1, t5##_x, x5, x5; \ 131 vaesenclast t4##_x, x1##_x, x1##_x; \ 132 vaesenclast t4##_x, t3##_x, t3##_x; \ 133 vinserti128 $1, t3##_x, x1, x1; \ 134 vaesenclast t4##_x, x4##_x, x4##_x; \ 135 vaesenclast t4##_x, t2##_x, t2##_x; \ 136 vinserti128 $1, t2##_x, x4, x4; \ 137 \ 138 /* postfilter sboxes 1 and 4 */ \ 139 vbroadcasti128 .Lpost_tf_lo_s3, t2; \ 140 vbroadcasti128 .Lpost_tf_hi_s3, t3; \ 141 filter_8bit(x0, t0, t1, t7, t6); \ 142 filter_8bit(x7, t0, t1, t7, t6); \ 143 filter_8bit(x3, t0, t1, t7, t6); \ 144 filter_8bit(x6, t0, t1, t7, t6); \ 145 \ 146 /* postfilter sbox 3 */ \ 147 vbroadcasti128 .Lpost_tf_lo_s2, t4; \ 148 vbroadcasti128 .Lpost_tf_hi_s2, t5; \ 149 filter_8bit(x2, t2, t3, t7, t6); \ 150 filter_8bit(x5, t2, t3, t7, t6); \ 151 \ 152 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ 153 \ 154 /* postfilter sbox 2 */ \ 155 filter_8bit(x1, t4, t5, t7, t2); \ 156 filter_8bit(x4, t4, t5, t7, t2); \ 157 vpxor t7, t7, t7; \ 158 \ 159 vpsrldq $1, t0, t1; \ 160 vpsrldq $2, t0, t2; \ 161 vpshufb t7, t1, t1; \ 162 vpsrldq $3, t0, t3; \ 163 \ 164 /* P-function */ \ 165 vpxor x5, x0, x0; \ 166 vpxor x6, x1, x1; \ 167 vpxor x7, x2, x2; \ 168 vpxor x4, x3, x3; \ 169 \ 170 vpshufb t7, t2, t2; \ 171 vpsrldq $4, t0, t4; \ 172 vpshufb t7, t3, t3; \ 173 vpsrldq $5, t0, t5; \ 174 vpshufb t7, t4, t4; \ 175 \ 176 vpxor x2, x4, x4; \ 177 vpxor x3, x5, x5; \ 178 vpxor x0, x6, x6; \ 179 vpxor x1, x7, x7; \ 180 \ 181 vpsrldq $6, t0, t6; \ 182 vpshufb t7, t5, t5; \ 183 vpshufb t7, t6, t6; \ 184 \ 185 vpxor x7, x0, x0; \ 186 vpxor x4, x1, x1; \ 187 vpxor x5, x2, x2; \ 188 vpxor x6, x3, x3; \ 189 \ 190 vpxor x3, x4, x4; \ 191 vpxor x0, x5, x5; \ 192 vpxor x1, x6, x6; \ 193 vpxor x2, x7, x7; /* note: high and low parts swapped */ \ 194 \ 195 /* Add key material and result to CD (x becomes new CD) */ \ 196 \ 197 vpxor t6, x1, x1; \ 198 vpxor 5 * 32(mem_cd), x1, x1; \ 199 \ 200 vpsrldq $7, t0, t6; \ 201 vpshufb t7, t0, t0; \ 202 vpshufb t7, t6, t7; \ 203 \ 204 vpxor t7, x0, x0; \ 205 vpxor 4 * 32(mem_cd), x0, x0; \ 206 \ 207 vpxor t5, x2, x2; \ 208 vpxor 6 * 32(mem_cd), x2, x2; \ 209 \ 210 vpxor t4, x3, x3; \ 211 vpxor 7 * 32(mem_cd), x3, x3; \ 212 \ 213 vpxor t3, x4, x4; \ 214 vpxor 0 * 32(mem_cd), x4, x4; \ 215 \ 216 vpxor t2, x5, x5; \ 217 vpxor 1 * 32(mem_cd), x5, x5; \ 218 \ 219 vpxor t1, x6, x6; \ 220 vpxor 2 * 32(mem_cd), x6, x6; \ 221 \ 222 vpxor t0, x7, x7; \ 223 vpxor 3 * 32(mem_cd), x7, x7; 224 225/* 226 * Size optimization... with inlined roundsm32 binary would be over 5 times 227 * larger and would only marginally faster. 228 */ 229.align 8 230roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: 231 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 232 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 233 %rcx, (%r9)); 234 ret; 235ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 236 237.align 8 238roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: 239 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, 240 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, 241 %rax, (%r9)); 242 ret; 243ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 244 245/* 246 * IN/OUT: 247 * x0..x7: byte-sliced AB state preloaded 248 * mem_ab: byte-sliced AB state in memory 249 * mem_cb: byte-sliced CD state in memory 250 */ 251#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 252 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ 253 leaq (key_table + (i) * 8)(CTX), %r9; \ 254 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ 255 \ 256 vmovdqu x0, 4 * 32(mem_cd); \ 257 vmovdqu x1, 5 * 32(mem_cd); \ 258 vmovdqu x2, 6 * 32(mem_cd); \ 259 vmovdqu x3, 7 * 32(mem_cd); \ 260 vmovdqu x4, 0 * 32(mem_cd); \ 261 vmovdqu x5, 1 * 32(mem_cd); \ 262 vmovdqu x6, 2 * 32(mem_cd); \ 263 vmovdqu x7, 3 * 32(mem_cd); \ 264 \ 265 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ 266 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ 267 \ 268 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); 269 270#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ 271 272#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ 273 /* Store new AB state */ \ 274 vmovdqu x4, 4 * 32(mem_ab); \ 275 vmovdqu x5, 5 * 32(mem_ab); \ 276 vmovdqu x6, 6 * 32(mem_ab); \ 277 vmovdqu x7, 7 * 32(mem_ab); \ 278 vmovdqu x0, 0 * 32(mem_ab); \ 279 vmovdqu x1, 1 * 32(mem_ab); \ 280 vmovdqu x2, 2 * 32(mem_ab); \ 281 vmovdqu x3, 3 * 32(mem_ab); 282 283#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 284 y6, y7, mem_ab, mem_cd, i) \ 285 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 286 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ 287 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 288 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ 289 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 290 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); 291 292#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 293 y6, y7, mem_ab, mem_cd, i) \ 294 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 295 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ 296 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 297 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ 298 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 299 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); 300 301/* 302 * IN: 303 * v0..3: byte-sliced 32-bit integers 304 * OUT: 305 * v0..3: (IN <<< 1) 306 */ 307#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ 308 vpcmpgtb v0, zero, t0; \ 309 vpaddb v0, v0, v0; \ 310 vpabsb t0, t0; \ 311 \ 312 vpcmpgtb v1, zero, t1; \ 313 vpaddb v1, v1, v1; \ 314 vpabsb t1, t1; \ 315 \ 316 vpcmpgtb v2, zero, t2; \ 317 vpaddb v2, v2, v2; \ 318 vpabsb t2, t2; \ 319 \ 320 vpor t0, v1, v1; \ 321 \ 322 vpcmpgtb v3, zero, t0; \ 323 vpaddb v3, v3, v3; \ 324 vpabsb t0, t0; \ 325 \ 326 vpor t1, v2, v2; \ 327 vpor t2, v3, v3; \ 328 vpor t0, v0, v0; 329 330/* 331 * IN: 332 * r: byte-sliced AB state in memory 333 * l: byte-sliced CD state in memory 334 * OUT: 335 * x0..x7: new byte-sliced CD state 336 */ 337#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ 338 tt1, tt2, tt3, kll, klr, krl, krr) \ 339 /* \ 340 * t0 = kll; \ 341 * t0 &= ll; \ 342 * lr ^= rol32(t0, 1); \ 343 */ \ 344 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ 345 vpxor tt0, tt0, tt0; \ 346 vpshufb tt0, t0, t3; \ 347 vpsrldq $1, t0, t0; \ 348 vpshufb tt0, t0, t2; \ 349 vpsrldq $1, t0, t0; \ 350 vpshufb tt0, t0, t1; \ 351 vpsrldq $1, t0, t0; \ 352 vpshufb tt0, t0, t0; \ 353 \ 354 vpand l0, t0, t0; \ 355 vpand l1, t1, t1; \ 356 vpand l2, t2, t2; \ 357 vpand l3, t3, t3; \ 358 \ 359 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 360 \ 361 vpxor l4, t0, l4; \ 362 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ 363 vmovdqu l4, 4 * 32(l); \ 364 vpxor l5, t1, l5; \ 365 vmovdqu l5, 5 * 32(l); \ 366 vpxor l6, t2, l6; \ 367 vmovdqu l6, 6 * 32(l); \ 368 vpxor l7, t3, l7; \ 369 vmovdqu l7, 7 * 32(l); \ 370 \ 371 /* \ 372 * t2 = krr; \ 373 * t2 |= rr; \ 374 * rl ^= t2; \ 375 */ \ 376 \ 377 vpshufb tt0, t0, t3; \ 378 vpsrldq $1, t0, t0; \ 379 vpshufb tt0, t0, t2; \ 380 vpsrldq $1, t0, t0; \ 381 vpshufb tt0, t0, t1; \ 382 vpsrldq $1, t0, t0; \ 383 vpshufb tt0, t0, t0; \ 384 \ 385 vpor 4 * 32(r), t0, t0; \ 386 vpor 5 * 32(r), t1, t1; \ 387 vpor 6 * 32(r), t2, t2; \ 388 vpor 7 * 32(r), t3, t3; \ 389 \ 390 vpxor 0 * 32(r), t0, t0; \ 391 vpxor 1 * 32(r), t1, t1; \ 392 vpxor 2 * 32(r), t2, t2; \ 393 vpxor 3 * 32(r), t3, t3; \ 394 vmovdqu t0, 0 * 32(r); \ 395 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ 396 vmovdqu t1, 1 * 32(r); \ 397 vmovdqu t2, 2 * 32(r); \ 398 vmovdqu t3, 3 * 32(r); \ 399 \ 400 /* \ 401 * t2 = krl; \ 402 * t2 &= rl; \ 403 * rr ^= rol32(t2, 1); \ 404 */ \ 405 vpshufb tt0, t0, t3; \ 406 vpsrldq $1, t0, t0; \ 407 vpshufb tt0, t0, t2; \ 408 vpsrldq $1, t0, t0; \ 409 vpshufb tt0, t0, t1; \ 410 vpsrldq $1, t0, t0; \ 411 vpshufb tt0, t0, t0; \ 412 \ 413 vpand 0 * 32(r), t0, t0; \ 414 vpand 1 * 32(r), t1, t1; \ 415 vpand 2 * 32(r), t2, t2; \ 416 vpand 3 * 32(r), t3, t3; \ 417 \ 418 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 419 \ 420 vpxor 4 * 32(r), t0, t0; \ 421 vpxor 5 * 32(r), t1, t1; \ 422 vpxor 6 * 32(r), t2, t2; \ 423 vpxor 7 * 32(r), t3, t3; \ 424 vmovdqu t0, 4 * 32(r); \ 425 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ 426 vmovdqu t1, 5 * 32(r); \ 427 vmovdqu t2, 6 * 32(r); \ 428 vmovdqu t3, 7 * 32(r); \ 429 \ 430 /* \ 431 * t0 = klr; \ 432 * t0 |= lr; \ 433 * ll ^= t0; \ 434 */ \ 435 \ 436 vpshufb tt0, t0, t3; \ 437 vpsrldq $1, t0, t0; \ 438 vpshufb tt0, t0, t2; \ 439 vpsrldq $1, t0, t0; \ 440 vpshufb tt0, t0, t1; \ 441 vpsrldq $1, t0, t0; \ 442 vpshufb tt0, t0, t0; \ 443 \ 444 vpor l4, t0, t0; \ 445 vpor l5, t1, t1; \ 446 vpor l6, t2, t2; \ 447 vpor l7, t3, t3; \ 448 \ 449 vpxor l0, t0, l0; \ 450 vmovdqu l0, 0 * 32(l); \ 451 vpxor l1, t1, l1; \ 452 vmovdqu l1, 1 * 32(l); \ 453 vpxor l2, t2, l2; \ 454 vmovdqu l2, 2 * 32(l); \ 455 vpxor l3, t3, l3; \ 456 vmovdqu l3, 3 * 32(l); 457 458#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 459 vpunpckhdq x1, x0, t2; \ 460 vpunpckldq x1, x0, x0; \ 461 \ 462 vpunpckldq x3, x2, t1; \ 463 vpunpckhdq x3, x2, x2; \ 464 \ 465 vpunpckhqdq t1, x0, x1; \ 466 vpunpcklqdq t1, x0, x0; \ 467 \ 468 vpunpckhqdq x2, t2, x3; \ 469 vpunpcklqdq x2, t2, x2; 470 471#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ 472 a3, b3, c3, d3, st0, st1) \ 473 vmovdqu d2, st0; \ 474 vmovdqu d3, st1; \ 475 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 476 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 477 vmovdqu st0, d2; \ 478 vmovdqu st1, d3; \ 479 \ 480 vmovdqu a0, st0; \ 481 vmovdqu a1, st1; \ 482 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 483 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 484 \ 485 vbroadcasti128 .Lshufb_16x16b, a0; \ 486 vmovdqu st1, a1; \ 487 vpshufb a0, a2, a2; \ 488 vpshufb a0, a3, a3; \ 489 vpshufb a0, b0, b0; \ 490 vpshufb a0, b1, b1; \ 491 vpshufb a0, b2, b2; \ 492 vpshufb a0, b3, b3; \ 493 vpshufb a0, a1, a1; \ 494 vpshufb a0, c0, c0; \ 495 vpshufb a0, c1, c1; \ 496 vpshufb a0, c2, c2; \ 497 vpshufb a0, c3, c3; \ 498 vpshufb a0, d0, d0; \ 499 vpshufb a0, d1, d1; \ 500 vpshufb a0, d2, d2; \ 501 vpshufb a0, d3, d3; \ 502 vmovdqu d3, st1; \ 503 vmovdqu st0, d3; \ 504 vpshufb a0, d3, a0; \ 505 vmovdqu d2, st0; \ 506 \ 507 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 508 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 509 vmovdqu st0, d2; \ 510 vmovdqu st1, d3; \ 511 \ 512 vmovdqu b0, st0; \ 513 vmovdqu b1, st1; \ 514 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 515 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 516 vmovdqu st0, b0; \ 517 vmovdqu st1, b1; \ 518 /* does not adjust output bytes inside vectors */ 519 520/* load blocks to registers and apply pre-whitening */ 521#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 522 y6, y7, rio, key) \ 523 vpbroadcastq key, x0; \ 524 vpshufb .Lpack_bswap, x0, x0; \ 525 \ 526 vpxor 0 * 32(rio), x0, y7; \ 527 vpxor 1 * 32(rio), x0, y6; \ 528 vpxor 2 * 32(rio), x0, y5; \ 529 vpxor 3 * 32(rio), x0, y4; \ 530 vpxor 4 * 32(rio), x0, y3; \ 531 vpxor 5 * 32(rio), x0, y2; \ 532 vpxor 6 * 32(rio), x0, y1; \ 533 vpxor 7 * 32(rio), x0, y0; \ 534 vpxor 8 * 32(rio), x0, x7; \ 535 vpxor 9 * 32(rio), x0, x6; \ 536 vpxor 10 * 32(rio), x0, x5; \ 537 vpxor 11 * 32(rio), x0, x4; \ 538 vpxor 12 * 32(rio), x0, x3; \ 539 vpxor 13 * 32(rio), x0, x2; \ 540 vpxor 14 * 32(rio), x0, x1; \ 541 vpxor 15 * 32(rio), x0, x0; 542 543/* byteslice pre-whitened blocks and store to temporary memory */ 544#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 545 y6, y7, mem_ab, mem_cd) \ 546 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ 547 y4, y5, y6, y7, (mem_ab), (mem_cd)); \ 548 \ 549 vmovdqu x0, 0 * 32(mem_ab); \ 550 vmovdqu x1, 1 * 32(mem_ab); \ 551 vmovdqu x2, 2 * 32(mem_ab); \ 552 vmovdqu x3, 3 * 32(mem_ab); \ 553 vmovdqu x4, 4 * 32(mem_ab); \ 554 vmovdqu x5, 5 * 32(mem_ab); \ 555 vmovdqu x6, 6 * 32(mem_ab); \ 556 vmovdqu x7, 7 * 32(mem_ab); \ 557 vmovdqu y0, 0 * 32(mem_cd); \ 558 vmovdqu y1, 1 * 32(mem_cd); \ 559 vmovdqu y2, 2 * 32(mem_cd); \ 560 vmovdqu y3, 3 * 32(mem_cd); \ 561 vmovdqu y4, 4 * 32(mem_cd); \ 562 vmovdqu y5, 5 * 32(mem_cd); \ 563 vmovdqu y6, 6 * 32(mem_cd); \ 564 vmovdqu y7, 7 * 32(mem_cd); 565 566/* de-byteslice, apply post-whitening and store blocks */ 567#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 568 y5, y6, y7, key, stack_tmp0, stack_tmp1) \ 569 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ 570 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ 571 \ 572 vmovdqu x0, stack_tmp0; \ 573 \ 574 vpbroadcastq key, x0; \ 575 vpshufb .Lpack_bswap, x0, x0; \ 576 \ 577 vpxor x0, y7, y7; \ 578 vpxor x0, y6, y6; \ 579 vpxor x0, y5, y5; \ 580 vpxor x0, y4, y4; \ 581 vpxor x0, y3, y3; \ 582 vpxor x0, y2, y2; \ 583 vpxor x0, y1, y1; \ 584 vpxor x0, y0, y0; \ 585 vpxor x0, x7, x7; \ 586 vpxor x0, x6, x6; \ 587 vpxor x0, x5, x5; \ 588 vpxor x0, x4, x4; \ 589 vpxor x0, x3, x3; \ 590 vpxor x0, x2, x2; \ 591 vpxor x0, x1, x1; \ 592 vpxor stack_tmp0, x0, x0; 593 594#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 595 y6, y7, rio) \ 596 vmovdqu x0, 0 * 32(rio); \ 597 vmovdqu x1, 1 * 32(rio); \ 598 vmovdqu x2, 2 * 32(rio); \ 599 vmovdqu x3, 3 * 32(rio); \ 600 vmovdqu x4, 4 * 32(rio); \ 601 vmovdqu x5, 5 * 32(rio); \ 602 vmovdqu x6, 6 * 32(rio); \ 603 vmovdqu x7, 7 * 32(rio); \ 604 vmovdqu y0, 8 * 32(rio); \ 605 vmovdqu y1, 9 * 32(rio); \ 606 vmovdqu y2, 10 * 32(rio); \ 607 vmovdqu y3, 11 * 32(rio); \ 608 vmovdqu y4, 12 * 32(rio); \ 609 vmovdqu y5, 13 * 32(rio); \ 610 vmovdqu y6, 14 * 32(rio); \ 611 vmovdqu y7, 15 * 32(rio); 612 613.data 614.align 32 615 616#define SHUFB_BYTES(idx) \ 617 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 618 619.Lshufb_16x16b: 620 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 621 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 622 623.Lpack_bswap: 624 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 625 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 626 627/* For CTR-mode IV byteswap */ 628.Lbswap128_mask: 629 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 630 631/* For XTS mode */ 632.Lxts_gf128mul_and_shl1_mask_0: 633 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 634.Lxts_gf128mul_and_shl1_mask_1: 635 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 636 637/* 638 * pre-SubByte transform 639 * 640 * pre-lookup for sbox1, sbox2, sbox3: 641 * swap_bitendianness( 642 * isom_map_camellia_to_aes( 643 * camellia_f( 644 * swap_bitendianess(in) 645 * ) 646 * ) 647 * ) 648 * 649 * (note: '⊕ 0xc5' inside camellia_f()) 650 */ 651.Lpre_tf_lo_s1: 652 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 653 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 654.Lpre_tf_hi_s1: 655 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a 656 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 657 658/* 659 * pre-SubByte transform 660 * 661 * pre-lookup for sbox4: 662 * swap_bitendianness( 663 * isom_map_camellia_to_aes( 664 * camellia_f( 665 * swap_bitendianess(in <<< 1) 666 * ) 667 * ) 668 * ) 669 * 670 * (note: '⊕ 0xc5' inside camellia_f()) 671 */ 672.Lpre_tf_lo_s4: 673 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 674 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 675.Lpre_tf_hi_s4: 676 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 677 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf 678 679/* 680 * post-SubByte transform 681 * 682 * post-lookup for sbox1, sbox4: 683 * swap_bitendianness( 684 * camellia_h( 685 * isom_map_aes_to_camellia( 686 * swap_bitendianness( 687 * aes_inverse_affine_transform(in) 688 * ) 689 * ) 690 * ) 691 * ) 692 * 693 * (note: '⊕ 0x6e' inside camellia_h()) 694 */ 695.Lpost_tf_lo_s1: 696 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 697 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 698.Lpost_tf_hi_s1: 699 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 700 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c 701 702/* 703 * post-SubByte transform 704 * 705 * post-lookup for sbox2: 706 * swap_bitendianness( 707 * camellia_h( 708 * isom_map_aes_to_camellia( 709 * swap_bitendianness( 710 * aes_inverse_affine_transform(in) 711 * ) 712 * ) 713 * ) 714 * ) <<< 1 715 * 716 * (note: '⊕ 0x6e' inside camellia_h()) 717 */ 718.Lpost_tf_lo_s2: 719 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 720 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 721.Lpost_tf_hi_s2: 722 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 723 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 724 725/* 726 * post-SubByte transform 727 * 728 * post-lookup for sbox3: 729 * swap_bitendianness( 730 * camellia_h( 731 * isom_map_aes_to_camellia( 732 * swap_bitendianness( 733 * aes_inverse_affine_transform(in) 734 * ) 735 * ) 736 * ) 737 * ) >>> 1 738 * 739 * (note: '⊕ 0x6e' inside camellia_h()) 740 */ 741.Lpost_tf_lo_s3: 742 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 743 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 744.Lpost_tf_hi_s3: 745 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 746 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 747 748/* For isolating SubBytes from AESENCLAST, inverse shift row */ 749.Linv_shift_row: 750 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 751 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 752 753.align 4 754/* 4-bit mask */ 755.L0f0f0f0f: 756 .long 0x0f0f0f0f 757 758.text 759 760.align 8 761__camellia_enc_blk32: 762 /* input: 763 * %rdi: ctx, CTX 764 * %rax: temporary storage, 512 bytes 765 * %ymm0..%ymm15: 32 plaintext blocks 766 * output: 767 * %ymm0..%ymm15: 32 encrypted blocks, order swapped: 768 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 769 */ 770 FRAME_BEGIN 771 772 leaq 8 * 32(%rax), %rcx; 773 774 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 775 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 776 %ymm15, %rax, %rcx); 777 778 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 779 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 780 %ymm15, %rax, %rcx, 0); 781 782 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 783 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 784 %ymm15, 785 ((key_table + (8) * 8) + 0)(CTX), 786 ((key_table + (8) * 8) + 4)(CTX), 787 ((key_table + (8) * 8) + 8)(CTX), 788 ((key_table + (8) * 8) + 12)(CTX)); 789 790 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 791 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 792 %ymm15, %rax, %rcx, 8); 793 794 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 795 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 796 %ymm15, 797 ((key_table + (16) * 8) + 0)(CTX), 798 ((key_table + (16) * 8) + 4)(CTX), 799 ((key_table + (16) * 8) + 8)(CTX), 800 ((key_table + (16) * 8) + 12)(CTX)); 801 802 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 803 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 804 %ymm15, %rax, %rcx, 16); 805 806 movl $24, %r8d; 807 cmpl $16, key_length(CTX); 808 jne .Lenc_max32; 809 810.Lenc_done: 811 /* load CD for output */ 812 vmovdqu 0 * 32(%rcx), %ymm8; 813 vmovdqu 1 * 32(%rcx), %ymm9; 814 vmovdqu 2 * 32(%rcx), %ymm10; 815 vmovdqu 3 * 32(%rcx), %ymm11; 816 vmovdqu 4 * 32(%rcx), %ymm12; 817 vmovdqu 5 * 32(%rcx), %ymm13; 818 vmovdqu 6 * 32(%rcx), %ymm14; 819 vmovdqu 7 * 32(%rcx), %ymm15; 820 821 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 822 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 823 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); 824 825 FRAME_END 826 ret; 827 828.align 8 829.Lenc_max32: 830 movl $32, %r8d; 831 832 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 833 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 834 %ymm15, 835 ((key_table + (24) * 8) + 0)(CTX), 836 ((key_table + (24) * 8) + 4)(CTX), 837 ((key_table + (24) * 8) + 8)(CTX), 838 ((key_table + (24) * 8) + 12)(CTX)); 839 840 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 841 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 842 %ymm15, %rax, %rcx, 24); 843 844 jmp .Lenc_done; 845ENDPROC(__camellia_enc_blk32) 846 847.align 8 848__camellia_dec_blk32: 849 /* input: 850 * %rdi: ctx, CTX 851 * %rax: temporary storage, 512 bytes 852 * %r8d: 24 for 16 byte key, 32 for larger 853 * %ymm0..%ymm15: 16 encrypted blocks 854 * output: 855 * %ymm0..%ymm15: 16 plaintext blocks, order swapped: 856 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 857 */ 858 FRAME_BEGIN 859 860 leaq 8 * 32(%rax), %rcx; 861 862 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 863 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 864 %ymm15, %rax, %rcx); 865 866 cmpl $32, %r8d; 867 je .Ldec_max32; 868 869.Ldec_max24: 870 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 871 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 872 %ymm15, %rax, %rcx, 16); 873 874 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 875 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 876 %ymm15, 877 ((key_table + (16) * 8) + 8)(CTX), 878 ((key_table + (16) * 8) + 12)(CTX), 879 ((key_table + (16) * 8) + 0)(CTX), 880 ((key_table + (16) * 8) + 4)(CTX)); 881 882 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 883 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 884 %ymm15, %rax, %rcx, 8); 885 886 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 887 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 888 %ymm15, 889 ((key_table + (8) * 8) + 8)(CTX), 890 ((key_table + (8) * 8) + 12)(CTX), 891 ((key_table + (8) * 8) + 0)(CTX), 892 ((key_table + (8) * 8) + 4)(CTX)); 893 894 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 895 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 896 %ymm15, %rax, %rcx, 0); 897 898 /* load CD for output */ 899 vmovdqu 0 * 32(%rcx), %ymm8; 900 vmovdqu 1 * 32(%rcx), %ymm9; 901 vmovdqu 2 * 32(%rcx), %ymm10; 902 vmovdqu 3 * 32(%rcx), %ymm11; 903 vmovdqu 4 * 32(%rcx), %ymm12; 904 vmovdqu 5 * 32(%rcx), %ymm13; 905 vmovdqu 6 * 32(%rcx), %ymm14; 906 vmovdqu 7 * 32(%rcx), %ymm15; 907 908 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 909 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 910 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); 911 912 FRAME_END 913 ret; 914 915.align 8 916.Ldec_max32: 917 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 918 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 919 %ymm15, %rax, %rcx, 24); 920 921 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 922 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 923 %ymm15, 924 ((key_table + (24) * 8) + 8)(CTX), 925 ((key_table + (24) * 8) + 12)(CTX), 926 ((key_table + (24) * 8) + 0)(CTX), 927 ((key_table + (24) * 8) + 4)(CTX)); 928 929 jmp .Ldec_max24; 930ENDPROC(__camellia_dec_blk32) 931 932ENTRY(camellia_ecb_enc_32way) 933 /* input: 934 * %rdi: ctx, CTX 935 * %rsi: dst (32 blocks) 936 * %rdx: src (32 blocks) 937 */ 938 FRAME_BEGIN 939 940 vzeroupper; 941 942 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 943 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 944 %ymm15, %rdx, (key_table)(CTX)); 945 946 /* now dst can be used as temporary buffer (even in src == dst case) */ 947 movq %rsi, %rax; 948 949 call __camellia_enc_blk32; 950 951 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 952 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 953 %ymm8, %rsi); 954 955 vzeroupper; 956 957 FRAME_END 958 ret; 959ENDPROC(camellia_ecb_enc_32way) 960 961ENTRY(camellia_ecb_dec_32way) 962 /* input: 963 * %rdi: ctx, CTX 964 * %rsi: dst (32 blocks) 965 * %rdx: src (32 blocks) 966 */ 967 FRAME_BEGIN 968 969 vzeroupper; 970 971 cmpl $16, key_length(CTX); 972 movl $32, %r8d; 973 movl $24, %eax; 974 cmovel %eax, %r8d; /* max */ 975 976 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 977 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 978 %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 979 980 /* now dst can be used as temporary buffer (even in src == dst case) */ 981 movq %rsi, %rax; 982 983 call __camellia_dec_blk32; 984 985 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 986 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 987 %ymm8, %rsi); 988 989 vzeroupper; 990 991 FRAME_END 992 ret; 993ENDPROC(camellia_ecb_dec_32way) 994 995ENTRY(camellia_cbc_dec_32way) 996 /* input: 997 * %rdi: ctx, CTX 998 * %rsi: dst (32 blocks) 999 * %rdx: src (32 blocks) 1000 */ 1001 FRAME_BEGIN 1002 1003 vzeroupper; 1004 1005 cmpl $16, key_length(CTX); 1006 movl $32, %r8d; 1007 movl $24, %eax; 1008 cmovel %eax, %r8d; /* max */ 1009 1010 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 1011 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1012 %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 1013 1014 movq %rsp, %r10; 1015 cmpq %rsi, %rdx; 1016 je .Lcbc_dec_use_stack; 1017 1018 /* dst can be used as temporary storage, src is not overwritten. */ 1019 movq %rsi, %rax; 1020 jmp .Lcbc_dec_continue; 1021 1022.Lcbc_dec_use_stack: 1023 /* 1024 * dst still in-use (because dst == src), so use stack for temporary 1025 * storage. 1026 */ 1027 subq $(16 * 32), %rsp; 1028 movq %rsp, %rax; 1029 1030.Lcbc_dec_continue: 1031 call __camellia_dec_blk32; 1032 1033 vmovdqu %ymm7, (%rax); 1034 vpxor %ymm7, %ymm7, %ymm7; 1035 vinserti128 $1, (%rdx), %ymm7, %ymm7; 1036 vpxor (%rax), %ymm7, %ymm7; 1037 movq %r10, %rsp; 1038 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; 1039 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; 1040 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; 1041 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; 1042 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; 1043 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; 1044 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; 1045 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; 1046 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; 1047 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; 1048 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; 1049 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; 1050 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; 1051 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; 1052 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; 1053 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1054 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1055 %ymm8, %rsi); 1056 1057 vzeroupper; 1058 1059 FRAME_END 1060 ret; 1061ENDPROC(camellia_cbc_dec_32way) 1062 1063#define inc_le128(x, minus_one, tmp) \ 1064 vpcmpeqq minus_one, x, tmp; \ 1065 vpsubq minus_one, x, x; \ 1066 vpslldq $8, tmp, tmp; \ 1067 vpsubq tmp, x, x; 1068 1069#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 1070 vpcmpeqq minus_one, x, tmp1; \ 1071 vpcmpeqq minus_two, x, tmp2; \ 1072 vpsubq minus_two, x, x; \ 1073 vpor tmp2, tmp1, tmp1; \ 1074 vpslldq $8, tmp1, tmp1; \ 1075 vpsubq tmp1, x, x; 1076 1077ENTRY(camellia_ctr_32way) 1078 /* input: 1079 * %rdi: ctx, CTX 1080 * %rsi: dst (32 blocks) 1081 * %rdx: src (32 blocks) 1082 * %rcx: iv (little endian, 128bit) 1083 */ 1084 FRAME_BEGIN 1085 1086 vzeroupper; 1087 1088 movq %rsp, %r10; 1089 cmpq %rsi, %rdx; 1090 je .Lctr_use_stack; 1091 1092 /* dst can be used as temporary storage, src is not overwritten. */ 1093 movq %rsi, %rax; 1094 jmp .Lctr_continue; 1095 1096.Lctr_use_stack: 1097 subq $(16 * 32), %rsp; 1098 movq %rsp, %rax; 1099 1100.Lctr_continue: 1101 vpcmpeqd %ymm15, %ymm15, %ymm15; 1102 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ 1103 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ 1104 1105 /* load IV and byteswap */ 1106 vmovdqu (%rcx), %xmm0; 1107 vmovdqa %xmm0, %xmm1; 1108 inc_le128(%xmm0, %xmm15, %xmm14); 1109 vbroadcasti128 .Lbswap128_mask, %ymm14; 1110 vinserti128 $1, %xmm0, %ymm1, %ymm0; 1111 vpshufb %ymm14, %ymm0, %ymm13; 1112 vmovdqu %ymm13, 15 * 32(%rax); 1113 1114 /* construct IVs */ 1115 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ 1116 vpshufb %ymm14, %ymm0, %ymm13; 1117 vmovdqu %ymm13, 14 * 32(%rax); 1118 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1119 vpshufb %ymm14, %ymm0, %ymm13; 1120 vmovdqu %ymm13, 13 * 32(%rax); 1121 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1122 vpshufb %ymm14, %ymm0, %ymm13; 1123 vmovdqu %ymm13, 12 * 32(%rax); 1124 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1125 vpshufb %ymm14, %ymm0, %ymm13; 1126 vmovdqu %ymm13, 11 * 32(%rax); 1127 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1128 vpshufb %ymm14, %ymm0, %ymm10; 1129 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1130 vpshufb %ymm14, %ymm0, %ymm9; 1131 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1132 vpshufb %ymm14, %ymm0, %ymm8; 1133 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1134 vpshufb %ymm14, %ymm0, %ymm7; 1135 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1136 vpshufb %ymm14, %ymm0, %ymm6; 1137 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1138 vpshufb %ymm14, %ymm0, %ymm5; 1139 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1140 vpshufb %ymm14, %ymm0, %ymm4; 1141 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1142 vpshufb %ymm14, %ymm0, %ymm3; 1143 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1144 vpshufb %ymm14, %ymm0, %ymm2; 1145 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1146 vpshufb %ymm14, %ymm0, %ymm1; 1147 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1148 vextracti128 $1, %ymm0, %xmm13; 1149 vpshufb %ymm14, %ymm0, %ymm0; 1150 inc_le128(%xmm13, %xmm15, %xmm14); 1151 vmovdqu %xmm13, (%rcx); 1152 1153 /* inpack32_pre: */ 1154 vpbroadcastq (key_table)(CTX), %ymm15; 1155 vpshufb .Lpack_bswap, %ymm15, %ymm15; 1156 vpxor %ymm0, %ymm15, %ymm0; 1157 vpxor %ymm1, %ymm15, %ymm1; 1158 vpxor %ymm2, %ymm15, %ymm2; 1159 vpxor %ymm3, %ymm15, %ymm3; 1160 vpxor %ymm4, %ymm15, %ymm4; 1161 vpxor %ymm5, %ymm15, %ymm5; 1162 vpxor %ymm6, %ymm15, %ymm6; 1163 vpxor %ymm7, %ymm15, %ymm7; 1164 vpxor %ymm8, %ymm15, %ymm8; 1165 vpxor %ymm9, %ymm15, %ymm9; 1166 vpxor %ymm10, %ymm15, %ymm10; 1167 vpxor 11 * 32(%rax), %ymm15, %ymm11; 1168 vpxor 12 * 32(%rax), %ymm15, %ymm12; 1169 vpxor 13 * 32(%rax), %ymm15, %ymm13; 1170 vpxor 14 * 32(%rax), %ymm15, %ymm14; 1171 vpxor 15 * 32(%rax), %ymm15, %ymm15; 1172 1173 call __camellia_enc_blk32; 1174 1175 movq %r10, %rsp; 1176 1177 vpxor 0 * 32(%rdx), %ymm7, %ymm7; 1178 vpxor 1 * 32(%rdx), %ymm6, %ymm6; 1179 vpxor 2 * 32(%rdx), %ymm5, %ymm5; 1180 vpxor 3 * 32(%rdx), %ymm4, %ymm4; 1181 vpxor 4 * 32(%rdx), %ymm3, %ymm3; 1182 vpxor 5 * 32(%rdx), %ymm2, %ymm2; 1183 vpxor 6 * 32(%rdx), %ymm1, %ymm1; 1184 vpxor 7 * 32(%rdx), %ymm0, %ymm0; 1185 vpxor 8 * 32(%rdx), %ymm15, %ymm15; 1186 vpxor 9 * 32(%rdx), %ymm14, %ymm14; 1187 vpxor 10 * 32(%rdx), %ymm13, %ymm13; 1188 vpxor 11 * 32(%rdx), %ymm12, %ymm12; 1189 vpxor 12 * 32(%rdx), %ymm11, %ymm11; 1190 vpxor 13 * 32(%rdx), %ymm10, %ymm10; 1191 vpxor 14 * 32(%rdx), %ymm9, %ymm9; 1192 vpxor 15 * 32(%rdx), %ymm8, %ymm8; 1193 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1194 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1195 %ymm8, %rsi); 1196 1197 vzeroupper; 1198 1199 FRAME_END 1200 ret; 1201ENDPROC(camellia_ctr_32way) 1202 1203#define gf128mul_x_ble(iv, mask, tmp) \ 1204 vpsrad $31, iv, tmp; \ 1205 vpaddq iv, iv, iv; \ 1206 vpshufd $0x13, tmp, tmp; \ 1207 vpand mask, tmp, tmp; \ 1208 vpxor tmp, iv, iv; 1209 1210#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 1211 vpsrad $31, iv, tmp0; \ 1212 vpaddq iv, iv, tmp1; \ 1213 vpsllq $2, iv, iv; \ 1214 vpshufd $0x13, tmp0, tmp0; \ 1215 vpsrad $31, tmp1, tmp1; \ 1216 vpand mask2, tmp0, tmp0; \ 1217 vpshufd $0x13, tmp1, tmp1; \ 1218 vpxor tmp0, iv, iv; \ 1219 vpand mask1, tmp1, tmp1; \ 1220 vpxor tmp1, iv, iv; 1221 1222.align 8 1223camellia_xts_crypt_32way: 1224 /* input: 1225 * %rdi: ctx, CTX 1226 * %rsi: dst (32 blocks) 1227 * %rdx: src (32 blocks) 1228 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1229 * %r8: index for input whitening key 1230 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 1231 */ 1232 FRAME_BEGIN 1233 1234 vzeroupper; 1235 1236 subq $(16 * 32), %rsp; 1237 movq %rsp, %rax; 1238 1239 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; 1240 1241 /* load IV and construct second IV */ 1242 vmovdqu (%rcx), %xmm0; 1243 vmovdqa %xmm0, %xmm15; 1244 gf128mul_x_ble(%xmm0, %xmm12, %xmm13); 1245 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; 1246 vinserti128 $1, %xmm0, %ymm15, %ymm0; 1247 vpxor 0 * 32(%rdx), %ymm0, %ymm15; 1248 vmovdqu %ymm15, 15 * 32(%rax); 1249 vmovdqu %ymm0, 0 * 32(%rsi); 1250 1251 /* construct IVs */ 1252 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1253 vpxor 1 * 32(%rdx), %ymm0, %ymm15; 1254 vmovdqu %ymm15, 14 * 32(%rax); 1255 vmovdqu %ymm0, 1 * 32(%rsi); 1256 1257 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1258 vpxor 2 * 32(%rdx), %ymm0, %ymm15; 1259 vmovdqu %ymm15, 13 * 32(%rax); 1260 vmovdqu %ymm0, 2 * 32(%rsi); 1261 1262 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1263 vpxor 3 * 32(%rdx), %ymm0, %ymm15; 1264 vmovdqu %ymm15, 12 * 32(%rax); 1265 vmovdqu %ymm0, 3 * 32(%rsi); 1266 1267 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1268 vpxor 4 * 32(%rdx), %ymm0, %ymm11; 1269 vmovdqu %ymm0, 4 * 32(%rsi); 1270 1271 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1272 vpxor 5 * 32(%rdx), %ymm0, %ymm10; 1273 vmovdqu %ymm0, 5 * 32(%rsi); 1274 1275 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1276 vpxor 6 * 32(%rdx), %ymm0, %ymm9; 1277 vmovdqu %ymm0, 6 * 32(%rsi); 1278 1279 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1280 vpxor 7 * 32(%rdx), %ymm0, %ymm8; 1281 vmovdqu %ymm0, 7 * 32(%rsi); 1282 1283 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1284 vpxor 8 * 32(%rdx), %ymm0, %ymm7; 1285 vmovdqu %ymm0, 8 * 32(%rsi); 1286 1287 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1288 vpxor 9 * 32(%rdx), %ymm0, %ymm6; 1289 vmovdqu %ymm0, 9 * 32(%rsi); 1290 1291 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1292 vpxor 10 * 32(%rdx), %ymm0, %ymm5; 1293 vmovdqu %ymm0, 10 * 32(%rsi); 1294 1295 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1296 vpxor 11 * 32(%rdx), %ymm0, %ymm4; 1297 vmovdqu %ymm0, 11 * 32(%rsi); 1298 1299 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1300 vpxor 12 * 32(%rdx), %ymm0, %ymm3; 1301 vmovdqu %ymm0, 12 * 32(%rsi); 1302 1303 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1304 vpxor 13 * 32(%rdx), %ymm0, %ymm2; 1305 vmovdqu %ymm0, 13 * 32(%rsi); 1306 1307 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1308 vpxor 14 * 32(%rdx), %ymm0, %ymm1; 1309 vmovdqu %ymm0, 14 * 32(%rsi); 1310 1311 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1312 vpxor 15 * 32(%rdx), %ymm0, %ymm15; 1313 vmovdqu %ymm15, 0 * 32(%rax); 1314 vmovdqu %ymm0, 15 * 32(%rsi); 1315 1316 vextracti128 $1, %ymm0, %xmm0; 1317 gf128mul_x_ble(%xmm0, %xmm12, %xmm15); 1318 vmovdqu %xmm0, (%rcx); 1319 1320 /* inpack32_pre: */ 1321 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; 1322 vpshufb .Lpack_bswap, %ymm15, %ymm15; 1323 vpxor 0 * 32(%rax), %ymm15, %ymm0; 1324 vpxor %ymm1, %ymm15, %ymm1; 1325 vpxor %ymm2, %ymm15, %ymm2; 1326 vpxor %ymm3, %ymm15, %ymm3; 1327 vpxor %ymm4, %ymm15, %ymm4; 1328 vpxor %ymm5, %ymm15, %ymm5; 1329 vpxor %ymm6, %ymm15, %ymm6; 1330 vpxor %ymm7, %ymm15, %ymm7; 1331 vpxor %ymm8, %ymm15, %ymm8; 1332 vpxor %ymm9, %ymm15, %ymm9; 1333 vpxor %ymm10, %ymm15, %ymm10; 1334 vpxor %ymm11, %ymm15, %ymm11; 1335 vpxor 12 * 32(%rax), %ymm15, %ymm12; 1336 vpxor 13 * 32(%rax), %ymm15, %ymm13; 1337 vpxor 14 * 32(%rax), %ymm15, %ymm14; 1338 vpxor 15 * 32(%rax), %ymm15, %ymm15; 1339 1340 call *%r9; 1341 1342 addq $(16 * 32), %rsp; 1343 1344 vpxor 0 * 32(%rsi), %ymm7, %ymm7; 1345 vpxor 1 * 32(%rsi), %ymm6, %ymm6; 1346 vpxor 2 * 32(%rsi), %ymm5, %ymm5; 1347 vpxor 3 * 32(%rsi), %ymm4, %ymm4; 1348 vpxor 4 * 32(%rsi), %ymm3, %ymm3; 1349 vpxor 5 * 32(%rsi), %ymm2, %ymm2; 1350 vpxor 6 * 32(%rsi), %ymm1, %ymm1; 1351 vpxor 7 * 32(%rsi), %ymm0, %ymm0; 1352 vpxor 8 * 32(%rsi), %ymm15, %ymm15; 1353 vpxor 9 * 32(%rsi), %ymm14, %ymm14; 1354 vpxor 10 * 32(%rsi), %ymm13, %ymm13; 1355 vpxor 11 * 32(%rsi), %ymm12, %ymm12; 1356 vpxor 12 * 32(%rsi), %ymm11, %ymm11; 1357 vpxor 13 * 32(%rsi), %ymm10, %ymm10; 1358 vpxor 14 * 32(%rsi), %ymm9, %ymm9; 1359 vpxor 15 * 32(%rsi), %ymm8, %ymm8; 1360 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1361 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1362 %ymm8, %rsi); 1363 1364 vzeroupper; 1365 1366 FRAME_END 1367 ret; 1368ENDPROC(camellia_xts_crypt_32way) 1369 1370ENTRY(camellia_xts_enc_32way) 1371 /* input: 1372 * %rdi: ctx, CTX 1373 * %rsi: dst (32 blocks) 1374 * %rdx: src (32 blocks) 1375 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1376 */ 1377 1378 xorl %r8d, %r8d; /* input whitening key, 0 for enc */ 1379 1380 leaq __camellia_enc_blk32, %r9; 1381 1382 jmp camellia_xts_crypt_32way; 1383ENDPROC(camellia_xts_enc_32way) 1384 1385ENTRY(camellia_xts_dec_32way) 1386 /* input: 1387 * %rdi: ctx, CTX 1388 * %rsi: dst (32 blocks) 1389 * %rdx: src (32 blocks) 1390 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1391 */ 1392 1393 cmpl $16, key_length(CTX); 1394 movl $32, %r8d; 1395 movl $24, %eax; 1396 cmovel %eax, %r8d; /* input whitening key, last for dec */ 1397 1398 leaq __camellia_dec_blk32, %r9; 1399 1400 jmp camellia_xts_crypt_32way; 1401ENDPROC(camellia_xts_dec_32way) 1402