1/* 2 * x86_64/AVX2/AES-NI assembler implementation of Camellia 3 * 4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 */ 12 13#include <linux/linkage.h> 14 15#define CAMELLIA_TABLE_BYTE_LEN 272 16 17/* struct camellia_ctx: */ 18#define key_table 0 19#define key_length CAMELLIA_TABLE_BYTE_LEN 20 21/* register macros */ 22#define CTX %rdi 23#define RIO %r8 24 25/********************************************************************** 26 helper macros 27 **********************************************************************/ 28#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 29 vpand x, mask4bit, tmp0; \ 30 vpandn x, mask4bit, x; \ 31 vpsrld $4, x, x; \ 32 \ 33 vpshufb tmp0, lo_t, tmp0; \ 34 vpshufb x, hi_t, x; \ 35 vpxor tmp0, x, x; 36 37#define ymm0_x xmm0 38#define ymm1_x xmm1 39#define ymm2_x xmm2 40#define ymm3_x xmm3 41#define ymm4_x xmm4 42#define ymm5_x xmm5 43#define ymm6_x xmm6 44#define ymm7_x xmm7 45#define ymm8_x xmm8 46#define ymm9_x xmm9 47#define ymm10_x xmm10 48#define ymm11_x xmm11 49#define ymm12_x xmm12 50#define ymm13_x xmm13 51#define ymm14_x xmm14 52#define ymm15_x xmm15 53 54/********************************************************************** 55 32-way camellia 56 **********************************************************************/ 57 58/* 59 * IN: 60 * x0..x7: byte-sliced AB state 61 * mem_cd: register pointer storing CD state 62 * key: index for key material 63 * OUT: 64 * x0..x7: new byte-sliced CD state 65 */ 66#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ 67 t7, mem_cd, key) \ 68 /* \ 69 * S-function with AES subbytes \ 70 */ \ 71 vbroadcasti128 .Linv_shift_row, t4; \ 72 vpbroadcastd .L0f0f0f0f, t7; \ 73 vbroadcasti128 .Lpre_tf_lo_s1, t5; \ 74 vbroadcasti128 .Lpre_tf_hi_s1, t6; \ 75 vbroadcasti128 .Lpre_tf_lo_s4, t2; \ 76 vbroadcasti128 .Lpre_tf_hi_s4, t3; \ 77 \ 78 /* AES inverse shift rows */ \ 79 vpshufb t4, x0, x0; \ 80 vpshufb t4, x7, x7; \ 81 vpshufb t4, x3, x3; \ 82 vpshufb t4, x6, x6; \ 83 vpshufb t4, x2, x2; \ 84 vpshufb t4, x5, x5; \ 85 vpshufb t4, x1, x1; \ 86 vpshufb t4, x4, x4; \ 87 \ 88 /* prefilter sboxes 1, 2 and 3 */ \ 89 /* prefilter sbox 4 */ \ 90 filter_8bit(x0, t5, t6, t7, t4); \ 91 filter_8bit(x7, t5, t6, t7, t4); \ 92 vextracti128 $1, x0, t0##_x; \ 93 vextracti128 $1, x7, t1##_x; \ 94 filter_8bit(x3, t2, t3, t7, t4); \ 95 filter_8bit(x6, t2, t3, t7, t4); \ 96 vextracti128 $1, x3, t3##_x; \ 97 vextracti128 $1, x6, t2##_x; \ 98 filter_8bit(x2, t5, t6, t7, t4); \ 99 filter_8bit(x5, t5, t6, t7, t4); \ 100 filter_8bit(x1, t5, t6, t7, t4); \ 101 filter_8bit(x4, t5, t6, t7, t4); \ 102 \ 103 vpxor t4##_x, t4##_x, t4##_x; \ 104 \ 105 /* AES subbytes + AES shift rows */ \ 106 vextracti128 $1, x2, t6##_x; \ 107 vextracti128 $1, x5, t5##_x; \ 108 vaesenclast t4##_x, x0##_x, x0##_x; \ 109 vaesenclast t4##_x, t0##_x, t0##_x; \ 110 vinserti128 $1, t0##_x, x0, x0; \ 111 vaesenclast t4##_x, x7##_x, x7##_x; \ 112 vaesenclast t4##_x, t1##_x, t1##_x; \ 113 vinserti128 $1, t1##_x, x7, x7; \ 114 vaesenclast t4##_x, x3##_x, x3##_x; \ 115 vaesenclast t4##_x, t3##_x, t3##_x; \ 116 vinserti128 $1, t3##_x, x3, x3; \ 117 vaesenclast t4##_x, x6##_x, x6##_x; \ 118 vaesenclast t4##_x, t2##_x, t2##_x; \ 119 vinserti128 $1, t2##_x, x6, x6; \ 120 vextracti128 $1, x1, t3##_x; \ 121 vextracti128 $1, x4, t2##_x; \ 122 vbroadcasti128 .Lpost_tf_lo_s1, t0; \ 123 vbroadcasti128 .Lpost_tf_hi_s1, t1; \ 124 vaesenclast t4##_x, x2##_x, x2##_x; \ 125 vaesenclast t4##_x, t6##_x, t6##_x; \ 126 vinserti128 $1, t6##_x, x2, x2; \ 127 vaesenclast t4##_x, x5##_x, x5##_x; \ 128 vaesenclast t4##_x, t5##_x, t5##_x; \ 129 vinserti128 $1, t5##_x, x5, x5; \ 130 vaesenclast t4##_x, x1##_x, x1##_x; \ 131 vaesenclast t4##_x, t3##_x, t3##_x; \ 132 vinserti128 $1, t3##_x, x1, x1; \ 133 vaesenclast t4##_x, x4##_x, x4##_x; \ 134 vaesenclast t4##_x, t2##_x, t2##_x; \ 135 vinserti128 $1, t2##_x, x4, x4; \ 136 \ 137 /* postfilter sboxes 1 and 4 */ \ 138 vbroadcasti128 .Lpost_tf_lo_s3, t2; \ 139 vbroadcasti128 .Lpost_tf_hi_s3, t3; \ 140 filter_8bit(x0, t0, t1, t7, t6); \ 141 filter_8bit(x7, t0, t1, t7, t6); \ 142 filter_8bit(x3, t0, t1, t7, t6); \ 143 filter_8bit(x6, t0, t1, t7, t6); \ 144 \ 145 /* postfilter sbox 3 */ \ 146 vbroadcasti128 .Lpost_tf_lo_s2, t4; \ 147 vbroadcasti128 .Lpost_tf_hi_s2, t5; \ 148 filter_8bit(x2, t2, t3, t7, t6); \ 149 filter_8bit(x5, t2, t3, t7, t6); \ 150 \ 151 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ 152 \ 153 /* postfilter sbox 2 */ \ 154 filter_8bit(x1, t4, t5, t7, t2); \ 155 filter_8bit(x4, t4, t5, t7, t2); \ 156 vpxor t7, t7, t7; \ 157 \ 158 vpsrldq $1, t0, t1; \ 159 vpsrldq $2, t0, t2; \ 160 vpshufb t7, t1, t1; \ 161 vpsrldq $3, t0, t3; \ 162 \ 163 /* P-function */ \ 164 vpxor x5, x0, x0; \ 165 vpxor x6, x1, x1; \ 166 vpxor x7, x2, x2; \ 167 vpxor x4, x3, x3; \ 168 \ 169 vpshufb t7, t2, t2; \ 170 vpsrldq $4, t0, t4; \ 171 vpshufb t7, t3, t3; \ 172 vpsrldq $5, t0, t5; \ 173 vpshufb t7, t4, t4; \ 174 \ 175 vpxor x2, x4, x4; \ 176 vpxor x3, x5, x5; \ 177 vpxor x0, x6, x6; \ 178 vpxor x1, x7, x7; \ 179 \ 180 vpsrldq $6, t0, t6; \ 181 vpshufb t7, t5, t5; \ 182 vpshufb t7, t6, t6; \ 183 \ 184 vpxor x7, x0, x0; \ 185 vpxor x4, x1, x1; \ 186 vpxor x5, x2, x2; \ 187 vpxor x6, x3, x3; \ 188 \ 189 vpxor x3, x4, x4; \ 190 vpxor x0, x5, x5; \ 191 vpxor x1, x6, x6; \ 192 vpxor x2, x7, x7; /* note: high and low parts swapped */ \ 193 \ 194 /* Add key material and result to CD (x becomes new CD) */ \ 195 \ 196 vpxor t6, x1, x1; \ 197 vpxor 5 * 32(mem_cd), x1, x1; \ 198 \ 199 vpsrldq $7, t0, t6; \ 200 vpshufb t7, t0, t0; \ 201 vpshufb t7, t6, t7; \ 202 \ 203 vpxor t7, x0, x0; \ 204 vpxor 4 * 32(mem_cd), x0, x0; \ 205 \ 206 vpxor t5, x2, x2; \ 207 vpxor 6 * 32(mem_cd), x2, x2; \ 208 \ 209 vpxor t4, x3, x3; \ 210 vpxor 7 * 32(mem_cd), x3, x3; \ 211 \ 212 vpxor t3, x4, x4; \ 213 vpxor 0 * 32(mem_cd), x4, x4; \ 214 \ 215 vpxor t2, x5, x5; \ 216 vpxor 1 * 32(mem_cd), x5, x5; \ 217 \ 218 vpxor t1, x6, x6; \ 219 vpxor 2 * 32(mem_cd), x6, x6; \ 220 \ 221 vpxor t0, x7, x7; \ 222 vpxor 3 * 32(mem_cd), x7, x7; 223 224/* 225 * Size optimization... with inlined roundsm32 binary would be over 5 times 226 * larger and would only marginally faster. 227 */ 228.align 8 229roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: 230 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 231 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 232 %rcx, (%r9)); 233 ret; 234ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 235 236.align 8 237roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: 238 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, 239 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, 240 %rax, (%r9)); 241 ret; 242ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 243 244/* 245 * IN/OUT: 246 * x0..x7: byte-sliced AB state preloaded 247 * mem_ab: byte-sliced AB state in memory 248 * mem_cb: byte-sliced CD state in memory 249 */ 250#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 251 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ 252 leaq (key_table + (i) * 8)(CTX), %r9; \ 253 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ 254 \ 255 vmovdqu x0, 4 * 32(mem_cd); \ 256 vmovdqu x1, 5 * 32(mem_cd); \ 257 vmovdqu x2, 6 * 32(mem_cd); \ 258 vmovdqu x3, 7 * 32(mem_cd); \ 259 vmovdqu x4, 0 * 32(mem_cd); \ 260 vmovdqu x5, 1 * 32(mem_cd); \ 261 vmovdqu x6, 2 * 32(mem_cd); \ 262 vmovdqu x7, 3 * 32(mem_cd); \ 263 \ 264 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ 265 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ 266 \ 267 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); 268 269#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ 270 271#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ 272 /* Store new AB state */ \ 273 vmovdqu x4, 4 * 32(mem_ab); \ 274 vmovdqu x5, 5 * 32(mem_ab); \ 275 vmovdqu x6, 6 * 32(mem_ab); \ 276 vmovdqu x7, 7 * 32(mem_ab); \ 277 vmovdqu x0, 0 * 32(mem_ab); \ 278 vmovdqu x1, 1 * 32(mem_ab); \ 279 vmovdqu x2, 2 * 32(mem_ab); \ 280 vmovdqu x3, 3 * 32(mem_ab); 281 282#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 283 y6, y7, mem_ab, mem_cd, i) \ 284 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 285 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ 286 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 287 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ 288 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 289 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); 290 291#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 292 y6, y7, mem_ab, mem_cd, i) \ 293 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 294 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ 295 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 296 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ 297 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 298 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); 299 300/* 301 * IN: 302 * v0..3: byte-sliced 32-bit integers 303 * OUT: 304 * v0..3: (IN <<< 1) 305 */ 306#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ 307 vpcmpgtb v0, zero, t0; \ 308 vpaddb v0, v0, v0; \ 309 vpabsb t0, t0; \ 310 \ 311 vpcmpgtb v1, zero, t1; \ 312 vpaddb v1, v1, v1; \ 313 vpabsb t1, t1; \ 314 \ 315 vpcmpgtb v2, zero, t2; \ 316 vpaddb v2, v2, v2; \ 317 vpabsb t2, t2; \ 318 \ 319 vpor t0, v1, v1; \ 320 \ 321 vpcmpgtb v3, zero, t0; \ 322 vpaddb v3, v3, v3; \ 323 vpabsb t0, t0; \ 324 \ 325 vpor t1, v2, v2; \ 326 vpor t2, v3, v3; \ 327 vpor t0, v0, v0; 328 329/* 330 * IN: 331 * r: byte-sliced AB state in memory 332 * l: byte-sliced CD state in memory 333 * OUT: 334 * x0..x7: new byte-sliced CD state 335 */ 336#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ 337 tt1, tt2, tt3, kll, klr, krl, krr) \ 338 /* \ 339 * t0 = kll; \ 340 * t0 &= ll; \ 341 * lr ^= rol32(t0, 1); \ 342 */ \ 343 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ 344 vpxor tt0, tt0, tt0; \ 345 vpshufb tt0, t0, t3; \ 346 vpsrldq $1, t0, t0; \ 347 vpshufb tt0, t0, t2; \ 348 vpsrldq $1, t0, t0; \ 349 vpshufb tt0, t0, t1; \ 350 vpsrldq $1, t0, t0; \ 351 vpshufb tt0, t0, t0; \ 352 \ 353 vpand l0, t0, t0; \ 354 vpand l1, t1, t1; \ 355 vpand l2, t2, t2; \ 356 vpand l3, t3, t3; \ 357 \ 358 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 359 \ 360 vpxor l4, t0, l4; \ 361 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ 362 vmovdqu l4, 4 * 32(l); \ 363 vpxor l5, t1, l5; \ 364 vmovdqu l5, 5 * 32(l); \ 365 vpxor l6, t2, l6; \ 366 vmovdqu l6, 6 * 32(l); \ 367 vpxor l7, t3, l7; \ 368 vmovdqu l7, 7 * 32(l); \ 369 \ 370 /* \ 371 * t2 = krr; \ 372 * t2 |= rr; \ 373 * rl ^= t2; \ 374 */ \ 375 \ 376 vpshufb tt0, t0, t3; \ 377 vpsrldq $1, t0, t0; \ 378 vpshufb tt0, t0, t2; \ 379 vpsrldq $1, t0, t0; \ 380 vpshufb tt0, t0, t1; \ 381 vpsrldq $1, t0, t0; \ 382 vpshufb tt0, t0, t0; \ 383 \ 384 vpor 4 * 32(r), t0, t0; \ 385 vpor 5 * 32(r), t1, t1; \ 386 vpor 6 * 32(r), t2, t2; \ 387 vpor 7 * 32(r), t3, t3; \ 388 \ 389 vpxor 0 * 32(r), t0, t0; \ 390 vpxor 1 * 32(r), t1, t1; \ 391 vpxor 2 * 32(r), t2, t2; \ 392 vpxor 3 * 32(r), t3, t3; \ 393 vmovdqu t0, 0 * 32(r); \ 394 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ 395 vmovdqu t1, 1 * 32(r); \ 396 vmovdqu t2, 2 * 32(r); \ 397 vmovdqu t3, 3 * 32(r); \ 398 \ 399 /* \ 400 * t2 = krl; \ 401 * t2 &= rl; \ 402 * rr ^= rol32(t2, 1); \ 403 */ \ 404 vpshufb tt0, t0, t3; \ 405 vpsrldq $1, t0, t0; \ 406 vpshufb tt0, t0, t2; \ 407 vpsrldq $1, t0, t0; \ 408 vpshufb tt0, t0, t1; \ 409 vpsrldq $1, t0, t0; \ 410 vpshufb tt0, t0, t0; \ 411 \ 412 vpand 0 * 32(r), t0, t0; \ 413 vpand 1 * 32(r), t1, t1; \ 414 vpand 2 * 32(r), t2, t2; \ 415 vpand 3 * 32(r), t3, t3; \ 416 \ 417 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 418 \ 419 vpxor 4 * 32(r), t0, t0; \ 420 vpxor 5 * 32(r), t1, t1; \ 421 vpxor 6 * 32(r), t2, t2; \ 422 vpxor 7 * 32(r), t3, t3; \ 423 vmovdqu t0, 4 * 32(r); \ 424 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ 425 vmovdqu t1, 5 * 32(r); \ 426 vmovdqu t2, 6 * 32(r); \ 427 vmovdqu t3, 7 * 32(r); \ 428 \ 429 /* \ 430 * t0 = klr; \ 431 * t0 |= lr; \ 432 * ll ^= t0; \ 433 */ \ 434 \ 435 vpshufb tt0, t0, t3; \ 436 vpsrldq $1, t0, t0; \ 437 vpshufb tt0, t0, t2; \ 438 vpsrldq $1, t0, t0; \ 439 vpshufb tt0, t0, t1; \ 440 vpsrldq $1, t0, t0; \ 441 vpshufb tt0, t0, t0; \ 442 \ 443 vpor l4, t0, t0; \ 444 vpor l5, t1, t1; \ 445 vpor l6, t2, t2; \ 446 vpor l7, t3, t3; \ 447 \ 448 vpxor l0, t0, l0; \ 449 vmovdqu l0, 0 * 32(l); \ 450 vpxor l1, t1, l1; \ 451 vmovdqu l1, 1 * 32(l); \ 452 vpxor l2, t2, l2; \ 453 vmovdqu l2, 2 * 32(l); \ 454 vpxor l3, t3, l3; \ 455 vmovdqu l3, 3 * 32(l); 456 457#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 458 vpunpckhdq x1, x0, t2; \ 459 vpunpckldq x1, x0, x0; \ 460 \ 461 vpunpckldq x3, x2, t1; \ 462 vpunpckhdq x3, x2, x2; \ 463 \ 464 vpunpckhqdq t1, x0, x1; \ 465 vpunpcklqdq t1, x0, x0; \ 466 \ 467 vpunpckhqdq x2, t2, x3; \ 468 vpunpcklqdq x2, t2, x2; 469 470#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ 471 a3, b3, c3, d3, st0, st1) \ 472 vmovdqu d2, st0; \ 473 vmovdqu d3, st1; \ 474 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 475 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 476 vmovdqu st0, d2; \ 477 vmovdqu st1, d3; \ 478 \ 479 vmovdqu a0, st0; \ 480 vmovdqu a1, st1; \ 481 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 482 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 483 \ 484 vbroadcasti128 .Lshufb_16x16b, a0; \ 485 vmovdqu st1, a1; \ 486 vpshufb a0, a2, a2; \ 487 vpshufb a0, a3, a3; \ 488 vpshufb a0, b0, b0; \ 489 vpshufb a0, b1, b1; \ 490 vpshufb a0, b2, b2; \ 491 vpshufb a0, b3, b3; \ 492 vpshufb a0, a1, a1; \ 493 vpshufb a0, c0, c0; \ 494 vpshufb a0, c1, c1; \ 495 vpshufb a0, c2, c2; \ 496 vpshufb a0, c3, c3; \ 497 vpshufb a0, d0, d0; \ 498 vpshufb a0, d1, d1; \ 499 vpshufb a0, d2, d2; \ 500 vpshufb a0, d3, d3; \ 501 vmovdqu d3, st1; \ 502 vmovdqu st0, d3; \ 503 vpshufb a0, d3, a0; \ 504 vmovdqu d2, st0; \ 505 \ 506 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 507 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 508 vmovdqu st0, d2; \ 509 vmovdqu st1, d3; \ 510 \ 511 vmovdqu b0, st0; \ 512 vmovdqu b1, st1; \ 513 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 514 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 515 vmovdqu st0, b0; \ 516 vmovdqu st1, b1; \ 517 /* does not adjust output bytes inside vectors */ 518 519/* load blocks to registers and apply pre-whitening */ 520#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 521 y6, y7, rio, key) \ 522 vpbroadcastq key, x0; \ 523 vpshufb .Lpack_bswap, x0, x0; \ 524 \ 525 vpxor 0 * 32(rio), x0, y7; \ 526 vpxor 1 * 32(rio), x0, y6; \ 527 vpxor 2 * 32(rio), x0, y5; \ 528 vpxor 3 * 32(rio), x0, y4; \ 529 vpxor 4 * 32(rio), x0, y3; \ 530 vpxor 5 * 32(rio), x0, y2; \ 531 vpxor 6 * 32(rio), x0, y1; \ 532 vpxor 7 * 32(rio), x0, y0; \ 533 vpxor 8 * 32(rio), x0, x7; \ 534 vpxor 9 * 32(rio), x0, x6; \ 535 vpxor 10 * 32(rio), x0, x5; \ 536 vpxor 11 * 32(rio), x0, x4; \ 537 vpxor 12 * 32(rio), x0, x3; \ 538 vpxor 13 * 32(rio), x0, x2; \ 539 vpxor 14 * 32(rio), x0, x1; \ 540 vpxor 15 * 32(rio), x0, x0; 541 542/* byteslice pre-whitened blocks and store to temporary memory */ 543#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 544 y6, y7, mem_ab, mem_cd) \ 545 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ 546 y4, y5, y6, y7, (mem_ab), (mem_cd)); \ 547 \ 548 vmovdqu x0, 0 * 32(mem_ab); \ 549 vmovdqu x1, 1 * 32(mem_ab); \ 550 vmovdqu x2, 2 * 32(mem_ab); \ 551 vmovdqu x3, 3 * 32(mem_ab); \ 552 vmovdqu x4, 4 * 32(mem_ab); \ 553 vmovdqu x5, 5 * 32(mem_ab); \ 554 vmovdqu x6, 6 * 32(mem_ab); \ 555 vmovdqu x7, 7 * 32(mem_ab); \ 556 vmovdqu y0, 0 * 32(mem_cd); \ 557 vmovdqu y1, 1 * 32(mem_cd); \ 558 vmovdqu y2, 2 * 32(mem_cd); \ 559 vmovdqu y3, 3 * 32(mem_cd); \ 560 vmovdqu y4, 4 * 32(mem_cd); \ 561 vmovdqu y5, 5 * 32(mem_cd); \ 562 vmovdqu y6, 6 * 32(mem_cd); \ 563 vmovdqu y7, 7 * 32(mem_cd); 564 565/* de-byteslice, apply post-whitening and store blocks */ 566#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 567 y5, y6, y7, key, stack_tmp0, stack_tmp1) \ 568 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ 569 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ 570 \ 571 vmovdqu x0, stack_tmp0; \ 572 \ 573 vpbroadcastq key, x0; \ 574 vpshufb .Lpack_bswap, x0, x0; \ 575 \ 576 vpxor x0, y7, y7; \ 577 vpxor x0, y6, y6; \ 578 vpxor x0, y5, y5; \ 579 vpxor x0, y4, y4; \ 580 vpxor x0, y3, y3; \ 581 vpxor x0, y2, y2; \ 582 vpxor x0, y1, y1; \ 583 vpxor x0, y0, y0; \ 584 vpxor x0, x7, x7; \ 585 vpxor x0, x6, x6; \ 586 vpxor x0, x5, x5; \ 587 vpxor x0, x4, x4; \ 588 vpxor x0, x3, x3; \ 589 vpxor x0, x2, x2; \ 590 vpxor x0, x1, x1; \ 591 vpxor stack_tmp0, x0, x0; 592 593#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 594 y6, y7, rio) \ 595 vmovdqu x0, 0 * 32(rio); \ 596 vmovdqu x1, 1 * 32(rio); \ 597 vmovdqu x2, 2 * 32(rio); \ 598 vmovdqu x3, 3 * 32(rio); \ 599 vmovdqu x4, 4 * 32(rio); \ 600 vmovdqu x5, 5 * 32(rio); \ 601 vmovdqu x6, 6 * 32(rio); \ 602 vmovdqu x7, 7 * 32(rio); \ 603 vmovdqu y0, 8 * 32(rio); \ 604 vmovdqu y1, 9 * 32(rio); \ 605 vmovdqu y2, 10 * 32(rio); \ 606 vmovdqu y3, 11 * 32(rio); \ 607 vmovdqu y4, 12 * 32(rio); \ 608 vmovdqu y5, 13 * 32(rio); \ 609 vmovdqu y6, 14 * 32(rio); \ 610 vmovdqu y7, 15 * 32(rio); 611 612.data 613.align 32 614 615#define SHUFB_BYTES(idx) \ 616 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 617 618.Lshufb_16x16b: 619 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 620 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 621 622.Lpack_bswap: 623 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 624 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 625 626/* For CTR-mode IV byteswap */ 627.Lbswap128_mask: 628 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 629 630/* For XTS mode */ 631.Lxts_gf128mul_and_shl1_mask_0: 632 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 633.Lxts_gf128mul_and_shl1_mask_1: 634 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 635 636/* 637 * pre-SubByte transform 638 * 639 * pre-lookup for sbox1, sbox2, sbox3: 640 * swap_bitendianness( 641 * isom_map_camellia_to_aes( 642 * camellia_f( 643 * swap_bitendianess(in) 644 * ) 645 * ) 646 * ) 647 * 648 * (note: '⊕ 0xc5' inside camellia_f()) 649 */ 650.Lpre_tf_lo_s1: 651 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 652 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 653.Lpre_tf_hi_s1: 654 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a 655 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 656 657/* 658 * pre-SubByte transform 659 * 660 * pre-lookup for sbox4: 661 * swap_bitendianness( 662 * isom_map_camellia_to_aes( 663 * camellia_f( 664 * swap_bitendianess(in <<< 1) 665 * ) 666 * ) 667 * ) 668 * 669 * (note: '⊕ 0xc5' inside camellia_f()) 670 */ 671.Lpre_tf_lo_s4: 672 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 673 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 674.Lpre_tf_hi_s4: 675 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 676 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf 677 678/* 679 * post-SubByte transform 680 * 681 * post-lookup for sbox1, sbox4: 682 * swap_bitendianness( 683 * camellia_h( 684 * isom_map_aes_to_camellia( 685 * swap_bitendianness( 686 * aes_inverse_affine_transform(in) 687 * ) 688 * ) 689 * ) 690 * ) 691 * 692 * (note: '⊕ 0x6e' inside camellia_h()) 693 */ 694.Lpost_tf_lo_s1: 695 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 696 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 697.Lpost_tf_hi_s1: 698 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 699 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c 700 701/* 702 * post-SubByte transform 703 * 704 * post-lookup for sbox2: 705 * swap_bitendianness( 706 * camellia_h( 707 * isom_map_aes_to_camellia( 708 * swap_bitendianness( 709 * aes_inverse_affine_transform(in) 710 * ) 711 * ) 712 * ) 713 * ) <<< 1 714 * 715 * (note: '⊕ 0x6e' inside camellia_h()) 716 */ 717.Lpost_tf_lo_s2: 718 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 719 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 720.Lpost_tf_hi_s2: 721 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 722 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 723 724/* 725 * post-SubByte transform 726 * 727 * post-lookup for sbox3: 728 * swap_bitendianness( 729 * camellia_h( 730 * isom_map_aes_to_camellia( 731 * swap_bitendianness( 732 * aes_inverse_affine_transform(in) 733 * ) 734 * ) 735 * ) 736 * ) >>> 1 737 * 738 * (note: '⊕ 0x6e' inside camellia_h()) 739 */ 740.Lpost_tf_lo_s3: 741 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 742 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 743.Lpost_tf_hi_s3: 744 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 745 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 746 747/* For isolating SubBytes from AESENCLAST, inverse shift row */ 748.Linv_shift_row: 749 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 750 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 751 752.align 4 753/* 4-bit mask */ 754.L0f0f0f0f: 755 .long 0x0f0f0f0f 756 757.text 758 759.align 8 760__camellia_enc_blk32: 761 /* input: 762 * %rdi: ctx, CTX 763 * %rax: temporary storage, 512 bytes 764 * %ymm0..%ymm15: 32 plaintext blocks 765 * output: 766 * %ymm0..%ymm15: 32 encrypted blocks, order swapped: 767 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 768 */ 769 770 leaq 8 * 32(%rax), %rcx; 771 772 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 773 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 774 %ymm15, %rax, %rcx); 775 776 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 777 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 778 %ymm15, %rax, %rcx, 0); 779 780 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 781 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 782 %ymm15, 783 ((key_table + (8) * 8) + 0)(CTX), 784 ((key_table + (8) * 8) + 4)(CTX), 785 ((key_table + (8) * 8) + 8)(CTX), 786 ((key_table + (8) * 8) + 12)(CTX)); 787 788 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 789 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 790 %ymm15, %rax, %rcx, 8); 791 792 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 793 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 794 %ymm15, 795 ((key_table + (16) * 8) + 0)(CTX), 796 ((key_table + (16) * 8) + 4)(CTX), 797 ((key_table + (16) * 8) + 8)(CTX), 798 ((key_table + (16) * 8) + 12)(CTX)); 799 800 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 801 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 802 %ymm15, %rax, %rcx, 16); 803 804 movl $24, %r8d; 805 cmpl $16, key_length(CTX); 806 jne .Lenc_max32; 807 808.Lenc_done: 809 /* load CD for output */ 810 vmovdqu 0 * 32(%rcx), %ymm8; 811 vmovdqu 1 * 32(%rcx), %ymm9; 812 vmovdqu 2 * 32(%rcx), %ymm10; 813 vmovdqu 3 * 32(%rcx), %ymm11; 814 vmovdqu 4 * 32(%rcx), %ymm12; 815 vmovdqu 5 * 32(%rcx), %ymm13; 816 vmovdqu 6 * 32(%rcx), %ymm14; 817 vmovdqu 7 * 32(%rcx), %ymm15; 818 819 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 820 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 821 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); 822 823 ret; 824 825.align 8 826.Lenc_max32: 827 movl $32, %r8d; 828 829 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 830 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 831 %ymm15, 832 ((key_table + (24) * 8) + 0)(CTX), 833 ((key_table + (24) * 8) + 4)(CTX), 834 ((key_table + (24) * 8) + 8)(CTX), 835 ((key_table + (24) * 8) + 12)(CTX)); 836 837 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 838 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 839 %ymm15, %rax, %rcx, 24); 840 841 jmp .Lenc_done; 842ENDPROC(__camellia_enc_blk32) 843 844.align 8 845__camellia_dec_blk32: 846 /* input: 847 * %rdi: ctx, CTX 848 * %rax: temporary storage, 512 bytes 849 * %r8d: 24 for 16 byte key, 32 for larger 850 * %ymm0..%ymm15: 16 encrypted blocks 851 * output: 852 * %ymm0..%ymm15: 16 plaintext blocks, order swapped: 853 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 854 */ 855 856 leaq 8 * 32(%rax), %rcx; 857 858 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 859 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 860 %ymm15, %rax, %rcx); 861 862 cmpl $32, %r8d; 863 je .Ldec_max32; 864 865.Ldec_max24: 866 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 867 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 868 %ymm15, %rax, %rcx, 16); 869 870 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 871 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 872 %ymm15, 873 ((key_table + (16) * 8) + 8)(CTX), 874 ((key_table + (16) * 8) + 12)(CTX), 875 ((key_table + (16) * 8) + 0)(CTX), 876 ((key_table + (16) * 8) + 4)(CTX)); 877 878 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 879 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 880 %ymm15, %rax, %rcx, 8); 881 882 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 883 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 884 %ymm15, 885 ((key_table + (8) * 8) + 8)(CTX), 886 ((key_table + (8) * 8) + 12)(CTX), 887 ((key_table + (8) * 8) + 0)(CTX), 888 ((key_table + (8) * 8) + 4)(CTX)); 889 890 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 891 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 892 %ymm15, %rax, %rcx, 0); 893 894 /* load CD for output */ 895 vmovdqu 0 * 32(%rcx), %ymm8; 896 vmovdqu 1 * 32(%rcx), %ymm9; 897 vmovdqu 2 * 32(%rcx), %ymm10; 898 vmovdqu 3 * 32(%rcx), %ymm11; 899 vmovdqu 4 * 32(%rcx), %ymm12; 900 vmovdqu 5 * 32(%rcx), %ymm13; 901 vmovdqu 6 * 32(%rcx), %ymm14; 902 vmovdqu 7 * 32(%rcx), %ymm15; 903 904 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 905 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 906 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); 907 908 ret; 909 910.align 8 911.Ldec_max32: 912 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 913 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 914 %ymm15, %rax, %rcx, 24); 915 916 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 917 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 918 %ymm15, 919 ((key_table + (24) * 8) + 8)(CTX), 920 ((key_table + (24) * 8) + 12)(CTX), 921 ((key_table + (24) * 8) + 0)(CTX), 922 ((key_table + (24) * 8) + 4)(CTX)); 923 924 jmp .Ldec_max24; 925ENDPROC(__camellia_dec_blk32) 926 927ENTRY(camellia_ecb_enc_32way) 928 /* input: 929 * %rdi: ctx, CTX 930 * %rsi: dst (32 blocks) 931 * %rdx: src (32 blocks) 932 */ 933 934 vzeroupper; 935 936 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 937 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 938 %ymm15, %rdx, (key_table)(CTX)); 939 940 /* now dst can be used as temporary buffer (even in src == dst case) */ 941 movq %rsi, %rax; 942 943 call __camellia_enc_blk32; 944 945 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 946 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 947 %ymm8, %rsi); 948 949 vzeroupper; 950 951 ret; 952ENDPROC(camellia_ecb_enc_32way) 953 954ENTRY(camellia_ecb_dec_32way) 955 /* input: 956 * %rdi: ctx, CTX 957 * %rsi: dst (32 blocks) 958 * %rdx: src (32 blocks) 959 */ 960 961 vzeroupper; 962 963 cmpl $16, key_length(CTX); 964 movl $32, %r8d; 965 movl $24, %eax; 966 cmovel %eax, %r8d; /* max */ 967 968 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 969 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 970 %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 971 972 /* now dst can be used as temporary buffer (even in src == dst case) */ 973 movq %rsi, %rax; 974 975 call __camellia_dec_blk32; 976 977 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 978 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 979 %ymm8, %rsi); 980 981 vzeroupper; 982 983 ret; 984ENDPROC(camellia_ecb_dec_32way) 985 986ENTRY(camellia_cbc_dec_32way) 987 /* input: 988 * %rdi: ctx, CTX 989 * %rsi: dst (32 blocks) 990 * %rdx: src (32 blocks) 991 */ 992 993 vzeroupper; 994 995 cmpl $16, key_length(CTX); 996 movl $32, %r8d; 997 movl $24, %eax; 998 cmovel %eax, %r8d; /* max */ 999 1000 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 1001 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1002 %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 1003 1004 movq %rsp, %r10; 1005 cmpq %rsi, %rdx; 1006 je .Lcbc_dec_use_stack; 1007 1008 /* dst can be used as temporary storage, src is not overwritten. */ 1009 movq %rsi, %rax; 1010 jmp .Lcbc_dec_continue; 1011 1012.Lcbc_dec_use_stack: 1013 /* 1014 * dst still in-use (because dst == src), so use stack for temporary 1015 * storage. 1016 */ 1017 subq $(16 * 32), %rsp; 1018 movq %rsp, %rax; 1019 1020.Lcbc_dec_continue: 1021 call __camellia_dec_blk32; 1022 1023 vmovdqu %ymm7, (%rax); 1024 vpxor %ymm7, %ymm7, %ymm7; 1025 vinserti128 $1, (%rdx), %ymm7, %ymm7; 1026 vpxor (%rax), %ymm7, %ymm7; 1027 movq %r10, %rsp; 1028 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; 1029 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; 1030 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; 1031 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; 1032 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; 1033 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; 1034 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; 1035 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; 1036 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; 1037 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; 1038 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; 1039 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; 1040 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; 1041 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; 1042 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; 1043 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1044 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1045 %ymm8, %rsi); 1046 1047 vzeroupper; 1048 1049 ret; 1050ENDPROC(camellia_cbc_dec_32way) 1051 1052#define inc_le128(x, minus_one, tmp) \ 1053 vpcmpeqq minus_one, x, tmp; \ 1054 vpsubq minus_one, x, x; \ 1055 vpslldq $8, tmp, tmp; \ 1056 vpsubq tmp, x, x; 1057 1058#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 1059 vpcmpeqq minus_one, x, tmp1; \ 1060 vpcmpeqq minus_two, x, tmp2; \ 1061 vpsubq minus_two, x, x; \ 1062 vpor tmp2, tmp1, tmp1; \ 1063 vpslldq $8, tmp1, tmp1; \ 1064 vpsubq tmp1, x, x; 1065 1066ENTRY(camellia_ctr_32way) 1067 /* input: 1068 * %rdi: ctx, CTX 1069 * %rsi: dst (32 blocks) 1070 * %rdx: src (32 blocks) 1071 * %rcx: iv (little endian, 128bit) 1072 */ 1073 1074 vzeroupper; 1075 1076 movq %rsp, %r10; 1077 cmpq %rsi, %rdx; 1078 je .Lctr_use_stack; 1079 1080 /* dst can be used as temporary storage, src is not overwritten. */ 1081 movq %rsi, %rax; 1082 jmp .Lctr_continue; 1083 1084.Lctr_use_stack: 1085 subq $(16 * 32), %rsp; 1086 movq %rsp, %rax; 1087 1088.Lctr_continue: 1089 vpcmpeqd %ymm15, %ymm15, %ymm15; 1090 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ 1091 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ 1092 1093 /* load IV and byteswap */ 1094 vmovdqu (%rcx), %xmm0; 1095 vmovdqa %xmm0, %xmm1; 1096 inc_le128(%xmm0, %xmm15, %xmm14); 1097 vbroadcasti128 .Lbswap128_mask, %ymm14; 1098 vinserti128 $1, %xmm0, %ymm1, %ymm0; 1099 vpshufb %ymm14, %ymm0, %ymm13; 1100 vmovdqu %ymm13, 15 * 32(%rax); 1101 1102 /* construct IVs */ 1103 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ 1104 vpshufb %ymm14, %ymm0, %ymm13; 1105 vmovdqu %ymm13, 14 * 32(%rax); 1106 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1107 vpshufb %ymm14, %ymm0, %ymm13; 1108 vmovdqu %ymm13, 13 * 32(%rax); 1109 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1110 vpshufb %ymm14, %ymm0, %ymm13; 1111 vmovdqu %ymm13, 12 * 32(%rax); 1112 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1113 vpshufb %ymm14, %ymm0, %ymm13; 1114 vmovdqu %ymm13, 11 * 32(%rax); 1115 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1116 vpshufb %ymm14, %ymm0, %ymm10; 1117 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1118 vpshufb %ymm14, %ymm0, %ymm9; 1119 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1120 vpshufb %ymm14, %ymm0, %ymm8; 1121 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1122 vpshufb %ymm14, %ymm0, %ymm7; 1123 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1124 vpshufb %ymm14, %ymm0, %ymm6; 1125 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1126 vpshufb %ymm14, %ymm0, %ymm5; 1127 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1128 vpshufb %ymm14, %ymm0, %ymm4; 1129 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1130 vpshufb %ymm14, %ymm0, %ymm3; 1131 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1132 vpshufb %ymm14, %ymm0, %ymm2; 1133 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1134 vpshufb %ymm14, %ymm0, %ymm1; 1135 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1136 vextracti128 $1, %ymm0, %xmm13; 1137 vpshufb %ymm14, %ymm0, %ymm0; 1138 inc_le128(%xmm13, %xmm15, %xmm14); 1139 vmovdqu %xmm13, (%rcx); 1140 1141 /* inpack32_pre: */ 1142 vpbroadcastq (key_table)(CTX), %ymm15; 1143 vpshufb .Lpack_bswap, %ymm15, %ymm15; 1144 vpxor %ymm0, %ymm15, %ymm0; 1145 vpxor %ymm1, %ymm15, %ymm1; 1146 vpxor %ymm2, %ymm15, %ymm2; 1147 vpxor %ymm3, %ymm15, %ymm3; 1148 vpxor %ymm4, %ymm15, %ymm4; 1149 vpxor %ymm5, %ymm15, %ymm5; 1150 vpxor %ymm6, %ymm15, %ymm6; 1151 vpxor %ymm7, %ymm15, %ymm7; 1152 vpxor %ymm8, %ymm15, %ymm8; 1153 vpxor %ymm9, %ymm15, %ymm9; 1154 vpxor %ymm10, %ymm15, %ymm10; 1155 vpxor 11 * 32(%rax), %ymm15, %ymm11; 1156 vpxor 12 * 32(%rax), %ymm15, %ymm12; 1157 vpxor 13 * 32(%rax), %ymm15, %ymm13; 1158 vpxor 14 * 32(%rax), %ymm15, %ymm14; 1159 vpxor 15 * 32(%rax), %ymm15, %ymm15; 1160 1161 call __camellia_enc_blk32; 1162 1163 movq %r10, %rsp; 1164 1165 vpxor 0 * 32(%rdx), %ymm7, %ymm7; 1166 vpxor 1 * 32(%rdx), %ymm6, %ymm6; 1167 vpxor 2 * 32(%rdx), %ymm5, %ymm5; 1168 vpxor 3 * 32(%rdx), %ymm4, %ymm4; 1169 vpxor 4 * 32(%rdx), %ymm3, %ymm3; 1170 vpxor 5 * 32(%rdx), %ymm2, %ymm2; 1171 vpxor 6 * 32(%rdx), %ymm1, %ymm1; 1172 vpxor 7 * 32(%rdx), %ymm0, %ymm0; 1173 vpxor 8 * 32(%rdx), %ymm15, %ymm15; 1174 vpxor 9 * 32(%rdx), %ymm14, %ymm14; 1175 vpxor 10 * 32(%rdx), %ymm13, %ymm13; 1176 vpxor 11 * 32(%rdx), %ymm12, %ymm12; 1177 vpxor 12 * 32(%rdx), %ymm11, %ymm11; 1178 vpxor 13 * 32(%rdx), %ymm10, %ymm10; 1179 vpxor 14 * 32(%rdx), %ymm9, %ymm9; 1180 vpxor 15 * 32(%rdx), %ymm8, %ymm8; 1181 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1182 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1183 %ymm8, %rsi); 1184 1185 vzeroupper; 1186 1187 ret; 1188ENDPROC(camellia_ctr_32way) 1189 1190#define gf128mul_x_ble(iv, mask, tmp) \ 1191 vpsrad $31, iv, tmp; \ 1192 vpaddq iv, iv, iv; \ 1193 vpshufd $0x13, tmp, tmp; \ 1194 vpand mask, tmp, tmp; \ 1195 vpxor tmp, iv, iv; 1196 1197#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 1198 vpsrad $31, iv, tmp0; \ 1199 vpaddq iv, iv, tmp1; \ 1200 vpsllq $2, iv, iv; \ 1201 vpshufd $0x13, tmp0, tmp0; \ 1202 vpsrad $31, tmp1, tmp1; \ 1203 vpand mask2, tmp0, tmp0; \ 1204 vpshufd $0x13, tmp1, tmp1; \ 1205 vpxor tmp0, iv, iv; \ 1206 vpand mask1, tmp1, tmp1; \ 1207 vpxor tmp1, iv, iv; 1208 1209.align 8 1210camellia_xts_crypt_32way: 1211 /* input: 1212 * %rdi: ctx, CTX 1213 * %rsi: dst (32 blocks) 1214 * %rdx: src (32 blocks) 1215 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1216 * %r8: index for input whitening key 1217 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 1218 */ 1219 1220 vzeroupper; 1221 1222 subq $(16 * 32), %rsp; 1223 movq %rsp, %rax; 1224 1225 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; 1226 1227 /* load IV and construct second IV */ 1228 vmovdqu (%rcx), %xmm0; 1229 vmovdqa %xmm0, %xmm15; 1230 gf128mul_x_ble(%xmm0, %xmm12, %xmm13); 1231 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; 1232 vinserti128 $1, %xmm0, %ymm15, %ymm0; 1233 vpxor 0 * 32(%rdx), %ymm0, %ymm15; 1234 vmovdqu %ymm15, 15 * 32(%rax); 1235 vmovdqu %ymm0, 0 * 32(%rsi); 1236 1237 /* construct IVs */ 1238 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1239 vpxor 1 * 32(%rdx), %ymm0, %ymm15; 1240 vmovdqu %ymm15, 14 * 32(%rax); 1241 vmovdqu %ymm0, 1 * 32(%rsi); 1242 1243 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1244 vpxor 2 * 32(%rdx), %ymm0, %ymm15; 1245 vmovdqu %ymm15, 13 * 32(%rax); 1246 vmovdqu %ymm0, 2 * 32(%rsi); 1247 1248 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1249 vpxor 3 * 32(%rdx), %ymm0, %ymm15; 1250 vmovdqu %ymm15, 12 * 32(%rax); 1251 vmovdqu %ymm0, 3 * 32(%rsi); 1252 1253 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1254 vpxor 4 * 32(%rdx), %ymm0, %ymm11; 1255 vmovdqu %ymm0, 4 * 32(%rsi); 1256 1257 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1258 vpxor 5 * 32(%rdx), %ymm0, %ymm10; 1259 vmovdqu %ymm0, 5 * 32(%rsi); 1260 1261 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1262 vpxor 6 * 32(%rdx), %ymm0, %ymm9; 1263 vmovdqu %ymm0, 6 * 32(%rsi); 1264 1265 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1266 vpxor 7 * 32(%rdx), %ymm0, %ymm8; 1267 vmovdqu %ymm0, 7 * 32(%rsi); 1268 1269 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1270 vpxor 8 * 32(%rdx), %ymm0, %ymm7; 1271 vmovdqu %ymm0, 8 * 32(%rsi); 1272 1273 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1274 vpxor 9 * 32(%rdx), %ymm0, %ymm6; 1275 vmovdqu %ymm0, 9 * 32(%rsi); 1276 1277 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1278 vpxor 10 * 32(%rdx), %ymm0, %ymm5; 1279 vmovdqu %ymm0, 10 * 32(%rsi); 1280 1281 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1282 vpxor 11 * 32(%rdx), %ymm0, %ymm4; 1283 vmovdqu %ymm0, 11 * 32(%rsi); 1284 1285 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1286 vpxor 12 * 32(%rdx), %ymm0, %ymm3; 1287 vmovdqu %ymm0, 12 * 32(%rsi); 1288 1289 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1290 vpxor 13 * 32(%rdx), %ymm0, %ymm2; 1291 vmovdqu %ymm0, 13 * 32(%rsi); 1292 1293 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1294 vpxor 14 * 32(%rdx), %ymm0, %ymm1; 1295 vmovdqu %ymm0, 14 * 32(%rsi); 1296 1297 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1298 vpxor 15 * 32(%rdx), %ymm0, %ymm15; 1299 vmovdqu %ymm15, 0 * 32(%rax); 1300 vmovdqu %ymm0, 15 * 32(%rsi); 1301 1302 vextracti128 $1, %ymm0, %xmm0; 1303 gf128mul_x_ble(%xmm0, %xmm12, %xmm15); 1304 vmovdqu %xmm0, (%rcx); 1305 1306 /* inpack32_pre: */ 1307 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; 1308 vpshufb .Lpack_bswap, %ymm15, %ymm15; 1309 vpxor 0 * 32(%rax), %ymm15, %ymm0; 1310 vpxor %ymm1, %ymm15, %ymm1; 1311 vpxor %ymm2, %ymm15, %ymm2; 1312 vpxor %ymm3, %ymm15, %ymm3; 1313 vpxor %ymm4, %ymm15, %ymm4; 1314 vpxor %ymm5, %ymm15, %ymm5; 1315 vpxor %ymm6, %ymm15, %ymm6; 1316 vpxor %ymm7, %ymm15, %ymm7; 1317 vpxor %ymm8, %ymm15, %ymm8; 1318 vpxor %ymm9, %ymm15, %ymm9; 1319 vpxor %ymm10, %ymm15, %ymm10; 1320 vpxor %ymm11, %ymm15, %ymm11; 1321 vpxor 12 * 32(%rax), %ymm15, %ymm12; 1322 vpxor 13 * 32(%rax), %ymm15, %ymm13; 1323 vpxor 14 * 32(%rax), %ymm15, %ymm14; 1324 vpxor 15 * 32(%rax), %ymm15, %ymm15; 1325 1326 call *%r9; 1327 1328 addq $(16 * 32), %rsp; 1329 1330 vpxor 0 * 32(%rsi), %ymm7, %ymm7; 1331 vpxor 1 * 32(%rsi), %ymm6, %ymm6; 1332 vpxor 2 * 32(%rsi), %ymm5, %ymm5; 1333 vpxor 3 * 32(%rsi), %ymm4, %ymm4; 1334 vpxor 4 * 32(%rsi), %ymm3, %ymm3; 1335 vpxor 5 * 32(%rsi), %ymm2, %ymm2; 1336 vpxor 6 * 32(%rsi), %ymm1, %ymm1; 1337 vpxor 7 * 32(%rsi), %ymm0, %ymm0; 1338 vpxor 8 * 32(%rsi), %ymm15, %ymm15; 1339 vpxor 9 * 32(%rsi), %ymm14, %ymm14; 1340 vpxor 10 * 32(%rsi), %ymm13, %ymm13; 1341 vpxor 11 * 32(%rsi), %ymm12, %ymm12; 1342 vpxor 12 * 32(%rsi), %ymm11, %ymm11; 1343 vpxor 13 * 32(%rsi), %ymm10, %ymm10; 1344 vpxor 14 * 32(%rsi), %ymm9, %ymm9; 1345 vpxor 15 * 32(%rsi), %ymm8, %ymm8; 1346 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1347 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1348 %ymm8, %rsi); 1349 1350 vzeroupper; 1351 1352 ret; 1353ENDPROC(camellia_xts_crypt_32way) 1354 1355ENTRY(camellia_xts_enc_32way) 1356 /* input: 1357 * %rdi: ctx, CTX 1358 * %rsi: dst (32 blocks) 1359 * %rdx: src (32 blocks) 1360 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1361 */ 1362 1363 xorl %r8d, %r8d; /* input whitening key, 0 for enc */ 1364 1365 leaq __camellia_enc_blk32, %r9; 1366 1367 jmp camellia_xts_crypt_32way; 1368ENDPROC(camellia_xts_enc_32way) 1369 1370ENTRY(camellia_xts_dec_32way) 1371 /* input: 1372 * %rdi: ctx, CTX 1373 * %rsi: dst (32 blocks) 1374 * %rdx: src (32 blocks) 1375 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1376 */ 1377 1378 cmpl $16, key_length(CTX); 1379 movl $32, %r8d; 1380 movl $24, %eax; 1381 cmovel %eax, %r8d; /* input whitening key, last for dec */ 1382 1383 leaq __camellia_dec_blk32, %r9; 1384 1385 jmp camellia_xts_crypt_32way; 1386ENDPROC(camellia_xts_dec_32way) 1387