1/* 2 * x86_64/AVX2/AES-NI assembler implementation of Camellia 3 * 4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 */ 12 13#include <linux/linkage.h> 14#include <asm/frame.h> 15 16#define CAMELLIA_TABLE_BYTE_LEN 272 17 18/* struct camellia_ctx: */ 19#define key_table 0 20#define key_length CAMELLIA_TABLE_BYTE_LEN 21 22/* register macros */ 23#define CTX %rdi 24#define RIO %r8 25 26/********************************************************************** 27 helper macros 28 **********************************************************************/ 29#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 30 vpand x, mask4bit, tmp0; \ 31 vpandn x, mask4bit, x; \ 32 vpsrld $4, x, x; \ 33 \ 34 vpshufb tmp0, lo_t, tmp0; \ 35 vpshufb x, hi_t, x; \ 36 vpxor tmp0, x, x; 37 38#define ymm0_x xmm0 39#define ymm1_x xmm1 40#define ymm2_x xmm2 41#define ymm3_x xmm3 42#define ymm4_x xmm4 43#define ymm5_x xmm5 44#define ymm6_x xmm6 45#define ymm7_x xmm7 46#define ymm8_x xmm8 47#define ymm9_x xmm9 48#define ymm10_x xmm10 49#define ymm11_x xmm11 50#define ymm12_x xmm12 51#define ymm13_x xmm13 52#define ymm14_x xmm14 53#define ymm15_x xmm15 54 55/********************************************************************** 56 32-way camellia 57 **********************************************************************/ 58 59/* 60 * IN: 61 * x0..x7: byte-sliced AB state 62 * mem_cd: register pointer storing CD state 63 * key: index for key material 64 * OUT: 65 * x0..x7: new byte-sliced CD state 66 */ 67#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ 68 t7, mem_cd, key) \ 69 /* \ 70 * S-function with AES subbytes \ 71 */ \ 72 vbroadcasti128 .Linv_shift_row, t4; \ 73 vpbroadcastd .L0f0f0f0f, t7; \ 74 vbroadcasti128 .Lpre_tf_lo_s1, t5; \ 75 vbroadcasti128 .Lpre_tf_hi_s1, t6; \ 76 vbroadcasti128 .Lpre_tf_lo_s4, t2; \ 77 vbroadcasti128 .Lpre_tf_hi_s4, t3; \ 78 \ 79 /* AES inverse shift rows */ \ 80 vpshufb t4, x0, x0; \ 81 vpshufb t4, x7, x7; \ 82 vpshufb t4, x3, x3; \ 83 vpshufb t4, x6, x6; \ 84 vpshufb t4, x2, x2; \ 85 vpshufb t4, x5, x5; \ 86 vpshufb t4, x1, x1; \ 87 vpshufb t4, x4, x4; \ 88 \ 89 /* prefilter sboxes 1, 2 and 3 */ \ 90 /* prefilter sbox 4 */ \ 91 filter_8bit(x0, t5, t6, t7, t4); \ 92 filter_8bit(x7, t5, t6, t7, t4); \ 93 vextracti128 $1, x0, t0##_x; \ 94 vextracti128 $1, x7, t1##_x; \ 95 filter_8bit(x3, t2, t3, t7, t4); \ 96 filter_8bit(x6, t2, t3, t7, t4); \ 97 vextracti128 $1, x3, t3##_x; \ 98 vextracti128 $1, x6, t2##_x; \ 99 filter_8bit(x2, t5, t6, t7, t4); \ 100 filter_8bit(x5, t5, t6, t7, t4); \ 101 filter_8bit(x1, t5, t6, t7, t4); \ 102 filter_8bit(x4, t5, t6, t7, t4); \ 103 \ 104 vpxor t4##_x, t4##_x, t4##_x; \ 105 \ 106 /* AES subbytes + AES shift rows */ \ 107 vextracti128 $1, x2, t6##_x; \ 108 vextracti128 $1, x5, t5##_x; \ 109 vaesenclast t4##_x, x0##_x, x0##_x; \ 110 vaesenclast t4##_x, t0##_x, t0##_x; \ 111 vinserti128 $1, t0##_x, x0, x0; \ 112 vaesenclast t4##_x, x7##_x, x7##_x; \ 113 vaesenclast t4##_x, t1##_x, t1##_x; \ 114 vinserti128 $1, t1##_x, x7, x7; \ 115 vaesenclast t4##_x, x3##_x, x3##_x; \ 116 vaesenclast t4##_x, t3##_x, t3##_x; \ 117 vinserti128 $1, t3##_x, x3, x3; \ 118 vaesenclast t4##_x, x6##_x, x6##_x; \ 119 vaesenclast t4##_x, t2##_x, t2##_x; \ 120 vinserti128 $1, t2##_x, x6, x6; \ 121 vextracti128 $1, x1, t3##_x; \ 122 vextracti128 $1, x4, t2##_x; \ 123 vbroadcasti128 .Lpost_tf_lo_s1, t0; \ 124 vbroadcasti128 .Lpost_tf_hi_s1, t1; \ 125 vaesenclast t4##_x, x2##_x, x2##_x; \ 126 vaesenclast t4##_x, t6##_x, t6##_x; \ 127 vinserti128 $1, t6##_x, x2, x2; \ 128 vaesenclast t4##_x, x5##_x, x5##_x; \ 129 vaesenclast t4##_x, t5##_x, t5##_x; \ 130 vinserti128 $1, t5##_x, x5, x5; \ 131 vaesenclast t4##_x, x1##_x, x1##_x; \ 132 vaesenclast t4##_x, t3##_x, t3##_x; \ 133 vinserti128 $1, t3##_x, x1, x1; \ 134 vaesenclast t4##_x, x4##_x, x4##_x; \ 135 vaesenclast t4##_x, t2##_x, t2##_x; \ 136 vinserti128 $1, t2##_x, x4, x4; \ 137 \ 138 /* postfilter sboxes 1 and 4 */ \ 139 vbroadcasti128 .Lpost_tf_lo_s3, t2; \ 140 vbroadcasti128 .Lpost_tf_hi_s3, t3; \ 141 filter_8bit(x0, t0, t1, t7, t6); \ 142 filter_8bit(x7, t0, t1, t7, t6); \ 143 filter_8bit(x3, t0, t1, t7, t6); \ 144 filter_8bit(x6, t0, t1, t7, t6); \ 145 \ 146 /* postfilter sbox 3 */ \ 147 vbroadcasti128 .Lpost_tf_lo_s2, t4; \ 148 vbroadcasti128 .Lpost_tf_hi_s2, t5; \ 149 filter_8bit(x2, t2, t3, t7, t6); \ 150 filter_8bit(x5, t2, t3, t7, t6); \ 151 \ 152 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ 153 \ 154 /* postfilter sbox 2 */ \ 155 filter_8bit(x1, t4, t5, t7, t2); \ 156 filter_8bit(x4, t4, t5, t7, t2); \ 157 vpxor t7, t7, t7; \ 158 \ 159 vpsrldq $1, t0, t1; \ 160 vpsrldq $2, t0, t2; \ 161 vpshufb t7, t1, t1; \ 162 vpsrldq $3, t0, t3; \ 163 \ 164 /* P-function */ \ 165 vpxor x5, x0, x0; \ 166 vpxor x6, x1, x1; \ 167 vpxor x7, x2, x2; \ 168 vpxor x4, x3, x3; \ 169 \ 170 vpshufb t7, t2, t2; \ 171 vpsrldq $4, t0, t4; \ 172 vpshufb t7, t3, t3; \ 173 vpsrldq $5, t0, t5; \ 174 vpshufb t7, t4, t4; \ 175 \ 176 vpxor x2, x4, x4; \ 177 vpxor x3, x5, x5; \ 178 vpxor x0, x6, x6; \ 179 vpxor x1, x7, x7; \ 180 \ 181 vpsrldq $6, t0, t6; \ 182 vpshufb t7, t5, t5; \ 183 vpshufb t7, t6, t6; \ 184 \ 185 vpxor x7, x0, x0; \ 186 vpxor x4, x1, x1; \ 187 vpxor x5, x2, x2; \ 188 vpxor x6, x3, x3; \ 189 \ 190 vpxor x3, x4, x4; \ 191 vpxor x0, x5, x5; \ 192 vpxor x1, x6, x6; \ 193 vpxor x2, x7, x7; /* note: high and low parts swapped */ \ 194 \ 195 /* Add key material and result to CD (x becomes new CD) */ \ 196 \ 197 vpxor t6, x1, x1; \ 198 vpxor 5 * 32(mem_cd), x1, x1; \ 199 \ 200 vpsrldq $7, t0, t6; \ 201 vpshufb t7, t0, t0; \ 202 vpshufb t7, t6, t7; \ 203 \ 204 vpxor t7, x0, x0; \ 205 vpxor 4 * 32(mem_cd), x0, x0; \ 206 \ 207 vpxor t5, x2, x2; \ 208 vpxor 6 * 32(mem_cd), x2, x2; \ 209 \ 210 vpxor t4, x3, x3; \ 211 vpxor 7 * 32(mem_cd), x3, x3; \ 212 \ 213 vpxor t3, x4, x4; \ 214 vpxor 0 * 32(mem_cd), x4, x4; \ 215 \ 216 vpxor t2, x5, x5; \ 217 vpxor 1 * 32(mem_cd), x5, x5; \ 218 \ 219 vpxor t1, x6, x6; \ 220 vpxor 2 * 32(mem_cd), x6, x6; \ 221 \ 222 vpxor t0, x7, x7; \ 223 vpxor 3 * 32(mem_cd), x7, x7; 224 225/* 226 * Size optimization... with inlined roundsm32 binary would be over 5 times 227 * larger and would only marginally faster. 228 */ 229.align 8 230roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: 231 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 232 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 233 %rcx, (%r9)); 234 ret; 235ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 236 237.align 8 238roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: 239 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, 240 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, 241 %rax, (%r9)); 242 ret; 243ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 244 245/* 246 * IN/OUT: 247 * x0..x7: byte-sliced AB state preloaded 248 * mem_ab: byte-sliced AB state in memory 249 * mem_cb: byte-sliced CD state in memory 250 */ 251#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 252 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ 253 leaq (key_table + (i) * 8)(CTX), %r9; \ 254 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ 255 \ 256 vmovdqu x0, 4 * 32(mem_cd); \ 257 vmovdqu x1, 5 * 32(mem_cd); \ 258 vmovdqu x2, 6 * 32(mem_cd); \ 259 vmovdqu x3, 7 * 32(mem_cd); \ 260 vmovdqu x4, 0 * 32(mem_cd); \ 261 vmovdqu x5, 1 * 32(mem_cd); \ 262 vmovdqu x6, 2 * 32(mem_cd); \ 263 vmovdqu x7, 3 * 32(mem_cd); \ 264 \ 265 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ 266 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ 267 \ 268 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); 269 270#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ 271 272#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ 273 /* Store new AB state */ \ 274 vmovdqu x4, 4 * 32(mem_ab); \ 275 vmovdqu x5, 5 * 32(mem_ab); \ 276 vmovdqu x6, 6 * 32(mem_ab); \ 277 vmovdqu x7, 7 * 32(mem_ab); \ 278 vmovdqu x0, 0 * 32(mem_ab); \ 279 vmovdqu x1, 1 * 32(mem_ab); \ 280 vmovdqu x2, 2 * 32(mem_ab); \ 281 vmovdqu x3, 3 * 32(mem_ab); 282 283#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 284 y6, y7, mem_ab, mem_cd, i) \ 285 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 286 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ 287 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 288 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ 289 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 290 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); 291 292#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 293 y6, y7, mem_ab, mem_cd, i) \ 294 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 295 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ 296 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 297 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ 298 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 299 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); 300 301/* 302 * IN: 303 * v0..3: byte-sliced 32-bit integers 304 * OUT: 305 * v0..3: (IN <<< 1) 306 */ 307#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ 308 vpcmpgtb v0, zero, t0; \ 309 vpaddb v0, v0, v0; \ 310 vpabsb t0, t0; \ 311 \ 312 vpcmpgtb v1, zero, t1; \ 313 vpaddb v1, v1, v1; \ 314 vpabsb t1, t1; \ 315 \ 316 vpcmpgtb v2, zero, t2; \ 317 vpaddb v2, v2, v2; \ 318 vpabsb t2, t2; \ 319 \ 320 vpor t0, v1, v1; \ 321 \ 322 vpcmpgtb v3, zero, t0; \ 323 vpaddb v3, v3, v3; \ 324 vpabsb t0, t0; \ 325 \ 326 vpor t1, v2, v2; \ 327 vpor t2, v3, v3; \ 328 vpor t0, v0, v0; 329 330/* 331 * IN: 332 * r: byte-sliced AB state in memory 333 * l: byte-sliced CD state in memory 334 * OUT: 335 * x0..x7: new byte-sliced CD state 336 */ 337#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ 338 tt1, tt2, tt3, kll, klr, krl, krr) \ 339 /* \ 340 * t0 = kll; \ 341 * t0 &= ll; \ 342 * lr ^= rol32(t0, 1); \ 343 */ \ 344 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ 345 vpxor tt0, tt0, tt0; \ 346 vpshufb tt0, t0, t3; \ 347 vpsrldq $1, t0, t0; \ 348 vpshufb tt0, t0, t2; \ 349 vpsrldq $1, t0, t0; \ 350 vpshufb tt0, t0, t1; \ 351 vpsrldq $1, t0, t0; \ 352 vpshufb tt0, t0, t0; \ 353 \ 354 vpand l0, t0, t0; \ 355 vpand l1, t1, t1; \ 356 vpand l2, t2, t2; \ 357 vpand l3, t3, t3; \ 358 \ 359 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 360 \ 361 vpxor l4, t0, l4; \ 362 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ 363 vmovdqu l4, 4 * 32(l); \ 364 vpxor l5, t1, l5; \ 365 vmovdqu l5, 5 * 32(l); \ 366 vpxor l6, t2, l6; \ 367 vmovdqu l6, 6 * 32(l); \ 368 vpxor l7, t3, l7; \ 369 vmovdqu l7, 7 * 32(l); \ 370 \ 371 /* \ 372 * t2 = krr; \ 373 * t2 |= rr; \ 374 * rl ^= t2; \ 375 */ \ 376 \ 377 vpshufb tt0, t0, t3; \ 378 vpsrldq $1, t0, t0; \ 379 vpshufb tt0, t0, t2; \ 380 vpsrldq $1, t0, t0; \ 381 vpshufb tt0, t0, t1; \ 382 vpsrldq $1, t0, t0; \ 383 vpshufb tt0, t0, t0; \ 384 \ 385 vpor 4 * 32(r), t0, t0; \ 386 vpor 5 * 32(r), t1, t1; \ 387 vpor 6 * 32(r), t2, t2; \ 388 vpor 7 * 32(r), t3, t3; \ 389 \ 390 vpxor 0 * 32(r), t0, t0; \ 391 vpxor 1 * 32(r), t1, t1; \ 392 vpxor 2 * 32(r), t2, t2; \ 393 vpxor 3 * 32(r), t3, t3; \ 394 vmovdqu t0, 0 * 32(r); \ 395 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ 396 vmovdqu t1, 1 * 32(r); \ 397 vmovdqu t2, 2 * 32(r); \ 398 vmovdqu t3, 3 * 32(r); \ 399 \ 400 /* \ 401 * t2 = krl; \ 402 * t2 &= rl; \ 403 * rr ^= rol32(t2, 1); \ 404 */ \ 405 vpshufb tt0, t0, t3; \ 406 vpsrldq $1, t0, t0; \ 407 vpshufb tt0, t0, t2; \ 408 vpsrldq $1, t0, t0; \ 409 vpshufb tt0, t0, t1; \ 410 vpsrldq $1, t0, t0; \ 411 vpshufb tt0, t0, t0; \ 412 \ 413 vpand 0 * 32(r), t0, t0; \ 414 vpand 1 * 32(r), t1, t1; \ 415 vpand 2 * 32(r), t2, t2; \ 416 vpand 3 * 32(r), t3, t3; \ 417 \ 418 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 419 \ 420 vpxor 4 * 32(r), t0, t0; \ 421 vpxor 5 * 32(r), t1, t1; \ 422 vpxor 6 * 32(r), t2, t2; \ 423 vpxor 7 * 32(r), t3, t3; \ 424 vmovdqu t0, 4 * 32(r); \ 425 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ 426 vmovdqu t1, 5 * 32(r); \ 427 vmovdqu t2, 6 * 32(r); \ 428 vmovdqu t3, 7 * 32(r); \ 429 \ 430 /* \ 431 * t0 = klr; \ 432 * t0 |= lr; \ 433 * ll ^= t0; \ 434 */ \ 435 \ 436 vpshufb tt0, t0, t3; \ 437 vpsrldq $1, t0, t0; \ 438 vpshufb tt0, t0, t2; \ 439 vpsrldq $1, t0, t0; \ 440 vpshufb tt0, t0, t1; \ 441 vpsrldq $1, t0, t0; \ 442 vpshufb tt0, t0, t0; \ 443 \ 444 vpor l4, t0, t0; \ 445 vpor l5, t1, t1; \ 446 vpor l6, t2, t2; \ 447 vpor l7, t3, t3; \ 448 \ 449 vpxor l0, t0, l0; \ 450 vmovdqu l0, 0 * 32(l); \ 451 vpxor l1, t1, l1; \ 452 vmovdqu l1, 1 * 32(l); \ 453 vpxor l2, t2, l2; \ 454 vmovdqu l2, 2 * 32(l); \ 455 vpxor l3, t3, l3; \ 456 vmovdqu l3, 3 * 32(l); 457 458#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 459 vpunpckhdq x1, x0, t2; \ 460 vpunpckldq x1, x0, x0; \ 461 \ 462 vpunpckldq x3, x2, t1; \ 463 vpunpckhdq x3, x2, x2; \ 464 \ 465 vpunpckhqdq t1, x0, x1; \ 466 vpunpcklqdq t1, x0, x0; \ 467 \ 468 vpunpckhqdq x2, t2, x3; \ 469 vpunpcklqdq x2, t2, x2; 470 471#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ 472 a3, b3, c3, d3, st0, st1) \ 473 vmovdqu d2, st0; \ 474 vmovdqu d3, st1; \ 475 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 476 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 477 vmovdqu st0, d2; \ 478 vmovdqu st1, d3; \ 479 \ 480 vmovdqu a0, st0; \ 481 vmovdqu a1, st1; \ 482 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 483 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 484 \ 485 vbroadcasti128 .Lshufb_16x16b, a0; \ 486 vmovdqu st1, a1; \ 487 vpshufb a0, a2, a2; \ 488 vpshufb a0, a3, a3; \ 489 vpshufb a0, b0, b0; \ 490 vpshufb a0, b1, b1; \ 491 vpshufb a0, b2, b2; \ 492 vpshufb a0, b3, b3; \ 493 vpshufb a0, a1, a1; \ 494 vpshufb a0, c0, c0; \ 495 vpshufb a0, c1, c1; \ 496 vpshufb a0, c2, c2; \ 497 vpshufb a0, c3, c3; \ 498 vpshufb a0, d0, d0; \ 499 vpshufb a0, d1, d1; \ 500 vpshufb a0, d2, d2; \ 501 vpshufb a0, d3, d3; \ 502 vmovdqu d3, st1; \ 503 vmovdqu st0, d3; \ 504 vpshufb a0, d3, a0; \ 505 vmovdqu d2, st0; \ 506 \ 507 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 508 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 509 vmovdqu st0, d2; \ 510 vmovdqu st1, d3; \ 511 \ 512 vmovdqu b0, st0; \ 513 vmovdqu b1, st1; \ 514 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 515 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 516 vmovdqu st0, b0; \ 517 vmovdqu st1, b1; \ 518 /* does not adjust output bytes inside vectors */ 519 520/* load blocks to registers and apply pre-whitening */ 521#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 522 y6, y7, rio, key) \ 523 vpbroadcastq key, x0; \ 524 vpshufb .Lpack_bswap, x0, x0; \ 525 \ 526 vpxor 0 * 32(rio), x0, y7; \ 527 vpxor 1 * 32(rio), x0, y6; \ 528 vpxor 2 * 32(rio), x0, y5; \ 529 vpxor 3 * 32(rio), x0, y4; \ 530 vpxor 4 * 32(rio), x0, y3; \ 531 vpxor 5 * 32(rio), x0, y2; \ 532 vpxor 6 * 32(rio), x0, y1; \ 533 vpxor 7 * 32(rio), x0, y0; \ 534 vpxor 8 * 32(rio), x0, x7; \ 535 vpxor 9 * 32(rio), x0, x6; \ 536 vpxor 10 * 32(rio), x0, x5; \ 537 vpxor 11 * 32(rio), x0, x4; \ 538 vpxor 12 * 32(rio), x0, x3; \ 539 vpxor 13 * 32(rio), x0, x2; \ 540 vpxor 14 * 32(rio), x0, x1; \ 541 vpxor 15 * 32(rio), x0, x0; 542 543/* byteslice pre-whitened blocks and store to temporary memory */ 544#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 545 y6, y7, mem_ab, mem_cd) \ 546 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ 547 y4, y5, y6, y7, (mem_ab), (mem_cd)); \ 548 \ 549 vmovdqu x0, 0 * 32(mem_ab); \ 550 vmovdqu x1, 1 * 32(mem_ab); \ 551 vmovdqu x2, 2 * 32(mem_ab); \ 552 vmovdqu x3, 3 * 32(mem_ab); \ 553 vmovdqu x4, 4 * 32(mem_ab); \ 554 vmovdqu x5, 5 * 32(mem_ab); \ 555 vmovdqu x6, 6 * 32(mem_ab); \ 556 vmovdqu x7, 7 * 32(mem_ab); \ 557 vmovdqu y0, 0 * 32(mem_cd); \ 558 vmovdqu y1, 1 * 32(mem_cd); \ 559 vmovdqu y2, 2 * 32(mem_cd); \ 560 vmovdqu y3, 3 * 32(mem_cd); \ 561 vmovdqu y4, 4 * 32(mem_cd); \ 562 vmovdqu y5, 5 * 32(mem_cd); \ 563 vmovdqu y6, 6 * 32(mem_cd); \ 564 vmovdqu y7, 7 * 32(mem_cd); 565 566/* de-byteslice, apply post-whitening and store blocks */ 567#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 568 y5, y6, y7, key, stack_tmp0, stack_tmp1) \ 569 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ 570 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ 571 \ 572 vmovdqu x0, stack_tmp0; \ 573 \ 574 vpbroadcastq key, x0; \ 575 vpshufb .Lpack_bswap, x0, x0; \ 576 \ 577 vpxor x0, y7, y7; \ 578 vpxor x0, y6, y6; \ 579 vpxor x0, y5, y5; \ 580 vpxor x0, y4, y4; \ 581 vpxor x0, y3, y3; \ 582 vpxor x0, y2, y2; \ 583 vpxor x0, y1, y1; \ 584 vpxor x0, y0, y0; \ 585 vpxor x0, x7, x7; \ 586 vpxor x0, x6, x6; \ 587 vpxor x0, x5, x5; \ 588 vpxor x0, x4, x4; \ 589 vpxor x0, x3, x3; \ 590 vpxor x0, x2, x2; \ 591 vpxor x0, x1, x1; \ 592 vpxor stack_tmp0, x0, x0; 593 594#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 595 y6, y7, rio) \ 596 vmovdqu x0, 0 * 32(rio); \ 597 vmovdqu x1, 1 * 32(rio); \ 598 vmovdqu x2, 2 * 32(rio); \ 599 vmovdqu x3, 3 * 32(rio); \ 600 vmovdqu x4, 4 * 32(rio); \ 601 vmovdqu x5, 5 * 32(rio); \ 602 vmovdqu x6, 6 * 32(rio); \ 603 vmovdqu x7, 7 * 32(rio); \ 604 vmovdqu y0, 8 * 32(rio); \ 605 vmovdqu y1, 9 * 32(rio); \ 606 vmovdqu y2, 10 * 32(rio); \ 607 vmovdqu y3, 11 * 32(rio); \ 608 vmovdqu y4, 12 * 32(rio); \ 609 vmovdqu y5, 13 * 32(rio); \ 610 vmovdqu y6, 14 * 32(rio); \ 611 vmovdqu y7, 15 * 32(rio); 612 613 614.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32 615.align 32 616#define SHUFB_BYTES(idx) \ 617 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 618.Lshufb_16x16b: 619 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 620 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 621 622.section .rodata.cst32.pack_bswap, "aM", @progbits, 32 623.align 32 624.Lpack_bswap: 625 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 626 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 627 628/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 629.section .rodata.cst16, "aM", @progbits, 16 630.align 16 631 632/* For CTR-mode IV byteswap */ 633.Lbswap128_mask: 634 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 635 636/* For XTS mode */ 637.Lxts_gf128mul_and_shl1_mask_0: 638 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 639.Lxts_gf128mul_and_shl1_mask_1: 640 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 641 642/* 643 * pre-SubByte transform 644 * 645 * pre-lookup for sbox1, sbox2, sbox3: 646 * swap_bitendianness( 647 * isom_map_camellia_to_aes( 648 * camellia_f( 649 * swap_bitendianess(in) 650 * ) 651 * ) 652 * ) 653 * 654 * (note: '⊕ 0xc5' inside camellia_f()) 655 */ 656.Lpre_tf_lo_s1: 657 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 658 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 659.Lpre_tf_hi_s1: 660 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a 661 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 662 663/* 664 * pre-SubByte transform 665 * 666 * pre-lookup for sbox4: 667 * swap_bitendianness( 668 * isom_map_camellia_to_aes( 669 * camellia_f( 670 * swap_bitendianess(in <<< 1) 671 * ) 672 * ) 673 * ) 674 * 675 * (note: '⊕ 0xc5' inside camellia_f()) 676 */ 677.Lpre_tf_lo_s4: 678 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 679 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 680.Lpre_tf_hi_s4: 681 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 682 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf 683 684/* 685 * post-SubByte transform 686 * 687 * post-lookup for sbox1, sbox4: 688 * swap_bitendianness( 689 * camellia_h( 690 * isom_map_aes_to_camellia( 691 * swap_bitendianness( 692 * aes_inverse_affine_transform(in) 693 * ) 694 * ) 695 * ) 696 * ) 697 * 698 * (note: '⊕ 0x6e' inside camellia_h()) 699 */ 700.Lpost_tf_lo_s1: 701 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 702 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 703.Lpost_tf_hi_s1: 704 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 705 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c 706 707/* 708 * post-SubByte transform 709 * 710 * post-lookup for sbox2: 711 * swap_bitendianness( 712 * camellia_h( 713 * isom_map_aes_to_camellia( 714 * swap_bitendianness( 715 * aes_inverse_affine_transform(in) 716 * ) 717 * ) 718 * ) 719 * ) <<< 1 720 * 721 * (note: '⊕ 0x6e' inside camellia_h()) 722 */ 723.Lpost_tf_lo_s2: 724 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 725 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 726.Lpost_tf_hi_s2: 727 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 728 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 729 730/* 731 * post-SubByte transform 732 * 733 * post-lookup for sbox3: 734 * swap_bitendianness( 735 * camellia_h( 736 * isom_map_aes_to_camellia( 737 * swap_bitendianness( 738 * aes_inverse_affine_transform(in) 739 * ) 740 * ) 741 * ) 742 * ) >>> 1 743 * 744 * (note: '⊕ 0x6e' inside camellia_h()) 745 */ 746.Lpost_tf_lo_s3: 747 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 748 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 749.Lpost_tf_hi_s3: 750 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 751 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 752 753/* For isolating SubBytes from AESENCLAST, inverse shift row */ 754.Linv_shift_row: 755 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 756 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 757 758.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 759.align 4 760/* 4-bit mask */ 761.L0f0f0f0f: 762 .long 0x0f0f0f0f 763 764.text 765 766.align 8 767__camellia_enc_blk32: 768 /* input: 769 * %rdi: ctx, CTX 770 * %rax: temporary storage, 512 bytes 771 * %ymm0..%ymm15: 32 plaintext blocks 772 * output: 773 * %ymm0..%ymm15: 32 encrypted blocks, order swapped: 774 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 775 */ 776 FRAME_BEGIN 777 778 leaq 8 * 32(%rax), %rcx; 779 780 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 781 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 782 %ymm15, %rax, %rcx); 783 784 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 785 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 786 %ymm15, %rax, %rcx, 0); 787 788 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 789 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 790 %ymm15, 791 ((key_table + (8) * 8) + 0)(CTX), 792 ((key_table + (8) * 8) + 4)(CTX), 793 ((key_table + (8) * 8) + 8)(CTX), 794 ((key_table + (8) * 8) + 12)(CTX)); 795 796 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 797 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 798 %ymm15, %rax, %rcx, 8); 799 800 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 801 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 802 %ymm15, 803 ((key_table + (16) * 8) + 0)(CTX), 804 ((key_table + (16) * 8) + 4)(CTX), 805 ((key_table + (16) * 8) + 8)(CTX), 806 ((key_table + (16) * 8) + 12)(CTX)); 807 808 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 809 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 810 %ymm15, %rax, %rcx, 16); 811 812 movl $24, %r8d; 813 cmpl $16, key_length(CTX); 814 jne .Lenc_max32; 815 816.Lenc_done: 817 /* load CD for output */ 818 vmovdqu 0 * 32(%rcx), %ymm8; 819 vmovdqu 1 * 32(%rcx), %ymm9; 820 vmovdqu 2 * 32(%rcx), %ymm10; 821 vmovdqu 3 * 32(%rcx), %ymm11; 822 vmovdqu 4 * 32(%rcx), %ymm12; 823 vmovdqu 5 * 32(%rcx), %ymm13; 824 vmovdqu 6 * 32(%rcx), %ymm14; 825 vmovdqu 7 * 32(%rcx), %ymm15; 826 827 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 828 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 829 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); 830 831 FRAME_END 832 ret; 833 834.align 8 835.Lenc_max32: 836 movl $32, %r8d; 837 838 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 839 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 840 %ymm15, 841 ((key_table + (24) * 8) + 0)(CTX), 842 ((key_table + (24) * 8) + 4)(CTX), 843 ((key_table + (24) * 8) + 8)(CTX), 844 ((key_table + (24) * 8) + 12)(CTX)); 845 846 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 847 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 848 %ymm15, %rax, %rcx, 24); 849 850 jmp .Lenc_done; 851ENDPROC(__camellia_enc_blk32) 852 853.align 8 854__camellia_dec_blk32: 855 /* input: 856 * %rdi: ctx, CTX 857 * %rax: temporary storage, 512 bytes 858 * %r8d: 24 for 16 byte key, 32 for larger 859 * %ymm0..%ymm15: 16 encrypted blocks 860 * output: 861 * %ymm0..%ymm15: 16 plaintext blocks, order swapped: 862 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 863 */ 864 FRAME_BEGIN 865 866 leaq 8 * 32(%rax), %rcx; 867 868 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 869 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 870 %ymm15, %rax, %rcx); 871 872 cmpl $32, %r8d; 873 je .Ldec_max32; 874 875.Ldec_max24: 876 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 877 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 878 %ymm15, %rax, %rcx, 16); 879 880 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 881 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 882 %ymm15, 883 ((key_table + (16) * 8) + 8)(CTX), 884 ((key_table + (16) * 8) + 12)(CTX), 885 ((key_table + (16) * 8) + 0)(CTX), 886 ((key_table + (16) * 8) + 4)(CTX)); 887 888 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 889 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 890 %ymm15, %rax, %rcx, 8); 891 892 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 893 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 894 %ymm15, 895 ((key_table + (8) * 8) + 8)(CTX), 896 ((key_table + (8) * 8) + 12)(CTX), 897 ((key_table + (8) * 8) + 0)(CTX), 898 ((key_table + (8) * 8) + 4)(CTX)); 899 900 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 901 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 902 %ymm15, %rax, %rcx, 0); 903 904 /* load CD for output */ 905 vmovdqu 0 * 32(%rcx), %ymm8; 906 vmovdqu 1 * 32(%rcx), %ymm9; 907 vmovdqu 2 * 32(%rcx), %ymm10; 908 vmovdqu 3 * 32(%rcx), %ymm11; 909 vmovdqu 4 * 32(%rcx), %ymm12; 910 vmovdqu 5 * 32(%rcx), %ymm13; 911 vmovdqu 6 * 32(%rcx), %ymm14; 912 vmovdqu 7 * 32(%rcx), %ymm15; 913 914 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 915 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 916 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); 917 918 FRAME_END 919 ret; 920 921.align 8 922.Ldec_max32: 923 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 924 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 925 %ymm15, %rax, %rcx, 24); 926 927 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 928 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 929 %ymm15, 930 ((key_table + (24) * 8) + 8)(CTX), 931 ((key_table + (24) * 8) + 12)(CTX), 932 ((key_table + (24) * 8) + 0)(CTX), 933 ((key_table + (24) * 8) + 4)(CTX)); 934 935 jmp .Ldec_max24; 936ENDPROC(__camellia_dec_blk32) 937 938ENTRY(camellia_ecb_enc_32way) 939 /* input: 940 * %rdi: ctx, CTX 941 * %rsi: dst (32 blocks) 942 * %rdx: src (32 blocks) 943 */ 944 FRAME_BEGIN 945 946 vzeroupper; 947 948 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 949 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 950 %ymm15, %rdx, (key_table)(CTX)); 951 952 /* now dst can be used as temporary buffer (even in src == dst case) */ 953 movq %rsi, %rax; 954 955 call __camellia_enc_blk32; 956 957 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 958 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 959 %ymm8, %rsi); 960 961 vzeroupper; 962 963 FRAME_END 964 ret; 965ENDPROC(camellia_ecb_enc_32way) 966 967ENTRY(camellia_ecb_dec_32way) 968 /* input: 969 * %rdi: ctx, CTX 970 * %rsi: dst (32 blocks) 971 * %rdx: src (32 blocks) 972 */ 973 FRAME_BEGIN 974 975 vzeroupper; 976 977 cmpl $16, key_length(CTX); 978 movl $32, %r8d; 979 movl $24, %eax; 980 cmovel %eax, %r8d; /* max */ 981 982 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 983 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 984 %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 985 986 /* now dst can be used as temporary buffer (even in src == dst case) */ 987 movq %rsi, %rax; 988 989 call __camellia_dec_blk32; 990 991 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 992 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 993 %ymm8, %rsi); 994 995 vzeroupper; 996 997 FRAME_END 998 ret; 999ENDPROC(camellia_ecb_dec_32way) 1000 1001ENTRY(camellia_cbc_dec_32way) 1002 /* input: 1003 * %rdi: ctx, CTX 1004 * %rsi: dst (32 blocks) 1005 * %rdx: src (32 blocks) 1006 */ 1007 FRAME_BEGIN 1008 1009 vzeroupper; 1010 1011 cmpl $16, key_length(CTX); 1012 movl $32, %r8d; 1013 movl $24, %eax; 1014 cmovel %eax, %r8d; /* max */ 1015 1016 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 1017 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1018 %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 1019 1020 movq %rsp, %r10; 1021 cmpq %rsi, %rdx; 1022 je .Lcbc_dec_use_stack; 1023 1024 /* dst can be used as temporary storage, src is not overwritten. */ 1025 movq %rsi, %rax; 1026 jmp .Lcbc_dec_continue; 1027 1028.Lcbc_dec_use_stack: 1029 /* 1030 * dst still in-use (because dst == src), so use stack for temporary 1031 * storage. 1032 */ 1033 subq $(16 * 32), %rsp; 1034 movq %rsp, %rax; 1035 1036.Lcbc_dec_continue: 1037 call __camellia_dec_blk32; 1038 1039 vmovdqu %ymm7, (%rax); 1040 vpxor %ymm7, %ymm7, %ymm7; 1041 vinserti128 $1, (%rdx), %ymm7, %ymm7; 1042 vpxor (%rax), %ymm7, %ymm7; 1043 movq %r10, %rsp; 1044 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; 1045 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; 1046 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; 1047 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; 1048 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; 1049 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; 1050 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; 1051 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; 1052 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; 1053 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; 1054 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; 1055 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; 1056 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; 1057 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; 1058 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; 1059 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1060 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1061 %ymm8, %rsi); 1062 1063 vzeroupper; 1064 1065 FRAME_END 1066 ret; 1067ENDPROC(camellia_cbc_dec_32way) 1068 1069#define inc_le128(x, minus_one, tmp) \ 1070 vpcmpeqq minus_one, x, tmp; \ 1071 vpsubq minus_one, x, x; \ 1072 vpslldq $8, tmp, tmp; \ 1073 vpsubq tmp, x, x; 1074 1075#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 1076 vpcmpeqq minus_one, x, tmp1; \ 1077 vpcmpeqq minus_two, x, tmp2; \ 1078 vpsubq minus_two, x, x; \ 1079 vpor tmp2, tmp1, tmp1; \ 1080 vpslldq $8, tmp1, tmp1; \ 1081 vpsubq tmp1, x, x; 1082 1083ENTRY(camellia_ctr_32way) 1084 /* input: 1085 * %rdi: ctx, CTX 1086 * %rsi: dst (32 blocks) 1087 * %rdx: src (32 blocks) 1088 * %rcx: iv (little endian, 128bit) 1089 */ 1090 FRAME_BEGIN 1091 1092 vzeroupper; 1093 1094 movq %rsp, %r10; 1095 cmpq %rsi, %rdx; 1096 je .Lctr_use_stack; 1097 1098 /* dst can be used as temporary storage, src is not overwritten. */ 1099 movq %rsi, %rax; 1100 jmp .Lctr_continue; 1101 1102.Lctr_use_stack: 1103 subq $(16 * 32), %rsp; 1104 movq %rsp, %rax; 1105 1106.Lctr_continue: 1107 vpcmpeqd %ymm15, %ymm15, %ymm15; 1108 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ 1109 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ 1110 1111 /* load IV and byteswap */ 1112 vmovdqu (%rcx), %xmm0; 1113 vmovdqa %xmm0, %xmm1; 1114 inc_le128(%xmm0, %xmm15, %xmm14); 1115 vbroadcasti128 .Lbswap128_mask, %ymm14; 1116 vinserti128 $1, %xmm0, %ymm1, %ymm0; 1117 vpshufb %ymm14, %ymm0, %ymm13; 1118 vmovdqu %ymm13, 15 * 32(%rax); 1119 1120 /* construct IVs */ 1121 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ 1122 vpshufb %ymm14, %ymm0, %ymm13; 1123 vmovdqu %ymm13, 14 * 32(%rax); 1124 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1125 vpshufb %ymm14, %ymm0, %ymm13; 1126 vmovdqu %ymm13, 13 * 32(%rax); 1127 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1128 vpshufb %ymm14, %ymm0, %ymm13; 1129 vmovdqu %ymm13, 12 * 32(%rax); 1130 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1131 vpshufb %ymm14, %ymm0, %ymm13; 1132 vmovdqu %ymm13, 11 * 32(%rax); 1133 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1134 vpshufb %ymm14, %ymm0, %ymm10; 1135 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1136 vpshufb %ymm14, %ymm0, %ymm9; 1137 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1138 vpshufb %ymm14, %ymm0, %ymm8; 1139 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1140 vpshufb %ymm14, %ymm0, %ymm7; 1141 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1142 vpshufb %ymm14, %ymm0, %ymm6; 1143 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1144 vpshufb %ymm14, %ymm0, %ymm5; 1145 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1146 vpshufb %ymm14, %ymm0, %ymm4; 1147 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1148 vpshufb %ymm14, %ymm0, %ymm3; 1149 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1150 vpshufb %ymm14, %ymm0, %ymm2; 1151 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1152 vpshufb %ymm14, %ymm0, %ymm1; 1153 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1154 vextracti128 $1, %ymm0, %xmm13; 1155 vpshufb %ymm14, %ymm0, %ymm0; 1156 inc_le128(%xmm13, %xmm15, %xmm14); 1157 vmovdqu %xmm13, (%rcx); 1158 1159 /* inpack32_pre: */ 1160 vpbroadcastq (key_table)(CTX), %ymm15; 1161 vpshufb .Lpack_bswap, %ymm15, %ymm15; 1162 vpxor %ymm0, %ymm15, %ymm0; 1163 vpxor %ymm1, %ymm15, %ymm1; 1164 vpxor %ymm2, %ymm15, %ymm2; 1165 vpxor %ymm3, %ymm15, %ymm3; 1166 vpxor %ymm4, %ymm15, %ymm4; 1167 vpxor %ymm5, %ymm15, %ymm5; 1168 vpxor %ymm6, %ymm15, %ymm6; 1169 vpxor %ymm7, %ymm15, %ymm7; 1170 vpxor %ymm8, %ymm15, %ymm8; 1171 vpxor %ymm9, %ymm15, %ymm9; 1172 vpxor %ymm10, %ymm15, %ymm10; 1173 vpxor 11 * 32(%rax), %ymm15, %ymm11; 1174 vpxor 12 * 32(%rax), %ymm15, %ymm12; 1175 vpxor 13 * 32(%rax), %ymm15, %ymm13; 1176 vpxor 14 * 32(%rax), %ymm15, %ymm14; 1177 vpxor 15 * 32(%rax), %ymm15, %ymm15; 1178 1179 call __camellia_enc_blk32; 1180 1181 movq %r10, %rsp; 1182 1183 vpxor 0 * 32(%rdx), %ymm7, %ymm7; 1184 vpxor 1 * 32(%rdx), %ymm6, %ymm6; 1185 vpxor 2 * 32(%rdx), %ymm5, %ymm5; 1186 vpxor 3 * 32(%rdx), %ymm4, %ymm4; 1187 vpxor 4 * 32(%rdx), %ymm3, %ymm3; 1188 vpxor 5 * 32(%rdx), %ymm2, %ymm2; 1189 vpxor 6 * 32(%rdx), %ymm1, %ymm1; 1190 vpxor 7 * 32(%rdx), %ymm0, %ymm0; 1191 vpxor 8 * 32(%rdx), %ymm15, %ymm15; 1192 vpxor 9 * 32(%rdx), %ymm14, %ymm14; 1193 vpxor 10 * 32(%rdx), %ymm13, %ymm13; 1194 vpxor 11 * 32(%rdx), %ymm12, %ymm12; 1195 vpxor 12 * 32(%rdx), %ymm11, %ymm11; 1196 vpxor 13 * 32(%rdx), %ymm10, %ymm10; 1197 vpxor 14 * 32(%rdx), %ymm9, %ymm9; 1198 vpxor 15 * 32(%rdx), %ymm8, %ymm8; 1199 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1200 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1201 %ymm8, %rsi); 1202 1203 vzeroupper; 1204 1205 FRAME_END 1206 ret; 1207ENDPROC(camellia_ctr_32way) 1208 1209#define gf128mul_x_ble(iv, mask, tmp) \ 1210 vpsrad $31, iv, tmp; \ 1211 vpaddq iv, iv, iv; \ 1212 vpshufd $0x13, tmp, tmp; \ 1213 vpand mask, tmp, tmp; \ 1214 vpxor tmp, iv, iv; 1215 1216#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 1217 vpsrad $31, iv, tmp0; \ 1218 vpaddq iv, iv, tmp1; \ 1219 vpsllq $2, iv, iv; \ 1220 vpshufd $0x13, tmp0, tmp0; \ 1221 vpsrad $31, tmp1, tmp1; \ 1222 vpand mask2, tmp0, tmp0; \ 1223 vpshufd $0x13, tmp1, tmp1; \ 1224 vpxor tmp0, iv, iv; \ 1225 vpand mask1, tmp1, tmp1; \ 1226 vpxor tmp1, iv, iv; 1227 1228.align 8 1229camellia_xts_crypt_32way: 1230 /* input: 1231 * %rdi: ctx, CTX 1232 * %rsi: dst (32 blocks) 1233 * %rdx: src (32 blocks) 1234 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1235 * %r8: index for input whitening key 1236 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 1237 */ 1238 FRAME_BEGIN 1239 1240 vzeroupper; 1241 1242 subq $(16 * 32), %rsp; 1243 movq %rsp, %rax; 1244 1245 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; 1246 1247 /* load IV and construct second IV */ 1248 vmovdqu (%rcx), %xmm0; 1249 vmovdqa %xmm0, %xmm15; 1250 gf128mul_x_ble(%xmm0, %xmm12, %xmm13); 1251 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; 1252 vinserti128 $1, %xmm0, %ymm15, %ymm0; 1253 vpxor 0 * 32(%rdx), %ymm0, %ymm15; 1254 vmovdqu %ymm15, 15 * 32(%rax); 1255 vmovdqu %ymm0, 0 * 32(%rsi); 1256 1257 /* construct IVs */ 1258 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1259 vpxor 1 * 32(%rdx), %ymm0, %ymm15; 1260 vmovdqu %ymm15, 14 * 32(%rax); 1261 vmovdqu %ymm0, 1 * 32(%rsi); 1262 1263 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1264 vpxor 2 * 32(%rdx), %ymm0, %ymm15; 1265 vmovdqu %ymm15, 13 * 32(%rax); 1266 vmovdqu %ymm0, 2 * 32(%rsi); 1267 1268 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1269 vpxor 3 * 32(%rdx), %ymm0, %ymm15; 1270 vmovdqu %ymm15, 12 * 32(%rax); 1271 vmovdqu %ymm0, 3 * 32(%rsi); 1272 1273 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1274 vpxor 4 * 32(%rdx), %ymm0, %ymm11; 1275 vmovdqu %ymm0, 4 * 32(%rsi); 1276 1277 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1278 vpxor 5 * 32(%rdx), %ymm0, %ymm10; 1279 vmovdqu %ymm0, 5 * 32(%rsi); 1280 1281 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1282 vpxor 6 * 32(%rdx), %ymm0, %ymm9; 1283 vmovdqu %ymm0, 6 * 32(%rsi); 1284 1285 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1286 vpxor 7 * 32(%rdx), %ymm0, %ymm8; 1287 vmovdqu %ymm0, 7 * 32(%rsi); 1288 1289 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1290 vpxor 8 * 32(%rdx), %ymm0, %ymm7; 1291 vmovdqu %ymm0, 8 * 32(%rsi); 1292 1293 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1294 vpxor 9 * 32(%rdx), %ymm0, %ymm6; 1295 vmovdqu %ymm0, 9 * 32(%rsi); 1296 1297 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1298 vpxor 10 * 32(%rdx), %ymm0, %ymm5; 1299 vmovdqu %ymm0, 10 * 32(%rsi); 1300 1301 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1302 vpxor 11 * 32(%rdx), %ymm0, %ymm4; 1303 vmovdqu %ymm0, 11 * 32(%rsi); 1304 1305 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1306 vpxor 12 * 32(%rdx), %ymm0, %ymm3; 1307 vmovdqu %ymm0, 12 * 32(%rsi); 1308 1309 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1310 vpxor 13 * 32(%rdx), %ymm0, %ymm2; 1311 vmovdqu %ymm0, 13 * 32(%rsi); 1312 1313 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1314 vpxor 14 * 32(%rdx), %ymm0, %ymm1; 1315 vmovdqu %ymm0, 14 * 32(%rsi); 1316 1317 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1318 vpxor 15 * 32(%rdx), %ymm0, %ymm15; 1319 vmovdqu %ymm15, 0 * 32(%rax); 1320 vmovdqu %ymm0, 15 * 32(%rsi); 1321 1322 vextracti128 $1, %ymm0, %xmm0; 1323 gf128mul_x_ble(%xmm0, %xmm12, %xmm15); 1324 vmovdqu %xmm0, (%rcx); 1325 1326 /* inpack32_pre: */ 1327 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; 1328 vpshufb .Lpack_bswap, %ymm15, %ymm15; 1329 vpxor 0 * 32(%rax), %ymm15, %ymm0; 1330 vpxor %ymm1, %ymm15, %ymm1; 1331 vpxor %ymm2, %ymm15, %ymm2; 1332 vpxor %ymm3, %ymm15, %ymm3; 1333 vpxor %ymm4, %ymm15, %ymm4; 1334 vpxor %ymm5, %ymm15, %ymm5; 1335 vpxor %ymm6, %ymm15, %ymm6; 1336 vpxor %ymm7, %ymm15, %ymm7; 1337 vpxor %ymm8, %ymm15, %ymm8; 1338 vpxor %ymm9, %ymm15, %ymm9; 1339 vpxor %ymm10, %ymm15, %ymm10; 1340 vpxor %ymm11, %ymm15, %ymm11; 1341 vpxor 12 * 32(%rax), %ymm15, %ymm12; 1342 vpxor 13 * 32(%rax), %ymm15, %ymm13; 1343 vpxor 14 * 32(%rax), %ymm15, %ymm14; 1344 vpxor 15 * 32(%rax), %ymm15, %ymm15; 1345 1346 call *%r9; 1347 1348 addq $(16 * 32), %rsp; 1349 1350 vpxor 0 * 32(%rsi), %ymm7, %ymm7; 1351 vpxor 1 * 32(%rsi), %ymm6, %ymm6; 1352 vpxor 2 * 32(%rsi), %ymm5, %ymm5; 1353 vpxor 3 * 32(%rsi), %ymm4, %ymm4; 1354 vpxor 4 * 32(%rsi), %ymm3, %ymm3; 1355 vpxor 5 * 32(%rsi), %ymm2, %ymm2; 1356 vpxor 6 * 32(%rsi), %ymm1, %ymm1; 1357 vpxor 7 * 32(%rsi), %ymm0, %ymm0; 1358 vpxor 8 * 32(%rsi), %ymm15, %ymm15; 1359 vpxor 9 * 32(%rsi), %ymm14, %ymm14; 1360 vpxor 10 * 32(%rsi), %ymm13, %ymm13; 1361 vpxor 11 * 32(%rsi), %ymm12, %ymm12; 1362 vpxor 12 * 32(%rsi), %ymm11, %ymm11; 1363 vpxor 13 * 32(%rsi), %ymm10, %ymm10; 1364 vpxor 14 * 32(%rsi), %ymm9, %ymm9; 1365 vpxor 15 * 32(%rsi), %ymm8, %ymm8; 1366 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1367 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1368 %ymm8, %rsi); 1369 1370 vzeroupper; 1371 1372 FRAME_END 1373 ret; 1374ENDPROC(camellia_xts_crypt_32way) 1375 1376ENTRY(camellia_xts_enc_32way) 1377 /* input: 1378 * %rdi: ctx, CTX 1379 * %rsi: dst (32 blocks) 1380 * %rdx: src (32 blocks) 1381 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1382 */ 1383 1384 xorl %r8d, %r8d; /* input whitening key, 0 for enc */ 1385 1386 leaq __camellia_enc_blk32, %r9; 1387 1388 jmp camellia_xts_crypt_32way; 1389ENDPROC(camellia_xts_enc_32way) 1390 1391ENTRY(camellia_xts_dec_32way) 1392 /* input: 1393 * %rdi: ctx, CTX 1394 * %rsi: dst (32 blocks) 1395 * %rdx: src (32 blocks) 1396 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1397 */ 1398 1399 cmpl $16, key_length(CTX); 1400 movl $32, %r8d; 1401 movl $24, %eax; 1402 cmovel %eax, %r8d; /* input whitening key, last for dec */ 1403 1404 leaq __camellia_dec_blk32, %r9; 1405 1406 jmp camellia_xts_crypt_32way; 1407ENDPROC(camellia_xts_dec_32way) 1408