1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) 4 * 5 * Copyright (C) 2012 Johannes Goetzfried 6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 7 * 8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/frame.h> 13#include "glue_helper-asm-avx.S" 14 15.file "cast6-avx-x86_64-asm_64.S" 16 17.extern cast_s1 18.extern cast_s2 19.extern cast_s3 20.extern cast_s4 21 22/* structure of crypto context */ 23#define km 0 24#define kr (12*4*4) 25 26/* s-boxes */ 27#define s1 cast_s1 28#define s2 cast_s2 29#define s3 cast_s3 30#define s4 cast_s4 31 32/********************************************************************** 33 8-way AVX cast6 34 **********************************************************************/ 35#define CTX %r15 36 37#define RA1 %xmm0 38#define RB1 %xmm1 39#define RC1 %xmm2 40#define RD1 %xmm3 41 42#define RA2 %xmm4 43#define RB2 %xmm5 44#define RC2 %xmm6 45#define RD2 %xmm7 46 47#define RX %xmm8 48 49#define RKM %xmm9 50#define RKR %xmm10 51#define RKRF %xmm11 52#define RKRR %xmm12 53#define R32 %xmm13 54#define R1ST %xmm14 55 56#define RTMP %xmm15 57 58#define RID1 %rdi 59#define RID1d %edi 60#define RID2 %rsi 61#define RID2d %esi 62 63#define RGI1 %rdx 64#define RGI1bl %dl 65#define RGI1bh %dh 66#define RGI2 %rcx 67#define RGI2bl %cl 68#define RGI2bh %ch 69 70#define RGI3 %rax 71#define RGI3bl %al 72#define RGI3bh %ah 73#define RGI4 %rbx 74#define RGI4bl %bl 75#define RGI4bh %bh 76 77#define RFS1 %r8 78#define RFS1d %r8d 79#define RFS2 %r9 80#define RFS2d %r9d 81#define RFS3 %r10 82#define RFS3d %r10d 83 84 85#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ 86 movzbl src ## bh, RID1d; \ 87 movzbl src ## bl, RID2d; \ 88 shrq $16, src; \ 89 movl s1(, RID1, 4), dst ## d; \ 90 op1 s2(, RID2, 4), dst ## d; \ 91 movzbl src ## bh, RID1d; \ 92 movzbl src ## bl, RID2d; \ 93 interleave_op(il_reg); \ 94 op2 s3(, RID1, 4), dst ## d; \ 95 op3 s4(, RID2, 4), dst ## d; 96 97#define dummy(d) /* do nothing */ 98 99#define shr_next(reg) \ 100 shrq $16, reg; 101 102#define F_head(a, x, gi1, gi2, op0) \ 103 op0 a, RKM, x; \ 104 vpslld RKRF, x, RTMP; \ 105 vpsrld RKRR, x, x; \ 106 vpor RTMP, x, x; \ 107 \ 108 vmovq x, gi1; \ 109 vpextrq $1, x, gi2; 110 111#define F_tail(a, x, gi1, gi2, op1, op2, op3) \ 112 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ 113 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ 114 \ 115 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ 116 shlq $32, RFS2; \ 117 orq RFS1, RFS2; \ 118 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ 119 shlq $32, RFS1; \ 120 orq RFS1, RFS3; \ 121 \ 122 vmovq RFS2, x; \ 123 vpinsrq $1, RFS3, x, x; 124 125#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ 126 F_head(b1, RX, RGI1, RGI2, op0); \ 127 F_head(b2, RX, RGI3, RGI4, op0); \ 128 \ 129 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ 130 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ 131 \ 132 vpxor a1, RX, a1; \ 133 vpxor a2, RTMP, a2; 134 135#define F1_2(a1, b1, a2, b2) \ 136 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) 137#define F2_2(a1, b1, a2, b2) \ 138 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) 139#define F3_2(a1, b1, a2, b2) \ 140 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) 141 142#define qop(in, out, f) \ 143 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); 144 145#define get_round_keys(nn) \ 146 vbroadcastss (km+(4*(nn)))(CTX), RKM; \ 147 vpand R1ST, RKR, RKRF; \ 148 vpsubq RKRF, R32, RKRR; \ 149 vpsrldq $1, RKR, RKR; 150 151#define Q(n) \ 152 get_round_keys(4*n+0); \ 153 qop(RD, RC, 1); \ 154 \ 155 get_round_keys(4*n+1); \ 156 qop(RC, RB, 2); \ 157 \ 158 get_round_keys(4*n+2); \ 159 qop(RB, RA, 3); \ 160 \ 161 get_round_keys(4*n+3); \ 162 qop(RA, RD, 1); 163 164#define QBAR(n) \ 165 get_round_keys(4*n+3); \ 166 qop(RA, RD, 1); \ 167 \ 168 get_round_keys(4*n+2); \ 169 qop(RB, RA, 3); \ 170 \ 171 get_round_keys(4*n+1); \ 172 qop(RC, RB, 2); \ 173 \ 174 get_round_keys(4*n+0); \ 175 qop(RD, RC, 1); 176 177#define shuffle(mask) \ 178 vpshufb mask, RKR, RKR; 179 180#define preload_rkr(n, do_mask, mask) \ 181 vbroadcastss .L16_mask, RKR; \ 182 /* add 16-bit rotation to key rotations (mod 32) */ \ 183 vpxor (kr+n*16)(CTX), RKR, RKR; \ 184 do_mask(mask); 185 186#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 187 vpunpckldq x1, x0, t0; \ 188 vpunpckhdq x1, x0, t2; \ 189 vpunpckldq x3, x2, t1; \ 190 vpunpckhdq x3, x2, x3; \ 191 \ 192 vpunpcklqdq t1, t0, x0; \ 193 vpunpckhqdq t1, t0, x1; \ 194 vpunpcklqdq x3, t2, x2; \ 195 vpunpckhqdq x3, t2, x3; 196 197#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 198 vpshufb rmask, x0, x0; \ 199 vpshufb rmask, x1, x1; \ 200 vpshufb rmask, x2, x2; \ 201 vpshufb rmask, x3, x3; \ 202 \ 203 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 204 205#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 206 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 207 \ 208 vpshufb rmask, x0, x0; \ 209 vpshufb rmask, x1, x1; \ 210 vpshufb rmask, x2, x2; \ 211 vpshufb rmask, x3, x3; 212 213.section .rodata.cst16, "aM", @progbits, 16 214.align 16 215.Lxts_gf128mul_and_shl1_mask: 216 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 217.Lbswap_mask: 218 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 219.Lbswap128_mask: 220 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 221.Lrkr_enc_Q_Q_QBAR_QBAR: 222 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 223.Lrkr_enc_QBAR_QBAR_QBAR_QBAR: 224 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 225.Lrkr_dec_Q_Q_Q_Q: 226 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 227.Lrkr_dec_Q_Q_QBAR_QBAR: 228 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 229.Lrkr_dec_QBAR_QBAR_QBAR_QBAR: 230 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 231 232.section .rodata.cst4.L16_mask, "aM", @progbits, 4 233.align 4 234.L16_mask: 235 .byte 16, 16, 16, 16 236 237.section .rodata.cst4.L32_mask, "aM", @progbits, 4 238.align 4 239.L32_mask: 240 .byte 32, 0, 0, 0 241 242.section .rodata.cst4.first_mask, "aM", @progbits, 4 243.align 4 244.Lfirst_mask: 245 .byte 0x1f, 0, 0, 0 246 247.text 248 249.align 8 250__cast6_enc_blk8: 251 /* input: 252 * %rdi: ctx 253 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 254 * output: 255 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 256 */ 257 258 pushq %r15; 259 pushq %rbx; 260 261 movq %rdi, CTX; 262 263 vmovdqa .Lbswap_mask, RKM; 264 vmovd .Lfirst_mask, R1ST; 265 vmovd .L32_mask, R32; 266 267 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 268 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 269 270 preload_rkr(0, dummy, none); 271 Q(0); 272 Q(1); 273 Q(2); 274 Q(3); 275 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); 276 Q(4); 277 Q(5); 278 QBAR(6); 279 QBAR(7); 280 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); 281 QBAR(8); 282 QBAR(9); 283 QBAR(10); 284 QBAR(11); 285 286 popq %rbx; 287 popq %r15; 288 289 vmovdqa .Lbswap_mask, RKM; 290 291 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 292 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 293 294 ret; 295ENDPROC(__cast6_enc_blk8) 296 297.align 8 298__cast6_dec_blk8: 299 /* input: 300 * %rdi: ctx 301 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 302 * output: 303 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks 304 */ 305 306 pushq %r15; 307 pushq %rbx; 308 309 movq %rdi, CTX; 310 311 vmovdqa .Lbswap_mask, RKM; 312 vmovd .Lfirst_mask, R1ST; 313 vmovd .L32_mask, R32; 314 315 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 316 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 317 318 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); 319 Q(11); 320 Q(10); 321 Q(9); 322 Q(8); 323 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); 324 Q(7); 325 Q(6); 326 QBAR(5); 327 QBAR(4); 328 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); 329 QBAR(3); 330 QBAR(2); 331 QBAR(1); 332 QBAR(0); 333 334 popq %rbx; 335 popq %r15; 336 337 vmovdqa .Lbswap_mask, RKM; 338 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 339 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 340 341 ret; 342ENDPROC(__cast6_dec_blk8) 343 344ENTRY(cast6_ecb_enc_8way) 345 /* input: 346 * %rdi: ctx 347 * %rsi: dst 348 * %rdx: src 349 */ 350 FRAME_BEGIN 351 pushq %r15; 352 353 movq %rdi, CTX; 354 movq %rsi, %r11; 355 356 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 357 358 call __cast6_enc_blk8; 359 360 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 361 362 popq %r15; 363 FRAME_END 364 ret; 365ENDPROC(cast6_ecb_enc_8way) 366 367ENTRY(cast6_ecb_dec_8way) 368 /* input: 369 * %rdi: ctx 370 * %rsi: dst 371 * %rdx: src 372 */ 373 FRAME_BEGIN 374 pushq %r15; 375 376 movq %rdi, CTX; 377 movq %rsi, %r11; 378 379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 380 381 call __cast6_dec_blk8; 382 383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 384 385 popq %r15; 386 FRAME_END 387 ret; 388ENDPROC(cast6_ecb_dec_8way) 389 390ENTRY(cast6_cbc_dec_8way) 391 /* input: 392 * %rdi: ctx 393 * %rsi: dst 394 * %rdx: src 395 */ 396 FRAME_BEGIN 397 pushq %r12; 398 pushq %r15; 399 400 movq %rdi, CTX; 401 movq %rsi, %r11; 402 movq %rdx, %r12; 403 404 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 405 406 call __cast6_dec_blk8; 407 408 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 409 410 popq %r15; 411 popq %r12; 412 FRAME_END 413 ret; 414ENDPROC(cast6_cbc_dec_8way) 415 416ENTRY(cast6_ctr_8way) 417 /* input: 418 * %rdi: ctx, CTX 419 * %rsi: dst 420 * %rdx: src 421 * %rcx: iv (little endian, 128bit) 422 */ 423 FRAME_BEGIN 424 pushq %r12; 425 pushq %r15 426 427 movq %rdi, CTX; 428 movq %rsi, %r11; 429 movq %rdx, %r12; 430 431 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 432 RD2, RX, RKR, RKM); 433 434 call __cast6_enc_blk8; 435 436 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 437 438 popq %r15; 439 popq %r12; 440 FRAME_END 441 ret; 442ENDPROC(cast6_ctr_8way) 443 444ENTRY(cast6_xts_enc_8way) 445 /* input: 446 * %rdi: ctx, CTX 447 * %rsi: dst 448 * %rdx: src 449 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 450 */ 451 FRAME_BEGIN 452 pushq %r15; 453 454 movq %rdi, CTX 455 movq %rsi, %r11; 456 457 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 458 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 459 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); 460 461 call __cast6_enc_blk8; 462 463 /* dst <= regs xor IVs(in dst) */ 464 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 465 466 popq %r15; 467 FRAME_END 468 ret; 469ENDPROC(cast6_xts_enc_8way) 470 471ENTRY(cast6_xts_dec_8way) 472 /* input: 473 * %rdi: ctx, CTX 474 * %rsi: dst 475 * %rdx: src 476 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 477 */ 478 FRAME_BEGIN 479 pushq %r15; 480 481 movq %rdi, CTX 482 movq %rsi, %r11; 483 484 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 485 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 486 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); 487 488 call __cast6_dec_blk8; 489 490 /* dst <= regs xor IVs(in dst) */ 491 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 492 493 popq %r15; 494 FRAME_END 495 ret; 496ENDPROC(cast6_xts_dec_8way) 497