1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64) 4 * 5 * Copyright (C) 2012 Johannes Goetzfried 6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 7 * 8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/frame.h> 13#include "glue_helper-asm-avx.S" 14 15.file "twofish-avx-x86_64-asm_64.S" 16 17.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 18.align 16 19.Lbswap128_mask: 20 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 21 22.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 23.align 16 24.Lxts_gf128mul_and_shl1_mask: 25 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 26 27.text 28 29/* structure of crypto context */ 30#define s0 0 31#define s1 1024 32#define s2 2048 33#define s3 3072 34#define w 4096 35#define k 4128 36 37/********************************************************************** 38 8-way AVX twofish 39 **********************************************************************/ 40#define CTX %rdi 41 42#define RA1 %xmm0 43#define RB1 %xmm1 44#define RC1 %xmm2 45#define RD1 %xmm3 46 47#define RA2 %xmm4 48#define RB2 %xmm5 49#define RC2 %xmm6 50#define RD2 %xmm7 51 52#define RX0 %xmm8 53#define RY0 %xmm9 54 55#define RX1 %xmm10 56#define RY1 %xmm11 57 58#define RK1 %xmm12 59#define RK2 %xmm13 60 61#define RT %xmm14 62#define RR %xmm15 63 64#define RID1 %r13 65#define RID1d %r13d 66#define RID2 %rsi 67#define RID2d %esi 68 69#define RGI1 %rdx 70#define RGI1bl %dl 71#define RGI1bh %dh 72#define RGI2 %rcx 73#define RGI2bl %cl 74#define RGI2bh %ch 75 76#define RGI3 %rax 77#define RGI3bl %al 78#define RGI3bh %ah 79#define RGI4 %rbx 80#define RGI4bl %bl 81#define RGI4bh %bh 82 83#define RGS1 %r8 84#define RGS1d %r8d 85#define RGS2 %r9 86#define RGS2d %r9d 87#define RGS3 %r10 88#define RGS3d %r10d 89 90 91#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \ 92 movzbl src ## bl, RID1d; \ 93 movzbl src ## bh, RID2d; \ 94 shrq $16, src; \ 95 movl t0(CTX, RID1, 4), dst ## d; \ 96 movl t1(CTX, RID2, 4), RID2d; \ 97 movzbl src ## bl, RID1d; \ 98 xorl RID2d, dst ## d; \ 99 movzbl src ## bh, RID2d; \ 100 interleave_op(il_reg); \ 101 xorl t2(CTX, RID1, 4), dst ## d; \ 102 xorl t3(CTX, RID2, 4), dst ## d; 103 104#define dummy(d) /* do nothing */ 105 106#define shr_next(reg) \ 107 shrq $16, reg; 108 109#define G(gi1, gi2, x, t0, t1, t2, t3) \ 110 lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \ 111 lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \ 112 \ 113 lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \ 114 shlq $32, RGS2; \ 115 orq RGS1, RGS2; \ 116 lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \ 117 shlq $32, RGS1; \ 118 orq RGS1, RGS3; 119 120#define round_head_2(a, b, x1, y1, x2, y2) \ 121 vmovq b ## 1, RGI3; \ 122 vpextrq $1, b ## 1, RGI4; \ 123 \ 124 G(RGI1, RGI2, x1, s0, s1, s2, s3); \ 125 vmovq a ## 2, RGI1; \ 126 vpextrq $1, a ## 2, RGI2; \ 127 vmovq RGS2, x1; \ 128 vpinsrq $1, RGS3, x1, x1; \ 129 \ 130 G(RGI3, RGI4, y1, s1, s2, s3, s0); \ 131 vmovq b ## 2, RGI3; \ 132 vpextrq $1, b ## 2, RGI4; \ 133 vmovq RGS2, y1; \ 134 vpinsrq $1, RGS3, y1, y1; \ 135 \ 136 G(RGI1, RGI2, x2, s0, s1, s2, s3); \ 137 vmovq RGS2, x2; \ 138 vpinsrq $1, RGS3, x2, x2; \ 139 \ 140 G(RGI3, RGI4, y2, s1, s2, s3, s0); \ 141 vmovq RGS2, y2; \ 142 vpinsrq $1, RGS3, y2, y2; 143 144#define encround_tail(a, b, c, d, x, y, prerotate) \ 145 vpaddd x, y, x; \ 146 vpaddd x, RK1, RT;\ 147 prerotate(b); \ 148 vpxor RT, c, c; \ 149 vpaddd y, x, y; \ 150 vpaddd y, RK2, y; \ 151 vpsrld $1, c, RT; \ 152 vpslld $(32 - 1), c, c; \ 153 vpor c, RT, c; \ 154 vpxor d, y, d; \ 155 156#define decround_tail(a, b, c, d, x, y, prerotate) \ 157 vpaddd x, y, x; \ 158 vpaddd x, RK1, RT;\ 159 prerotate(a); \ 160 vpxor RT, c, c; \ 161 vpaddd y, x, y; \ 162 vpaddd y, RK2, y; \ 163 vpxor d, y, d; \ 164 vpsrld $1, d, y; \ 165 vpslld $(32 - 1), d, d; \ 166 vpor d, y, d; \ 167 168#define rotate_1l(x) \ 169 vpslld $1, x, RR; \ 170 vpsrld $(32 - 1), x, x; \ 171 vpor x, RR, x; 172 173#define preload_rgi(c) \ 174 vmovq c, RGI1; \ 175 vpextrq $1, c, RGI2; 176 177#define encrypt_round(n, a, b, c, d, preload, prerotate) \ 178 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ 179 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ 180 round_head_2(a, b, RX0, RY0, RX1, RY1); \ 181 encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \ 182 preload(c ## 1); \ 183 encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate); 184 185#define decrypt_round(n, a, b, c, d, preload, prerotate) \ 186 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ 187 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ 188 round_head_2(a, b, RX0, RY0, RX1, RY1); \ 189 decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \ 190 preload(c ## 1); \ 191 decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate); 192 193#define encrypt_cycle(n) \ 194 encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \ 195 encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); 196 197#define encrypt_cycle_last(n) \ 198 encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \ 199 encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy); 200 201#define decrypt_cycle(n) \ 202 decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \ 203 decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); 204 205#define decrypt_cycle_last(n) \ 206 decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \ 207 decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy); 208 209#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 210 vpunpckldq x1, x0, t0; \ 211 vpunpckhdq x1, x0, t2; \ 212 vpunpckldq x3, x2, t1; \ 213 vpunpckhdq x3, x2, x3; \ 214 \ 215 vpunpcklqdq t1, t0, x0; \ 216 vpunpckhqdq t1, t0, x1; \ 217 vpunpcklqdq x3, t2, x2; \ 218 vpunpckhqdq x3, t2, x3; 219 220#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ 221 vpxor x0, wkey, x0; \ 222 vpxor x1, wkey, x1; \ 223 vpxor x2, wkey, x2; \ 224 vpxor x3, wkey, x3; \ 225 \ 226 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 227 228#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ 229 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 230 \ 231 vpxor x0, wkey, x0; \ 232 vpxor x1, wkey, x1; \ 233 vpxor x2, wkey, x2; \ 234 vpxor x3, wkey, x3; 235 236.align 8 237__twofish_enc_blk8: 238 /* input: 239 * %rdi: ctx, CTX 240 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 241 * output: 242 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks 243 */ 244 245 vmovdqu w(CTX), RK1; 246 247 pushq %r13; 248 pushq %rbx; 249 pushq %rcx; 250 251 inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); 252 preload_rgi(RA1); 253 rotate_1l(RD1); 254 inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 255 rotate_1l(RD2); 256 257 encrypt_cycle(0); 258 encrypt_cycle(1); 259 encrypt_cycle(2); 260 encrypt_cycle(3); 261 encrypt_cycle(4); 262 encrypt_cycle(5); 263 encrypt_cycle(6); 264 encrypt_cycle_last(7); 265 266 vmovdqu (w+4*4)(CTX), RK1; 267 268 popq %rcx; 269 popq %rbx; 270 popq %r13; 271 272 outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); 273 outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); 274 275 ret; 276ENDPROC(__twofish_enc_blk8) 277 278.align 8 279__twofish_dec_blk8: 280 /* input: 281 * %rdi: ctx, CTX 282 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks 283 * output: 284 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks 285 */ 286 287 vmovdqu (w+4*4)(CTX), RK1; 288 289 pushq %r13; 290 pushq %rbx; 291 292 inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); 293 preload_rgi(RC1); 294 rotate_1l(RA1); 295 inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); 296 rotate_1l(RA2); 297 298 decrypt_cycle(7); 299 decrypt_cycle(6); 300 decrypt_cycle(5); 301 decrypt_cycle(4); 302 decrypt_cycle(3); 303 decrypt_cycle(2); 304 decrypt_cycle(1); 305 decrypt_cycle_last(0); 306 307 vmovdqu (w)(CTX), RK1; 308 309 popq %rbx; 310 popq %r13; 311 312 outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); 313 outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 314 315 ret; 316ENDPROC(__twofish_dec_blk8) 317 318ENTRY(twofish_ecb_enc_8way) 319 /* input: 320 * %rdi: ctx, CTX 321 * %rsi: dst 322 * %rdx: src 323 */ 324 FRAME_BEGIN 325 326 movq %rsi, %r11; 327 328 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 329 330 call __twofish_enc_blk8; 331 332 store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 333 334 FRAME_END 335 ret; 336ENDPROC(twofish_ecb_enc_8way) 337 338ENTRY(twofish_ecb_dec_8way) 339 /* input: 340 * %rdi: ctx, CTX 341 * %rsi: dst 342 * %rdx: src 343 */ 344 FRAME_BEGIN 345 346 movq %rsi, %r11; 347 348 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 349 350 call __twofish_dec_blk8; 351 352 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 353 354 FRAME_END 355 ret; 356ENDPROC(twofish_ecb_dec_8way) 357 358ENTRY(twofish_cbc_dec_8way) 359 /* input: 360 * %rdi: ctx, CTX 361 * %rsi: dst 362 * %rdx: src 363 */ 364 FRAME_BEGIN 365 366 pushq %r12; 367 368 movq %rsi, %r11; 369 movq %rdx, %r12; 370 371 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 372 373 call __twofish_dec_blk8; 374 375 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 376 377 popq %r12; 378 379 FRAME_END 380 ret; 381ENDPROC(twofish_cbc_dec_8way) 382 383ENTRY(twofish_ctr_8way) 384 /* input: 385 * %rdi: ctx, CTX 386 * %rsi: dst 387 * %rdx: src 388 * %rcx: iv (little endian, 128bit) 389 */ 390 FRAME_BEGIN 391 392 pushq %r12; 393 394 movq %rsi, %r11; 395 movq %rdx, %r12; 396 397 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 398 RD2, RX0, RX1, RY0); 399 400 call __twofish_enc_blk8; 401 402 store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 403 404 popq %r12; 405 406 FRAME_END 407 ret; 408ENDPROC(twofish_ctr_8way) 409 410ENTRY(twofish_xts_enc_8way) 411 /* input: 412 * %rdi: ctx, CTX 413 * %rsi: dst 414 * %rdx: src 415 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 416 */ 417 FRAME_BEGIN 418 419 movq %rsi, %r11; 420 421 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 422 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 423 RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); 424 425 call __twofish_enc_blk8; 426 427 /* dst <= regs xor IVs(in dst) */ 428 store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 429 430 FRAME_END 431 ret; 432ENDPROC(twofish_xts_enc_8way) 433 434ENTRY(twofish_xts_dec_8way) 435 /* input: 436 * %rdi: ctx, CTX 437 * %rsi: dst 438 * %rdx: src 439 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 440 */ 441 FRAME_BEGIN 442 443 movq %rsi, %r11; 444 445 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 446 load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2, 447 RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); 448 449 call __twofish_dec_blk8; 450 451 /* dst <= regs xor IVs(in dst) */ 452 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 453 454 FRAME_END 455 ret; 456ENDPROC(twofish_xts_dec_8way) 457