1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Twofish Cipher 3-way parallel algorithm (x86_64) 4 * 5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 */ 7 8#include <linux/linkage.h> 9 10.file "twofish-x86_64-asm-3way.S" 11.text 12 13/* structure of crypto context */ 14#define s0 0 15#define s1 1024 16#define s2 2048 17#define s3 3072 18#define w 4096 19#define k 4128 20 21/********************************************************************** 22 3-way twofish 23 **********************************************************************/ 24#define CTX %rdi 25#define RIO %rdx 26 27#define RAB0 %rax 28#define RAB1 %rbx 29#define RAB2 %rcx 30 31#define RAB0d %eax 32#define RAB1d %ebx 33#define RAB2d %ecx 34 35#define RAB0bh %ah 36#define RAB1bh %bh 37#define RAB2bh %ch 38 39#define RAB0bl %al 40#define RAB1bl %bl 41#define RAB2bl %cl 42 43#define CD0 0x0(%rsp) 44#define CD1 0x8(%rsp) 45#define CD2 0x10(%rsp) 46 47# used only before/after all rounds 48#define RCD0 %r8 49#define RCD1 %r9 50#define RCD2 %r10 51 52# used only during rounds 53#define RX0 %r8 54#define RX1 %r9 55#define RX2 %r10 56 57#define RX0d %r8d 58#define RX1d %r9d 59#define RX2d %r10d 60 61#define RY0 %r11 62#define RY1 %r12 63#define RY2 %r13 64 65#define RY0d %r11d 66#define RY1d %r12d 67#define RY2d %r13d 68 69#define RT0 %rdx 70#define RT1 %rsi 71 72#define RT0d %edx 73#define RT1d %esi 74 75#define RT1bl %sil 76 77#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ 78 movzbl ab ## bl, tmp2 ## d; \ 79 movzbl ab ## bh, tmp1 ## d; \ 80 rorq $(rot), ab; \ 81 op1##l T0(CTX, tmp2, 4), dst ## d; \ 82 op2##l T1(CTX, tmp1, 4), dst ## d; 83 84#define swap_ab_with_cd(ab, cd, tmp) \ 85 movq cd, tmp; \ 86 movq ab, cd; \ 87 movq tmp, ab; 88 89/* 90 * Combined G1 & G2 function. Reordered with help of rotates to have moves 91 * at begining. 92 */ 93#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ 94 /* G1,1 && G2,1 */ \ 95 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ 96 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ 97 \ 98 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ 99 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ 100 \ 101 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ 102 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ 103 \ 104 /* G1,2 && G2,2 */ \ 105 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ 106 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ 107 swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \ 108 \ 109 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ 110 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ 111 swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \ 112 \ 113 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ 114 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ 115 swap_ab_with_cd(ab ## 2, cd ## 2, RT0); 116 117#define enc_round_end(ab, x, y, n) \ 118 addl y ## d, x ## d; \ 119 addl x ## d, y ## d; \ 120 addl k+4*(2*(n))(CTX), x ## d; \ 121 xorl ab ## d, x ## d; \ 122 addl k+4*(2*(n)+1)(CTX), y ## d; \ 123 shrq $32, ab; \ 124 roll $1, ab ## d; \ 125 xorl y ## d, ab ## d; \ 126 shlq $32, ab; \ 127 rorl $1, x ## d; \ 128 orq x, ab; 129 130#define dec_round_end(ba, x, y, n) \ 131 addl y ## d, x ## d; \ 132 addl x ## d, y ## d; \ 133 addl k+4*(2*(n))(CTX), x ## d; \ 134 addl k+4*(2*(n)+1)(CTX), y ## d; \ 135 xorl ba ## d, y ## d; \ 136 shrq $32, ba; \ 137 roll $1, ba ## d; \ 138 xorl x ## d, ba ## d; \ 139 shlq $32, ba; \ 140 rorl $1, y ## d; \ 141 orq y, ba; 142 143#define encrypt_round3(ab, cd, n) \ 144 g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ 145 \ 146 enc_round_end(ab ## 0, RX0, RY0, n); \ 147 enc_round_end(ab ## 1, RX1, RY1, n); \ 148 enc_round_end(ab ## 2, RX2, RY2, n); 149 150#define decrypt_round3(ba, dc, n) \ 151 g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ 152 \ 153 dec_round_end(ba ## 0, RX0, RY0, n); \ 154 dec_round_end(ba ## 1, RX1, RY1, n); \ 155 dec_round_end(ba ## 2, RX2, RY2, n); 156 157#define encrypt_cycle3(ab, cd, n) \ 158 encrypt_round3(ab, cd, n*2); \ 159 encrypt_round3(ab, cd, (n*2)+1); 160 161#define decrypt_cycle3(ba, dc, n) \ 162 decrypt_round3(ba, dc, (n*2)+1); \ 163 decrypt_round3(ba, dc, (n*2)); 164 165#define push_cd() \ 166 pushq RCD2; \ 167 pushq RCD1; \ 168 pushq RCD0; 169 170#define pop_cd() \ 171 popq RCD0; \ 172 popq RCD1; \ 173 popq RCD2; 174 175#define inpack3(in, n, xy, m) \ 176 movq 4*(n)(in), xy ## 0; \ 177 xorq w+4*m(CTX), xy ## 0; \ 178 \ 179 movq 4*(4+(n))(in), xy ## 1; \ 180 xorq w+4*m(CTX), xy ## 1; \ 181 \ 182 movq 4*(8+(n))(in), xy ## 2; \ 183 xorq w+4*m(CTX), xy ## 2; 184 185#define outunpack3(op, out, n, xy, m) \ 186 xorq w+4*m(CTX), xy ## 0; \ 187 op ## q xy ## 0, 4*(n)(out); \ 188 \ 189 xorq w+4*m(CTX), xy ## 1; \ 190 op ## q xy ## 1, 4*(4+(n))(out); \ 191 \ 192 xorq w+4*m(CTX), xy ## 2; \ 193 op ## q xy ## 2, 4*(8+(n))(out); 194 195#define inpack_enc3() \ 196 inpack3(RIO, 0, RAB, 0); \ 197 inpack3(RIO, 2, RCD, 2); 198 199#define outunpack_enc3(op) \ 200 outunpack3(op, RIO, 2, RAB, 6); \ 201 outunpack3(op, RIO, 0, RCD, 4); 202 203#define inpack_dec3() \ 204 inpack3(RIO, 0, RAB, 4); \ 205 rorq $32, RAB0; \ 206 rorq $32, RAB1; \ 207 rorq $32, RAB2; \ 208 inpack3(RIO, 2, RCD, 6); \ 209 rorq $32, RCD0; \ 210 rorq $32, RCD1; \ 211 rorq $32, RCD2; 212 213#define outunpack_dec3() \ 214 rorq $32, RCD0; \ 215 rorq $32, RCD1; \ 216 rorq $32, RCD2; \ 217 outunpack3(mov, RIO, 0, RCD, 0); \ 218 rorq $32, RAB0; \ 219 rorq $32, RAB1; \ 220 rorq $32, RAB2; \ 221 outunpack3(mov, RIO, 2, RAB, 2); 222 223ENTRY(__twofish_enc_blk_3way) 224 /* input: 225 * %rdi: ctx, CTX 226 * %rsi: dst 227 * %rdx: src, RIO 228 * %rcx: bool, if true: xor output 229 */ 230 pushq %r13; 231 pushq %r12; 232 pushq %rbx; 233 234 pushq %rcx; /* bool xor */ 235 pushq %rsi; /* dst */ 236 237 inpack_enc3(); 238 239 push_cd(); 240 encrypt_cycle3(RAB, CD, 0); 241 encrypt_cycle3(RAB, CD, 1); 242 encrypt_cycle3(RAB, CD, 2); 243 encrypt_cycle3(RAB, CD, 3); 244 encrypt_cycle3(RAB, CD, 4); 245 encrypt_cycle3(RAB, CD, 5); 246 encrypt_cycle3(RAB, CD, 6); 247 encrypt_cycle3(RAB, CD, 7); 248 pop_cd(); 249 250 popq RIO; /* dst */ 251 popq RT1; /* bool xor */ 252 253 testb RT1bl, RT1bl; 254 jnz .L__enc_xor3; 255 256 outunpack_enc3(mov); 257 258 popq %rbx; 259 popq %r12; 260 popq %r13; 261 ret; 262 263.L__enc_xor3: 264 outunpack_enc3(xor); 265 266 popq %rbx; 267 popq %r12; 268 popq %r13; 269 ret; 270ENDPROC(__twofish_enc_blk_3way) 271 272ENTRY(twofish_dec_blk_3way) 273 /* input: 274 * %rdi: ctx, CTX 275 * %rsi: dst 276 * %rdx: src, RIO 277 */ 278 pushq %r13; 279 pushq %r12; 280 pushq %rbx; 281 282 pushq %rsi; /* dst */ 283 284 inpack_dec3(); 285 286 push_cd(); 287 decrypt_cycle3(RAB, CD, 7); 288 decrypt_cycle3(RAB, CD, 6); 289 decrypt_cycle3(RAB, CD, 5); 290 decrypt_cycle3(RAB, CD, 4); 291 decrypt_cycle3(RAB, CD, 3); 292 decrypt_cycle3(RAB, CD, 2); 293 decrypt_cycle3(RAB, CD, 1); 294 decrypt_cycle3(RAB, CD, 0); 295 pop_cd(); 296 297 popq RIO; /* dst */ 298 299 outunpack_dec3(); 300 301 popq %rbx; 302 popq %r12; 303 popq %r13; 304 ret; 305ENDPROC(twofish_dec_blk_3way) 306