1/* 2 * Twofish Cipher 3-way parallel algorithm (x86_64) 3 * 4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 19 * USA 20 * 21 */ 22 23#include <linux/linkage.h> 24 25.file "twofish-x86_64-asm-3way.S" 26.text 27 28/* structure of crypto context */ 29#define s0 0 30#define s1 1024 31#define s2 2048 32#define s3 3072 33#define w 4096 34#define k 4128 35 36/********************************************************************** 37 3-way twofish 38 **********************************************************************/ 39#define CTX %rdi 40#define RIO %rdx 41 42#define RAB0 %rax 43#define RAB1 %rbx 44#define RAB2 %rcx 45 46#define RAB0d %eax 47#define RAB1d %ebx 48#define RAB2d %ecx 49 50#define RAB0bh %ah 51#define RAB1bh %bh 52#define RAB2bh %ch 53 54#define RAB0bl %al 55#define RAB1bl %bl 56#define RAB2bl %cl 57 58#define RCD0 %r8 59#define RCD1 %r9 60#define RCD2 %r10 61 62#define RCD0d %r8d 63#define RCD1d %r9d 64#define RCD2d %r10d 65 66#define RX0 %rbp 67#define RX1 %r11 68#define RX2 %r12 69 70#define RX0d %ebp 71#define RX1d %r11d 72#define RX2d %r12d 73 74#define RY0 %r13 75#define RY1 %r14 76#define RY2 %r15 77 78#define RY0d %r13d 79#define RY1d %r14d 80#define RY2d %r15d 81 82#define RT0 %rdx 83#define RT1 %rsi 84 85#define RT0d %edx 86#define RT1d %esi 87 88#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ 89 movzbl ab ## bl, tmp2 ## d; \ 90 movzbl ab ## bh, tmp1 ## d; \ 91 rorq $(rot), ab; \ 92 op1##l T0(CTX, tmp2, 4), dst ## d; \ 93 op2##l T1(CTX, tmp1, 4), dst ## d; 94 95/* 96 * Combined G1 & G2 function. Reordered with help of rotates to have moves 97 * at begining. 98 */ 99#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ 100 /* G1,1 && G2,1 */ \ 101 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ 102 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ 103 \ 104 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ 105 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ 106 \ 107 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ 108 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ 109 \ 110 /* G1,2 && G2,2 */ \ 111 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ 112 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ 113 xchgq cd ## 0, ab ## 0; \ 114 \ 115 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ 116 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ 117 xchgq cd ## 1, ab ## 1; \ 118 \ 119 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ 120 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ 121 xchgq cd ## 2, ab ## 2; 122 123#define enc_round_end(ab, x, y, n) \ 124 addl y ## d, x ## d; \ 125 addl x ## d, y ## d; \ 126 addl k+4*(2*(n))(CTX), x ## d; \ 127 xorl ab ## d, x ## d; \ 128 addl k+4*(2*(n)+1)(CTX), y ## d; \ 129 shrq $32, ab; \ 130 roll $1, ab ## d; \ 131 xorl y ## d, ab ## d; \ 132 shlq $32, ab; \ 133 rorl $1, x ## d; \ 134 orq x, ab; 135 136#define dec_round_end(ba, x, y, n) \ 137 addl y ## d, x ## d; \ 138 addl x ## d, y ## d; \ 139 addl k+4*(2*(n))(CTX), x ## d; \ 140 addl k+4*(2*(n)+1)(CTX), y ## d; \ 141 xorl ba ## d, y ## d; \ 142 shrq $32, ba; \ 143 roll $1, ba ## d; \ 144 xorl x ## d, ba ## d; \ 145 shlq $32, ba; \ 146 rorl $1, y ## d; \ 147 orq y, ba; 148 149#define encrypt_round3(ab, cd, n) \ 150 g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ 151 \ 152 enc_round_end(ab ## 0, RX0, RY0, n); \ 153 enc_round_end(ab ## 1, RX1, RY1, n); \ 154 enc_round_end(ab ## 2, RX2, RY2, n); 155 156#define decrypt_round3(ba, dc, n) \ 157 g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ 158 \ 159 dec_round_end(ba ## 0, RX0, RY0, n); \ 160 dec_round_end(ba ## 1, RX1, RY1, n); \ 161 dec_round_end(ba ## 2, RX2, RY2, n); 162 163#define encrypt_cycle3(ab, cd, n) \ 164 encrypt_round3(ab, cd, n*2); \ 165 encrypt_round3(ab, cd, (n*2)+1); 166 167#define decrypt_cycle3(ba, dc, n) \ 168 decrypt_round3(ba, dc, (n*2)+1); \ 169 decrypt_round3(ba, dc, (n*2)); 170 171#define inpack3(in, n, xy, m) \ 172 movq 4*(n)(in), xy ## 0; \ 173 xorq w+4*m(CTX), xy ## 0; \ 174 \ 175 movq 4*(4+(n))(in), xy ## 1; \ 176 xorq w+4*m(CTX), xy ## 1; \ 177 \ 178 movq 4*(8+(n))(in), xy ## 2; \ 179 xorq w+4*m(CTX), xy ## 2; 180 181#define outunpack3(op, out, n, xy, m) \ 182 xorq w+4*m(CTX), xy ## 0; \ 183 op ## q xy ## 0, 4*(n)(out); \ 184 \ 185 xorq w+4*m(CTX), xy ## 1; \ 186 op ## q xy ## 1, 4*(4+(n))(out); \ 187 \ 188 xorq w+4*m(CTX), xy ## 2; \ 189 op ## q xy ## 2, 4*(8+(n))(out); 190 191#define inpack_enc3() \ 192 inpack3(RIO, 0, RAB, 0); \ 193 inpack3(RIO, 2, RCD, 2); 194 195#define outunpack_enc3(op) \ 196 outunpack3(op, RIO, 2, RAB, 6); \ 197 outunpack3(op, RIO, 0, RCD, 4); 198 199#define inpack_dec3() \ 200 inpack3(RIO, 0, RAB, 4); \ 201 rorq $32, RAB0; \ 202 rorq $32, RAB1; \ 203 rorq $32, RAB2; \ 204 inpack3(RIO, 2, RCD, 6); \ 205 rorq $32, RCD0; \ 206 rorq $32, RCD1; \ 207 rorq $32, RCD2; 208 209#define outunpack_dec3() \ 210 rorq $32, RCD0; \ 211 rorq $32, RCD1; \ 212 rorq $32, RCD2; \ 213 outunpack3(mov, RIO, 0, RCD, 0); \ 214 rorq $32, RAB0; \ 215 rorq $32, RAB1; \ 216 rorq $32, RAB2; \ 217 outunpack3(mov, RIO, 2, RAB, 2); 218 219ENTRY(__twofish_enc_blk_3way) 220 /* input: 221 * %rdi: ctx, CTX 222 * %rsi: dst 223 * %rdx: src, RIO 224 * %rcx: bool, if true: xor output 225 */ 226 pushq %r15; 227 pushq %r14; 228 pushq %r13; 229 pushq %r12; 230 pushq %rbp; 231 pushq %rbx; 232 233 pushq %rcx; /* bool xor */ 234 pushq %rsi; /* dst */ 235 236 inpack_enc3(); 237 238 encrypt_cycle3(RAB, RCD, 0); 239 encrypt_cycle3(RAB, RCD, 1); 240 encrypt_cycle3(RAB, RCD, 2); 241 encrypt_cycle3(RAB, RCD, 3); 242 encrypt_cycle3(RAB, RCD, 4); 243 encrypt_cycle3(RAB, RCD, 5); 244 encrypt_cycle3(RAB, RCD, 6); 245 encrypt_cycle3(RAB, RCD, 7); 246 247 popq RIO; /* dst */ 248 popq %rbp; /* bool xor */ 249 250 testb %bpl, %bpl; 251 jnz .L__enc_xor3; 252 253 outunpack_enc3(mov); 254 255 popq %rbx; 256 popq %rbp; 257 popq %r12; 258 popq %r13; 259 popq %r14; 260 popq %r15; 261 ret; 262 263.L__enc_xor3: 264 outunpack_enc3(xor); 265 266 popq %rbx; 267 popq %rbp; 268 popq %r12; 269 popq %r13; 270 popq %r14; 271 popq %r15; 272 ret; 273ENDPROC(__twofish_enc_blk_3way) 274 275ENTRY(twofish_dec_blk_3way) 276 /* input: 277 * %rdi: ctx, CTX 278 * %rsi: dst 279 * %rdx: src, RIO 280 */ 281 pushq %r15; 282 pushq %r14; 283 pushq %r13; 284 pushq %r12; 285 pushq %rbp; 286 pushq %rbx; 287 288 pushq %rsi; /* dst */ 289 290 inpack_dec3(); 291 292 decrypt_cycle3(RAB, RCD, 7); 293 decrypt_cycle3(RAB, RCD, 6); 294 decrypt_cycle3(RAB, RCD, 5); 295 decrypt_cycle3(RAB, RCD, 4); 296 decrypt_cycle3(RAB, RCD, 3); 297 decrypt_cycle3(RAB, RCD, 2); 298 decrypt_cycle3(RAB, RCD, 1); 299 decrypt_cycle3(RAB, RCD, 0); 300 301 popq RIO; /* dst */ 302 303 outunpack_dec3(); 304 305 popq %rbx; 306 popq %rbp; 307 popq %r12; 308 popq %r13; 309 popq %r14; 310 popq %r15; 311 ret; 312ENDPROC(twofish_dec_blk_3way) 313