1/* 2 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64) 3 * 4 * Copyright (C) 2012 Johannes Goetzfried 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 20 * USA 21 * 22 */ 23 24.file "twofish-avx-x86_64-asm_64.S" 25.text 26 27/* structure of crypto context */ 28#define s0 0 29#define s1 1024 30#define s2 2048 31#define s3 3072 32#define w 4096 33#define k 4128 34 35/********************************************************************** 36 8-way AVX twofish 37 **********************************************************************/ 38#define CTX %rdi 39 40#define RA1 %xmm0 41#define RB1 %xmm1 42#define RC1 %xmm2 43#define RD1 %xmm3 44 45#define RA2 %xmm4 46#define RB2 %xmm5 47#define RC2 %xmm6 48#define RD2 %xmm7 49 50#define RX %xmm8 51#define RY %xmm9 52 53#define RK1 %xmm10 54#define RK2 %xmm11 55 56#define RID1 %rax 57#define RID1b %al 58#define RID2 %rbx 59#define RID2b %bl 60 61#define RGI1 %rdx 62#define RGI1bl %dl 63#define RGI1bh %dh 64#define RGI2 %rcx 65#define RGI2bl %cl 66#define RGI2bh %ch 67 68#define RGS1 %r8 69#define RGS1d %r8d 70#define RGS2 %r9 71#define RGS2d %r9d 72#define RGS3 %r10 73#define RGS3d %r10d 74 75 76#define lookup_32bit(t0, t1, t2, t3, src, dst) \ 77 movb src ## bl, RID1b; \ 78 movb src ## bh, RID2b; \ 79 movl t0(CTX, RID1, 4), dst ## d; \ 80 xorl t1(CTX, RID2, 4), dst ## d; \ 81 shrq $16, src; \ 82 movb src ## bl, RID1b; \ 83 movb src ## bh, RID2b; \ 84 xorl t2(CTX, RID1, 4), dst ## d; \ 85 xorl t3(CTX, RID2, 4), dst ## d; 86 87#define G(a, x, t0, t1, t2, t3) \ 88 vmovq a, RGI1; \ 89 vpsrldq $8, a, x; \ 90 vmovq x, RGI2; \ 91 \ 92 lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \ 93 shrq $16, RGI1; \ 94 lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \ 95 shlq $32, RGS2; \ 96 orq RGS1, RGS2; \ 97 \ 98 lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \ 99 shrq $16, RGI2; \ 100 lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \ 101 shlq $32, RGS3; \ 102 orq RGS1, RGS3; \ 103 \ 104 vmovq RGS2, x; \ 105 vpinsrq $1, RGS3, x, x; 106 107#define encround(a, b, c, d, x, y) \ 108 G(a, x, s0, s1, s2, s3); \ 109 G(b, y, s1, s2, s3, s0); \ 110 vpaddd x, y, x; \ 111 vpaddd y, x, y; \ 112 vpaddd x, RK1, x; \ 113 vpaddd y, RK2, y; \ 114 vpxor x, c, c; \ 115 vpsrld $1, c, x; \ 116 vpslld $(32 - 1), c, c; \ 117 vpor c, x, c; \ 118 vpslld $1, d, x; \ 119 vpsrld $(32 - 1), d, d; \ 120 vpor d, x, d; \ 121 vpxor d, y, d; 122 123#define decround(a, b, c, d, x, y) \ 124 G(a, x, s0, s1, s2, s3); \ 125 G(b, y, s1, s2, s3, s0); \ 126 vpaddd x, y, x; \ 127 vpaddd y, x, y; \ 128 vpaddd y, RK2, y; \ 129 vpxor d, y, d; \ 130 vpsrld $1, d, y; \ 131 vpslld $(32 - 1), d, d; \ 132 vpor d, y, d; \ 133 vpslld $1, c, y; \ 134 vpsrld $(32 - 1), c, c; \ 135 vpor c, y, c; \ 136 vpaddd x, RK1, x; \ 137 vpxor x, c, c; 138 139#define encrypt_round(n, a, b, c, d) \ 140 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ 141 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ 142 encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ 143 encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); 144 145#define decrypt_round(n, a, b, c, d) \ 146 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ 147 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ 148 decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ 149 decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); 150 151#define encrypt_cycle(n) \ 152 encrypt_round((2*n), RA, RB, RC, RD); \ 153 encrypt_round(((2*n) + 1), RC, RD, RA, RB); 154 155#define decrypt_cycle(n) \ 156 decrypt_round(((2*n) + 1), RC, RD, RA, RB); \ 157 decrypt_round((2*n), RA, RB, RC, RD); 158 159 160#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 161 vpunpckldq x1, x0, t0; \ 162 vpunpckhdq x1, x0, t2; \ 163 vpunpckldq x3, x2, t1; \ 164 vpunpckhdq x3, x2, x3; \ 165 \ 166 vpunpcklqdq t1, t0, x0; \ 167 vpunpckhqdq t1, t0, x1; \ 168 vpunpcklqdq x3, t2, x2; \ 169 vpunpckhqdq x3, t2, x3; 170 171#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ 172 vpxor (0*4*4)(in), wkey, x0; \ 173 vpxor (1*4*4)(in), wkey, x1; \ 174 vpxor (2*4*4)(in), wkey, x2; \ 175 vpxor (3*4*4)(in), wkey, x3; \ 176 \ 177 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 178 179#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ 180 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 181 \ 182 vpxor x0, wkey, x0; \ 183 vmovdqu x0, (0*4*4)(out); \ 184 vpxor x1, wkey, x1; \ 185 vmovdqu x1, (1*4*4)(out); \ 186 vpxor x2, wkey, x2; \ 187 vmovdqu x2, (2*4*4)(out); \ 188 vpxor x3, wkey, x3; \ 189 vmovdqu x3, (3*4*4)(out); 190 191#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ 192 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 193 \ 194 vpxor x0, wkey, x0; \ 195 vpxor (0*4*4)(out), x0, x0; \ 196 vmovdqu x0, (0*4*4)(out); \ 197 vpxor x1, wkey, x1; \ 198 vpxor (1*4*4)(out), x1, x1; \ 199 vmovdqu x1, (1*4*4)(out); \ 200 vpxor x2, wkey, x2; \ 201 vpxor (2*4*4)(out), x2, x2; \ 202 vmovdqu x2, (2*4*4)(out); \ 203 vpxor x3, wkey, x3; \ 204 vpxor (3*4*4)(out), x3, x3; \ 205 vmovdqu x3, (3*4*4)(out); 206 207.align 8 208.global __twofish_enc_blk_8way 209.type __twofish_enc_blk_8way,@function; 210 211__twofish_enc_blk_8way: 212 /* input: 213 * %rdi: ctx, CTX 214 * %rsi: dst 215 * %rdx: src 216 * %rcx: bool, if true: xor output 217 */ 218 219 pushq %rbx; 220 pushq %rcx; 221 222 vmovdqu w(CTX), RK1; 223 224 leaq (4*4*4)(%rdx), %rax; 225 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); 226 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); 227 228 xorq RID1, RID1; 229 xorq RID2, RID2; 230 231 encrypt_cycle(0); 232 encrypt_cycle(1); 233 encrypt_cycle(2); 234 encrypt_cycle(3); 235 encrypt_cycle(4); 236 encrypt_cycle(5); 237 encrypt_cycle(6); 238 encrypt_cycle(7); 239 240 vmovdqu (w+4*4)(CTX), RK1; 241 242 popq %rcx; 243 popq %rbx; 244 245 leaq (4*4*4)(%rsi), %rax; 246 247 testb %cl, %cl; 248 jnz __enc_xor8; 249 250 outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); 251 outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); 252 253 ret; 254 255__enc_xor8: 256 outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); 257 outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); 258 259 ret; 260 261.align 8 262.global twofish_dec_blk_8way 263.type twofish_dec_blk_8way,@function; 264 265twofish_dec_blk_8way: 266 /* input: 267 * %rdi: ctx, CTX 268 * %rsi: dst 269 * %rdx: src 270 */ 271 272 pushq %rbx; 273 274 vmovdqu (w+4*4)(CTX), RK1; 275 276 leaq (4*4*4)(%rdx), %rax; 277 inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); 278 inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); 279 280 xorq RID1, RID1; 281 xorq RID2, RID2; 282 283 decrypt_cycle(7); 284 decrypt_cycle(6); 285 decrypt_cycle(5); 286 decrypt_cycle(4); 287 decrypt_cycle(3); 288 decrypt_cycle(2); 289 decrypt_cycle(1); 290 decrypt_cycle(0); 291 292 vmovdqu (w)(CTX), RK1; 293 294 popq %rbx; 295 296 leaq (4*4*4)(%rsi), %rax; 297 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); 298 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); 299 300 ret; 301