1/*************************************************************************** 2* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * 3* * 4* This program is free software; you can redistribute it and/or modify * 5* it under the terms of the GNU General Public License as published by * 6* the Free Software Foundation; either version 2 of the License, or * 7* (at your option) any later version. * 8* * 9* This program is distributed in the hope that it will be useful, * 10* but WITHOUT ANY WARRANTY; without even the implied warranty of * 11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 12* GNU General Public License for more details. * 13* * 14* You should have received a copy of the GNU General Public License * 15* along with this program; if not, write to the * 16* Free Software Foundation, Inc., * 17* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 18***************************************************************************/ 19 20.file "twofish-i586-asm.S" 21.text 22 23#include <linux/linkage.h> 24#include <asm/asm-offsets.h> 25 26/* return address at 0 */ 27 28#define in_blk 12 /* input byte array address parameter*/ 29#define out_blk 8 /* output byte array address parameter*/ 30#define ctx 4 /* Twofish context structure */ 31 32#define a_offset 0 33#define b_offset 4 34#define c_offset 8 35#define d_offset 12 36 37/* Structure of the crypto context struct*/ 38 39#define s0 0 /* S0 Array 256 Words each */ 40#define s1 1024 /* S1 Array */ 41#define s2 2048 /* S2 Array */ 42#define s3 3072 /* S3 Array */ 43#define w 4096 /* 8 whitening keys (word) */ 44#define k 4128 /* key 1-32 ( word ) */ 45 46/* define a few register aliases to allow macro substitution */ 47 48#define R0D %eax 49#define R0B %al 50#define R0H %ah 51 52#define R1D %ebx 53#define R1B %bl 54#define R1H %bh 55 56#define R2D %ecx 57#define R2B %cl 58#define R2H %ch 59 60#define R3D %edx 61#define R3B %dl 62#define R3H %dh 63 64 65/* performs input whitening */ 66#define input_whitening(src,context,offset)\ 67 xor w+offset(context), src; 68 69/* performs input whitening */ 70#define output_whitening(src,context,offset)\ 71 xor w+16+offset(context), src; 72 73/* 74 * a input register containing a (rotated 16) 75 * b input register containing b 76 * c input register containing c 77 * d input register containing d (already rol $1) 78 * operations on a and b are interleaved to increase performance 79 */ 80#define encrypt_round(a,b,c,d,round)\ 81 push d ## D;\ 82 movzx b ## B, %edi;\ 83 mov s1(%ebp,%edi,4),d ## D;\ 84 movzx a ## B, %edi;\ 85 mov s2(%ebp,%edi,4),%esi;\ 86 movzx b ## H, %edi;\ 87 ror $16, b ## D;\ 88 xor s2(%ebp,%edi,4),d ## D;\ 89 movzx a ## H, %edi;\ 90 ror $16, a ## D;\ 91 xor s3(%ebp,%edi,4),%esi;\ 92 movzx b ## B, %edi;\ 93 xor s3(%ebp,%edi,4),d ## D;\ 94 movzx a ## B, %edi;\ 95 xor (%ebp,%edi,4), %esi;\ 96 movzx b ## H, %edi;\ 97 ror $15, b ## D;\ 98 xor (%ebp,%edi,4), d ## D;\ 99 movzx a ## H, %edi;\ 100 xor s1(%ebp,%edi,4),%esi;\ 101 pop %edi;\ 102 add d ## D, %esi;\ 103 add %esi, d ## D;\ 104 add k+round(%ebp), %esi;\ 105 xor %esi, c ## D;\ 106 rol $15, c ## D;\ 107 add k+4+round(%ebp),d ## D;\ 108 xor %edi, d ## D; 109 110/* 111 * a input register containing a (rotated 16) 112 * b input register containing b 113 * c input register containing c 114 * d input register containing d (already rol $1) 115 * operations on a and b are interleaved to increase performance 116 * last round has different rotations for the output preparation 117 */ 118#define encrypt_last_round(a,b,c,d,round)\ 119 push d ## D;\ 120 movzx b ## B, %edi;\ 121 mov s1(%ebp,%edi,4),d ## D;\ 122 movzx a ## B, %edi;\ 123 mov s2(%ebp,%edi,4),%esi;\ 124 movzx b ## H, %edi;\ 125 ror $16, b ## D;\ 126 xor s2(%ebp,%edi,4),d ## D;\ 127 movzx a ## H, %edi;\ 128 ror $16, a ## D;\ 129 xor s3(%ebp,%edi,4),%esi;\ 130 movzx b ## B, %edi;\ 131 xor s3(%ebp,%edi,4),d ## D;\ 132 movzx a ## B, %edi;\ 133 xor (%ebp,%edi,4), %esi;\ 134 movzx b ## H, %edi;\ 135 ror $16, b ## D;\ 136 xor (%ebp,%edi,4), d ## D;\ 137 movzx a ## H, %edi;\ 138 xor s1(%ebp,%edi,4),%esi;\ 139 pop %edi;\ 140 add d ## D, %esi;\ 141 add %esi, d ## D;\ 142 add k+round(%ebp), %esi;\ 143 xor %esi, c ## D;\ 144 ror $1, c ## D;\ 145 add k+4+round(%ebp),d ## D;\ 146 xor %edi, d ## D; 147 148/* 149 * a input register containing a 150 * b input register containing b (rotated 16) 151 * c input register containing c 152 * d input register containing d (already rol $1) 153 * operations on a and b are interleaved to increase performance 154 */ 155#define decrypt_round(a,b,c,d,round)\ 156 push c ## D;\ 157 movzx a ## B, %edi;\ 158 mov (%ebp,%edi,4), c ## D;\ 159 movzx b ## B, %edi;\ 160 mov s3(%ebp,%edi,4),%esi;\ 161 movzx a ## H, %edi;\ 162 ror $16, a ## D;\ 163 xor s1(%ebp,%edi,4),c ## D;\ 164 movzx b ## H, %edi;\ 165 ror $16, b ## D;\ 166 xor (%ebp,%edi,4), %esi;\ 167 movzx a ## B, %edi;\ 168 xor s2(%ebp,%edi,4),c ## D;\ 169 movzx b ## B, %edi;\ 170 xor s1(%ebp,%edi,4),%esi;\ 171 movzx a ## H, %edi;\ 172 ror $15, a ## D;\ 173 xor s3(%ebp,%edi,4),c ## D;\ 174 movzx b ## H, %edi;\ 175 xor s2(%ebp,%edi,4),%esi;\ 176 pop %edi;\ 177 add %esi, c ## D;\ 178 add c ## D, %esi;\ 179 add k+round(%ebp), c ## D;\ 180 xor %edi, c ## D;\ 181 add k+4+round(%ebp),%esi;\ 182 xor %esi, d ## D;\ 183 rol $15, d ## D; 184 185/* 186 * a input register containing a 187 * b input register containing b (rotated 16) 188 * c input register containing c 189 * d input register containing d (already rol $1) 190 * operations on a and b are interleaved to increase performance 191 * last round has different rotations for the output preparation 192 */ 193#define decrypt_last_round(a,b,c,d,round)\ 194 push c ## D;\ 195 movzx a ## B, %edi;\ 196 mov (%ebp,%edi,4), c ## D;\ 197 movzx b ## B, %edi;\ 198 mov s3(%ebp,%edi,4),%esi;\ 199 movzx a ## H, %edi;\ 200 ror $16, a ## D;\ 201 xor s1(%ebp,%edi,4),c ## D;\ 202 movzx b ## H, %edi;\ 203 ror $16, b ## D;\ 204 xor (%ebp,%edi,4), %esi;\ 205 movzx a ## B, %edi;\ 206 xor s2(%ebp,%edi,4),c ## D;\ 207 movzx b ## B, %edi;\ 208 xor s1(%ebp,%edi,4),%esi;\ 209 movzx a ## H, %edi;\ 210 ror $16, a ## D;\ 211 xor s3(%ebp,%edi,4),c ## D;\ 212 movzx b ## H, %edi;\ 213 xor s2(%ebp,%edi,4),%esi;\ 214 pop %edi;\ 215 add %esi, c ## D;\ 216 add c ## D, %esi;\ 217 add k+round(%ebp), c ## D;\ 218 xor %edi, c ## D;\ 219 add k+4+round(%ebp),%esi;\ 220 xor %esi, d ## D;\ 221 ror $1, d ## D; 222 223ENTRY(twofish_enc_blk) 224 push %ebp /* save registers according to calling convention*/ 225 push %ebx 226 push %esi 227 push %edi 228 229 mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base 230 * pointer to the ctx address */ 231 mov in_blk+16(%esp),%edi /* input address in edi */ 232 233 mov (%edi), %eax 234 mov b_offset(%edi), %ebx 235 mov c_offset(%edi), %ecx 236 mov d_offset(%edi), %edx 237 input_whitening(%eax,%ebp,a_offset) 238 ror $16, %eax 239 input_whitening(%ebx,%ebp,b_offset) 240 input_whitening(%ecx,%ebp,c_offset) 241 input_whitening(%edx,%ebp,d_offset) 242 rol $1, %edx 243 244 encrypt_round(R0,R1,R2,R3,0); 245 encrypt_round(R2,R3,R0,R1,8); 246 encrypt_round(R0,R1,R2,R3,2*8); 247 encrypt_round(R2,R3,R0,R1,3*8); 248 encrypt_round(R0,R1,R2,R3,4*8); 249 encrypt_round(R2,R3,R0,R1,5*8); 250 encrypt_round(R0,R1,R2,R3,6*8); 251 encrypt_round(R2,R3,R0,R1,7*8); 252 encrypt_round(R0,R1,R2,R3,8*8); 253 encrypt_round(R2,R3,R0,R1,9*8); 254 encrypt_round(R0,R1,R2,R3,10*8); 255 encrypt_round(R2,R3,R0,R1,11*8); 256 encrypt_round(R0,R1,R2,R3,12*8); 257 encrypt_round(R2,R3,R0,R1,13*8); 258 encrypt_round(R0,R1,R2,R3,14*8); 259 encrypt_last_round(R2,R3,R0,R1,15*8); 260 261 output_whitening(%eax,%ebp,c_offset) 262 output_whitening(%ebx,%ebp,d_offset) 263 output_whitening(%ecx,%ebp,a_offset) 264 output_whitening(%edx,%ebp,b_offset) 265 mov out_blk+16(%esp),%edi; 266 mov %eax, c_offset(%edi) 267 mov %ebx, d_offset(%edi) 268 mov %ecx, (%edi) 269 mov %edx, b_offset(%edi) 270 271 pop %edi 272 pop %esi 273 pop %ebx 274 pop %ebp 275 mov $1, %eax 276 ret 277ENDPROC(twofish_enc_blk) 278 279ENTRY(twofish_dec_blk) 280 push %ebp /* save registers according to calling convention*/ 281 push %ebx 282 push %esi 283 push %edi 284 285 286 mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base 287 * pointer to the ctx address */ 288 mov in_blk+16(%esp),%edi /* input address in edi */ 289 290 mov (%edi), %eax 291 mov b_offset(%edi), %ebx 292 mov c_offset(%edi), %ecx 293 mov d_offset(%edi), %edx 294 output_whitening(%eax,%ebp,a_offset) 295 output_whitening(%ebx,%ebp,b_offset) 296 ror $16, %ebx 297 output_whitening(%ecx,%ebp,c_offset) 298 output_whitening(%edx,%ebp,d_offset) 299 rol $1, %ecx 300 301 decrypt_round(R0,R1,R2,R3,15*8); 302 decrypt_round(R2,R3,R0,R1,14*8); 303 decrypt_round(R0,R1,R2,R3,13*8); 304 decrypt_round(R2,R3,R0,R1,12*8); 305 decrypt_round(R0,R1,R2,R3,11*8); 306 decrypt_round(R2,R3,R0,R1,10*8); 307 decrypt_round(R0,R1,R2,R3,9*8); 308 decrypt_round(R2,R3,R0,R1,8*8); 309 decrypt_round(R0,R1,R2,R3,7*8); 310 decrypt_round(R2,R3,R0,R1,6*8); 311 decrypt_round(R0,R1,R2,R3,5*8); 312 decrypt_round(R2,R3,R0,R1,4*8); 313 decrypt_round(R0,R1,R2,R3,3*8); 314 decrypt_round(R2,R3,R0,R1,2*8); 315 decrypt_round(R0,R1,R2,R3,1*8); 316 decrypt_last_round(R2,R3,R0,R1,0); 317 318 input_whitening(%eax,%ebp,c_offset) 319 input_whitening(%ebx,%ebp,d_offset) 320 input_whitening(%ecx,%ebp,a_offset) 321 input_whitening(%edx,%ebp,b_offset) 322 mov out_blk+16(%esp),%edi; 323 mov %eax, c_offset(%edi) 324 mov %ebx, d_offset(%edi) 325 mov %ecx, (%edi) 326 mov %edx, b_offset(%edi) 327 328 pop %edi 329 pop %esi 330 pop %ebx 331 pop %ebp 332 mov $1, %eax 333 ret 334ENDPROC(twofish_dec_blk) 335