1/* 2 * Blowfish Cipher Algorithm (x86_64) 3 * 4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 19 * USA 20 * 21 */ 22 23.file "blowfish-x86_64-asm.S" 24.text 25 26/* structure of crypto context */ 27#define p 0 28#define s0 ((16 + 2) * 4) 29#define s1 ((16 + 2 + (1 * 256)) * 4) 30#define s2 ((16 + 2 + (2 * 256)) * 4) 31#define s3 ((16 + 2 + (3 * 256)) * 4) 32 33/* register macros */ 34#define CTX %rdi 35#define RIO %rsi 36 37#define RX0 %rax 38#define RX1 %rbx 39#define RX2 %rcx 40#define RX3 %rdx 41 42#define RX0d %eax 43#define RX1d %ebx 44#define RX2d %ecx 45#define RX3d %edx 46 47#define RX0bl %al 48#define RX1bl %bl 49#define RX2bl %cl 50#define RX3bl %dl 51 52#define RX0bh %ah 53#define RX1bh %bh 54#define RX2bh %ch 55#define RX3bh %dh 56 57#define RT0 %rbp 58#define RT1 %rsi 59#define RT2 %r8 60#define RT3 %r9 61 62#define RT0d %ebp 63#define RT1d %esi 64#define RT2d %r8d 65#define RT3d %r9d 66 67#define RKEY %r10 68 69/*********************************************************************** 70 * 1-way blowfish 71 ***********************************************************************/ 72#define F() \ 73 rorq $16, RX0; \ 74 movzbl RX0bh, RT0d; \ 75 movzbl RX0bl, RT1d; \ 76 rolq $16, RX0; \ 77 movl s0(CTX,RT0,4), RT0d; \ 78 addl s1(CTX,RT1,4), RT0d; \ 79 movzbl RX0bh, RT1d; \ 80 movzbl RX0bl, RT2d; \ 81 rolq $32, RX0; \ 82 xorl s2(CTX,RT1,4), RT0d; \ 83 addl s3(CTX,RT2,4), RT0d; \ 84 xorq RT0, RX0; 85 86#define add_roundkey_enc(n) \ 87 xorq p+4*(n)(CTX), RX0; 88 89#define round_enc(n) \ 90 add_roundkey_enc(n); \ 91 \ 92 F(); \ 93 F(); 94 95#define add_roundkey_dec(n) \ 96 movq p+4*(n-1)(CTX), RT0; \ 97 rorq $32, RT0; \ 98 xorq RT0, RX0; 99 100#define round_dec(n) \ 101 add_roundkey_dec(n); \ 102 \ 103 F(); \ 104 F(); \ 105 106#define read_block() \ 107 movq (RIO), RX0; \ 108 rorq $32, RX0; \ 109 bswapq RX0; 110 111#define write_block() \ 112 bswapq RX0; \ 113 movq RX0, (RIO); 114 115#define xor_block() \ 116 bswapq RX0; \ 117 xorq RX0, (RIO); 118 119.align 8 120.global __blowfish_enc_blk 121.type __blowfish_enc_blk,@function; 122 123__blowfish_enc_blk: 124 /* input: 125 * %rdi: ctx, CTX 126 * %rsi: dst 127 * %rdx: src 128 * %rcx: bool, if true: xor output 129 */ 130 movq %rbp, %r11; 131 132 movq %rsi, %r10; 133 movq %rdx, RIO; 134 135 read_block(); 136 137 round_enc(0); 138 round_enc(2); 139 round_enc(4); 140 round_enc(6); 141 round_enc(8); 142 round_enc(10); 143 round_enc(12); 144 round_enc(14); 145 add_roundkey_enc(16); 146 147 movq %r11, %rbp; 148 149 movq %r10, RIO; 150 test %cl, %cl; 151 jnz __enc_xor; 152 153 write_block(); 154 ret; 155__enc_xor: 156 xor_block(); 157 ret; 158 159.align 8 160.global blowfish_dec_blk 161.type blowfish_dec_blk,@function; 162 163blowfish_dec_blk: 164 /* input: 165 * %rdi: ctx, CTX 166 * %rsi: dst 167 * %rdx: src 168 */ 169 movq %rbp, %r11; 170 171 movq %rsi, %r10; 172 movq %rdx, RIO; 173 174 read_block(); 175 176 round_dec(17); 177 round_dec(15); 178 round_dec(13); 179 round_dec(11); 180 round_dec(9); 181 round_dec(7); 182 round_dec(5); 183 round_dec(3); 184 add_roundkey_dec(1); 185 186 movq %r10, RIO; 187 write_block(); 188 189 movq %r11, %rbp; 190 191 ret; 192 193/********************************************************************** 194 4-way blowfish, four blocks parallel 195 **********************************************************************/ 196 197/* F() for 4-way. Slower when used alone/1-way, but faster when used 198 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). 199 */ 200#define F4(x) \ 201 movzbl x ## bh, RT1d; \ 202 movzbl x ## bl, RT3d; \ 203 rorq $16, x; \ 204 movzbl x ## bh, RT0d; \ 205 movzbl x ## bl, RT2d; \ 206 rorq $16, x; \ 207 movl s0(CTX,RT0,4), RT0d; \ 208 addl s1(CTX,RT2,4), RT0d; \ 209 xorl s2(CTX,RT1,4), RT0d; \ 210 addl s3(CTX,RT3,4), RT0d; \ 211 xorq RT0, x; 212 213#define add_preloaded_roundkey4() \ 214 xorq RKEY, RX0; \ 215 xorq RKEY, RX1; \ 216 xorq RKEY, RX2; \ 217 xorq RKEY, RX3; 218 219#define preload_roundkey_enc(n) \ 220 movq p+4*(n)(CTX), RKEY; 221 222#define add_roundkey_enc4(n) \ 223 add_preloaded_roundkey4(); \ 224 preload_roundkey_enc(n + 2); 225 226#define round_enc4(n) \ 227 add_roundkey_enc4(n); \ 228 \ 229 F4(RX0); \ 230 F4(RX1); \ 231 F4(RX2); \ 232 F4(RX3); \ 233 \ 234 F4(RX0); \ 235 F4(RX1); \ 236 F4(RX2); \ 237 F4(RX3); 238 239#define preload_roundkey_dec(n) \ 240 movq p+4*((n)-1)(CTX), RKEY; \ 241 rorq $32, RKEY; 242 243#define add_roundkey_dec4(n) \ 244 add_preloaded_roundkey4(); \ 245 preload_roundkey_dec(n - 2); 246 247#define round_dec4(n) \ 248 add_roundkey_dec4(n); \ 249 \ 250 F4(RX0); \ 251 F4(RX1); \ 252 F4(RX2); \ 253 F4(RX3); \ 254 \ 255 F4(RX0); \ 256 F4(RX1); \ 257 F4(RX2); \ 258 F4(RX3); 259 260#define read_block4() \ 261 movq (RIO), RX0; \ 262 rorq $32, RX0; \ 263 bswapq RX0; \ 264 \ 265 movq 8(RIO), RX1; \ 266 rorq $32, RX1; \ 267 bswapq RX1; \ 268 \ 269 movq 16(RIO), RX2; \ 270 rorq $32, RX2; \ 271 bswapq RX2; \ 272 \ 273 movq 24(RIO), RX3; \ 274 rorq $32, RX3; \ 275 bswapq RX3; 276 277#define write_block4() \ 278 bswapq RX0; \ 279 movq RX0, (RIO); \ 280 \ 281 bswapq RX1; \ 282 movq RX1, 8(RIO); \ 283 \ 284 bswapq RX2; \ 285 movq RX2, 16(RIO); \ 286 \ 287 bswapq RX3; \ 288 movq RX3, 24(RIO); 289 290#define xor_block4() \ 291 bswapq RX0; \ 292 xorq RX0, (RIO); \ 293 \ 294 bswapq RX1; \ 295 xorq RX1, 8(RIO); \ 296 \ 297 bswapq RX2; \ 298 xorq RX2, 16(RIO); \ 299 \ 300 bswapq RX3; \ 301 xorq RX3, 24(RIO); 302 303.align 8 304.global __blowfish_enc_blk_4way 305.type __blowfish_enc_blk_4way,@function; 306 307__blowfish_enc_blk_4way: 308 /* input: 309 * %rdi: ctx, CTX 310 * %rsi: dst 311 * %rdx: src 312 * %rcx: bool, if true: xor output 313 */ 314 pushq %rbp; 315 pushq %rbx; 316 pushq %rcx; 317 318 preload_roundkey_enc(0); 319 320 movq %rsi, %r11; 321 movq %rdx, RIO; 322 323 read_block4(); 324 325 round_enc4(0); 326 round_enc4(2); 327 round_enc4(4); 328 round_enc4(6); 329 round_enc4(8); 330 round_enc4(10); 331 round_enc4(12); 332 round_enc4(14); 333 add_preloaded_roundkey4(); 334 335 popq %rbp; 336 movq %r11, RIO; 337 338 test %bpl, %bpl; 339 jnz __enc_xor4; 340 341 write_block4(); 342 343 popq %rbx; 344 popq %rbp; 345 ret; 346 347__enc_xor4: 348 xor_block4(); 349 350 popq %rbx; 351 popq %rbp; 352 ret; 353 354.align 8 355.global blowfish_dec_blk_4way 356.type blowfish_dec_blk_4way,@function; 357 358blowfish_dec_blk_4way: 359 /* input: 360 * %rdi: ctx, CTX 361 * %rsi: dst 362 * %rdx: src 363 */ 364 pushq %rbp; 365 pushq %rbx; 366 preload_roundkey_dec(17); 367 368 movq %rsi, %r11; 369 movq %rdx, RIO; 370 371 read_block4(); 372 373 round_dec4(17); 374 round_dec4(15); 375 round_dec4(13); 376 round_dec4(11); 377 round_dec4(9); 378 round_dec4(7); 379 round_dec4(5); 380 round_dec4(3); 381 add_preloaded_roundkey4(); 382 383 movq %r11, RIO; 384 write_block4(); 385 386 popq %rbx; 387 popq %rbp; 388 389 ret; 390 391