1/* 2 * Blowfish Cipher Algorithm (x86_64) 3 * 4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 19 * USA 20 * 21 */ 22 23#include <linux/linkage.h> 24 25.file "blowfish-x86_64-asm.S" 26.text 27 28/* structure of crypto context */ 29#define p 0 30#define s0 ((16 + 2) * 4) 31#define s1 ((16 + 2 + (1 * 256)) * 4) 32#define s2 ((16 + 2 + (2 * 256)) * 4) 33#define s3 ((16 + 2 + (3 * 256)) * 4) 34 35/* register macros */ 36#define CTX %rdi 37#define RIO %rsi 38 39#define RX0 %rax 40#define RX1 %rbx 41#define RX2 %rcx 42#define RX3 %rdx 43 44#define RX0d %eax 45#define RX1d %ebx 46#define RX2d %ecx 47#define RX3d %edx 48 49#define RX0bl %al 50#define RX1bl %bl 51#define RX2bl %cl 52#define RX3bl %dl 53 54#define RX0bh %ah 55#define RX1bh %bh 56#define RX2bh %ch 57#define RX3bh %dh 58 59#define RT0 %rbp 60#define RT1 %rsi 61#define RT2 %r8 62#define RT3 %r9 63 64#define RT0d %ebp 65#define RT1d %esi 66#define RT2d %r8d 67#define RT3d %r9d 68 69#define RKEY %r10 70 71/*********************************************************************** 72 * 1-way blowfish 73 ***********************************************************************/ 74#define F() \ 75 rorq $16, RX0; \ 76 movzbl RX0bh, RT0d; \ 77 movzbl RX0bl, RT1d; \ 78 rolq $16, RX0; \ 79 movl s0(CTX,RT0,4), RT0d; \ 80 addl s1(CTX,RT1,4), RT0d; \ 81 movzbl RX0bh, RT1d; \ 82 movzbl RX0bl, RT2d; \ 83 rolq $32, RX0; \ 84 xorl s2(CTX,RT1,4), RT0d; \ 85 addl s3(CTX,RT2,4), RT0d; \ 86 xorq RT0, RX0; 87 88#define add_roundkey_enc(n) \ 89 xorq p+4*(n)(CTX), RX0; 90 91#define round_enc(n) \ 92 add_roundkey_enc(n); \ 93 \ 94 F(); \ 95 F(); 96 97#define add_roundkey_dec(n) \ 98 movq p+4*(n-1)(CTX), RT0; \ 99 rorq $32, RT0; \ 100 xorq RT0, RX0; 101 102#define round_dec(n) \ 103 add_roundkey_dec(n); \ 104 \ 105 F(); \ 106 F(); \ 107 108#define read_block() \ 109 movq (RIO), RX0; \ 110 rorq $32, RX0; \ 111 bswapq RX0; 112 113#define write_block() \ 114 bswapq RX0; \ 115 movq RX0, (RIO); 116 117#define xor_block() \ 118 bswapq RX0; \ 119 xorq RX0, (RIO); 120 121ENTRY(__blowfish_enc_blk) 122 /* input: 123 * %rdi: ctx, CTX 124 * %rsi: dst 125 * %rdx: src 126 * %rcx: bool, if true: xor output 127 */ 128 movq %rbp, %r11; 129 130 movq %rsi, %r10; 131 movq %rdx, RIO; 132 133 read_block(); 134 135 round_enc(0); 136 round_enc(2); 137 round_enc(4); 138 round_enc(6); 139 round_enc(8); 140 round_enc(10); 141 round_enc(12); 142 round_enc(14); 143 add_roundkey_enc(16); 144 145 movq %r11, %rbp; 146 147 movq %r10, RIO; 148 test %cl, %cl; 149 jnz .L__enc_xor; 150 151 write_block(); 152 ret; 153.L__enc_xor: 154 xor_block(); 155 ret; 156ENDPROC(__blowfish_enc_blk) 157 158ENTRY(blowfish_dec_blk) 159 /* input: 160 * %rdi: ctx, CTX 161 * %rsi: dst 162 * %rdx: src 163 */ 164 movq %rbp, %r11; 165 166 movq %rsi, %r10; 167 movq %rdx, RIO; 168 169 read_block(); 170 171 round_dec(17); 172 round_dec(15); 173 round_dec(13); 174 round_dec(11); 175 round_dec(9); 176 round_dec(7); 177 round_dec(5); 178 round_dec(3); 179 add_roundkey_dec(1); 180 181 movq %r10, RIO; 182 write_block(); 183 184 movq %r11, %rbp; 185 186 ret; 187ENDPROC(blowfish_dec_blk) 188 189/********************************************************************** 190 4-way blowfish, four blocks parallel 191 **********************************************************************/ 192 193/* F() for 4-way. Slower when used alone/1-way, but faster when used 194 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). 195 */ 196#define F4(x) \ 197 movzbl x ## bh, RT1d; \ 198 movzbl x ## bl, RT3d; \ 199 rorq $16, x; \ 200 movzbl x ## bh, RT0d; \ 201 movzbl x ## bl, RT2d; \ 202 rorq $16, x; \ 203 movl s0(CTX,RT0,4), RT0d; \ 204 addl s1(CTX,RT2,4), RT0d; \ 205 xorl s2(CTX,RT1,4), RT0d; \ 206 addl s3(CTX,RT3,4), RT0d; \ 207 xorq RT0, x; 208 209#define add_preloaded_roundkey4() \ 210 xorq RKEY, RX0; \ 211 xorq RKEY, RX1; \ 212 xorq RKEY, RX2; \ 213 xorq RKEY, RX3; 214 215#define preload_roundkey_enc(n) \ 216 movq p+4*(n)(CTX), RKEY; 217 218#define add_roundkey_enc4(n) \ 219 add_preloaded_roundkey4(); \ 220 preload_roundkey_enc(n + 2); 221 222#define round_enc4(n) \ 223 add_roundkey_enc4(n); \ 224 \ 225 F4(RX0); \ 226 F4(RX1); \ 227 F4(RX2); \ 228 F4(RX3); \ 229 \ 230 F4(RX0); \ 231 F4(RX1); \ 232 F4(RX2); \ 233 F4(RX3); 234 235#define preload_roundkey_dec(n) \ 236 movq p+4*((n)-1)(CTX), RKEY; \ 237 rorq $32, RKEY; 238 239#define add_roundkey_dec4(n) \ 240 add_preloaded_roundkey4(); \ 241 preload_roundkey_dec(n - 2); 242 243#define round_dec4(n) \ 244 add_roundkey_dec4(n); \ 245 \ 246 F4(RX0); \ 247 F4(RX1); \ 248 F4(RX2); \ 249 F4(RX3); \ 250 \ 251 F4(RX0); \ 252 F4(RX1); \ 253 F4(RX2); \ 254 F4(RX3); 255 256#define read_block4() \ 257 movq (RIO), RX0; \ 258 rorq $32, RX0; \ 259 bswapq RX0; \ 260 \ 261 movq 8(RIO), RX1; \ 262 rorq $32, RX1; \ 263 bswapq RX1; \ 264 \ 265 movq 16(RIO), RX2; \ 266 rorq $32, RX2; \ 267 bswapq RX2; \ 268 \ 269 movq 24(RIO), RX3; \ 270 rorq $32, RX3; \ 271 bswapq RX3; 272 273#define write_block4() \ 274 bswapq RX0; \ 275 movq RX0, (RIO); \ 276 \ 277 bswapq RX1; \ 278 movq RX1, 8(RIO); \ 279 \ 280 bswapq RX2; \ 281 movq RX2, 16(RIO); \ 282 \ 283 bswapq RX3; \ 284 movq RX3, 24(RIO); 285 286#define xor_block4() \ 287 bswapq RX0; \ 288 xorq RX0, (RIO); \ 289 \ 290 bswapq RX1; \ 291 xorq RX1, 8(RIO); \ 292 \ 293 bswapq RX2; \ 294 xorq RX2, 16(RIO); \ 295 \ 296 bswapq RX3; \ 297 xorq RX3, 24(RIO); 298 299ENTRY(__blowfish_enc_blk_4way) 300 /* input: 301 * %rdi: ctx, CTX 302 * %rsi: dst 303 * %rdx: src 304 * %rcx: bool, if true: xor output 305 */ 306 pushq %rbp; 307 pushq %rbx; 308 pushq %rcx; 309 310 preload_roundkey_enc(0); 311 312 movq %rsi, %r11; 313 movq %rdx, RIO; 314 315 read_block4(); 316 317 round_enc4(0); 318 round_enc4(2); 319 round_enc4(4); 320 round_enc4(6); 321 round_enc4(8); 322 round_enc4(10); 323 round_enc4(12); 324 round_enc4(14); 325 add_preloaded_roundkey4(); 326 327 popq %rbp; 328 movq %r11, RIO; 329 330 test %bpl, %bpl; 331 jnz .L__enc_xor4; 332 333 write_block4(); 334 335 popq %rbx; 336 popq %rbp; 337 ret; 338 339.L__enc_xor4: 340 xor_block4(); 341 342 popq %rbx; 343 popq %rbp; 344 ret; 345ENDPROC(__blowfish_enc_blk_4way) 346 347ENTRY(blowfish_dec_blk_4way) 348 /* input: 349 * %rdi: ctx, CTX 350 * %rsi: dst 351 * %rdx: src 352 */ 353 pushq %rbp; 354 pushq %rbx; 355 preload_roundkey_dec(17); 356 357 movq %rsi, %r11; 358 movq %rdx, RIO; 359 360 read_block4(); 361 362 round_dec4(17); 363 round_dec4(15); 364 round_dec4(13); 365 round_dec4(11); 366 round_dec4(9); 367 round_dec4(7); 368 round_dec4(5); 369 round_dec4(3); 370 add_preloaded_roundkey4(); 371 372 movq %r11, RIO; 373 write_block4(); 374 375 popq %rbx; 376 popq %rbp; 377 378 ret; 379ENDPROC(blowfish_dec_blk_4way) 380