1/* 2 * Blowfish Cipher Algorithm (x86_64) 3 * 4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 19 * USA 20 * 21 */ 22 23#include <linux/linkage.h> 24 25.file "blowfish-x86_64-asm.S" 26.text 27 28/* structure of crypto context */ 29#define p 0 30#define s0 ((16 + 2) * 4) 31#define s1 ((16 + 2 + (1 * 256)) * 4) 32#define s2 ((16 + 2 + (2 * 256)) * 4) 33#define s3 ((16 + 2 + (3 * 256)) * 4) 34 35/* register macros */ 36#define CTX %r12 37#define RIO %rsi 38 39#define RX0 %rax 40#define RX1 %rbx 41#define RX2 %rcx 42#define RX3 %rdx 43 44#define RX0d %eax 45#define RX1d %ebx 46#define RX2d %ecx 47#define RX3d %edx 48 49#define RX0bl %al 50#define RX1bl %bl 51#define RX2bl %cl 52#define RX3bl %dl 53 54#define RX0bh %ah 55#define RX1bh %bh 56#define RX2bh %ch 57#define RX3bh %dh 58 59#define RT0 %rdi 60#define RT1 %rsi 61#define RT2 %r8 62#define RT3 %r9 63 64#define RT0d %edi 65#define RT1d %esi 66#define RT2d %r8d 67#define RT3d %r9d 68 69#define RKEY %r10 70 71/*********************************************************************** 72 * 1-way blowfish 73 ***********************************************************************/ 74#define F() \ 75 rorq $16, RX0; \ 76 movzbl RX0bh, RT0d; \ 77 movzbl RX0bl, RT1d; \ 78 rolq $16, RX0; \ 79 movl s0(CTX,RT0,4), RT0d; \ 80 addl s1(CTX,RT1,4), RT0d; \ 81 movzbl RX0bh, RT1d; \ 82 movzbl RX0bl, RT2d; \ 83 rolq $32, RX0; \ 84 xorl s2(CTX,RT1,4), RT0d; \ 85 addl s3(CTX,RT2,4), RT0d; \ 86 xorq RT0, RX0; 87 88#define add_roundkey_enc(n) \ 89 xorq p+4*(n)(CTX), RX0; 90 91#define round_enc(n) \ 92 add_roundkey_enc(n); \ 93 \ 94 F(); \ 95 F(); 96 97#define add_roundkey_dec(n) \ 98 movq p+4*(n-1)(CTX), RT0; \ 99 rorq $32, RT0; \ 100 xorq RT0, RX0; 101 102#define round_dec(n) \ 103 add_roundkey_dec(n); \ 104 \ 105 F(); \ 106 F(); \ 107 108#define read_block() \ 109 movq (RIO), RX0; \ 110 rorq $32, RX0; \ 111 bswapq RX0; 112 113#define write_block() \ 114 bswapq RX0; \ 115 movq RX0, (RIO); 116 117#define xor_block() \ 118 bswapq RX0; \ 119 xorq RX0, (RIO); 120 121ENTRY(__blowfish_enc_blk) 122 /* input: 123 * %rdi: ctx 124 * %rsi: dst 125 * %rdx: src 126 * %rcx: bool, if true: xor output 127 */ 128 movq %r12, %r11; 129 130 movq %rdi, CTX; 131 movq %rsi, %r10; 132 movq %rdx, RIO; 133 134 read_block(); 135 136 round_enc(0); 137 round_enc(2); 138 round_enc(4); 139 round_enc(6); 140 round_enc(8); 141 round_enc(10); 142 round_enc(12); 143 round_enc(14); 144 add_roundkey_enc(16); 145 146 movq %r11, %r12; 147 148 movq %r10, RIO; 149 test %cl, %cl; 150 jnz .L__enc_xor; 151 152 write_block(); 153 ret; 154.L__enc_xor: 155 xor_block(); 156 ret; 157ENDPROC(__blowfish_enc_blk) 158 159ENTRY(blowfish_dec_blk) 160 /* input: 161 * %rdi: ctx 162 * %rsi: dst 163 * %rdx: src 164 */ 165 movq %r12, %r11; 166 167 movq %rdi, CTX; 168 movq %rsi, %r10; 169 movq %rdx, RIO; 170 171 read_block(); 172 173 round_dec(17); 174 round_dec(15); 175 round_dec(13); 176 round_dec(11); 177 round_dec(9); 178 round_dec(7); 179 round_dec(5); 180 round_dec(3); 181 add_roundkey_dec(1); 182 183 movq %r10, RIO; 184 write_block(); 185 186 movq %r11, %r12; 187 188 ret; 189ENDPROC(blowfish_dec_blk) 190 191/********************************************************************** 192 4-way blowfish, four blocks parallel 193 **********************************************************************/ 194 195/* F() for 4-way. Slower when used alone/1-way, but faster when used 196 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). 197 */ 198#define F4(x) \ 199 movzbl x ## bh, RT1d; \ 200 movzbl x ## bl, RT3d; \ 201 rorq $16, x; \ 202 movzbl x ## bh, RT0d; \ 203 movzbl x ## bl, RT2d; \ 204 rorq $16, x; \ 205 movl s0(CTX,RT0,4), RT0d; \ 206 addl s1(CTX,RT2,4), RT0d; \ 207 xorl s2(CTX,RT1,4), RT0d; \ 208 addl s3(CTX,RT3,4), RT0d; \ 209 xorq RT0, x; 210 211#define add_preloaded_roundkey4() \ 212 xorq RKEY, RX0; \ 213 xorq RKEY, RX1; \ 214 xorq RKEY, RX2; \ 215 xorq RKEY, RX3; 216 217#define preload_roundkey_enc(n) \ 218 movq p+4*(n)(CTX), RKEY; 219 220#define add_roundkey_enc4(n) \ 221 add_preloaded_roundkey4(); \ 222 preload_roundkey_enc(n + 2); 223 224#define round_enc4(n) \ 225 add_roundkey_enc4(n); \ 226 \ 227 F4(RX0); \ 228 F4(RX1); \ 229 F4(RX2); \ 230 F4(RX3); \ 231 \ 232 F4(RX0); \ 233 F4(RX1); \ 234 F4(RX2); \ 235 F4(RX3); 236 237#define preload_roundkey_dec(n) \ 238 movq p+4*((n)-1)(CTX), RKEY; \ 239 rorq $32, RKEY; 240 241#define add_roundkey_dec4(n) \ 242 add_preloaded_roundkey4(); \ 243 preload_roundkey_dec(n - 2); 244 245#define round_dec4(n) \ 246 add_roundkey_dec4(n); \ 247 \ 248 F4(RX0); \ 249 F4(RX1); \ 250 F4(RX2); \ 251 F4(RX3); \ 252 \ 253 F4(RX0); \ 254 F4(RX1); \ 255 F4(RX2); \ 256 F4(RX3); 257 258#define read_block4() \ 259 movq (RIO), RX0; \ 260 rorq $32, RX0; \ 261 bswapq RX0; \ 262 \ 263 movq 8(RIO), RX1; \ 264 rorq $32, RX1; \ 265 bswapq RX1; \ 266 \ 267 movq 16(RIO), RX2; \ 268 rorq $32, RX2; \ 269 bswapq RX2; \ 270 \ 271 movq 24(RIO), RX3; \ 272 rorq $32, RX3; \ 273 bswapq RX3; 274 275#define write_block4() \ 276 bswapq RX0; \ 277 movq RX0, (RIO); \ 278 \ 279 bswapq RX1; \ 280 movq RX1, 8(RIO); \ 281 \ 282 bswapq RX2; \ 283 movq RX2, 16(RIO); \ 284 \ 285 bswapq RX3; \ 286 movq RX3, 24(RIO); 287 288#define xor_block4() \ 289 bswapq RX0; \ 290 xorq RX0, (RIO); \ 291 \ 292 bswapq RX1; \ 293 xorq RX1, 8(RIO); \ 294 \ 295 bswapq RX2; \ 296 xorq RX2, 16(RIO); \ 297 \ 298 bswapq RX3; \ 299 xorq RX3, 24(RIO); 300 301ENTRY(__blowfish_enc_blk_4way) 302 /* input: 303 * %rdi: ctx 304 * %rsi: dst 305 * %rdx: src 306 * %rcx: bool, if true: xor output 307 */ 308 pushq %r12; 309 pushq %rbx; 310 pushq %rcx; 311 312 movq %rdi, CTX 313 movq %rsi, %r11; 314 movq %rdx, RIO; 315 316 preload_roundkey_enc(0); 317 318 read_block4(); 319 320 round_enc4(0); 321 round_enc4(2); 322 round_enc4(4); 323 round_enc4(6); 324 round_enc4(8); 325 round_enc4(10); 326 round_enc4(12); 327 round_enc4(14); 328 add_preloaded_roundkey4(); 329 330 popq %r12; 331 movq %r11, RIO; 332 333 test %r12b, %r12b; 334 jnz .L__enc_xor4; 335 336 write_block4(); 337 338 popq %rbx; 339 popq %r12; 340 ret; 341 342.L__enc_xor4: 343 xor_block4(); 344 345 popq %rbx; 346 popq %r12; 347 ret; 348ENDPROC(__blowfish_enc_blk_4way) 349 350ENTRY(blowfish_dec_blk_4way) 351 /* input: 352 * %rdi: ctx 353 * %rsi: dst 354 * %rdx: src 355 */ 356 pushq %r12; 357 pushq %rbx; 358 359 movq %rdi, CTX; 360 movq %rsi, %r11 361 movq %rdx, RIO; 362 363 preload_roundkey_dec(17); 364 read_block4(); 365 366 round_dec4(17); 367 round_dec4(15); 368 round_dec4(13); 369 round_dec4(11); 370 round_dec4(9); 371 round_dec4(7); 372 round_dec4(5); 373 round_dec4(3); 374 add_preloaded_roundkey4(); 375 376 movq %r11, RIO; 377 write_block4(); 378 379 popq %rbx; 380 popq %r12; 381 382 ret; 383ENDPROC(blowfish_dec_blk_4way) 384