1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Blowfish Cipher Algorithm (x86_64) 4 * 5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 */ 7 8#include <linux/linkage.h> 9 10.file "blowfish-x86_64-asm.S" 11.text 12 13/* structure of crypto context */ 14#define p 0 15#define s0 ((16 + 2) * 4) 16#define s1 ((16 + 2 + (1 * 256)) * 4) 17#define s2 ((16 + 2 + (2 * 256)) * 4) 18#define s3 ((16 + 2 + (3 * 256)) * 4) 19 20/* register macros */ 21#define CTX %r12 22#define RIO %rsi 23 24#define RX0 %rax 25#define RX1 %rbx 26#define RX2 %rcx 27#define RX3 %rdx 28 29#define RX0d %eax 30#define RX1d %ebx 31#define RX2d %ecx 32#define RX3d %edx 33 34#define RX0bl %al 35#define RX1bl %bl 36#define RX2bl %cl 37#define RX3bl %dl 38 39#define RX0bh %ah 40#define RX1bh %bh 41#define RX2bh %ch 42#define RX3bh %dh 43 44#define RT0 %rdi 45#define RT1 %rsi 46#define RT2 %r8 47#define RT3 %r9 48 49#define RT0d %edi 50#define RT1d %esi 51#define RT2d %r8d 52#define RT3d %r9d 53 54#define RKEY %r10 55 56/*********************************************************************** 57 * 1-way blowfish 58 ***********************************************************************/ 59#define F() \ 60 rorq $16, RX0; \ 61 movzbl RX0bh, RT0d; \ 62 movzbl RX0bl, RT1d; \ 63 rolq $16, RX0; \ 64 movl s0(CTX,RT0,4), RT0d; \ 65 addl s1(CTX,RT1,4), RT0d; \ 66 movzbl RX0bh, RT1d; \ 67 movzbl RX0bl, RT2d; \ 68 rolq $32, RX0; \ 69 xorl s2(CTX,RT1,4), RT0d; \ 70 addl s3(CTX,RT2,4), RT0d; \ 71 xorq RT0, RX0; 72 73#define add_roundkey_enc(n) \ 74 xorq p+4*(n)(CTX), RX0; 75 76#define round_enc(n) \ 77 add_roundkey_enc(n); \ 78 \ 79 F(); \ 80 F(); 81 82#define add_roundkey_dec(n) \ 83 movq p+4*(n-1)(CTX), RT0; \ 84 rorq $32, RT0; \ 85 xorq RT0, RX0; 86 87#define round_dec(n) \ 88 add_roundkey_dec(n); \ 89 \ 90 F(); \ 91 F(); \ 92 93#define read_block() \ 94 movq (RIO), RX0; \ 95 rorq $32, RX0; \ 96 bswapq RX0; 97 98#define write_block() \ 99 bswapq RX0; \ 100 movq RX0, (RIO); 101 102SYM_FUNC_START(blowfish_enc_blk) 103 /* input: 104 * %rdi: ctx 105 * %rsi: dst 106 * %rdx: src 107 */ 108 movq %r12, %r11; 109 110 movq %rdi, CTX; 111 movq %rsi, %r10; 112 movq %rdx, RIO; 113 114 read_block(); 115 116 round_enc(0); 117 round_enc(2); 118 round_enc(4); 119 round_enc(6); 120 round_enc(8); 121 round_enc(10); 122 round_enc(12); 123 round_enc(14); 124 add_roundkey_enc(16); 125 126 movq %r11, %r12; 127 movq %r10, RIO; 128 129 write_block(); 130 RET; 131SYM_FUNC_END(blowfish_enc_blk) 132 133SYM_FUNC_START(blowfish_dec_blk) 134 /* input: 135 * %rdi: ctx 136 * %rsi: dst 137 * %rdx: src 138 */ 139 movq %r12, %r11; 140 141 movq %rdi, CTX; 142 movq %rsi, %r10; 143 movq %rdx, RIO; 144 145 read_block(); 146 147 round_dec(17); 148 round_dec(15); 149 round_dec(13); 150 round_dec(11); 151 round_dec(9); 152 round_dec(7); 153 round_dec(5); 154 round_dec(3); 155 add_roundkey_dec(1); 156 157 movq %r10, RIO; 158 write_block(); 159 160 movq %r11, %r12; 161 162 RET; 163SYM_FUNC_END(blowfish_dec_blk) 164 165/********************************************************************** 166 4-way blowfish, four blocks parallel 167 **********************************************************************/ 168 169/* F() for 4-way. Slower when used alone/1-way, but faster when used 170 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). 171 */ 172#define F4(x) \ 173 movzbl x ## bh, RT1d; \ 174 movzbl x ## bl, RT3d; \ 175 rorq $16, x; \ 176 movzbl x ## bh, RT0d; \ 177 movzbl x ## bl, RT2d; \ 178 rorq $16, x; \ 179 movl s0(CTX,RT0,4), RT0d; \ 180 addl s1(CTX,RT2,4), RT0d; \ 181 xorl s2(CTX,RT1,4), RT0d; \ 182 addl s3(CTX,RT3,4), RT0d; \ 183 xorq RT0, x; 184 185#define add_preloaded_roundkey4() \ 186 xorq RKEY, RX0; \ 187 xorq RKEY, RX1; \ 188 xorq RKEY, RX2; \ 189 xorq RKEY, RX3; 190 191#define preload_roundkey_enc(n) \ 192 movq p+4*(n)(CTX), RKEY; 193 194#define add_roundkey_enc4(n) \ 195 add_preloaded_roundkey4(); \ 196 preload_roundkey_enc(n + 2); 197 198#define round_enc4(n) \ 199 add_roundkey_enc4(n); \ 200 \ 201 F4(RX0); \ 202 F4(RX1); \ 203 F4(RX2); \ 204 F4(RX3); \ 205 \ 206 F4(RX0); \ 207 F4(RX1); \ 208 F4(RX2); \ 209 F4(RX3); 210 211#define preload_roundkey_dec(n) \ 212 movq p+4*((n)-1)(CTX), RKEY; \ 213 rorq $32, RKEY; 214 215#define add_roundkey_dec4(n) \ 216 add_preloaded_roundkey4(); \ 217 preload_roundkey_dec(n - 2); 218 219#define round_dec4(n) \ 220 add_roundkey_dec4(n); \ 221 \ 222 F4(RX0); \ 223 F4(RX1); \ 224 F4(RX2); \ 225 F4(RX3); \ 226 \ 227 F4(RX0); \ 228 F4(RX1); \ 229 F4(RX2); \ 230 F4(RX3); 231 232#define read_block4() \ 233 movq (RIO), RX0; \ 234 rorq $32, RX0; \ 235 bswapq RX0; \ 236 \ 237 movq 8(RIO), RX1; \ 238 rorq $32, RX1; \ 239 bswapq RX1; \ 240 \ 241 movq 16(RIO), RX2; \ 242 rorq $32, RX2; \ 243 bswapq RX2; \ 244 \ 245 movq 24(RIO), RX3; \ 246 rorq $32, RX3; \ 247 bswapq RX3; 248 249#define write_block4() \ 250 bswapq RX0; \ 251 movq RX0, (RIO); \ 252 \ 253 bswapq RX1; \ 254 movq RX1, 8(RIO); \ 255 \ 256 bswapq RX2; \ 257 movq RX2, 16(RIO); \ 258 \ 259 bswapq RX3; \ 260 movq RX3, 24(RIO); 261 262#define xor_block4() \ 263 movq (RIO), RT0; \ 264 bswapq RT0; \ 265 xorq RT0, RX1; \ 266 \ 267 movq 8(RIO), RT2; \ 268 bswapq RT2; \ 269 xorq RT2, RX2; \ 270 \ 271 movq 16(RIO), RT3; \ 272 bswapq RT3; \ 273 xorq RT3, RX3; 274 275SYM_FUNC_START(blowfish_enc_blk_4way) 276 /* input: 277 * %rdi: ctx 278 * %rsi: dst 279 * %rdx: src 280 */ 281 pushq %r12; 282 pushq %rbx; 283 284 movq %rdi, CTX 285 movq %rsi, %r11; 286 movq %rdx, RIO; 287 288 preload_roundkey_enc(0); 289 290 read_block4(); 291 292 round_enc4(0); 293 round_enc4(2); 294 round_enc4(4); 295 round_enc4(6); 296 round_enc4(8); 297 round_enc4(10); 298 round_enc4(12); 299 round_enc4(14); 300 add_preloaded_roundkey4(); 301 302 movq %r11, RIO; 303 write_block4(); 304 305 popq %rbx; 306 popq %r12; 307 RET; 308SYM_FUNC_END(blowfish_enc_blk_4way) 309 310SYM_FUNC_START(__blowfish_dec_blk_4way) 311 /* input: 312 * %rdi: ctx 313 * %rsi: dst 314 * %rdx: src 315 * %rcx: cbc (bool) 316 */ 317 pushq %r12; 318 pushq %rbx; 319 pushq %rcx; 320 pushq %rdx; 321 322 movq %rdi, CTX; 323 movq %rsi, %r11; 324 movq %rdx, RIO; 325 326 preload_roundkey_dec(17); 327 read_block4(); 328 329 round_dec4(17); 330 round_dec4(15); 331 round_dec4(13); 332 round_dec4(11); 333 round_dec4(9); 334 round_dec4(7); 335 round_dec4(5); 336 round_dec4(3); 337 add_preloaded_roundkey4(); 338 339 popq RIO; 340 popq %r12; 341 testq %r12, %r12; 342 jz .L_no_cbc_xor; 343 344 xor_block4(); 345 346.L_no_cbc_xor: 347 movq %r11, RIO; 348 write_block4(); 349 350 popq %rbx; 351 popq %r12; 352 353 RET; 354SYM_FUNC_END(__blowfish_dec_blk_4way) 355