1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/*************************************************************************** 3* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * 4* * 5***************************************************************************/ 6 7.file "twofish-x86_64-asm.S" 8.text 9 10#include <linux/linkage.h> 11#include <asm/asm-offsets.h> 12 13#define a_offset 0 14#define b_offset 4 15#define c_offset 8 16#define d_offset 12 17 18/* Structure of the crypto context struct*/ 19 20#define s0 0 /* S0 Array 256 Words each */ 21#define s1 1024 /* S1 Array */ 22#define s2 2048 /* S2 Array */ 23#define s3 3072 /* S3 Array */ 24#define w 4096 /* 8 whitening keys (word) */ 25#define k 4128 /* key 1-32 ( word ) */ 26 27/* define a few register aliases to allow macro substitution */ 28 29#define R0 %rax 30#define R0D %eax 31#define R0B %al 32#define R0H %ah 33 34#define R1 %rbx 35#define R1D %ebx 36#define R1B %bl 37#define R1H %bh 38 39#define R2 %rcx 40#define R2D %ecx 41#define R2B %cl 42#define R2H %ch 43 44#define R3 %rdx 45#define R3D %edx 46#define R3B %dl 47#define R3H %dh 48 49 50/* performs input whitening */ 51#define input_whitening(src,context,offset)\ 52 xor w+offset(context), src; 53 54/* performs input whitening */ 55#define output_whitening(src,context,offset)\ 56 xor w+16+offset(context), src; 57 58 59/* 60 * a input register containing a (rotated 16) 61 * b input register containing b 62 * c input register containing c 63 * d input register containing d (already rol $1) 64 * operations on a and b are interleaved to increase performance 65 */ 66#define encrypt_round(a,b,c,d,round)\ 67 movzx b ## B, %edi;\ 68 mov s1(%r11,%rdi,4),%r8d;\ 69 movzx a ## B, %edi;\ 70 mov s2(%r11,%rdi,4),%r9d;\ 71 movzx b ## H, %edi;\ 72 ror $16, b ## D;\ 73 xor s2(%r11,%rdi,4),%r8d;\ 74 movzx a ## H, %edi;\ 75 ror $16, a ## D;\ 76 xor s3(%r11,%rdi,4),%r9d;\ 77 movzx b ## B, %edi;\ 78 xor s3(%r11,%rdi,4),%r8d;\ 79 movzx a ## B, %edi;\ 80 xor (%r11,%rdi,4), %r9d;\ 81 movzx b ## H, %edi;\ 82 ror $15, b ## D;\ 83 xor (%r11,%rdi,4), %r8d;\ 84 movzx a ## H, %edi;\ 85 xor s1(%r11,%rdi,4),%r9d;\ 86 add %r8d, %r9d;\ 87 add %r9d, %r8d;\ 88 add k+round(%r11), %r9d;\ 89 xor %r9d, c ## D;\ 90 rol $15, c ## D;\ 91 add k+4+round(%r11),%r8d;\ 92 xor %r8d, d ## D; 93 94/* 95 * a input register containing a(rotated 16) 96 * b input register containing b 97 * c input register containing c 98 * d input register containing d (already rol $1) 99 * operations on a and b are interleaved to increase performance 100 * during the round a and b are prepared for the output whitening 101 */ 102#define encrypt_last_round(a,b,c,d,round)\ 103 mov b ## D, %r10d;\ 104 shl $32, %r10;\ 105 movzx b ## B, %edi;\ 106 mov s1(%r11,%rdi,4),%r8d;\ 107 movzx a ## B, %edi;\ 108 mov s2(%r11,%rdi,4),%r9d;\ 109 movzx b ## H, %edi;\ 110 ror $16, b ## D;\ 111 xor s2(%r11,%rdi,4),%r8d;\ 112 movzx a ## H, %edi;\ 113 ror $16, a ## D;\ 114 xor s3(%r11,%rdi,4),%r9d;\ 115 movzx b ## B, %edi;\ 116 xor s3(%r11,%rdi,4),%r8d;\ 117 movzx a ## B, %edi;\ 118 xor (%r11,%rdi,4), %r9d;\ 119 xor a, %r10;\ 120 movzx b ## H, %edi;\ 121 xor (%r11,%rdi,4), %r8d;\ 122 movzx a ## H, %edi;\ 123 xor s1(%r11,%rdi,4),%r9d;\ 124 add %r8d, %r9d;\ 125 add %r9d, %r8d;\ 126 add k+round(%r11), %r9d;\ 127 xor %r9d, c ## D;\ 128 ror $1, c ## D;\ 129 add k+4+round(%r11),%r8d;\ 130 xor %r8d, d ## D 131 132/* 133 * a input register containing a 134 * b input register containing b (rotated 16) 135 * c input register containing c (already rol $1) 136 * d input register containing d 137 * operations on a and b are interleaved to increase performance 138 */ 139#define decrypt_round(a,b,c,d,round)\ 140 movzx a ## B, %edi;\ 141 mov (%r11,%rdi,4), %r9d;\ 142 movzx b ## B, %edi;\ 143 mov s3(%r11,%rdi,4),%r8d;\ 144 movzx a ## H, %edi;\ 145 ror $16, a ## D;\ 146 xor s1(%r11,%rdi,4),%r9d;\ 147 movzx b ## H, %edi;\ 148 ror $16, b ## D;\ 149 xor (%r11,%rdi,4), %r8d;\ 150 movzx a ## B, %edi;\ 151 xor s2(%r11,%rdi,4),%r9d;\ 152 movzx b ## B, %edi;\ 153 xor s1(%r11,%rdi,4),%r8d;\ 154 movzx a ## H, %edi;\ 155 ror $15, a ## D;\ 156 xor s3(%r11,%rdi,4),%r9d;\ 157 movzx b ## H, %edi;\ 158 xor s2(%r11,%rdi,4),%r8d;\ 159 add %r8d, %r9d;\ 160 add %r9d, %r8d;\ 161 add k+round(%r11), %r9d;\ 162 xor %r9d, c ## D;\ 163 add k+4+round(%r11),%r8d;\ 164 xor %r8d, d ## D;\ 165 rol $15, d ## D; 166 167/* 168 * a input register containing a 169 * b input register containing b 170 * c input register containing c (already rol $1) 171 * d input register containing d 172 * operations on a and b are interleaved to increase performance 173 * during the round a and b are prepared for the output whitening 174 */ 175#define decrypt_last_round(a,b,c,d,round)\ 176 movzx a ## B, %edi;\ 177 mov (%r11,%rdi,4), %r9d;\ 178 movzx b ## B, %edi;\ 179 mov s3(%r11,%rdi,4),%r8d;\ 180 movzx b ## H, %edi;\ 181 ror $16, b ## D;\ 182 xor (%r11,%rdi,4), %r8d;\ 183 movzx a ## H, %edi;\ 184 mov b ## D, %r10d;\ 185 shl $32, %r10;\ 186 xor a, %r10;\ 187 ror $16, a ## D;\ 188 xor s1(%r11,%rdi,4),%r9d;\ 189 movzx b ## B, %edi;\ 190 xor s1(%r11,%rdi,4),%r8d;\ 191 movzx a ## B, %edi;\ 192 xor s2(%r11,%rdi,4),%r9d;\ 193 movzx b ## H, %edi;\ 194 xor s2(%r11,%rdi,4),%r8d;\ 195 movzx a ## H, %edi;\ 196 xor s3(%r11,%rdi,4),%r9d;\ 197 add %r8d, %r9d;\ 198 add %r9d, %r8d;\ 199 add k+round(%r11), %r9d;\ 200 xor %r9d, c ## D;\ 201 add k+4+round(%r11),%r8d;\ 202 xor %r8d, d ## D;\ 203 ror $1, d ## D; 204 205ENTRY(twofish_enc_blk) 206 pushq R1 207 208 /* %rdi contains the ctx address */ 209 /* %rsi contains the output address */ 210 /* %rdx contains the input address */ 211 /* ctx address is moved to free one non-rex register 212 as target for the 8bit high operations */ 213 mov %rdi, %r11 214 215 movq (R3), R1 216 movq 8(R3), R3 217 input_whitening(R1,%r11,a_offset) 218 input_whitening(R3,%r11,c_offset) 219 mov R1D, R0D 220 rol $16, R0D 221 shr $32, R1 222 mov R3D, R2D 223 shr $32, R3 224 rol $1, R3D 225 226 encrypt_round(R0,R1,R2,R3,0); 227 encrypt_round(R2,R3,R0,R1,8); 228 encrypt_round(R0,R1,R2,R3,2*8); 229 encrypt_round(R2,R3,R0,R1,3*8); 230 encrypt_round(R0,R1,R2,R3,4*8); 231 encrypt_round(R2,R3,R0,R1,5*8); 232 encrypt_round(R0,R1,R2,R3,6*8); 233 encrypt_round(R2,R3,R0,R1,7*8); 234 encrypt_round(R0,R1,R2,R3,8*8); 235 encrypt_round(R2,R3,R0,R1,9*8); 236 encrypt_round(R0,R1,R2,R3,10*8); 237 encrypt_round(R2,R3,R0,R1,11*8); 238 encrypt_round(R0,R1,R2,R3,12*8); 239 encrypt_round(R2,R3,R0,R1,13*8); 240 encrypt_round(R0,R1,R2,R3,14*8); 241 encrypt_last_round(R2,R3,R0,R1,15*8); 242 243 244 output_whitening(%r10,%r11,a_offset) 245 movq %r10, (%rsi) 246 247 shl $32, R1 248 xor R0, R1 249 250 output_whitening(R1,%r11,c_offset) 251 movq R1, 8(%rsi) 252 253 popq R1 254 movl $1,%eax 255 ret 256ENDPROC(twofish_enc_blk) 257 258ENTRY(twofish_dec_blk) 259 pushq R1 260 261 /* %rdi contains the ctx address */ 262 /* %rsi contains the output address */ 263 /* %rdx contains the input address */ 264 /* ctx address is moved to free one non-rex register 265 as target for the 8bit high operations */ 266 mov %rdi, %r11 267 268 movq (R3), R1 269 movq 8(R3), R3 270 output_whitening(R1,%r11,a_offset) 271 output_whitening(R3,%r11,c_offset) 272 mov R1D, R0D 273 shr $32, R1 274 rol $16, R1D 275 mov R3D, R2D 276 shr $32, R3 277 rol $1, R2D 278 279 decrypt_round(R0,R1,R2,R3,15*8); 280 decrypt_round(R2,R3,R0,R1,14*8); 281 decrypt_round(R0,R1,R2,R3,13*8); 282 decrypt_round(R2,R3,R0,R1,12*8); 283 decrypt_round(R0,R1,R2,R3,11*8); 284 decrypt_round(R2,R3,R0,R1,10*8); 285 decrypt_round(R0,R1,R2,R3,9*8); 286 decrypt_round(R2,R3,R0,R1,8*8); 287 decrypt_round(R0,R1,R2,R3,7*8); 288 decrypt_round(R2,R3,R0,R1,6*8); 289 decrypt_round(R0,R1,R2,R3,5*8); 290 decrypt_round(R2,R3,R0,R1,4*8); 291 decrypt_round(R0,R1,R2,R3,3*8); 292 decrypt_round(R2,R3,R0,R1,2*8); 293 decrypt_round(R0,R1,R2,R3,1*8); 294 decrypt_last_round(R2,R3,R0,R1,0); 295 296 input_whitening(%r10,%r11,a_offset) 297 movq %r10, (%rsi) 298 299 shl $32, R1 300 xor R0, R1 301 302 input_whitening(R1,%r11,c_offset) 303 movq R1, 8(%rsi) 304 305 popq R1 306 movl $1,%eax 307 ret 308ENDPROC(twofish_dec_blk) 309