1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Camellia Cipher Algorithm (x86_64) 4 * 5 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 */ 7 8#include <linux/linkage.h> 9 10.file "camellia-x86_64-asm_64.S" 11.text 12 13.extern camellia_sp10011110; 14.extern camellia_sp22000222; 15.extern camellia_sp03303033; 16.extern camellia_sp00444404; 17.extern camellia_sp02220222; 18.extern camellia_sp30333033; 19.extern camellia_sp44044404; 20.extern camellia_sp11101110; 21 22#define sp10011110 camellia_sp10011110 23#define sp22000222 camellia_sp22000222 24#define sp03303033 camellia_sp03303033 25#define sp00444404 camellia_sp00444404 26#define sp02220222 camellia_sp02220222 27#define sp30333033 camellia_sp30333033 28#define sp44044404 camellia_sp44044404 29#define sp11101110 camellia_sp11101110 30 31#define CAMELLIA_TABLE_BYTE_LEN 272 32 33/* struct camellia_ctx: */ 34#define key_table 0 35#define key_length CAMELLIA_TABLE_BYTE_LEN 36 37/* register macros */ 38#define CTX %rdi 39#define RIO %rsi 40#define RIOd %esi 41 42#define RAB0 %rax 43#define RCD0 %rcx 44#define RAB1 %rbx 45#define RCD1 %rdx 46 47#define RAB0d %eax 48#define RCD0d %ecx 49#define RAB1d %ebx 50#define RCD1d %edx 51 52#define RAB0bl %al 53#define RCD0bl %cl 54#define RAB1bl %bl 55#define RCD1bl %dl 56 57#define RAB0bh %ah 58#define RCD0bh %ch 59#define RAB1bh %bh 60#define RCD1bh %dh 61 62#define RT0 %rsi 63#define RT1 %r12 64#define RT2 %r8 65 66#define RT0d %esi 67#define RT1d %r12d 68#define RT2d %r8d 69 70#define RT2bl %r8b 71 72#define RXOR %r9 73#define RR12 %r10 74#define RDST %r11 75 76#define RXORd %r9d 77#define RXORbl %r9b 78 79#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ 80 movzbl ab ## bl, tmp2 ## d; \ 81 movzbl ab ## bh, tmp1 ## d; \ 82 rorq $16, ab; \ 83 xorq T0(, tmp2, 8), dst; \ 84 xorq T1(, tmp1, 8), dst; 85 86/********************************************************************** 87 1-way camellia 88 **********************************************************************/ 89#define roundsm(ab, subkey, cd) \ 90 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 91 \ 92 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 93 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 94 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 95 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 96 \ 97 xorq RT2, cd ## 0; 98 99#define fls(l, r, kl, kr) \ 100 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 101 andl l ## 0d, RT0d; \ 102 roll $1, RT0d; \ 103 shlq $32, RT0; \ 104 xorq RT0, l ## 0; \ 105 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 106 orq r ## 0, RT1; \ 107 shrq $32, RT1; \ 108 xorq RT1, r ## 0; \ 109 \ 110 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ 111 orq l ## 0, RT2; \ 112 shrq $32, RT2; \ 113 xorq RT2, l ## 0; \ 114 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ 115 andl r ## 0d, RT0d; \ 116 roll $1, RT0d; \ 117 shlq $32, RT0; \ 118 xorq RT0, r ## 0; 119 120#define enc_rounds(i) \ 121 roundsm(RAB, i + 2, RCD); \ 122 roundsm(RCD, i + 3, RAB); \ 123 roundsm(RAB, i + 4, RCD); \ 124 roundsm(RCD, i + 5, RAB); \ 125 roundsm(RAB, i + 6, RCD); \ 126 roundsm(RCD, i + 7, RAB); 127 128#define enc_fls(i) \ 129 fls(RAB, RCD, i + 0, i + 1); 130 131#define enc_inpack() \ 132 movq (RIO), RAB0; \ 133 bswapq RAB0; \ 134 rolq $32, RAB0; \ 135 movq 4*2(RIO), RCD0; \ 136 bswapq RCD0; \ 137 rorq $32, RCD0; \ 138 xorq key_table(CTX), RAB0; 139 140#define enc_outunpack(op, max) \ 141 xorq key_table(CTX, max, 8), RCD0; \ 142 rorq $32, RCD0; \ 143 bswapq RCD0; \ 144 op ## q RCD0, (RIO); \ 145 rolq $32, RAB0; \ 146 bswapq RAB0; \ 147 op ## q RAB0, 4*2(RIO); 148 149#define dec_rounds(i) \ 150 roundsm(RAB, i + 7, RCD); \ 151 roundsm(RCD, i + 6, RAB); \ 152 roundsm(RAB, i + 5, RCD); \ 153 roundsm(RCD, i + 4, RAB); \ 154 roundsm(RAB, i + 3, RCD); \ 155 roundsm(RCD, i + 2, RAB); 156 157#define dec_fls(i) \ 158 fls(RAB, RCD, i + 1, i + 0); 159 160#define dec_inpack(max) \ 161 movq (RIO), RAB0; \ 162 bswapq RAB0; \ 163 rolq $32, RAB0; \ 164 movq 4*2(RIO), RCD0; \ 165 bswapq RCD0; \ 166 rorq $32, RCD0; \ 167 xorq key_table(CTX, max, 8), RAB0; 168 169#define dec_outunpack() \ 170 xorq key_table(CTX), RCD0; \ 171 rorq $32, RCD0; \ 172 bswapq RCD0; \ 173 movq RCD0, (RIO); \ 174 rolq $32, RAB0; \ 175 bswapq RAB0; \ 176 movq RAB0, 4*2(RIO); 177 178ENTRY(__camellia_enc_blk) 179 /* input: 180 * %rdi: ctx, CTX 181 * %rsi: dst 182 * %rdx: src 183 * %rcx: bool xor 184 */ 185 movq %r12, RR12; 186 187 movq %rcx, RXOR; 188 movq %rsi, RDST; 189 movq %rdx, RIO; 190 191 enc_inpack(); 192 193 enc_rounds(0); 194 enc_fls(8); 195 enc_rounds(8); 196 enc_fls(16); 197 enc_rounds(16); 198 movl $24, RT1d; /* max */ 199 200 cmpb $16, key_length(CTX); 201 je .L__enc_done; 202 203 enc_fls(24); 204 enc_rounds(24); 205 movl $32, RT1d; /* max */ 206 207.L__enc_done: 208 testb RXORbl, RXORbl; 209 movq RDST, RIO; 210 211 jnz .L__enc_xor; 212 213 enc_outunpack(mov, RT1); 214 215 movq RR12, %r12; 216 ret; 217 218.L__enc_xor: 219 enc_outunpack(xor, RT1); 220 221 movq RR12, %r12; 222 ret; 223ENDPROC(__camellia_enc_blk) 224 225ENTRY(camellia_dec_blk) 226 /* input: 227 * %rdi: ctx, CTX 228 * %rsi: dst 229 * %rdx: src 230 */ 231 cmpl $16, key_length(CTX); 232 movl $32, RT2d; 233 movl $24, RXORd; 234 cmovel RXORd, RT2d; /* max */ 235 236 movq %r12, RR12; 237 movq %rsi, RDST; 238 movq %rdx, RIO; 239 240 dec_inpack(RT2); 241 242 cmpb $24, RT2bl; 243 je .L__dec_rounds16; 244 245 dec_rounds(24); 246 dec_fls(24); 247 248.L__dec_rounds16: 249 dec_rounds(16); 250 dec_fls(16); 251 dec_rounds(8); 252 dec_fls(8); 253 dec_rounds(0); 254 255 movq RDST, RIO; 256 257 dec_outunpack(); 258 259 movq RR12, %r12; 260 ret; 261ENDPROC(camellia_dec_blk) 262 263/********************************************************************** 264 2-way camellia 265 **********************************************************************/ 266#define roundsm2(ab, subkey, cd) \ 267 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 268 xorq RT2, cd ## 1; \ 269 \ 270 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 271 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 272 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 273 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 274 \ 275 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ 276 xorq RT2, cd ## 0; \ 277 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ 278 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ 279 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); 280 281#define fls2(l, r, kl, kr) \ 282 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 283 andl l ## 0d, RT0d; \ 284 roll $1, RT0d; \ 285 shlq $32, RT0; \ 286 xorq RT0, l ## 0; \ 287 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 288 orq r ## 0, RT1; \ 289 shrq $32, RT1; \ 290 xorq RT1, r ## 0; \ 291 \ 292 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ 293 andl l ## 1d, RT2d; \ 294 roll $1, RT2d; \ 295 shlq $32, RT2; \ 296 xorq RT2, l ## 1; \ 297 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ 298 orq r ## 1, RT0; \ 299 shrq $32, RT0; \ 300 xorq RT0, r ## 1; \ 301 \ 302 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ 303 orq l ## 0, RT1; \ 304 shrq $32, RT1; \ 305 xorq RT1, l ## 0; \ 306 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ 307 andl r ## 0d, RT2d; \ 308 roll $1, RT2d; \ 309 shlq $32, RT2; \ 310 xorq RT2, r ## 0; \ 311 \ 312 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ 313 orq l ## 1, RT0; \ 314 shrq $32, RT0; \ 315 xorq RT0, l ## 1; \ 316 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ 317 andl r ## 1d, RT1d; \ 318 roll $1, RT1d; \ 319 shlq $32, RT1; \ 320 xorq RT1, r ## 1; 321 322#define enc_rounds2(i) \ 323 roundsm2(RAB, i + 2, RCD); \ 324 roundsm2(RCD, i + 3, RAB); \ 325 roundsm2(RAB, i + 4, RCD); \ 326 roundsm2(RCD, i + 5, RAB); \ 327 roundsm2(RAB, i + 6, RCD); \ 328 roundsm2(RCD, i + 7, RAB); 329 330#define enc_fls2(i) \ 331 fls2(RAB, RCD, i + 0, i + 1); 332 333#define enc_inpack2() \ 334 movq (RIO), RAB0; \ 335 bswapq RAB0; \ 336 rorq $32, RAB0; \ 337 movq 4*2(RIO), RCD0; \ 338 bswapq RCD0; \ 339 rolq $32, RCD0; \ 340 xorq key_table(CTX), RAB0; \ 341 \ 342 movq 8*2(RIO), RAB1; \ 343 bswapq RAB1; \ 344 rorq $32, RAB1; \ 345 movq 12*2(RIO), RCD1; \ 346 bswapq RCD1; \ 347 rolq $32, RCD1; \ 348 xorq key_table(CTX), RAB1; 349 350#define enc_outunpack2(op, max) \ 351 xorq key_table(CTX, max, 8), RCD0; \ 352 rolq $32, RCD0; \ 353 bswapq RCD0; \ 354 op ## q RCD0, (RIO); \ 355 rorq $32, RAB0; \ 356 bswapq RAB0; \ 357 op ## q RAB0, 4*2(RIO); \ 358 \ 359 xorq key_table(CTX, max, 8), RCD1; \ 360 rolq $32, RCD1; \ 361 bswapq RCD1; \ 362 op ## q RCD1, 8*2(RIO); \ 363 rorq $32, RAB1; \ 364 bswapq RAB1; \ 365 op ## q RAB1, 12*2(RIO); 366 367#define dec_rounds2(i) \ 368 roundsm2(RAB, i + 7, RCD); \ 369 roundsm2(RCD, i + 6, RAB); \ 370 roundsm2(RAB, i + 5, RCD); \ 371 roundsm2(RCD, i + 4, RAB); \ 372 roundsm2(RAB, i + 3, RCD); \ 373 roundsm2(RCD, i + 2, RAB); 374 375#define dec_fls2(i) \ 376 fls2(RAB, RCD, i + 1, i + 0); 377 378#define dec_inpack2(max) \ 379 movq (RIO), RAB0; \ 380 bswapq RAB0; \ 381 rorq $32, RAB0; \ 382 movq 4*2(RIO), RCD0; \ 383 bswapq RCD0; \ 384 rolq $32, RCD0; \ 385 xorq key_table(CTX, max, 8), RAB0; \ 386 \ 387 movq 8*2(RIO), RAB1; \ 388 bswapq RAB1; \ 389 rorq $32, RAB1; \ 390 movq 12*2(RIO), RCD1; \ 391 bswapq RCD1; \ 392 rolq $32, RCD1; \ 393 xorq key_table(CTX, max, 8), RAB1; 394 395#define dec_outunpack2() \ 396 xorq key_table(CTX), RCD0; \ 397 rolq $32, RCD0; \ 398 bswapq RCD0; \ 399 movq RCD0, (RIO); \ 400 rorq $32, RAB0; \ 401 bswapq RAB0; \ 402 movq RAB0, 4*2(RIO); \ 403 \ 404 xorq key_table(CTX), RCD1; \ 405 rolq $32, RCD1; \ 406 bswapq RCD1; \ 407 movq RCD1, 8*2(RIO); \ 408 rorq $32, RAB1; \ 409 bswapq RAB1; \ 410 movq RAB1, 12*2(RIO); 411 412ENTRY(__camellia_enc_blk_2way) 413 /* input: 414 * %rdi: ctx, CTX 415 * %rsi: dst 416 * %rdx: src 417 * %rcx: bool xor 418 */ 419 pushq %rbx; 420 421 movq %r12, RR12; 422 movq %rcx, RXOR; 423 movq %rsi, RDST; 424 movq %rdx, RIO; 425 426 enc_inpack2(); 427 428 enc_rounds2(0); 429 enc_fls2(8); 430 enc_rounds2(8); 431 enc_fls2(16); 432 enc_rounds2(16); 433 movl $24, RT2d; /* max */ 434 435 cmpb $16, key_length(CTX); 436 je .L__enc2_done; 437 438 enc_fls2(24); 439 enc_rounds2(24); 440 movl $32, RT2d; /* max */ 441 442.L__enc2_done: 443 test RXORbl, RXORbl; 444 movq RDST, RIO; 445 jnz .L__enc2_xor; 446 447 enc_outunpack2(mov, RT2); 448 449 movq RR12, %r12; 450 popq %rbx; 451 ret; 452 453.L__enc2_xor: 454 enc_outunpack2(xor, RT2); 455 456 movq RR12, %r12; 457 popq %rbx; 458 ret; 459ENDPROC(__camellia_enc_blk_2way) 460 461ENTRY(camellia_dec_blk_2way) 462 /* input: 463 * %rdi: ctx, CTX 464 * %rsi: dst 465 * %rdx: src 466 */ 467 cmpl $16, key_length(CTX); 468 movl $32, RT2d; 469 movl $24, RXORd; 470 cmovel RXORd, RT2d; /* max */ 471 472 movq %rbx, RXOR; 473 movq %r12, RR12; 474 movq %rsi, RDST; 475 movq %rdx, RIO; 476 477 dec_inpack2(RT2); 478 479 cmpb $24, RT2bl; 480 je .L__dec2_rounds16; 481 482 dec_rounds2(24); 483 dec_fls2(24); 484 485.L__dec2_rounds16: 486 dec_rounds2(16); 487 dec_fls2(16); 488 dec_rounds2(8); 489 dec_fls2(8); 490 dec_rounds2(0); 491 492 movq RDST, RIO; 493 494 dec_outunpack2(); 495 496 movq RR12, %r12; 497 movq RXOR, %rbx; 498 ret; 499ENDPROC(camellia_dec_blk_2way) 500