1/* 2 * Camellia Cipher Algorithm (x86_64) 3 * 4 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 19 * USA 20 * 21 */ 22 23#include <linux/linkage.h> 24 25.file "camellia-x86_64-asm_64.S" 26.text 27 28.extern camellia_sp10011110; 29.extern camellia_sp22000222; 30.extern camellia_sp03303033; 31.extern camellia_sp00444404; 32.extern camellia_sp02220222; 33.extern camellia_sp30333033; 34.extern camellia_sp44044404; 35.extern camellia_sp11101110; 36 37#define sp10011110 camellia_sp10011110 38#define sp22000222 camellia_sp22000222 39#define sp03303033 camellia_sp03303033 40#define sp00444404 camellia_sp00444404 41#define sp02220222 camellia_sp02220222 42#define sp30333033 camellia_sp30333033 43#define sp44044404 camellia_sp44044404 44#define sp11101110 camellia_sp11101110 45 46#define CAMELLIA_TABLE_BYTE_LEN 272 47 48/* struct camellia_ctx: */ 49#define key_table 0 50#define key_length CAMELLIA_TABLE_BYTE_LEN 51 52/* register macros */ 53#define CTX %rdi 54#define RIO %rsi 55#define RIOd %esi 56 57#define RAB0 %rax 58#define RCD0 %rcx 59#define RAB1 %rbx 60#define RCD1 %rdx 61 62#define RAB0d %eax 63#define RCD0d %ecx 64#define RAB1d %ebx 65#define RCD1d %edx 66 67#define RAB0bl %al 68#define RCD0bl %cl 69#define RAB1bl %bl 70#define RCD1bl %dl 71 72#define RAB0bh %ah 73#define RCD0bh %ch 74#define RAB1bh %bh 75#define RCD1bh %dh 76 77#define RT0 %rsi 78#define RT1 %rbp 79#define RT2 %r8 80 81#define RT0d %esi 82#define RT1d %ebp 83#define RT2d %r8d 84 85#define RT2bl %r8b 86 87#define RXOR %r9 88#define RRBP %r10 89#define RDST %r11 90 91#define RXORd %r9d 92#define RXORbl %r9b 93 94#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ 95 movzbl ab ## bl, tmp2 ## d; \ 96 movzbl ab ## bh, tmp1 ## d; \ 97 rorq $16, ab; \ 98 xorq T0(, tmp2, 8), dst; \ 99 xorq T1(, tmp1, 8), dst; 100 101/********************************************************************** 102 1-way camellia 103 **********************************************************************/ 104#define roundsm(ab, subkey, cd) \ 105 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 106 \ 107 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 108 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 109 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 110 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 111 \ 112 xorq RT2, cd ## 0; 113 114#define fls(l, r, kl, kr) \ 115 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 116 andl l ## 0d, RT0d; \ 117 roll $1, RT0d; \ 118 shlq $32, RT0; \ 119 xorq RT0, l ## 0; \ 120 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 121 orq r ## 0, RT1; \ 122 shrq $32, RT1; \ 123 xorq RT1, r ## 0; \ 124 \ 125 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ 126 orq l ## 0, RT2; \ 127 shrq $32, RT2; \ 128 xorq RT2, l ## 0; \ 129 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ 130 andl r ## 0d, RT0d; \ 131 roll $1, RT0d; \ 132 shlq $32, RT0; \ 133 xorq RT0, r ## 0; 134 135#define enc_rounds(i) \ 136 roundsm(RAB, i + 2, RCD); \ 137 roundsm(RCD, i + 3, RAB); \ 138 roundsm(RAB, i + 4, RCD); \ 139 roundsm(RCD, i + 5, RAB); \ 140 roundsm(RAB, i + 6, RCD); \ 141 roundsm(RCD, i + 7, RAB); 142 143#define enc_fls(i) \ 144 fls(RAB, RCD, i + 0, i + 1); 145 146#define enc_inpack() \ 147 movq (RIO), RAB0; \ 148 bswapq RAB0; \ 149 rolq $32, RAB0; \ 150 movq 4*2(RIO), RCD0; \ 151 bswapq RCD0; \ 152 rorq $32, RCD0; \ 153 xorq key_table(CTX), RAB0; 154 155#define enc_outunpack(op, max) \ 156 xorq key_table(CTX, max, 8), RCD0; \ 157 rorq $32, RCD0; \ 158 bswapq RCD0; \ 159 op ## q RCD0, (RIO); \ 160 rolq $32, RAB0; \ 161 bswapq RAB0; \ 162 op ## q RAB0, 4*2(RIO); 163 164#define dec_rounds(i) \ 165 roundsm(RAB, i + 7, RCD); \ 166 roundsm(RCD, i + 6, RAB); \ 167 roundsm(RAB, i + 5, RCD); \ 168 roundsm(RCD, i + 4, RAB); \ 169 roundsm(RAB, i + 3, RCD); \ 170 roundsm(RCD, i + 2, RAB); 171 172#define dec_fls(i) \ 173 fls(RAB, RCD, i + 1, i + 0); 174 175#define dec_inpack(max) \ 176 movq (RIO), RAB0; \ 177 bswapq RAB0; \ 178 rolq $32, RAB0; \ 179 movq 4*2(RIO), RCD0; \ 180 bswapq RCD0; \ 181 rorq $32, RCD0; \ 182 xorq key_table(CTX, max, 8), RAB0; 183 184#define dec_outunpack() \ 185 xorq key_table(CTX), RCD0; \ 186 rorq $32, RCD0; \ 187 bswapq RCD0; \ 188 movq RCD0, (RIO); \ 189 rolq $32, RAB0; \ 190 bswapq RAB0; \ 191 movq RAB0, 4*2(RIO); 192 193ENTRY(__camellia_enc_blk) 194 /* input: 195 * %rdi: ctx, CTX 196 * %rsi: dst 197 * %rdx: src 198 * %rcx: bool xor 199 */ 200 movq %rbp, RRBP; 201 202 movq %rcx, RXOR; 203 movq %rsi, RDST; 204 movq %rdx, RIO; 205 206 enc_inpack(); 207 208 enc_rounds(0); 209 enc_fls(8); 210 enc_rounds(8); 211 enc_fls(16); 212 enc_rounds(16); 213 movl $24, RT1d; /* max */ 214 215 cmpb $16, key_length(CTX); 216 je .L__enc_done; 217 218 enc_fls(24); 219 enc_rounds(24); 220 movl $32, RT1d; /* max */ 221 222.L__enc_done: 223 testb RXORbl, RXORbl; 224 movq RDST, RIO; 225 226 jnz .L__enc_xor; 227 228 enc_outunpack(mov, RT1); 229 230 movq RRBP, %rbp; 231 ret; 232 233.L__enc_xor: 234 enc_outunpack(xor, RT1); 235 236 movq RRBP, %rbp; 237 ret; 238ENDPROC(__camellia_enc_blk) 239 240ENTRY(camellia_dec_blk) 241 /* input: 242 * %rdi: ctx, CTX 243 * %rsi: dst 244 * %rdx: src 245 */ 246 cmpl $16, key_length(CTX); 247 movl $32, RT2d; 248 movl $24, RXORd; 249 cmovel RXORd, RT2d; /* max */ 250 251 movq %rbp, RRBP; 252 movq %rsi, RDST; 253 movq %rdx, RIO; 254 255 dec_inpack(RT2); 256 257 cmpb $24, RT2bl; 258 je .L__dec_rounds16; 259 260 dec_rounds(24); 261 dec_fls(24); 262 263.L__dec_rounds16: 264 dec_rounds(16); 265 dec_fls(16); 266 dec_rounds(8); 267 dec_fls(8); 268 dec_rounds(0); 269 270 movq RDST, RIO; 271 272 dec_outunpack(); 273 274 movq RRBP, %rbp; 275 ret; 276ENDPROC(camellia_dec_blk) 277 278/********************************************************************** 279 2-way camellia 280 **********************************************************************/ 281#define roundsm2(ab, subkey, cd) \ 282 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 283 xorq RT2, cd ## 1; \ 284 \ 285 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 286 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 287 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 288 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 289 \ 290 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ 291 xorq RT2, cd ## 0; \ 292 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ 293 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ 294 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); 295 296#define fls2(l, r, kl, kr) \ 297 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 298 andl l ## 0d, RT0d; \ 299 roll $1, RT0d; \ 300 shlq $32, RT0; \ 301 xorq RT0, l ## 0; \ 302 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 303 orq r ## 0, RT1; \ 304 shrq $32, RT1; \ 305 xorq RT1, r ## 0; \ 306 \ 307 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ 308 andl l ## 1d, RT2d; \ 309 roll $1, RT2d; \ 310 shlq $32, RT2; \ 311 xorq RT2, l ## 1; \ 312 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ 313 orq r ## 1, RT0; \ 314 shrq $32, RT0; \ 315 xorq RT0, r ## 1; \ 316 \ 317 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ 318 orq l ## 0, RT1; \ 319 shrq $32, RT1; \ 320 xorq RT1, l ## 0; \ 321 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ 322 andl r ## 0d, RT2d; \ 323 roll $1, RT2d; \ 324 shlq $32, RT2; \ 325 xorq RT2, r ## 0; \ 326 \ 327 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ 328 orq l ## 1, RT0; \ 329 shrq $32, RT0; \ 330 xorq RT0, l ## 1; \ 331 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ 332 andl r ## 1d, RT1d; \ 333 roll $1, RT1d; \ 334 shlq $32, RT1; \ 335 xorq RT1, r ## 1; 336 337#define enc_rounds2(i) \ 338 roundsm2(RAB, i + 2, RCD); \ 339 roundsm2(RCD, i + 3, RAB); \ 340 roundsm2(RAB, i + 4, RCD); \ 341 roundsm2(RCD, i + 5, RAB); \ 342 roundsm2(RAB, i + 6, RCD); \ 343 roundsm2(RCD, i + 7, RAB); 344 345#define enc_fls2(i) \ 346 fls2(RAB, RCD, i + 0, i + 1); 347 348#define enc_inpack2() \ 349 movq (RIO), RAB0; \ 350 bswapq RAB0; \ 351 rorq $32, RAB0; \ 352 movq 4*2(RIO), RCD0; \ 353 bswapq RCD0; \ 354 rolq $32, RCD0; \ 355 xorq key_table(CTX), RAB0; \ 356 \ 357 movq 8*2(RIO), RAB1; \ 358 bswapq RAB1; \ 359 rorq $32, RAB1; \ 360 movq 12*2(RIO), RCD1; \ 361 bswapq RCD1; \ 362 rolq $32, RCD1; \ 363 xorq key_table(CTX), RAB1; 364 365#define enc_outunpack2(op, max) \ 366 xorq key_table(CTX, max, 8), RCD0; \ 367 rolq $32, RCD0; \ 368 bswapq RCD0; \ 369 op ## q RCD0, (RIO); \ 370 rorq $32, RAB0; \ 371 bswapq RAB0; \ 372 op ## q RAB0, 4*2(RIO); \ 373 \ 374 xorq key_table(CTX, max, 8), RCD1; \ 375 rolq $32, RCD1; \ 376 bswapq RCD1; \ 377 op ## q RCD1, 8*2(RIO); \ 378 rorq $32, RAB1; \ 379 bswapq RAB1; \ 380 op ## q RAB1, 12*2(RIO); 381 382#define dec_rounds2(i) \ 383 roundsm2(RAB, i + 7, RCD); \ 384 roundsm2(RCD, i + 6, RAB); \ 385 roundsm2(RAB, i + 5, RCD); \ 386 roundsm2(RCD, i + 4, RAB); \ 387 roundsm2(RAB, i + 3, RCD); \ 388 roundsm2(RCD, i + 2, RAB); 389 390#define dec_fls2(i) \ 391 fls2(RAB, RCD, i + 1, i + 0); 392 393#define dec_inpack2(max) \ 394 movq (RIO), RAB0; \ 395 bswapq RAB0; \ 396 rorq $32, RAB0; \ 397 movq 4*2(RIO), RCD0; \ 398 bswapq RCD0; \ 399 rolq $32, RCD0; \ 400 xorq key_table(CTX, max, 8), RAB0; \ 401 \ 402 movq 8*2(RIO), RAB1; \ 403 bswapq RAB1; \ 404 rorq $32, RAB1; \ 405 movq 12*2(RIO), RCD1; \ 406 bswapq RCD1; \ 407 rolq $32, RCD1; \ 408 xorq key_table(CTX, max, 8), RAB1; 409 410#define dec_outunpack2() \ 411 xorq key_table(CTX), RCD0; \ 412 rolq $32, RCD0; \ 413 bswapq RCD0; \ 414 movq RCD0, (RIO); \ 415 rorq $32, RAB0; \ 416 bswapq RAB0; \ 417 movq RAB0, 4*2(RIO); \ 418 \ 419 xorq key_table(CTX), RCD1; \ 420 rolq $32, RCD1; \ 421 bswapq RCD1; \ 422 movq RCD1, 8*2(RIO); \ 423 rorq $32, RAB1; \ 424 bswapq RAB1; \ 425 movq RAB1, 12*2(RIO); 426 427ENTRY(__camellia_enc_blk_2way) 428 /* input: 429 * %rdi: ctx, CTX 430 * %rsi: dst 431 * %rdx: src 432 * %rcx: bool xor 433 */ 434 pushq %rbx; 435 436 movq %rbp, RRBP; 437 movq %rcx, RXOR; 438 movq %rsi, RDST; 439 movq %rdx, RIO; 440 441 enc_inpack2(); 442 443 enc_rounds2(0); 444 enc_fls2(8); 445 enc_rounds2(8); 446 enc_fls2(16); 447 enc_rounds2(16); 448 movl $24, RT2d; /* max */ 449 450 cmpb $16, key_length(CTX); 451 je .L__enc2_done; 452 453 enc_fls2(24); 454 enc_rounds2(24); 455 movl $32, RT2d; /* max */ 456 457.L__enc2_done: 458 test RXORbl, RXORbl; 459 movq RDST, RIO; 460 jnz .L__enc2_xor; 461 462 enc_outunpack2(mov, RT2); 463 464 movq RRBP, %rbp; 465 popq %rbx; 466 ret; 467 468.L__enc2_xor: 469 enc_outunpack2(xor, RT2); 470 471 movq RRBP, %rbp; 472 popq %rbx; 473 ret; 474ENDPROC(__camellia_enc_blk_2way) 475 476ENTRY(camellia_dec_blk_2way) 477 /* input: 478 * %rdi: ctx, CTX 479 * %rsi: dst 480 * %rdx: src 481 */ 482 cmpl $16, key_length(CTX); 483 movl $32, RT2d; 484 movl $24, RXORd; 485 cmovel RXORd, RT2d; /* max */ 486 487 movq %rbx, RXOR; 488 movq %rbp, RRBP; 489 movq %rsi, RDST; 490 movq %rdx, RIO; 491 492 dec_inpack2(RT2); 493 494 cmpb $24, RT2bl; 495 je .L__dec2_rounds16; 496 497 dec_rounds2(24); 498 dec_fls2(24); 499 500.L__dec2_rounds16: 501 dec_rounds2(16); 502 dec_fls2(16); 503 dec_rounds2(8); 504 dec_fls2(8); 505 dec_rounds2(0); 506 507 movq RDST, RIO; 508 509 dec_outunpack2(); 510 511 movq RRBP, %rbp; 512 movq RXOR, %rbx; 513 ret; 514ENDPROC(camellia_dec_blk_2way) 515