1/* 2 * AES-NI + SSE2 implementation of AEGIS-128 3 * 4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> 5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 as published 9 * by the Free Software Foundation. 10 */ 11 12#include <linux/linkage.h> 13#include <asm/frame.h> 14 15#define STATE0 %xmm0 16#define STATE1 %xmm1 17#define STATE2 %xmm2 18#define STATE3 %xmm3 19#define STATE4 %xmm4 20#define KEY %xmm5 21#define MSG %xmm5 22#define T0 %xmm6 23#define T1 %xmm7 24 25#define STATEP %rdi 26#define LEN %rsi 27#define SRC %rdx 28#define DST %rcx 29 30.section .rodata.cst16.aegis128_const, "aM", @progbits, 32 31.align 16 32.Laegis128_const_0: 33 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d 34 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 35.Laegis128_const_1: 36 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 37 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd 38 39.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 40.align 16 41.Laegis128_counter: 42 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 43 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 44 45.text 46 47/* 48 * aegis128_update 49 * input: 50 * STATE[0-4] - input state 51 * output: 52 * STATE[0-4] - output state (shifted positions) 53 * changed: 54 * T0 55 */ 56.macro aegis128_update 57 movdqa STATE4, T0 58 aesenc STATE0, STATE4 59 aesenc STATE1, STATE0 60 aesenc STATE2, STATE1 61 aesenc STATE3, STATE2 62 aesenc T0, STATE3 63.endm 64 65/* 66 * __load_partial: internal ABI 67 * input: 68 * LEN - bytes 69 * SRC - src 70 * output: 71 * MSG - message block 72 * changed: 73 * T0 74 * %r8 75 * %r9 76 */ 77__load_partial: 78 xor %r9, %r9 79 pxor MSG, MSG 80 81 mov LEN, %r8 82 and $0x1, %r8 83 jz .Lld_partial_1 84 85 mov LEN, %r8 86 and $0x1E, %r8 87 add SRC, %r8 88 mov (%r8), %r9b 89 90.Lld_partial_1: 91 mov LEN, %r8 92 and $0x2, %r8 93 jz .Lld_partial_2 94 95 mov LEN, %r8 96 and $0x1C, %r8 97 add SRC, %r8 98 shl $0x10, %r9 99 mov (%r8), %r9w 100 101.Lld_partial_2: 102 mov LEN, %r8 103 and $0x4, %r8 104 jz .Lld_partial_4 105 106 mov LEN, %r8 107 and $0x18, %r8 108 add SRC, %r8 109 shl $32, %r9 110 mov (%r8), %r8d 111 xor %r8, %r9 112 113.Lld_partial_4: 114 movq %r9, MSG 115 116 mov LEN, %r8 117 and $0x8, %r8 118 jz .Lld_partial_8 119 120 mov LEN, %r8 121 and $0x10, %r8 122 add SRC, %r8 123 pslldq $8, MSG 124 movq (%r8), T0 125 pxor T0, MSG 126 127.Lld_partial_8: 128 ret 129ENDPROC(__load_partial) 130 131/* 132 * __store_partial: internal ABI 133 * input: 134 * LEN - bytes 135 * DST - dst 136 * output: 137 * T0 - message block 138 * changed: 139 * %r8 140 * %r9 141 * %r10 142 */ 143__store_partial: 144 mov LEN, %r8 145 mov DST, %r9 146 147 movq T0, %r10 148 149 cmp $8, %r8 150 jl .Lst_partial_8 151 152 mov %r10, (%r9) 153 psrldq $8, T0 154 movq T0, %r10 155 156 sub $8, %r8 157 add $8, %r9 158 159.Lst_partial_8: 160 cmp $4, %r8 161 jl .Lst_partial_4 162 163 mov %r10d, (%r9) 164 shr $32, %r10 165 166 sub $4, %r8 167 add $4, %r9 168 169.Lst_partial_4: 170 cmp $2, %r8 171 jl .Lst_partial_2 172 173 mov %r10w, (%r9) 174 shr $0x10, %r10 175 176 sub $2, %r8 177 add $2, %r9 178 179.Lst_partial_2: 180 cmp $1, %r8 181 jl .Lst_partial_1 182 183 mov %r10b, (%r9) 184 185.Lst_partial_1: 186 ret 187ENDPROC(__store_partial) 188 189/* 190 * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); 191 */ 192ENTRY(crypto_aegis128_aesni_init) 193 FRAME_BEGIN 194 195 /* load IV: */ 196 movdqu (%rdx), T1 197 198 /* load key: */ 199 movdqa (%rsi), KEY 200 pxor KEY, T1 201 movdqa T1, STATE0 202 movdqa KEY, STATE3 203 movdqa KEY, STATE4 204 205 /* load the constants: */ 206 movdqa .Laegis128_const_0, STATE2 207 movdqa .Laegis128_const_1, STATE1 208 pxor STATE2, STATE3 209 pxor STATE1, STATE4 210 211 /* update 10 times with KEY / KEY xor IV: */ 212 aegis128_update; pxor KEY, STATE4 213 aegis128_update; pxor T1, STATE3 214 aegis128_update; pxor KEY, STATE2 215 aegis128_update; pxor T1, STATE1 216 aegis128_update; pxor KEY, STATE0 217 aegis128_update; pxor T1, STATE4 218 aegis128_update; pxor KEY, STATE3 219 aegis128_update; pxor T1, STATE2 220 aegis128_update; pxor KEY, STATE1 221 aegis128_update; pxor T1, STATE0 222 223 /* store the state: */ 224 movdqu STATE0, 0x00(STATEP) 225 movdqu STATE1, 0x10(STATEP) 226 movdqu STATE2, 0x20(STATEP) 227 movdqu STATE3, 0x30(STATEP) 228 movdqu STATE4, 0x40(STATEP) 229 230 FRAME_END 231 ret 232ENDPROC(crypto_aegis128_aesni_init) 233 234/* 235 * void crypto_aegis128_aesni_ad(void *state, unsigned int length, 236 * const void *data); 237 */ 238ENTRY(crypto_aegis128_aesni_ad) 239 FRAME_BEGIN 240 241 cmp $0x10, LEN 242 jb .Lad_out 243 244 /* load the state: */ 245 movdqu 0x00(STATEP), STATE0 246 movdqu 0x10(STATEP), STATE1 247 movdqu 0x20(STATEP), STATE2 248 movdqu 0x30(STATEP), STATE3 249 movdqu 0x40(STATEP), STATE4 250 251 mov SRC, %r8 252 and $0xF, %r8 253 jnz .Lad_u_loop 254 255.align 8 256.Lad_a_loop: 257 movdqa 0x00(SRC), MSG 258 aegis128_update 259 pxor MSG, STATE4 260 sub $0x10, LEN 261 cmp $0x10, LEN 262 jl .Lad_out_1 263 264 movdqa 0x10(SRC), MSG 265 aegis128_update 266 pxor MSG, STATE3 267 sub $0x10, LEN 268 cmp $0x10, LEN 269 jl .Lad_out_2 270 271 movdqa 0x20(SRC), MSG 272 aegis128_update 273 pxor MSG, STATE2 274 sub $0x10, LEN 275 cmp $0x10, LEN 276 jl .Lad_out_3 277 278 movdqa 0x30(SRC), MSG 279 aegis128_update 280 pxor MSG, STATE1 281 sub $0x10, LEN 282 cmp $0x10, LEN 283 jl .Lad_out_4 284 285 movdqa 0x40(SRC), MSG 286 aegis128_update 287 pxor MSG, STATE0 288 sub $0x10, LEN 289 cmp $0x10, LEN 290 jl .Lad_out_0 291 292 add $0x50, SRC 293 jmp .Lad_a_loop 294 295.align 8 296.Lad_u_loop: 297 movdqu 0x00(SRC), MSG 298 aegis128_update 299 pxor MSG, STATE4 300 sub $0x10, LEN 301 cmp $0x10, LEN 302 jl .Lad_out_1 303 304 movdqu 0x10(SRC), MSG 305 aegis128_update 306 pxor MSG, STATE3 307 sub $0x10, LEN 308 cmp $0x10, LEN 309 jl .Lad_out_2 310 311 movdqu 0x20(SRC), MSG 312 aegis128_update 313 pxor MSG, STATE2 314 sub $0x10, LEN 315 cmp $0x10, LEN 316 jl .Lad_out_3 317 318 movdqu 0x30(SRC), MSG 319 aegis128_update 320 pxor MSG, STATE1 321 sub $0x10, LEN 322 cmp $0x10, LEN 323 jl .Lad_out_4 324 325 movdqu 0x40(SRC), MSG 326 aegis128_update 327 pxor MSG, STATE0 328 sub $0x10, LEN 329 cmp $0x10, LEN 330 jl .Lad_out_0 331 332 add $0x50, SRC 333 jmp .Lad_u_loop 334 335 /* store the state: */ 336.Lad_out_0: 337 movdqu STATE0, 0x00(STATEP) 338 movdqu STATE1, 0x10(STATEP) 339 movdqu STATE2, 0x20(STATEP) 340 movdqu STATE3, 0x30(STATEP) 341 movdqu STATE4, 0x40(STATEP) 342 FRAME_END 343 ret 344 345.Lad_out_1: 346 movdqu STATE4, 0x00(STATEP) 347 movdqu STATE0, 0x10(STATEP) 348 movdqu STATE1, 0x20(STATEP) 349 movdqu STATE2, 0x30(STATEP) 350 movdqu STATE3, 0x40(STATEP) 351 FRAME_END 352 ret 353 354.Lad_out_2: 355 movdqu STATE3, 0x00(STATEP) 356 movdqu STATE4, 0x10(STATEP) 357 movdqu STATE0, 0x20(STATEP) 358 movdqu STATE1, 0x30(STATEP) 359 movdqu STATE2, 0x40(STATEP) 360 FRAME_END 361 ret 362 363.Lad_out_3: 364 movdqu STATE2, 0x00(STATEP) 365 movdqu STATE3, 0x10(STATEP) 366 movdqu STATE4, 0x20(STATEP) 367 movdqu STATE0, 0x30(STATEP) 368 movdqu STATE1, 0x40(STATEP) 369 FRAME_END 370 ret 371 372.Lad_out_4: 373 movdqu STATE1, 0x00(STATEP) 374 movdqu STATE2, 0x10(STATEP) 375 movdqu STATE3, 0x20(STATEP) 376 movdqu STATE4, 0x30(STATEP) 377 movdqu STATE0, 0x40(STATEP) 378 FRAME_END 379 ret 380 381.Lad_out: 382 FRAME_END 383 ret 384ENDPROC(crypto_aegis128_aesni_ad) 385 386.macro encrypt_block a s0 s1 s2 s3 s4 i 387 movdq\a (\i * 0x10)(SRC), MSG 388 movdqa MSG, T0 389 pxor \s1, T0 390 pxor \s4, T0 391 movdqa \s2, T1 392 pand \s3, T1 393 pxor T1, T0 394 movdq\a T0, (\i * 0x10)(DST) 395 396 aegis128_update 397 pxor MSG, \s4 398 399 sub $0x10, LEN 400 cmp $0x10, LEN 401 jl .Lenc_out_\i 402.endm 403 404/* 405 * void crypto_aegis128_aesni_enc(void *state, unsigned int length, 406 * const void *src, void *dst); 407 */ 408ENTRY(crypto_aegis128_aesni_enc) 409 FRAME_BEGIN 410 411 cmp $0x10, LEN 412 jb .Lenc_out 413 414 /* load the state: */ 415 movdqu 0x00(STATEP), STATE0 416 movdqu 0x10(STATEP), STATE1 417 movdqu 0x20(STATEP), STATE2 418 movdqu 0x30(STATEP), STATE3 419 movdqu 0x40(STATEP), STATE4 420 421 mov SRC, %r8 422 or DST, %r8 423 and $0xF, %r8 424 jnz .Lenc_u_loop 425 426.align 8 427.Lenc_a_loop: 428 encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 429 encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 430 encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 431 encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 432 encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 433 434 add $0x50, SRC 435 add $0x50, DST 436 jmp .Lenc_a_loop 437 438.align 8 439.Lenc_u_loop: 440 encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 441 encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 442 encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 443 encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 444 encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 445 446 add $0x50, SRC 447 add $0x50, DST 448 jmp .Lenc_u_loop 449 450 /* store the state: */ 451.Lenc_out_0: 452 movdqu STATE4, 0x00(STATEP) 453 movdqu STATE0, 0x10(STATEP) 454 movdqu STATE1, 0x20(STATEP) 455 movdqu STATE2, 0x30(STATEP) 456 movdqu STATE3, 0x40(STATEP) 457 FRAME_END 458 ret 459 460.Lenc_out_1: 461 movdqu STATE3, 0x00(STATEP) 462 movdqu STATE4, 0x10(STATEP) 463 movdqu STATE0, 0x20(STATEP) 464 movdqu STATE1, 0x30(STATEP) 465 movdqu STATE2, 0x40(STATEP) 466 FRAME_END 467 ret 468 469.Lenc_out_2: 470 movdqu STATE2, 0x00(STATEP) 471 movdqu STATE3, 0x10(STATEP) 472 movdqu STATE4, 0x20(STATEP) 473 movdqu STATE0, 0x30(STATEP) 474 movdqu STATE1, 0x40(STATEP) 475 FRAME_END 476 ret 477 478.Lenc_out_3: 479 movdqu STATE1, 0x00(STATEP) 480 movdqu STATE2, 0x10(STATEP) 481 movdqu STATE3, 0x20(STATEP) 482 movdqu STATE4, 0x30(STATEP) 483 movdqu STATE0, 0x40(STATEP) 484 FRAME_END 485 ret 486 487.Lenc_out_4: 488 movdqu STATE0, 0x00(STATEP) 489 movdqu STATE1, 0x10(STATEP) 490 movdqu STATE2, 0x20(STATEP) 491 movdqu STATE3, 0x30(STATEP) 492 movdqu STATE4, 0x40(STATEP) 493 FRAME_END 494 ret 495 496.Lenc_out: 497 FRAME_END 498 ret 499ENDPROC(crypto_aegis128_aesni_enc) 500 501/* 502 * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, 503 * const void *src, void *dst); 504 */ 505ENTRY(crypto_aegis128_aesni_enc_tail) 506 FRAME_BEGIN 507 508 /* load the state: */ 509 movdqu 0x00(STATEP), STATE0 510 movdqu 0x10(STATEP), STATE1 511 movdqu 0x20(STATEP), STATE2 512 movdqu 0x30(STATEP), STATE3 513 movdqu 0x40(STATEP), STATE4 514 515 /* encrypt message: */ 516 call __load_partial 517 518 movdqa MSG, T0 519 pxor STATE1, T0 520 pxor STATE4, T0 521 movdqa STATE2, T1 522 pand STATE3, T1 523 pxor T1, T0 524 525 call __store_partial 526 527 aegis128_update 528 pxor MSG, STATE4 529 530 /* store the state: */ 531 movdqu STATE4, 0x00(STATEP) 532 movdqu STATE0, 0x10(STATEP) 533 movdqu STATE1, 0x20(STATEP) 534 movdqu STATE2, 0x30(STATEP) 535 movdqu STATE3, 0x40(STATEP) 536 537 FRAME_END 538ENDPROC(crypto_aegis128_aesni_enc_tail) 539 540.macro decrypt_block a s0 s1 s2 s3 s4 i 541 movdq\a (\i * 0x10)(SRC), MSG 542 pxor \s1, MSG 543 pxor \s4, MSG 544 movdqa \s2, T1 545 pand \s3, T1 546 pxor T1, MSG 547 movdq\a MSG, (\i * 0x10)(DST) 548 549 aegis128_update 550 pxor MSG, \s4 551 552 sub $0x10, LEN 553 cmp $0x10, LEN 554 jl .Ldec_out_\i 555.endm 556 557/* 558 * void crypto_aegis128_aesni_dec(void *state, unsigned int length, 559 * const void *src, void *dst); 560 */ 561ENTRY(crypto_aegis128_aesni_dec) 562 FRAME_BEGIN 563 564 cmp $0x10, LEN 565 jb .Ldec_out 566 567 /* load the state: */ 568 movdqu 0x00(STATEP), STATE0 569 movdqu 0x10(STATEP), STATE1 570 movdqu 0x20(STATEP), STATE2 571 movdqu 0x30(STATEP), STATE3 572 movdqu 0x40(STATEP), STATE4 573 574 mov SRC, %r8 575 or DST, %r8 576 and $0xF, %r8 577 jnz .Ldec_u_loop 578 579.align 8 580.Ldec_a_loop: 581 decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 582 decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 583 decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 584 decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 585 decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 586 587 add $0x50, SRC 588 add $0x50, DST 589 jmp .Ldec_a_loop 590 591.align 8 592.Ldec_u_loop: 593 decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 594 decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 595 decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 596 decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 597 decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 598 599 add $0x50, SRC 600 add $0x50, DST 601 jmp .Ldec_u_loop 602 603 /* store the state: */ 604.Ldec_out_0: 605 movdqu STATE4, 0x00(STATEP) 606 movdqu STATE0, 0x10(STATEP) 607 movdqu STATE1, 0x20(STATEP) 608 movdqu STATE2, 0x30(STATEP) 609 movdqu STATE3, 0x40(STATEP) 610 FRAME_END 611 ret 612 613.Ldec_out_1: 614 movdqu STATE3, 0x00(STATEP) 615 movdqu STATE4, 0x10(STATEP) 616 movdqu STATE0, 0x20(STATEP) 617 movdqu STATE1, 0x30(STATEP) 618 movdqu STATE2, 0x40(STATEP) 619 FRAME_END 620 ret 621 622.Ldec_out_2: 623 movdqu STATE2, 0x00(STATEP) 624 movdqu STATE3, 0x10(STATEP) 625 movdqu STATE4, 0x20(STATEP) 626 movdqu STATE0, 0x30(STATEP) 627 movdqu STATE1, 0x40(STATEP) 628 FRAME_END 629 ret 630 631.Ldec_out_3: 632 movdqu STATE1, 0x00(STATEP) 633 movdqu STATE2, 0x10(STATEP) 634 movdqu STATE3, 0x20(STATEP) 635 movdqu STATE4, 0x30(STATEP) 636 movdqu STATE0, 0x40(STATEP) 637 FRAME_END 638 ret 639 640.Ldec_out_4: 641 movdqu STATE0, 0x00(STATEP) 642 movdqu STATE1, 0x10(STATEP) 643 movdqu STATE2, 0x20(STATEP) 644 movdqu STATE3, 0x30(STATEP) 645 movdqu STATE4, 0x40(STATEP) 646 FRAME_END 647 ret 648 649.Ldec_out: 650 FRAME_END 651 ret 652ENDPROC(crypto_aegis128_aesni_dec) 653 654/* 655 * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, 656 * const void *src, void *dst); 657 */ 658ENTRY(crypto_aegis128_aesni_dec_tail) 659 FRAME_BEGIN 660 661 /* load the state: */ 662 movdqu 0x00(STATEP), STATE0 663 movdqu 0x10(STATEP), STATE1 664 movdqu 0x20(STATEP), STATE2 665 movdqu 0x30(STATEP), STATE3 666 movdqu 0x40(STATEP), STATE4 667 668 /* decrypt message: */ 669 call __load_partial 670 671 pxor STATE1, MSG 672 pxor STATE4, MSG 673 movdqa STATE2, T1 674 pand STATE3, T1 675 pxor T1, MSG 676 677 movdqa MSG, T0 678 call __store_partial 679 680 /* mask with byte count: */ 681 movq LEN, T0 682 punpcklbw T0, T0 683 punpcklbw T0, T0 684 punpcklbw T0, T0 685 punpcklbw T0, T0 686 movdqa .Laegis128_counter, T1 687 pcmpgtb T1, T0 688 pand T0, MSG 689 690 aegis128_update 691 pxor MSG, STATE4 692 693 /* store the state: */ 694 movdqu STATE4, 0x00(STATEP) 695 movdqu STATE0, 0x10(STATEP) 696 movdqu STATE1, 0x20(STATEP) 697 movdqu STATE2, 0x30(STATEP) 698 movdqu STATE3, 0x40(STATEP) 699 700 FRAME_END 701 ret 702ENDPROC(crypto_aegis128_aesni_dec_tail) 703 704/* 705 * void crypto_aegis128_aesni_final(void *state, void *tag_xor, 706 * u64 assoclen, u64 cryptlen); 707 */ 708ENTRY(crypto_aegis128_aesni_final) 709 FRAME_BEGIN 710 711 /* load the state: */ 712 movdqu 0x00(STATEP), STATE0 713 movdqu 0x10(STATEP), STATE1 714 movdqu 0x20(STATEP), STATE2 715 movdqu 0x30(STATEP), STATE3 716 movdqu 0x40(STATEP), STATE4 717 718 /* prepare length block: */ 719 movq %rdx, MSG 720 movq %rcx, T0 721 pslldq $8, T0 722 pxor T0, MSG 723 psllq $3, MSG /* multiply by 8 (to get bit count) */ 724 725 pxor STATE3, MSG 726 727 /* update state: */ 728 aegis128_update; pxor MSG, STATE4 729 aegis128_update; pxor MSG, STATE3 730 aegis128_update; pxor MSG, STATE2 731 aegis128_update; pxor MSG, STATE1 732 aegis128_update; pxor MSG, STATE0 733 aegis128_update; pxor MSG, STATE4 734 aegis128_update; pxor MSG, STATE3 735 736 /* xor tag: */ 737 movdqu (%rsi), MSG 738 739 pxor STATE0, MSG 740 pxor STATE1, MSG 741 pxor STATE2, MSG 742 pxor STATE3, MSG 743 pxor STATE4, MSG 744 745 movdqu MSG, (%rsi) 746 747 FRAME_END 748 ret 749ENDPROC(crypto_aegis128_aesni_final) 750