1/* 2 * AES-NI + SSE2 implementation of AEGIS-128 3 * 4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> 5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 as published 9 * by the Free Software Foundation. 10 */ 11 12#include <linux/linkage.h> 13#include <asm/frame.h> 14 15#define STATE0 %xmm0 16#define STATE1 %xmm1 17#define STATE2 %xmm2 18#define STATE3 %xmm3 19#define STATE4 %xmm4 20#define KEY %xmm5 21#define MSG %xmm5 22#define T0 %xmm6 23#define T1 %xmm7 24 25#define STATEP %rdi 26#define LEN %rsi 27#define SRC %rdx 28#define DST %rcx 29 30.section .rodata.cst16.aegis128_const, "aM", @progbits, 32 31.align 16 32.Laegis128_const_0: 33 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d 34 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 35.Laegis128_const_1: 36 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 37 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd 38 39.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 40.align 16 41.Laegis128_counter: 42 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 43 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 44 45.text 46 47/* 48 * aegis128_update 49 * input: 50 * STATE[0-4] - input state 51 * output: 52 * STATE[0-4] - output state (shifted positions) 53 * changed: 54 * T0 55 */ 56.macro aegis128_update 57 movdqa STATE4, T0 58 aesenc STATE0, STATE4 59 aesenc STATE1, STATE0 60 aesenc STATE2, STATE1 61 aesenc STATE3, STATE2 62 aesenc T0, STATE3 63.endm 64 65/* 66 * __load_partial: internal ABI 67 * input: 68 * LEN - bytes 69 * SRC - src 70 * output: 71 * MSG - message block 72 * changed: 73 * T0 74 * %r8 75 * %r9 76 */ 77__load_partial: 78 xor %r9d, %r9d 79 pxor MSG, MSG 80 81 mov LEN, %r8 82 and $0x1, %r8 83 jz .Lld_partial_1 84 85 mov LEN, %r8 86 and $0x1E, %r8 87 add SRC, %r8 88 mov (%r8), %r9b 89 90.Lld_partial_1: 91 mov LEN, %r8 92 and $0x2, %r8 93 jz .Lld_partial_2 94 95 mov LEN, %r8 96 and $0x1C, %r8 97 add SRC, %r8 98 shl $0x10, %r9 99 mov (%r8), %r9w 100 101.Lld_partial_2: 102 mov LEN, %r8 103 and $0x4, %r8 104 jz .Lld_partial_4 105 106 mov LEN, %r8 107 and $0x18, %r8 108 add SRC, %r8 109 shl $32, %r9 110 mov (%r8), %r8d 111 xor %r8, %r9 112 113.Lld_partial_4: 114 movq %r9, MSG 115 116 mov LEN, %r8 117 and $0x8, %r8 118 jz .Lld_partial_8 119 120 mov LEN, %r8 121 and $0x10, %r8 122 add SRC, %r8 123 pslldq $8, MSG 124 movq (%r8), T0 125 pxor T0, MSG 126 127.Lld_partial_8: 128 ret 129ENDPROC(__load_partial) 130 131/* 132 * __store_partial: internal ABI 133 * input: 134 * LEN - bytes 135 * DST - dst 136 * output: 137 * T0 - message block 138 * changed: 139 * %r8 140 * %r9 141 * %r10 142 */ 143__store_partial: 144 mov LEN, %r8 145 mov DST, %r9 146 147 movq T0, %r10 148 149 cmp $8, %r8 150 jl .Lst_partial_8 151 152 mov %r10, (%r9) 153 psrldq $8, T0 154 movq T0, %r10 155 156 sub $8, %r8 157 add $8, %r9 158 159.Lst_partial_8: 160 cmp $4, %r8 161 jl .Lst_partial_4 162 163 mov %r10d, (%r9) 164 shr $32, %r10 165 166 sub $4, %r8 167 add $4, %r9 168 169.Lst_partial_4: 170 cmp $2, %r8 171 jl .Lst_partial_2 172 173 mov %r10w, (%r9) 174 shr $0x10, %r10 175 176 sub $2, %r8 177 add $2, %r9 178 179.Lst_partial_2: 180 cmp $1, %r8 181 jl .Lst_partial_1 182 183 mov %r10b, (%r9) 184 185.Lst_partial_1: 186 ret 187ENDPROC(__store_partial) 188 189/* 190 * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); 191 */ 192ENTRY(crypto_aegis128_aesni_init) 193 FRAME_BEGIN 194 195 /* load IV: */ 196 movdqu (%rdx), T1 197 198 /* load key: */ 199 movdqa (%rsi), KEY 200 pxor KEY, T1 201 movdqa T1, STATE0 202 movdqa KEY, STATE3 203 movdqa KEY, STATE4 204 205 /* load the constants: */ 206 movdqa .Laegis128_const_0, STATE2 207 movdqa .Laegis128_const_1, STATE1 208 pxor STATE2, STATE3 209 pxor STATE1, STATE4 210 211 /* update 10 times with KEY / KEY xor IV: */ 212 aegis128_update; pxor KEY, STATE4 213 aegis128_update; pxor T1, STATE3 214 aegis128_update; pxor KEY, STATE2 215 aegis128_update; pxor T1, STATE1 216 aegis128_update; pxor KEY, STATE0 217 aegis128_update; pxor T1, STATE4 218 aegis128_update; pxor KEY, STATE3 219 aegis128_update; pxor T1, STATE2 220 aegis128_update; pxor KEY, STATE1 221 aegis128_update; pxor T1, STATE0 222 223 /* store the state: */ 224 movdqu STATE0, 0x00(STATEP) 225 movdqu STATE1, 0x10(STATEP) 226 movdqu STATE2, 0x20(STATEP) 227 movdqu STATE3, 0x30(STATEP) 228 movdqu STATE4, 0x40(STATEP) 229 230 FRAME_END 231 ret 232ENDPROC(crypto_aegis128_aesni_init) 233 234/* 235 * void crypto_aegis128_aesni_ad(void *state, unsigned int length, 236 * const void *data); 237 */ 238ENTRY(crypto_aegis128_aesni_ad) 239 FRAME_BEGIN 240 241 cmp $0x10, LEN 242 jb .Lad_out 243 244 /* load the state: */ 245 movdqu 0x00(STATEP), STATE0 246 movdqu 0x10(STATEP), STATE1 247 movdqu 0x20(STATEP), STATE2 248 movdqu 0x30(STATEP), STATE3 249 movdqu 0x40(STATEP), STATE4 250 251 mov SRC, %r8 252 and $0xF, %r8 253 jnz .Lad_u_loop 254 255.align 8 256.Lad_a_loop: 257 movdqa 0x00(SRC), MSG 258 aegis128_update 259 pxor MSG, STATE4 260 sub $0x10, LEN 261 cmp $0x10, LEN 262 jl .Lad_out_1 263 264 movdqa 0x10(SRC), MSG 265 aegis128_update 266 pxor MSG, STATE3 267 sub $0x10, LEN 268 cmp $0x10, LEN 269 jl .Lad_out_2 270 271 movdqa 0x20(SRC), MSG 272 aegis128_update 273 pxor MSG, STATE2 274 sub $0x10, LEN 275 cmp $0x10, LEN 276 jl .Lad_out_3 277 278 movdqa 0x30(SRC), MSG 279 aegis128_update 280 pxor MSG, STATE1 281 sub $0x10, LEN 282 cmp $0x10, LEN 283 jl .Lad_out_4 284 285 movdqa 0x40(SRC), MSG 286 aegis128_update 287 pxor MSG, STATE0 288 sub $0x10, LEN 289 cmp $0x10, LEN 290 jl .Lad_out_0 291 292 add $0x50, SRC 293 jmp .Lad_a_loop 294 295.align 8 296.Lad_u_loop: 297 movdqu 0x00(SRC), MSG 298 aegis128_update 299 pxor MSG, STATE4 300 sub $0x10, LEN 301 cmp $0x10, LEN 302 jl .Lad_out_1 303 304 movdqu 0x10(SRC), MSG 305 aegis128_update 306 pxor MSG, STATE3 307 sub $0x10, LEN 308 cmp $0x10, LEN 309 jl .Lad_out_2 310 311 movdqu 0x20(SRC), MSG 312 aegis128_update 313 pxor MSG, STATE2 314 sub $0x10, LEN 315 cmp $0x10, LEN 316 jl .Lad_out_3 317 318 movdqu 0x30(SRC), MSG 319 aegis128_update 320 pxor MSG, STATE1 321 sub $0x10, LEN 322 cmp $0x10, LEN 323 jl .Lad_out_4 324 325 movdqu 0x40(SRC), MSG 326 aegis128_update 327 pxor MSG, STATE0 328 sub $0x10, LEN 329 cmp $0x10, LEN 330 jl .Lad_out_0 331 332 add $0x50, SRC 333 jmp .Lad_u_loop 334 335 /* store the state: */ 336.Lad_out_0: 337 movdqu STATE0, 0x00(STATEP) 338 movdqu STATE1, 0x10(STATEP) 339 movdqu STATE2, 0x20(STATEP) 340 movdqu STATE3, 0x30(STATEP) 341 movdqu STATE4, 0x40(STATEP) 342 FRAME_END 343 ret 344 345.Lad_out_1: 346 movdqu STATE4, 0x00(STATEP) 347 movdqu STATE0, 0x10(STATEP) 348 movdqu STATE1, 0x20(STATEP) 349 movdqu STATE2, 0x30(STATEP) 350 movdqu STATE3, 0x40(STATEP) 351 FRAME_END 352 ret 353 354.Lad_out_2: 355 movdqu STATE3, 0x00(STATEP) 356 movdqu STATE4, 0x10(STATEP) 357 movdqu STATE0, 0x20(STATEP) 358 movdqu STATE1, 0x30(STATEP) 359 movdqu STATE2, 0x40(STATEP) 360 FRAME_END 361 ret 362 363.Lad_out_3: 364 movdqu STATE2, 0x00(STATEP) 365 movdqu STATE3, 0x10(STATEP) 366 movdqu STATE4, 0x20(STATEP) 367 movdqu STATE0, 0x30(STATEP) 368 movdqu STATE1, 0x40(STATEP) 369 FRAME_END 370 ret 371 372.Lad_out_4: 373 movdqu STATE1, 0x00(STATEP) 374 movdqu STATE2, 0x10(STATEP) 375 movdqu STATE3, 0x20(STATEP) 376 movdqu STATE4, 0x30(STATEP) 377 movdqu STATE0, 0x40(STATEP) 378 FRAME_END 379 ret 380 381.Lad_out: 382 FRAME_END 383 ret 384ENDPROC(crypto_aegis128_aesni_ad) 385 386.macro encrypt_block a s0 s1 s2 s3 s4 i 387 movdq\a (\i * 0x10)(SRC), MSG 388 movdqa MSG, T0 389 pxor \s1, T0 390 pxor \s4, T0 391 movdqa \s2, T1 392 pand \s3, T1 393 pxor T1, T0 394 movdq\a T0, (\i * 0x10)(DST) 395 396 aegis128_update 397 pxor MSG, \s4 398 399 sub $0x10, LEN 400 cmp $0x10, LEN 401 jl .Lenc_out_\i 402.endm 403 404/* 405 * void crypto_aegis128_aesni_enc(void *state, unsigned int length, 406 * const void *src, void *dst); 407 */ 408ENTRY(crypto_aegis128_aesni_enc) 409 FRAME_BEGIN 410 411 cmp $0x10, LEN 412 jb .Lenc_out 413 414 /* load the state: */ 415 movdqu 0x00(STATEP), STATE0 416 movdqu 0x10(STATEP), STATE1 417 movdqu 0x20(STATEP), STATE2 418 movdqu 0x30(STATEP), STATE3 419 movdqu 0x40(STATEP), STATE4 420 421 mov SRC, %r8 422 or DST, %r8 423 and $0xF, %r8 424 jnz .Lenc_u_loop 425 426.align 8 427.Lenc_a_loop: 428 encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 429 encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 430 encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 431 encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 432 encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 433 434 add $0x50, SRC 435 add $0x50, DST 436 jmp .Lenc_a_loop 437 438.align 8 439.Lenc_u_loop: 440 encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 441 encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 442 encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 443 encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 444 encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 445 446 add $0x50, SRC 447 add $0x50, DST 448 jmp .Lenc_u_loop 449 450 /* store the state: */ 451.Lenc_out_0: 452 movdqu STATE4, 0x00(STATEP) 453 movdqu STATE0, 0x10(STATEP) 454 movdqu STATE1, 0x20(STATEP) 455 movdqu STATE2, 0x30(STATEP) 456 movdqu STATE3, 0x40(STATEP) 457 FRAME_END 458 ret 459 460.Lenc_out_1: 461 movdqu STATE3, 0x00(STATEP) 462 movdqu STATE4, 0x10(STATEP) 463 movdqu STATE0, 0x20(STATEP) 464 movdqu STATE1, 0x30(STATEP) 465 movdqu STATE2, 0x40(STATEP) 466 FRAME_END 467 ret 468 469.Lenc_out_2: 470 movdqu STATE2, 0x00(STATEP) 471 movdqu STATE3, 0x10(STATEP) 472 movdqu STATE4, 0x20(STATEP) 473 movdqu STATE0, 0x30(STATEP) 474 movdqu STATE1, 0x40(STATEP) 475 FRAME_END 476 ret 477 478.Lenc_out_3: 479 movdqu STATE1, 0x00(STATEP) 480 movdqu STATE2, 0x10(STATEP) 481 movdqu STATE3, 0x20(STATEP) 482 movdqu STATE4, 0x30(STATEP) 483 movdqu STATE0, 0x40(STATEP) 484 FRAME_END 485 ret 486 487.Lenc_out_4: 488 movdqu STATE0, 0x00(STATEP) 489 movdqu STATE1, 0x10(STATEP) 490 movdqu STATE2, 0x20(STATEP) 491 movdqu STATE3, 0x30(STATEP) 492 movdqu STATE4, 0x40(STATEP) 493 FRAME_END 494 ret 495 496.Lenc_out: 497 FRAME_END 498 ret 499ENDPROC(crypto_aegis128_aesni_enc) 500 501/* 502 * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, 503 * const void *src, void *dst); 504 */ 505ENTRY(crypto_aegis128_aesni_enc_tail) 506 FRAME_BEGIN 507 508 /* load the state: */ 509 movdqu 0x00(STATEP), STATE0 510 movdqu 0x10(STATEP), STATE1 511 movdqu 0x20(STATEP), STATE2 512 movdqu 0x30(STATEP), STATE3 513 movdqu 0x40(STATEP), STATE4 514 515 /* encrypt message: */ 516 call __load_partial 517 518 movdqa MSG, T0 519 pxor STATE1, T0 520 pxor STATE4, T0 521 movdqa STATE2, T1 522 pand STATE3, T1 523 pxor T1, T0 524 525 call __store_partial 526 527 aegis128_update 528 pxor MSG, STATE4 529 530 /* store the state: */ 531 movdqu STATE4, 0x00(STATEP) 532 movdqu STATE0, 0x10(STATEP) 533 movdqu STATE1, 0x20(STATEP) 534 movdqu STATE2, 0x30(STATEP) 535 movdqu STATE3, 0x40(STATEP) 536 537 FRAME_END 538 ret 539ENDPROC(crypto_aegis128_aesni_enc_tail) 540 541.macro decrypt_block a s0 s1 s2 s3 s4 i 542 movdq\a (\i * 0x10)(SRC), MSG 543 pxor \s1, MSG 544 pxor \s4, MSG 545 movdqa \s2, T1 546 pand \s3, T1 547 pxor T1, MSG 548 movdq\a MSG, (\i * 0x10)(DST) 549 550 aegis128_update 551 pxor MSG, \s4 552 553 sub $0x10, LEN 554 cmp $0x10, LEN 555 jl .Ldec_out_\i 556.endm 557 558/* 559 * void crypto_aegis128_aesni_dec(void *state, unsigned int length, 560 * const void *src, void *dst); 561 */ 562ENTRY(crypto_aegis128_aesni_dec) 563 FRAME_BEGIN 564 565 cmp $0x10, LEN 566 jb .Ldec_out 567 568 /* load the state: */ 569 movdqu 0x00(STATEP), STATE0 570 movdqu 0x10(STATEP), STATE1 571 movdqu 0x20(STATEP), STATE2 572 movdqu 0x30(STATEP), STATE3 573 movdqu 0x40(STATEP), STATE4 574 575 mov SRC, %r8 576 or DST, %r8 577 and $0xF, %r8 578 jnz .Ldec_u_loop 579 580.align 8 581.Ldec_a_loop: 582 decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 583 decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 584 decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 585 decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 586 decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 587 588 add $0x50, SRC 589 add $0x50, DST 590 jmp .Ldec_a_loop 591 592.align 8 593.Ldec_u_loop: 594 decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 595 decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 596 decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 597 decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 598 decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 599 600 add $0x50, SRC 601 add $0x50, DST 602 jmp .Ldec_u_loop 603 604 /* store the state: */ 605.Ldec_out_0: 606 movdqu STATE4, 0x00(STATEP) 607 movdqu STATE0, 0x10(STATEP) 608 movdqu STATE1, 0x20(STATEP) 609 movdqu STATE2, 0x30(STATEP) 610 movdqu STATE3, 0x40(STATEP) 611 FRAME_END 612 ret 613 614.Ldec_out_1: 615 movdqu STATE3, 0x00(STATEP) 616 movdqu STATE4, 0x10(STATEP) 617 movdqu STATE0, 0x20(STATEP) 618 movdqu STATE1, 0x30(STATEP) 619 movdqu STATE2, 0x40(STATEP) 620 FRAME_END 621 ret 622 623.Ldec_out_2: 624 movdqu STATE2, 0x00(STATEP) 625 movdqu STATE3, 0x10(STATEP) 626 movdqu STATE4, 0x20(STATEP) 627 movdqu STATE0, 0x30(STATEP) 628 movdqu STATE1, 0x40(STATEP) 629 FRAME_END 630 ret 631 632.Ldec_out_3: 633 movdqu STATE1, 0x00(STATEP) 634 movdqu STATE2, 0x10(STATEP) 635 movdqu STATE3, 0x20(STATEP) 636 movdqu STATE4, 0x30(STATEP) 637 movdqu STATE0, 0x40(STATEP) 638 FRAME_END 639 ret 640 641.Ldec_out_4: 642 movdqu STATE0, 0x00(STATEP) 643 movdqu STATE1, 0x10(STATEP) 644 movdqu STATE2, 0x20(STATEP) 645 movdqu STATE3, 0x30(STATEP) 646 movdqu STATE4, 0x40(STATEP) 647 FRAME_END 648 ret 649 650.Ldec_out: 651 FRAME_END 652 ret 653ENDPROC(crypto_aegis128_aesni_dec) 654 655/* 656 * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, 657 * const void *src, void *dst); 658 */ 659ENTRY(crypto_aegis128_aesni_dec_tail) 660 FRAME_BEGIN 661 662 /* load the state: */ 663 movdqu 0x00(STATEP), STATE0 664 movdqu 0x10(STATEP), STATE1 665 movdqu 0x20(STATEP), STATE2 666 movdqu 0x30(STATEP), STATE3 667 movdqu 0x40(STATEP), STATE4 668 669 /* decrypt message: */ 670 call __load_partial 671 672 pxor STATE1, MSG 673 pxor STATE4, MSG 674 movdqa STATE2, T1 675 pand STATE3, T1 676 pxor T1, MSG 677 678 movdqa MSG, T0 679 call __store_partial 680 681 /* mask with byte count: */ 682 movq LEN, T0 683 punpcklbw T0, T0 684 punpcklbw T0, T0 685 punpcklbw T0, T0 686 punpcklbw T0, T0 687 movdqa .Laegis128_counter, T1 688 pcmpgtb T1, T0 689 pand T0, MSG 690 691 aegis128_update 692 pxor MSG, STATE4 693 694 /* store the state: */ 695 movdqu STATE4, 0x00(STATEP) 696 movdqu STATE0, 0x10(STATEP) 697 movdqu STATE1, 0x20(STATEP) 698 movdqu STATE2, 0x30(STATEP) 699 movdqu STATE3, 0x40(STATEP) 700 701 FRAME_END 702 ret 703ENDPROC(crypto_aegis128_aesni_dec_tail) 704 705/* 706 * void crypto_aegis128_aesni_final(void *state, void *tag_xor, 707 * u64 assoclen, u64 cryptlen); 708 */ 709ENTRY(crypto_aegis128_aesni_final) 710 FRAME_BEGIN 711 712 /* load the state: */ 713 movdqu 0x00(STATEP), STATE0 714 movdqu 0x10(STATEP), STATE1 715 movdqu 0x20(STATEP), STATE2 716 movdqu 0x30(STATEP), STATE3 717 movdqu 0x40(STATEP), STATE4 718 719 /* prepare length block: */ 720 movq %rdx, MSG 721 movq %rcx, T0 722 pslldq $8, T0 723 pxor T0, MSG 724 psllq $3, MSG /* multiply by 8 (to get bit count) */ 725 726 pxor STATE3, MSG 727 728 /* update state: */ 729 aegis128_update; pxor MSG, STATE4 730 aegis128_update; pxor MSG, STATE3 731 aegis128_update; pxor MSG, STATE2 732 aegis128_update; pxor MSG, STATE1 733 aegis128_update; pxor MSG, STATE0 734 aegis128_update; pxor MSG, STATE4 735 aegis128_update; pxor MSG, STATE3 736 737 /* xor tag: */ 738 movdqu (%rsi), MSG 739 740 pxor STATE0, MSG 741 pxor STATE1, MSG 742 pxor STATE2, MSG 743 pxor STATE3, MSG 744 pxor STATE4, MSG 745 746 movdqu MSG, (%rsi) 747 748 FRAME_END 749 ret 750ENDPROC(crypto_aegis128_aesni_final) 751