1#!/usr/bin/env perl 2# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 3# 4# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 6# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. 7# 8# This code is taken from the OpenSSL project but the author, Andy Polyakov, 9# has relicensed it under the licenses specified in the SPDX header above. 10# The original headers, including the original license headers, are 11# included below for completeness. 12# 13# ==================================================================== 14# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 15# project. The module is, however, dual licensed under OpenSSL and 16# CRYPTOGAMS licenses depending on where you obtain it. For further 17# details see http://www.openssl.org/~appro/cryptogams/. 18# ==================================================================== 19# 20# This module implements Poly1305 hash for x86_64. 21# 22# March 2015 23# 24# Initial release. 25# 26# December 2016 27# 28# Add AVX512F+VL+BW code path. 29# 30# November 2017 31# 32# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 33# executed even on Knights Landing. Trigger for modification was 34# observation that AVX512 code paths can negatively affect overall 35# Skylake-X system performance. Since we are likely to suppress 36# AVX512F capability flag [at least on Skylake-X], conversion serves 37# as kind of "investment protection". Note that next *lake processor, 38# Cannonlake, has AVX512IFMA code path to execute... 39# 40# Numbers are cycles per processed byte with poly1305_blocks alone, 41# measured with rdtsc at fixed clock frequency. 42# 43# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 44# P4 4.46/+120% - 45# Core 2 2.41/+90% - 46# Westmere 1.88/+120% - 47# Sandy Bridge 1.39/+140% 1.10 48# Haswell 1.14/+175% 1.11 0.65 49# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 50# Silvermont 2.83/+95% - 51# Knights L 3.60/? 1.65 1.10 0.41(***) 52# Goldmont 1.70/+180% - 53# VIA Nano 1.82/+150% - 54# Sledgehammer 1.38/+160% - 55# Bulldozer 2.30/+130% 0.97 56# Ryzen 1.15/+200% 1.08 1.18 57# 58# (*) improvement coefficients relative to clang are more modest and 59# are ~50% on most processors, in both cases we are comparing to 60# __int128 code; 61# (**) SSE2 implementation was attempted, but among non-AVX processors 62# it was faster than integer-only code only on older Intel P4 and 63# Core processors, 50-30%, less newer processor is, but slower on 64# contemporary ones, for example almost 2x slower on Atom, and as 65# former are naturally disappearing, SSE2 is deemed unnecessary; 66# (***) strangely enough performance seems to vary from core to core, 67# listed result is best case; 68 69$flavour = shift; 70$output = shift; 71if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 72 73$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 74$kernel=0; $kernel=1 if (!$flavour && !$output); 75 76if (!$kernel) { 77 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 78 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 79 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 80 die "can't locate x86_64-xlate.pl"; 81 82 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 83 *STDOUT=*OUT; 84 85 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 86 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 87 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 88 } 89 90 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 91 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 92 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 93 $avx += 1 if ($1==2.11 && $2>=8); 94 } 95 96 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 97 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 98 $avx = ($1>=10) + ($1>=11); 99 } 100 101 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 102 $avx = ($2>=3.0) + ($2>3.0); 103 } 104} else { 105 $avx = 4; # The kernel uses ifdefs for this. 106} 107 108sub declare_function() { 109 my ($name, $align, $nargs) = @_; 110 if($kernel) { 111 $code .= ".align $align\n"; 112 $code .= "SYM_FUNC_START($name)\n"; 113 $code .= ".L$name:\n"; 114 } else { 115 $code .= ".globl $name\n"; 116 $code .= ".type $name,\@function,$nargs\n"; 117 $code .= ".align $align\n"; 118 $code .= "$name:\n"; 119 } 120} 121 122sub end_function() { 123 my ($name) = @_; 124 if($kernel) { 125 $code .= "SYM_FUNC_END($name)\n"; 126 } else { 127 $code .= ".size $name,.-$name\n"; 128 } 129} 130 131$code.=<<___ if $kernel; 132#include <linux/linkage.h> 133___ 134 135if ($avx) { 136$code.=<<___ if $kernel; 137.section .rodata 138___ 139$code.=<<___; 140.align 64 141.Lconst: 142.Lmask24: 143.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 144.L129: 145.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 146.Lmask26: 147.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 148.Lpermd_avx2: 149.long 2,2,2,3,2,0,2,1 150.Lpermd_avx512: 151.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 152 153.L2_44_inp_permd: 154.long 0,1,1,2,2,3,7,7 155.L2_44_inp_shift: 156.quad 0,12,24,64 157.L2_44_mask: 158.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 159.L2_44_shift_rgt: 160.quad 44,44,42,64 161.L2_44_shift_lft: 162.quad 8,8,10,64 163 164.align 64 165.Lx_mask44: 166.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 167.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 168.Lx_mask42: 169.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 170.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 171___ 172} 173$code.=<<___ if (!$kernel); 174.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 175.align 16 176___ 177 178my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 179my ($mac,$nonce)=($inp,$len); # *_emit arguments 180my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); 181my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); 182 183sub poly1305_iteration { 184# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 185# output: $h0-$h2 *= $r0-$r1 186$code.=<<___; 187 mulq $h0 # h0*r1 188 mov %rax,$d2 189 mov $r0,%rax 190 mov %rdx,$d3 191 192 mulq $h0 # h0*r0 193 mov %rax,$h0 # future $h0 194 mov $r0,%rax 195 mov %rdx,$d1 196 197 mulq $h1 # h1*r0 198 add %rax,$d2 199 mov $s1,%rax 200 adc %rdx,$d3 201 202 mulq $h1 # h1*s1 203 mov $h2,$h1 # borrow $h1 204 add %rax,$h0 205 adc %rdx,$d1 206 207 imulq $s1,$h1 # h2*s1 208 add $h1,$d2 209 mov $d1,$h1 210 adc \$0,$d3 211 212 imulq $r0,$h2 # h2*r0 213 add $d2,$h1 214 mov \$-4,%rax # mask value 215 adc $h2,$d3 216 217 and $d3,%rax # last reduction step 218 mov $d3,$h2 219 shr \$2,$d3 220 and \$3,$h2 221 add $d3,%rax 222 add %rax,$h0 223 adc \$0,$h1 224 adc \$0,$h2 225___ 226} 227 228######################################################################## 229# Layout of opaque area is following. 230# 231# unsigned __int64 h[3]; # current hash value base 2^64 232# unsigned __int64 r[2]; # key value base 2^64 233 234$code.=<<___; 235.text 236___ 237$code.=<<___ if (!$kernel); 238.extern OPENSSL_ia32cap_P 239 240.globl poly1305_init_x86_64 241.hidden poly1305_init_x86_64 242.globl poly1305_blocks_x86_64 243.hidden poly1305_blocks_x86_64 244.globl poly1305_emit_x86_64 245.hidden poly1305_emit_x86_64 246___ 247&declare_function("poly1305_init_x86_64", 32, 3); 248$code.=<<___; 249 xor %eax,%eax 250 mov %rax,0($ctx) # initialize hash value 251 mov %rax,8($ctx) 252 mov %rax,16($ctx) 253 254 cmp \$0,$inp 255 je .Lno_key 256___ 257$code.=<<___ if (!$kernel); 258 lea poly1305_blocks_x86_64(%rip),%r10 259 lea poly1305_emit_x86_64(%rip),%r11 260___ 261$code.=<<___ if (!$kernel && $avx); 262 mov OPENSSL_ia32cap_P+4(%rip),%r9 263 lea poly1305_blocks_avx(%rip),%rax 264 lea poly1305_emit_avx(%rip),%rcx 265 bt \$`60-32`,%r9 # AVX? 266 cmovc %rax,%r10 267 cmovc %rcx,%r11 268___ 269$code.=<<___ if (!$kernel && $avx>1); 270 lea poly1305_blocks_avx2(%rip),%rax 271 bt \$`5+32`,%r9 # AVX2? 272 cmovc %rax,%r10 273___ 274$code.=<<___ if (!$kernel && $avx>3); 275 mov \$`(1<<31|1<<21|1<<16)`,%rax 276 shr \$32,%r9 277 and %rax,%r9 278 cmp %rax,%r9 279 je .Linit_base2_44 280___ 281$code.=<<___; 282 mov \$0x0ffffffc0fffffff,%rax 283 mov \$0x0ffffffc0ffffffc,%rcx 284 and 0($inp),%rax 285 and 8($inp),%rcx 286 mov %rax,24($ctx) 287 mov %rcx,32($ctx) 288___ 289$code.=<<___ if (!$kernel && $flavour !~ /elf32/); 290 mov %r10,0(%rdx) 291 mov %r11,8(%rdx) 292___ 293$code.=<<___ if (!$kernel && $flavour =~ /elf32/); 294 mov %r10d,0(%rdx) 295 mov %r11d,4(%rdx) 296___ 297$code.=<<___; 298 mov \$1,%eax 299.Lno_key: 300 ret 301___ 302&end_function("poly1305_init_x86_64"); 303 304&declare_function("poly1305_blocks_x86_64", 32, 4); 305$code.=<<___; 306.cfi_startproc 307.Lblocks: 308 shr \$4,$len 309 jz .Lno_data # too short 310 311 push %rbx 312.cfi_push %rbx 313 push %r12 314.cfi_push %r12 315 push %r13 316.cfi_push %r13 317 push %r14 318.cfi_push %r14 319 push %r15 320.cfi_push %r15 321 push $ctx 322.cfi_push $ctx 323.Lblocks_body: 324 325 mov $len,%r15 # reassign $len 326 327 mov 24($ctx),$r0 # load r 328 mov 32($ctx),$s1 329 330 mov 0($ctx),$h0 # load hash value 331 mov 8($ctx),$h1 332 mov 16($ctx),$h2 333 334 mov $s1,$r1 335 shr \$2,$s1 336 mov $r1,%rax 337 add $r1,$s1 # s1 = r1 + (r1 >> 2) 338 jmp .Loop 339 340.align 32 341.Loop: 342 add 0($inp),$h0 # accumulate input 343 adc 8($inp),$h1 344 lea 16($inp),$inp 345 adc $padbit,$h2 346___ 347 348 &poly1305_iteration(); 349 350$code.=<<___; 351 mov $r1,%rax 352 dec %r15 # len-=16 353 jnz .Loop 354 355 mov 0(%rsp),$ctx 356.cfi_restore $ctx 357 358 mov $h0,0($ctx) # store hash value 359 mov $h1,8($ctx) 360 mov $h2,16($ctx) 361 362 mov 8(%rsp),%r15 363.cfi_restore %r15 364 mov 16(%rsp),%r14 365.cfi_restore %r14 366 mov 24(%rsp),%r13 367.cfi_restore %r13 368 mov 32(%rsp),%r12 369.cfi_restore %r12 370 mov 40(%rsp),%rbx 371.cfi_restore %rbx 372 lea 48(%rsp),%rsp 373.cfi_adjust_cfa_offset -48 374.Lno_data: 375.Lblocks_epilogue: 376 ret 377.cfi_endproc 378___ 379&end_function("poly1305_blocks_x86_64"); 380 381&declare_function("poly1305_emit_x86_64", 32, 3); 382$code.=<<___; 383.Lemit: 384 mov 0($ctx),%r8 # load hash value 385 mov 8($ctx),%r9 386 mov 16($ctx),%r10 387 388 mov %r8,%rax 389 add \$5,%r8 # compare to modulus 390 mov %r9,%rcx 391 adc \$0,%r9 392 adc \$0,%r10 393 shr \$2,%r10 # did 130-bit value overflow? 394 cmovnz %r8,%rax 395 cmovnz %r9,%rcx 396 397 add 0($nonce),%rax # accumulate nonce 398 adc 8($nonce),%rcx 399 mov %rax,0($mac) # write result 400 mov %rcx,8($mac) 401 402 ret 403___ 404&end_function("poly1305_emit_x86_64"); 405if ($avx) { 406 407######################################################################## 408# Layout of opaque area is following. 409# 410# unsigned __int32 h[5]; # current hash value base 2^26 411# unsigned __int32 is_base2_26; 412# unsigned __int64 r[2]; # key value base 2^64 413# unsigned __int64 pad; 414# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 415# 416# where r^n are base 2^26 digits of degrees of multiplier key. There are 417# 5 digits, but last four are interleaved with multiples of 5, totalling 418# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 419 420my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 421 map("%xmm$_",(0..15)); 422 423$code.=<<___; 424.type __poly1305_block,\@abi-omnipotent 425.align 32 426__poly1305_block: 427 push $ctx 428___ 429 &poly1305_iteration(); 430$code.=<<___; 431 pop $ctx 432 ret 433.size __poly1305_block,.-__poly1305_block 434 435.type __poly1305_init_avx,\@abi-omnipotent 436.align 32 437__poly1305_init_avx: 438 push %rbp 439 mov %rsp,%rbp 440 mov $r0,$h0 441 mov $r1,$h1 442 xor $h2,$h2 443 444 lea 48+64($ctx),$ctx # size optimization 445 446 mov $r1,%rax 447 call __poly1305_block # r^2 448 449 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 450 mov \$0x3ffffff,%edx 451 mov $h0,$d1 452 and $h0#d,%eax 453 mov $r0,$d2 454 and $r0#d,%edx 455 mov %eax,`16*0+0-64`($ctx) 456 shr \$26,$d1 457 mov %edx,`16*0+4-64`($ctx) 458 shr \$26,$d2 459 460 mov \$0x3ffffff,%eax 461 mov \$0x3ffffff,%edx 462 and $d1#d,%eax 463 and $d2#d,%edx 464 mov %eax,`16*1+0-64`($ctx) 465 lea (%rax,%rax,4),%eax # *5 466 mov %edx,`16*1+4-64`($ctx) 467 lea (%rdx,%rdx,4),%edx # *5 468 mov %eax,`16*2+0-64`($ctx) 469 shr \$26,$d1 470 mov %edx,`16*2+4-64`($ctx) 471 shr \$26,$d2 472 473 mov $h1,%rax 474 mov $r1,%rdx 475 shl \$12,%rax 476 shl \$12,%rdx 477 or $d1,%rax 478 or $d2,%rdx 479 and \$0x3ffffff,%eax 480 and \$0x3ffffff,%edx 481 mov %eax,`16*3+0-64`($ctx) 482 lea (%rax,%rax,4),%eax # *5 483 mov %edx,`16*3+4-64`($ctx) 484 lea (%rdx,%rdx,4),%edx # *5 485 mov %eax,`16*4+0-64`($ctx) 486 mov $h1,$d1 487 mov %edx,`16*4+4-64`($ctx) 488 mov $r1,$d2 489 490 mov \$0x3ffffff,%eax 491 mov \$0x3ffffff,%edx 492 shr \$14,$d1 493 shr \$14,$d2 494 and $d1#d,%eax 495 and $d2#d,%edx 496 mov %eax,`16*5+0-64`($ctx) 497 lea (%rax,%rax,4),%eax # *5 498 mov %edx,`16*5+4-64`($ctx) 499 lea (%rdx,%rdx,4),%edx # *5 500 mov %eax,`16*6+0-64`($ctx) 501 shr \$26,$d1 502 mov %edx,`16*6+4-64`($ctx) 503 shr \$26,$d2 504 505 mov $h2,%rax 506 shl \$24,%rax 507 or %rax,$d1 508 mov $d1#d,`16*7+0-64`($ctx) 509 lea ($d1,$d1,4),$d1 # *5 510 mov $d2#d,`16*7+4-64`($ctx) 511 lea ($d2,$d2,4),$d2 # *5 512 mov $d1#d,`16*8+0-64`($ctx) 513 mov $d2#d,`16*8+4-64`($ctx) 514 515 mov $r1,%rax 516 call __poly1305_block # r^3 517 518 mov \$0x3ffffff,%eax # save r^3 base 2^26 519 mov $h0,$d1 520 and $h0#d,%eax 521 shr \$26,$d1 522 mov %eax,`16*0+12-64`($ctx) 523 524 mov \$0x3ffffff,%edx 525 and $d1#d,%edx 526 mov %edx,`16*1+12-64`($ctx) 527 lea (%rdx,%rdx,4),%edx # *5 528 shr \$26,$d1 529 mov %edx,`16*2+12-64`($ctx) 530 531 mov $h1,%rax 532 shl \$12,%rax 533 or $d1,%rax 534 and \$0x3ffffff,%eax 535 mov %eax,`16*3+12-64`($ctx) 536 lea (%rax,%rax,4),%eax # *5 537 mov $h1,$d1 538 mov %eax,`16*4+12-64`($ctx) 539 540 mov \$0x3ffffff,%edx 541 shr \$14,$d1 542 and $d1#d,%edx 543 mov %edx,`16*5+12-64`($ctx) 544 lea (%rdx,%rdx,4),%edx # *5 545 shr \$26,$d1 546 mov %edx,`16*6+12-64`($ctx) 547 548 mov $h2,%rax 549 shl \$24,%rax 550 or %rax,$d1 551 mov $d1#d,`16*7+12-64`($ctx) 552 lea ($d1,$d1,4),$d1 # *5 553 mov $d1#d,`16*8+12-64`($ctx) 554 555 mov $r1,%rax 556 call __poly1305_block # r^4 557 558 mov \$0x3ffffff,%eax # save r^4 base 2^26 559 mov $h0,$d1 560 and $h0#d,%eax 561 shr \$26,$d1 562 mov %eax,`16*0+8-64`($ctx) 563 564 mov \$0x3ffffff,%edx 565 and $d1#d,%edx 566 mov %edx,`16*1+8-64`($ctx) 567 lea (%rdx,%rdx,4),%edx # *5 568 shr \$26,$d1 569 mov %edx,`16*2+8-64`($ctx) 570 571 mov $h1,%rax 572 shl \$12,%rax 573 or $d1,%rax 574 and \$0x3ffffff,%eax 575 mov %eax,`16*3+8-64`($ctx) 576 lea (%rax,%rax,4),%eax # *5 577 mov $h1,$d1 578 mov %eax,`16*4+8-64`($ctx) 579 580 mov \$0x3ffffff,%edx 581 shr \$14,$d1 582 and $d1#d,%edx 583 mov %edx,`16*5+8-64`($ctx) 584 lea (%rdx,%rdx,4),%edx # *5 585 shr \$26,$d1 586 mov %edx,`16*6+8-64`($ctx) 587 588 mov $h2,%rax 589 shl \$24,%rax 590 or %rax,$d1 591 mov $d1#d,`16*7+8-64`($ctx) 592 lea ($d1,$d1,4),$d1 # *5 593 mov $d1#d,`16*8+8-64`($ctx) 594 595 lea -48-64($ctx),$ctx # size [de-]optimization 596 pop %rbp 597 ret 598.size __poly1305_init_avx,.-__poly1305_init_avx 599___ 600 601&declare_function("poly1305_blocks_avx", 32, 4); 602$code.=<<___; 603.cfi_startproc 604 mov 20($ctx),%r8d # is_base2_26 605 cmp \$128,$len 606 jae .Lblocks_avx 607 test %r8d,%r8d 608 jz .Lblocks 609 610.Lblocks_avx: 611 and \$-16,$len 612 jz .Lno_data_avx 613 614 vzeroupper 615 616 test %r8d,%r8d 617 jz .Lbase2_64_avx 618 619 test \$31,$len 620 jz .Leven_avx 621 622 push %rbp 623.cfi_push %rbp 624 mov %rsp,%rbp 625 push %rbx 626.cfi_push %rbx 627 push %r12 628.cfi_push %r12 629 push %r13 630.cfi_push %r13 631 push %r14 632.cfi_push %r14 633 push %r15 634.cfi_push %r15 635.Lblocks_avx_body: 636 637 mov $len,%r15 # reassign $len 638 639 mov 0($ctx),$d1 # load hash value 640 mov 8($ctx),$d2 641 mov 16($ctx),$h2#d 642 643 mov 24($ctx),$r0 # load r 644 mov 32($ctx),$s1 645 646 ################################# base 2^26 -> base 2^64 647 mov $d1#d,$h0#d 648 and \$`-1*(1<<31)`,$d1 649 mov $d2,$r1 # borrow $r1 650 mov $d2#d,$h1#d 651 and \$`-1*(1<<31)`,$d2 652 653 shr \$6,$d1 654 shl \$52,$r1 655 add $d1,$h0 656 shr \$12,$h1 657 shr \$18,$d2 658 add $r1,$h0 659 adc $d2,$h1 660 661 mov $h2,$d1 662 shl \$40,$d1 663 shr \$24,$h2 664 add $d1,$h1 665 adc \$0,$h2 # can be partially reduced... 666 667 mov \$-4,$d2 # ... so reduce 668 mov $h2,$d1 669 and $h2,$d2 670 shr \$2,$d1 671 and \$3,$h2 672 add $d2,$d1 # =*5 673 add $d1,$h0 674 adc \$0,$h1 675 adc \$0,$h2 676 677 mov $s1,$r1 678 mov $s1,%rax 679 shr \$2,$s1 680 add $r1,$s1 # s1 = r1 + (r1 >> 2) 681 682 add 0($inp),$h0 # accumulate input 683 adc 8($inp),$h1 684 lea 16($inp),$inp 685 adc $padbit,$h2 686 687 call __poly1305_block 688 689 test $padbit,$padbit # if $padbit is zero, 690 jz .Lstore_base2_64_avx # store hash in base 2^64 format 691 692 ################################# base 2^64 -> base 2^26 693 mov $h0,%rax 694 mov $h0,%rdx 695 shr \$52,$h0 696 mov $h1,$r0 697 mov $h1,$r1 698 shr \$26,%rdx 699 and \$0x3ffffff,%rax # h[0] 700 shl \$12,$r0 701 and \$0x3ffffff,%rdx # h[1] 702 shr \$14,$h1 703 or $r0,$h0 704 shl \$24,$h2 705 and \$0x3ffffff,$h0 # h[2] 706 shr \$40,$r1 707 and \$0x3ffffff,$h1 # h[3] 708 or $r1,$h2 # h[4] 709 710 sub \$16,%r15 711 jz .Lstore_base2_26_avx 712 713 vmovd %rax#d,$H0 714 vmovd %rdx#d,$H1 715 vmovd $h0#d,$H2 716 vmovd $h1#d,$H3 717 vmovd $h2#d,$H4 718 jmp .Lproceed_avx 719 720.align 32 721.Lstore_base2_64_avx: 722 mov $h0,0($ctx) 723 mov $h1,8($ctx) 724 mov $h2,16($ctx) # note that is_base2_26 is zeroed 725 jmp .Ldone_avx 726 727.align 16 728.Lstore_base2_26_avx: 729 mov %rax#d,0($ctx) # store hash value base 2^26 730 mov %rdx#d,4($ctx) 731 mov $h0#d,8($ctx) 732 mov $h1#d,12($ctx) 733 mov $h2#d,16($ctx) 734.align 16 735.Ldone_avx: 736 pop %r15 737.cfi_restore %r15 738 pop %r14 739.cfi_restore %r14 740 pop %r13 741.cfi_restore %r13 742 pop %r12 743.cfi_restore %r12 744 pop %rbx 745.cfi_restore %rbx 746 pop %rbp 747.cfi_restore %rbp 748.Lno_data_avx: 749.Lblocks_avx_epilogue: 750 ret 751.cfi_endproc 752 753.align 32 754.Lbase2_64_avx: 755.cfi_startproc 756 push %rbp 757.cfi_push %rbp 758 mov %rsp,%rbp 759 push %rbx 760.cfi_push %rbx 761 push %r12 762.cfi_push %r12 763 push %r13 764.cfi_push %r13 765 push %r14 766.cfi_push %r14 767 push %r15 768.cfi_push %r15 769.Lbase2_64_avx_body: 770 771 mov $len,%r15 # reassign $len 772 773 mov 24($ctx),$r0 # load r 774 mov 32($ctx),$s1 775 776 mov 0($ctx),$h0 # load hash value 777 mov 8($ctx),$h1 778 mov 16($ctx),$h2#d 779 780 mov $s1,$r1 781 mov $s1,%rax 782 shr \$2,$s1 783 add $r1,$s1 # s1 = r1 + (r1 >> 2) 784 785 test \$31,$len 786 jz .Linit_avx 787 788 add 0($inp),$h0 # accumulate input 789 adc 8($inp),$h1 790 lea 16($inp),$inp 791 adc $padbit,$h2 792 sub \$16,%r15 793 794 call __poly1305_block 795 796.Linit_avx: 797 ################################# base 2^64 -> base 2^26 798 mov $h0,%rax 799 mov $h0,%rdx 800 shr \$52,$h0 801 mov $h1,$d1 802 mov $h1,$d2 803 shr \$26,%rdx 804 and \$0x3ffffff,%rax # h[0] 805 shl \$12,$d1 806 and \$0x3ffffff,%rdx # h[1] 807 shr \$14,$h1 808 or $d1,$h0 809 shl \$24,$h2 810 and \$0x3ffffff,$h0 # h[2] 811 shr \$40,$d2 812 and \$0x3ffffff,$h1 # h[3] 813 or $d2,$h2 # h[4] 814 815 vmovd %rax#d,$H0 816 vmovd %rdx#d,$H1 817 vmovd $h0#d,$H2 818 vmovd $h1#d,$H3 819 vmovd $h2#d,$H4 820 movl \$1,20($ctx) # set is_base2_26 821 822 call __poly1305_init_avx 823 824.Lproceed_avx: 825 mov %r15,$len 826 pop %r15 827.cfi_restore %r15 828 pop %r14 829.cfi_restore %r14 830 pop %r13 831.cfi_restore %r13 832 pop %r12 833.cfi_restore %r12 834 pop %rbx 835.cfi_restore %rbx 836 pop %rbp 837.cfi_restore %rbp 838.Lbase2_64_avx_epilogue: 839 jmp .Ldo_avx 840.cfi_endproc 841 842.align 32 843.Leven_avx: 844.cfi_startproc 845 vmovd 4*0($ctx),$H0 # load hash value 846 vmovd 4*1($ctx),$H1 847 vmovd 4*2($ctx),$H2 848 vmovd 4*3($ctx),$H3 849 vmovd 4*4($ctx),$H4 850 851.Ldo_avx: 852___ 853$code.=<<___ if (!$win64); 854 lea 8(%rsp),%r10 855.cfi_def_cfa_register %r10 856 and \$-32,%rsp 857 sub \$-8,%rsp 858 lea -0x58(%rsp),%r11 859 sub \$0x178,%rsp 860___ 861$code.=<<___ if ($win64); 862 lea -0xf8(%rsp),%r11 863 sub \$0x218,%rsp 864 vmovdqa %xmm6,0x50(%r11) 865 vmovdqa %xmm7,0x60(%r11) 866 vmovdqa %xmm8,0x70(%r11) 867 vmovdqa %xmm9,0x80(%r11) 868 vmovdqa %xmm10,0x90(%r11) 869 vmovdqa %xmm11,0xa0(%r11) 870 vmovdqa %xmm12,0xb0(%r11) 871 vmovdqa %xmm13,0xc0(%r11) 872 vmovdqa %xmm14,0xd0(%r11) 873 vmovdqa %xmm15,0xe0(%r11) 874.Ldo_avx_body: 875___ 876$code.=<<___; 877 sub \$64,$len 878 lea -32($inp),%rax 879 cmovc %rax,$inp 880 881 vmovdqu `16*3`($ctx),$D4 # preload r0^2 882 lea `16*3+64`($ctx),$ctx # size optimization 883 lea .Lconst(%rip),%rcx 884 885 ################################################################ 886 # load input 887 vmovdqu 16*2($inp),$T0 888 vmovdqu 16*3($inp),$T1 889 vmovdqa 64(%rcx),$MASK # .Lmask26 890 891 vpsrldq \$6,$T0,$T2 # splat input 892 vpsrldq \$6,$T1,$T3 893 vpunpckhqdq $T1,$T0,$T4 # 4 894 vpunpcklqdq $T1,$T0,$T0 # 0:1 895 vpunpcklqdq $T3,$T2,$T3 # 2:3 896 897 vpsrlq \$40,$T4,$T4 # 4 898 vpsrlq \$26,$T0,$T1 899 vpand $MASK,$T0,$T0 # 0 900 vpsrlq \$4,$T3,$T2 901 vpand $MASK,$T1,$T1 # 1 902 vpsrlq \$30,$T3,$T3 903 vpand $MASK,$T2,$T2 # 2 904 vpand $MASK,$T3,$T3 # 3 905 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 906 907 jbe .Lskip_loop_avx 908 909 # expand and copy pre-calculated table to stack 910 vmovdqu `16*1-64`($ctx),$D1 911 vmovdqu `16*2-64`($ctx),$D2 912 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 913 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 914 vmovdqa $D3,-0x90(%r11) 915 vmovdqa $D0,0x00(%rsp) 916 vpshufd \$0xEE,$D1,$D4 917 vmovdqu `16*3-64`($ctx),$D0 918 vpshufd \$0x44,$D1,$D1 919 vmovdqa $D4,-0x80(%r11) 920 vmovdqa $D1,0x10(%rsp) 921 vpshufd \$0xEE,$D2,$D3 922 vmovdqu `16*4-64`($ctx),$D1 923 vpshufd \$0x44,$D2,$D2 924 vmovdqa $D3,-0x70(%r11) 925 vmovdqa $D2,0x20(%rsp) 926 vpshufd \$0xEE,$D0,$D4 927 vmovdqu `16*5-64`($ctx),$D2 928 vpshufd \$0x44,$D0,$D0 929 vmovdqa $D4,-0x60(%r11) 930 vmovdqa $D0,0x30(%rsp) 931 vpshufd \$0xEE,$D1,$D3 932 vmovdqu `16*6-64`($ctx),$D0 933 vpshufd \$0x44,$D1,$D1 934 vmovdqa $D3,-0x50(%r11) 935 vmovdqa $D1,0x40(%rsp) 936 vpshufd \$0xEE,$D2,$D4 937 vmovdqu `16*7-64`($ctx),$D1 938 vpshufd \$0x44,$D2,$D2 939 vmovdqa $D4,-0x40(%r11) 940 vmovdqa $D2,0x50(%rsp) 941 vpshufd \$0xEE,$D0,$D3 942 vmovdqu `16*8-64`($ctx),$D2 943 vpshufd \$0x44,$D0,$D0 944 vmovdqa $D3,-0x30(%r11) 945 vmovdqa $D0,0x60(%rsp) 946 vpshufd \$0xEE,$D1,$D4 947 vpshufd \$0x44,$D1,$D1 948 vmovdqa $D4,-0x20(%r11) 949 vmovdqa $D1,0x70(%rsp) 950 vpshufd \$0xEE,$D2,$D3 951 vmovdqa 0x00(%rsp),$D4 # preload r0^2 952 vpshufd \$0x44,$D2,$D2 953 vmovdqa $D3,-0x10(%r11) 954 vmovdqa $D2,0x80(%rsp) 955 956 jmp .Loop_avx 957 958.align 32 959.Loop_avx: 960 ################################################################ 961 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 962 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 963 # \___________________/ 964 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 965 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 966 # \___________________/ \____________________/ 967 # 968 # Note that we start with inp[2:3]*r^2. This is because it 969 # doesn't depend on reduction in previous iteration. 970 ################################################################ 971 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 972 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 973 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 974 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 975 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 976 # 977 # though note that $Tx and $Hx are "reversed" in this section, 978 # and $D4 is preloaded with r0^2... 979 980 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 981 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 982 vmovdqa $H2,0x20(%r11) # offload hash 983 vpmuludq $T2,$D4,$D2 # d3 = h2*r0 984 vmovdqa 0x10(%rsp),$H2 # r1^2 985 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 986 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 987 988 vmovdqa $H0,0x00(%r11) # 989 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 990 vmovdqa $H1,0x10(%r11) # 991 vpmuludq $T3,$H2,$H1 # h3*r1 992 vpaddq $H0,$D0,$D0 # d0 += h4*s1 993 vpaddq $H1,$D4,$D4 # d4 += h3*r1 994 vmovdqa $H3,0x30(%r11) # 995 vpmuludq $T2,$H2,$H0 # h2*r1 996 vpmuludq $T1,$H2,$H1 # h1*r1 997 vpaddq $H0,$D3,$D3 # d3 += h2*r1 998 vmovdqa 0x30(%rsp),$H3 # r2^2 999 vpaddq $H1,$D2,$D2 # d2 += h1*r1 1000 vmovdqa $H4,0x40(%r11) # 1001 vpmuludq $T0,$H2,$H2 # h0*r1 1002 vpmuludq $T2,$H3,$H0 # h2*r2 1003 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1004 1005 vmovdqa 0x40(%rsp),$H4 # s2^2 1006 vpaddq $H0,$D4,$D4 # d4 += h2*r2 1007 vpmuludq $T1,$H3,$H1 # h1*r2 1008 vpmuludq $T0,$H3,$H3 # h0*r2 1009 vpaddq $H1,$D3,$D3 # d3 += h1*r2 1010 vmovdqa 0x50(%rsp),$H2 # r3^2 1011 vpaddq $H3,$D2,$D2 # d2 += h0*r2 1012 vpmuludq $T4,$H4,$H0 # h4*s2 1013 vpmuludq $T3,$H4,$H4 # h3*s2 1014 vpaddq $H0,$D1,$D1 # d1 += h4*s2 1015 vmovdqa 0x60(%rsp),$H3 # s3^2 1016 vpaddq $H4,$D0,$D0 # d0 += h3*s2 1017 1018 vmovdqa 0x80(%rsp),$H4 # s4^2 1019 vpmuludq $T1,$H2,$H1 # h1*r3 1020 vpmuludq $T0,$H2,$H2 # h0*r3 1021 vpaddq $H1,$D4,$D4 # d4 += h1*r3 1022 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1023 vpmuludq $T4,$H3,$H0 # h4*s3 1024 vpmuludq $T3,$H3,$H1 # h3*s3 1025 vpaddq $H0,$D2,$D2 # d2 += h4*s3 1026 vmovdqu 16*0($inp),$H0 # load input 1027 vpaddq $H1,$D1,$D1 # d1 += h3*s3 1028 vpmuludq $T2,$H3,$H3 # h2*s3 1029 vpmuludq $T2,$H4,$T2 # h2*s4 1030 vpaddq $H3,$D0,$D0 # d0 += h2*s3 1031 1032 vmovdqu 16*1($inp),$H1 # 1033 vpaddq $T2,$D1,$D1 # d1 += h2*s4 1034 vpmuludq $T3,$H4,$T3 # h3*s4 1035 vpmuludq $T4,$H4,$T4 # h4*s4 1036 vpsrldq \$6,$H0,$H2 # splat input 1037 vpaddq $T3,$D2,$D2 # d2 += h3*s4 1038 vpaddq $T4,$D3,$D3 # d3 += h4*s4 1039 vpsrldq \$6,$H1,$H3 # 1040 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 1041 vpmuludq $T1,$H4,$T0 # h1*s4 1042 vpunpckhqdq $H1,$H0,$H4 # 4 1043 vpaddq $T4,$D4,$D4 # d4 += h0*r4 1044 vmovdqa -0x90(%r11),$T4 # r0^4 1045 vpaddq $T0,$D0,$D0 # d0 += h1*s4 1046 1047 vpunpcklqdq $H1,$H0,$H0 # 0:1 1048 vpunpcklqdq $H3,$H2,$H3 # 2:3 1049 1050 #vpsrlq \$40,$H4,$H4 # 4 1051 vpsrldq \$`40/8`,$H4,$H4 # 4 1052 vpsrlq \$26,$H0,$H1 1053 vpand $MASK,$H0,$H0 # 0 1054 vpsrlq \$4,$H3,$H2 1055 vpand $MASK,$H1,$H1 # 1 1056 vpand 0(%rcx),$H4,$H4 # .Lmask24 1057 vpsrlq \$30,$H3,$H3 1058 vpand $MASK,$H2,$H2 # 2 1059 vpand $MASK,$H3,$H3 # 3 1060 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1061 1062 vpaddq 0x00(%r11),$H0,$H0 # add hash value 1063 vpaddq 0x10(%r11),$H1,$H1 1064 vpaddq 0x20(%r11),$H2,$H2 1065 vpaddq 0x30(%r11),$H3,$H3 1066 vpaddq 0x40(%r11),$H4,$H4 1067 1068 lea 16*2($inp),%rax 1069 lea 16*4($inp),$inp 1070 sub \$64,$len 1071 cmovc %rax,$inp 1072 1073 ################################################################ 1074 # Now we accumulate (inp[0:1]+hash)*r^4 1075 ################################################################ 1076 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1077 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1078 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1079 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1080 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1081 1082 vpmuludq $H0,$T4,$T0 # h0*r0 1083 vpmuludq $H1,$T4,$T1 # h1*r0 1084 vpaddq $T0,$D0,$D0 1085 vpaddq $T1,$D1,$D1 1086 vmovdqa -0x80(%r11),$T2 # r1^4 1087 vpmuludq $H2,$T4,$T0 # h2*r0 1088 vpmuludq $H3,$T4,$T1 # h3*r0 1089 vpaddq $T0,$D2,$D2 1090 vpaddq $T1,$D3,$D3 1091 vpmuludq $H4,$T4,$T4 # h4*r0 1092 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 1093 vpaddq $T4,$D4,$D4 1094 1095 vpaddq $T0,$D0,$D0 # d0 += h4*s1 1096 vpmuludq $H2,$T2,$T1 # h2*r1 1097 vpmuludq $H3,$T2,$T0 # h3*r1 1098 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1099 vmovdqa -0x60(%r11),$T3 # r2^4 1100 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1101 vpmuludq $H1,$T2,$T1 # h1*r1 1102 vpmuludq $H0,$T2,$T2 # h0*r1 1103 vpaddq $T1,$D2,$D2 # d2 += h1*r1 1104 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1105 1106 vmovdqa -0x50(%r11),$T4 # s2^4 1107 vpmuludq $H2,$T3,$T0 # h2*r2 1108 vpmuludq $H1,$T3,$T1 # h1*r2 1109 vpaddq $T0,$D4,$D4 # d4 += h2*r2 1110 vpaddq $T1,$D3,$D3 # d3 += h1*r2 1111 vmovdqa -0x40(%r11),$T2 # r3^4 1112 vpmuludq $H0,$T3,$T3 # h0*r2 1113 vpmuludq $H4,$T4,$T0 # h4*s2 1114 vpaddq $T3,$D2,$D2 # d2 += h0*r2 1115 vpaddq $T0,$D1,$D1 # d1 += h4*s2 1116 vmovdqa -0x30(%r11),$T3 # s3^4 1117 vpmuludq $H3,$T4,$T4 # h3*s2 1118 vpmuludq $H1,$T2,$T1 # h1*r3 1119 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1120 1121 vmovdqa -0x10(%r11),$T4 # s4^4 1122 vpaddq $T1,$D4,$D4 # d4 += h1*r3 1123 vpmuludq $H0,$T2,$T2 # h0*r3 1124 vpmuludq $H4,$T3,$T0 # h4*s3 1125 vpaddq $T2,$D3,$D3 # d3 += h0*r3 1126 vpaddq $T0,$D2,$D2 # d2 += h4*s3 1127 vmovdqu 16*2($inp),$T0 # load input 1128 vpmuludq $H3,$T3,$T2 # h3*s3 1129 vpmuludq $H2,$T3,$T3 # h2*s3 1130 vpaddq $T2,$D1,$D1 # d1 += h3*s3 1131 vmovdqu 16*3($inp),$T1 # 1132 vpaddq $T3,$D0,$D0 # d0 += h2*s3 1133 1134 vpmuludq $H2,$T4,$H2 # h2*s4 1135 vpmuludq $H3,$T4,$H3 # h3*s4 1136 vpsrldq \$6,$T0,$T2 # splat input 1137 vpaddq $H2,$D1,$D1 # d1 += h2*s4 1138 vpmuludq $H4,$T4,$H4 # h4*s4 1139 vpsrldq \$6,$T1,$T3 # 1140 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 1141 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 1142 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 1143 vpmuludq $H1,$T4,$H0 1144 vpunpckhqdq $T1,$T0,$T4 # 4 1145 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1146 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1147 1148 vpunpcklqdq $T1,$T0,$T0 # 0:1 1149 vpunpcklqdq $T3,$T2,$T3 # 2:3 1150 1151 #vpsrlq \$40,$T4,$T4 # 4 1152 vpsrldq \$`40/8`,$T4,$T4 # 4 1153 vpsrlq \$26,$T0,$T1 1154 vmovdqa 0x00(%rsp),$D4 # preload r0^2 1155 vpand $MASK,$T0,$T0 # 0 1156 vpsrlq \$4,$T3,$T2 1157 vpand $MASK,$T1,$T1 # 1 1158 vpand 0(%rcx),$T4,$T4 # .Lmask24 1159 vpsrlq \$30,$T3,$T3 1160 vpand $MASK,$T2,$T2 # 2 1161 vpand $MASK,$T3,$T3 # 3 1162 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1163 1164 ################################################################ 1165 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 1166 # and P. Schwabe 1167 1168 vpsrlq \$26,$H3,$D3 1169 vpand $MASK,$H3,$H3 1170 vpaddq $D3,$H4,$H4 # h3 -> h4 1171 1172 vpsrlq \$26,$H0,$D0 1173 vpand $MASK,$H0,$H0 1174 vpaddq $D0,$D1,$H1 # h0 -> h1 1175 1176 vpsrlq \$26,$H4,$D0 1177 vpand $MASK,$H4,$H4 1178 1179 vpsrlq \$26,$H1,$D1 1180 vpand $MASK,$H1,$H1 1181 vpaddq $D1,$H2,$H2 # h1 -> h2 1182 1183 vpaddq $D0,$H0,$H0 1184 vpsllq \$2,$D0,$D0 1185 vpaddq $D0,$H0,$H0 # h4 -> h0 1186 1187 vpsrlq \$26,$H2,$D2 1188 vpand $MASK,$H2,$H2 1189 vpaddq $D2,$H3,$H3 # h2 -> h3 1190 1191 vpsrlq \$26,$H0,$D0 1192 vpand $MASK,$H0,$H0 1193 vpaddq $D0,$H1,$H1 # h0 -> h1 1194 1195 vpsrlq \$26,$H3,$D3 1196 vpand $MASK,$H3,$H3 1197 vpaddq $D3,$H4,$H4 # h3 -> h4 1198 1199 ja .Loop_avx 1200 1201.Lskip_loop_avx: 1202 ################################################################ 1203 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1204 1205 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 1206 add \$32,$len 1207 jnz .Long_tail_avx 1208 1209 vpaddq $H2,$T2,$T2 1210 vpaddq $H0,$T0,$T0 1211 vpaddq $H1,$T1,$T1 1212 vpaddq $H3,$T3,$T3 1213 vpaddq $H4,$T4,$T4 1214 1215.Long_tail_avx: 1216 vmovdqa $H2,0x20(%r11) 1217 vmovdqa $H0,0x00(%r11) 1218 vmovdqa $H1,0x10(%r11) 1219 vmovdqa $H3,0x30(%r11) 1220 vmovdqa $H4,0x40(%r11) 1221 1222 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1223 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1224 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1225 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1226 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1227 1228 vpmuludq $T2,$D4,$D2 # d2 = h2*r0 1229 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 1230 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 1231 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 1232 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 1233 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 1234 1235 vpmuludq $T3,$H2,$H0 # h3*r1 1236 vpaddq $H0,$D4,$D4 # d4 += h3*r1 1237 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 1238 vpmuludq $T2,$H2,$H1 # h2*r1 1239 vpaddq $H1,$D3,$D3 # d3 += h2*r1 1240 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 1241 vpmuludq $T1,$H2,$H0 # h1*r1 1242 vpaddq $H0,$D2,$D2 # d2 += h1*r1 1243 vpmuludq $T0,$H2,$H2 # h0*r1 1244 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1245 vpmuludq $T4,$H3,$H3 # h4*s1 1246 vpaddq $H3,$D0,$D0 # d0 += h4*s1 1247 1248 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 1249 vpmuludq $T2,$H4,$H1 # h2*r2 1250 vpaddq $H1,$D4,$D4 # d4 += h2*r2 1251 vpmuludq $T1,$H4,$H0 # h1*r2 1252 vpaddq $H0,$D3,$D3 # d3 += h1*r2 1253 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 1254 vpmuludq $T0,$H4,$H4 # h0*r2 1255 vpaddq $H4,$D2,$D2 # d2 += h0*r2 1256 vpmuludq $T4,$H2,$H1 # h4*s2 1257 vpaddq $H1,$D1,$D1 # d1 += h4*s2 1258 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 1259 vpmuludq $T3,$H2,$H2 # h3*s2 1260 vpaddq $H2,$D0,$D0 # d0 += h3*s2 1261 1262 vpmuludq $T1,$H3,$H0 # h1*r3 1263 vpaddq $H0,$D4,$D4 # d4 += h1*r3 1264 vpmuludq $T0,$H3,$H3 # h0*r3 1265 vpaddq $H3,$D3,$D3 # d3 += h0*r3 1266 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 1267 vpmuludq $T4,$H4,$H1 # h4*s3 1268 vpaddq $H1,$D2,$D2 # d2 += h4*s3 1269 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 1270 vpmuludq $T3,$H4,$H0 # h3*s3 1271 vpaddq $H0,$D1,$D1 # d1 += h3*s3 1272 vpmuludq $T2,$H4,$H4 # h2*s3 1273 vpaddq $H4,$D0,$D0 # d0 += h2*s3 1274 1275 vpmuludq $T0,$H2,$H2 # h0*r4 1276 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 1277 vpmuludq $T4,$H3,$H1 # h4*s4 1278 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 1279 vpmuludq $T3,$H3,$H0 # h3*s4 1280 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 1281 vpmuludq $T2,$H3,$H1 # h2*s4 1282 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 1283 vpmuludq $T1,$H3,$H3 # h1*s4 1284 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 1285 1286 jz .Lshort_tail_avx 1287 1288 vmovdqu 16*0($inp),$H0 # load input 1289 vmovdqu 16*1($inp),$H1 1290 1291 vpsrldq \$6,$H0,$H2 # splat input 1292 vpsrldq \$6,$H1,$H3 1293 vpunpckhqdq $H1,$H0,$H4 # 4 1294 vpunpcklqdq $H1,$H0,$H0 # 0:1 1295 vpunpcklqdq $H3,$H2,$H3 # 2:3 1296 1297 vpsrlq \$40,$H4,$H4 # 4 1298 vpsrlq \$26,$H0,$H1 1299 vpand $MASK,$H0,$H0 # 0 1300 vpsrlq \$4,$H3,$H2 1301 vpand $MASK,$H1,$H1 # 1 1302 vpsrlq \$30,$H3,$H3 1303 vpand $MASK,$H2,$H2 # 2 1304 vpand $MASK,$H3,$H3 # 3 1305 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1306 1307 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 1308 vpaddq 0x00(%r11),$H0,$H0 1309 vpaddq 0x10(%r11),$H1,$H1 1310 vpaddq 0x20(%r11),$H2,$H2 1311 vpaddq 0x30(%r11),$H3,$H3 1312 vpaddq 0x40(%r11),$H4,$H4 1313 1314 ################################################################ 1315 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 1316 1317 vpmuludq $H0,$T4,$T0 # h0*r0 1318 vpaddq $T0,$D0,$D0 # d0 += h0*r0 1319 vpmuludq $H1,$T4,$T1 # h1*r0 1320 vpaddq $T1,$D1,$D1 # d1 += h1*r0 1321 vpmuludq $H2,$T4,$T0 # h2*r0 1322 vpaddq $T0,$D2,$D2 # d2 += h2*r0 1323 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 1324 vpmuludq $H3,$T4,$T1 # h3*r0 1325 vpaddq $T1,$D3,$D3 # d3 += h3*r0 1326 vpmuludq $H4,$T4,$T4 # h4*r0 1327 vpaddq $T4,$D4,$D4 # d4 += h4*r0 1328 1329 vpmuludq $H3,$T2,$T0 # h3*r1 1330 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1331 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 1332 vpmuludq $H2,$T2,$T1 # h2*r1 1333 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1334 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 1335 vpmuludq $H1,$T2,$T0 # h1*r1 1336 vpaddq $T0,$D2,$D2 # d2 += h1*r1 1337 vpmuludq $H0,$T2,$T2 # h0*r1 1338 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1339 vpmuludq $H4,$T3,$T3 # h4*s1 1340 vpaddq $T3,$D0,$D0 # d0 += h4*s1 1341 1342 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 1343 vpmuludq $H2,$T4,$T1 # h2*r2 1344 vpaddq $T1,$D4,$D4 # d4 += h2*r2 1345 vpmuludq $H1,$T4,$T0 # h1*r2 1346 vpaddq $T0,$D3,$D3 # d3 += h1*r2 1347 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 1348 vpmuludq $H0,$T4,$T4 # h0*r2 1349 vpaddq $T4,$D2,$D2 # d2 += h0*r2 1350 vpmuludq $H4,$T2,$T1 # h4*s2 1351 vpaddq $T1,$D1,$D1 # d1 += h4*s2 1352 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 1353 vpmuludq $H3,$T2,$T2 # h3*s2 1354 vpaddq $T2,$D0,$D0 # d0 += h3*s2 1355 1356 vpmuludq $H1,$T3,$T0 # h1*r3 1357 vpaddq $T0,$D4,$D4 # d4 += h1*r3 1358 vpmuludq $H0,$T3,$T3 # h0*r3 1359 vpaddq $T3,$D3,$D3 # d3 += h0*r3 1360 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 1361 vpmuludq $H4,$T4,$T1 # h4*s3 1362 vpaddq $T1,$D2,$D2 # d2 += h4*s3 1363 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 1364 vpmuludq $H3,$T4,$T0 # h3*s3 1365 vpaddq $T0,$D1,$D1 # d1 += h3*s3 1366 vpmuludq $H2,$T4,$T4 # h2*s3 1367 vpaddq $T4,$D0,$D0 # d0 += h2*s3 1368 1369 vpmuludq $H0,$T2,$T2 # h0*r4 1370 vpaddq $T2,$D4,$D4 # d4 += h0*r4 1371 vpmuludq $H4,$T3,$T1 # h4*s4 1372 vpaddq $T1,$D3,$D3 # d3 += h4*s4 1373 vpmuludq $H3,$T3,$T0 # h3*s4 1374 vpaddq $T0,$D2,$D2 # d2 += h3*s4 1375 vpmuludq $H2,$T3,$T1 # h2*s4 1376 vpaddq $T1,$D1,$D1 # d1 += h2*s4 1377 vpmuludq $H1,$T3,$T3 # h1*s4 1378 vpaddq $T3,$D0,$D0 # d0 += h1*s4 1379 1380.Lshort_tail_avx: 1381 ################################################################ 1382 # horizontal addition 1383 1384 vpsrldq \$8,$D4,$T4 1385 vpsrldq \$8,$D3,$T3 1386 vpsrldq \$8,$D1,$T1 1387 vpsrldq \$8,$D0,$T0 1388 vpsrldq \$8,$D2,$T2 1389 vpaddq $T3,$D3,$D3 1390 vpaddq $T4,$D4,$D4 1391 vpaddq $T0,$D0,$D0 1392 vpaddq $T1,$D1,$D1 1393 vpaddq $T2,$D2,$D2 1394 1395 ################################################################ 1396 # lazy reduction 1397 1398 vpsrlq \$26,$D3,$H3 1399 vpand $MASK,$D3,$D3 1400 vpaddq $H3,$D4,$D4 # h3 -> h4 1401 1402 vpsrlq \$26,$D0,$H0 1403 vpand $MASK,$D0,$D0 1404 vpaddq $H0,$D1,$D1 # h0 -> h1 1405 1406 vpsrlq \$26,$D4,$H4 1407 vpand $MASK,$D4,$D4 1408 1409 vpsrlq \$26,$D1,$H1 1410 vpand $MASK,$D1,$D1 1411 vpaddq $H1,$D2,$D2 # h1 -> h2 1412 1413 vpaddq $H4,$D0,$D0 1414 vpsllq \$2,$H4,$H4 1415 vpaddq $H4,$D0,$D0 # h4 -> h0 1416 1417 vpsrlq \$26,$D2,$H2 1418 vpand $MASK,$D2,$D2 1419 vpaddq $H2,$D3,$D3 # h2 -> h3 1420 1421 vpsrlq \$26,$D0,$H0 1422 vpand $MASK,$D0,$D0 1423 vpaddq $H0,$D1,$D1 # h0 -> h1 1424 1425 vpsrlq \$26,$D3,$H3 1426 vpand $MASK,$D3,$D3 1427 vpaddq $H3,$D4,$D4 # h3 -> h4 1428 1429 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 1430 vmovd $D1,`4*1-48-64`($ctx) 1431 vmovd $D2,`4*2-48-64`($ctx) 1432 vmovd $D3,`4*3-48-64`($ctx) 1433 vmovd $D4,`4*4-48-64`($ctx) 1434___ 1435$code.=<<___ if ($win64); 1436 vmovdqa 0x50(%r11),%xmm6 1437 vmovdqa 0x60(%r11),%xmm7 1438 vmovdqa 0x70(%r11),%xmm8 1439 vmovdqa 0x80(%r11),%xmm9 1440 vmovdqa 0x90(%r11),%xmm10 1441 vmovdqa 0xa0(%r11),%xmm11 1442 vmovdqa 0xb0(%r11),%xmm12 1443 vmovdqa 0xc0(%r11),%xmm13 1444 vmovdqa 0xd0(%r11),%xmm14 1445 vmovdqa 0xe0(%r11),%xmm15 1446 lea 0xf8(%r11),%rsp 1447.Ldo_avx_epilogue: 1448___ 1449$code.=<<___ if (!$win64); 1450 lea -8(%r10),%rsp 1451.cfi_def_cfa_register %rsp 1452___ 1453$code.=<<___; 1454 vzeroupper 1455 ret 1456.cfi_endproc 1457___ 1458&end_function("poly1305_blocks_avx"); 1459 1460&declare_function("poly1305_emit_avx", 32, 3); 1461$code.=<<___; 1462 cmpl \$0,20($ctx) # is_base2_26? 1463 je .Lemit 1464 1465 mov 0($ctx),%eax # load hash value base 2^26 1466 mov 4($ctx),%ecx 1467 mov 8($ctx),%r8d 1468 mov 12($ctx),%r11d 1469 mov 16($ctx),%r10d 1470 1471 shl \$26,%rcx # base 2^26 -> base 2^64 1472 mov %r8,%r9 1473 shl \$52,%r8 1474 add %rcx,%rax 1475 shr \$12,%r9 1476 add %rax,%r8 # h0 1477 adc \$0,%r9 1478 1479 shl \$14,%r11 1480 mov %r10,%rax 1481 shr \$24,%r10 1482 add %r11,%r9 1483 shl \$40,%rax 1484 add %rax,%r9 # h1 1485 adc \$0,%r10 # h2 1486 1487 mov %r10,%rax # could be partially reduced, so reduce 1488 mov %r10,%rcx 1489 and \$3,%r10 1490 shr \$2,%rax 1491 and \$-4,%rcx 1492 add %rcx,%rax 1493 add %rax,%r8 1494 adc \$0,%r9 1495 adc \$0,%r10 1496 1497 mov %r8,%rax 1498 add \$5,%r8 # compare to modulus 1499 mov %r9,%rcx 1500 adc \$0,%r9 1501 adc \$0,%r10 1502 shr \$2,%r10 # did 130-bit value overflow? 1503 cmovnz %r8,%rax 1504 cmovnz %r9,%rcx 1505 1506 add 0($nonce),%rax # accumulate nonce 1507 adc 8($nonce),%rcx 1508 mov %rax,0($mac) # write result 1509 mov %rcx,8($mac) 1510 1511 ret 1512___ 1513&end_function("poly1305_emit_avx"); 1514 1515if ($avx>1) { 1516 1517my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 1518 map("%ymm$_",(0..15)); 1519my $S4=$MASK; 1520 1521sub poly1305_blocks_avxN { 1522 my ($avx512) = @_; 1523 my $suffix = $avx512 ? "_avx512" : ""; 1524$code.=<<___; 1525.cfi_startproc 1526 mov 20($ctx),%r8d # is_base2_26 1527 cmp \$128,$len 1528 jae .Lblocks_avx2$suffix 1529 test %r8d,%r8d 1530 jz .Lblocks 1531 1532.Lblocks_avx2$suffix: 1533 and \$-16,$len 1534 jz .Lno_data_avx2$suffix 1535 1536 vzeroupper 1537 1538 test %r8d,%r8d 1539 jz .Lbase2_64_avx2$suffix 1540 1541 test \$63,$len 1542 jz .Leven_avx2$suffix 1543 1544 push %rbp 1545.cfi_push %rbp 1546 mov %rsp,%rbp 1547 push %rbx 1548.cfi_push %rbx 1549 push %r12 1550.cfi_push %r12 1551 push %r13 1552.cfi_push %r13 1553 push %r14 1554.cfi_push %r14 1555 push %r15 1556.cfi_push %r15 1557.Lblocks_avx2_body$suffix: 1558 1559 mov $len,%r15 # reassign $len 1560 1561 mov 0($ctx),$d1 # load hash value 1562 mov 8($ctx),$d2 1563 mov 16($ctx),$h2#d 1564 1565 mov 24($ctx),$r0 # load r 1566 mov 32($ctx),$s1 1567 1568 ################################# base 2^26 -> base 2^64 1569 mov $d1#d,$h0#d 1570 and \$`-1*(1<<31)`,$d1 1571 mov $d2,$r1 # borrow $r1 1572 mov $d2#d,$h1#d 1573 and \$`-1*(1<<31)`,$d2 1574 1575 shr \$6,$d1 1576 shl \$52,$r1 1577 add $d1,$h0 1578 shr \$12,$h1 1579 shr \$18,$d2 1580 add $r1,$h0 1581 adc $d2,$h1 1582 1583 mov $h2,$d1 1584 shl \$40,$d1 1585 shr \$24,$h2 1586 add $d1,$h1 1587 adc \$0,$h2 # can be partially reduced... 1588 1589 mov \$-4,$d2 # ... so reduce 1590 mov $h2,$d1 1591 and $h2,$d2 1592 shr \$2,$d1 1593 and \$3,$h2 1594 add $d2,$d1 # =*5 1595 add $d1,$h0 1596 adc \$0,$h1 1597 adc \$0,$h2 1598 1599 mov $s1,$r1 1600 mov $s1,%rax 1601 shr \$2,$s1 1602 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1603 1604.Lbase2_26_pre_avx2$suffix: 1605 add 0($inp),$h0 # accumulate input 1606 adc 8($inp),$h1 1607 lea 16($inp),$inp 1608 adc $padbit,$h2 1609 sub \$16,%r15 1610 1611 call __poly1305_block 1612 mov $r1,%rax 1613 1614 test \$63,%r15 1615 jnz .Lbase2_26_pre_avx2$suffix 1616 1617 test $padbit,$padbit # if $padbit is zero, 1618 jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format 1619 1620 ################################# base 2^64 -> base 2^26 1621 mov $h0,%rax 1622 mov $h0,%rdx 1623 shr \$52,$h0 1624 mov $h1,$r0 1625 mov $h1,$r1 1626 shr \$26,%rdx 1627 and \$0x3ffffff,%rax # h[0] 1628 shl \$12,$r0 1629 and \$0x3ffffff,%rdx # h[1] 1630 shr \$14,$h1 1631 or $r0,$h0 1632 shl \$24,$h2 1633 and \$0x3ffffff,$h0 # h[2] 1634 shr \$40,$r1 1635 and \$0x3ffffff,$h1 # h[3] 1636 or $r1,$h2 # h[4] 1637 1638 test %r15,%r15 1639 jz .Lstore_base2_26_avx2$suffix 1640 1641 vmovd %rax#d,%x#$H0 1642 vmovd %rdx#d,%x#$H1 1643 vmovd $h0#d,%x#$H2 1644 vmovd $h1#d,%x#$H3 1645 vmovd $h2#d,%x#$H4 1646 jmp .Lproceed_avx2$suffix 1647 1648.align 32 1649.Lstore_base2_64_avx2$suffix: 1650 mov $h0,0($ctx) 1651 mov $h1,8($ctx) 1652 mov $h2,16($ctx) # note that is_base2_26 is zeroed 1653 jmp .Ldone_avx2$suffix 1654 1655.align 16 1656.Lstore_base2_26_avx2$suffix: 1657 mov %rax#d,0($ctx) # store hash value base 2^26 1658 mov %rdx#d,4($ctx) 1659 mov $h0#d,8($ctx) 1660 mov $h1#d,12($ctx) 1661 mov $h2#d,16($ctx) 1662.align 16 1663.Ldone_avx2$suffix: 1664 pop %r15 1665.cfi_restore %r15 1666 pop %r14 1667.cfi_restore %r14 1668 pop %r13 1669.cfi_restore %r13 1670 pop %r12 1671.cfi_restore %r12 1672 pop %rbx 1673.cfi_restore %rbx 1674 pop %rbp 1675.cfi_restore %rbp 1676.Lno_data_avx2$suffix: 1677.Lblocks_avx2_epilogue$suffix: 1678 ret 1679.cfi_endproc 1680 1681.align 32 1682.Lbase2_64_avx2$suffix: 1683.cfi_startproc 1684 push %rbp 1685.cfi_push %rbp 1686 mov %rsp,%rbp 1687 push %rbx 1688.cfi_push %rbx 1689 push %r12 1690.cfi_push %r12 1691 push %r13 1692.cfi_push %r13 1693 push %r14 1694.cfi_push %r14 1695 push %r15 1696.cfi_push %r15 1697.Lbase2_64_avx2_body$suffix: 1698 1699 mov $len,%r15 # reassign $len 1700 1701 mov 24($ctx),$r0 # load r 1702 mov 32($ctx),$s1 1703 1704 mov 0($ctx),$h0 # load hash value 1705 mov 8($ctx),$h1 1706 mov 16($ctx),$h2#d 1707 1708 mov $s1,$r1 1709 mov $s1,%rax 1710 shr \$2,$s1 1711 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1712 1713 test \$63,$len 1714 jz .Linit_avx2$suffix 1715 1716.Lbase2_64_pre_avx2$suffix: 1717 add 0($inp),$h0 # accumulate input 1718 adc 8($inp),$h1 1719 lea 16($inp),$inp 1720 adc $padbit,$h2 1721 sub \$16,%r15 1722 1723 call __poly1305_block 1724 mov $r1,%rax 1725 1726 test \$63,%r15 1727 jnz .Lbase2_64_pre_avx2$suffix 1728 1729.Linit_avx2$suffix: 1730 ################################# base 2^64 -> base 2^26 1731 mov $h0,%rax 1732 mov $h0,%rdx 1733 shr \$52,$h0 1734 mov $h1,$d1 1735 mov $h1,$d2 1736 shr \$26,%rdx 1737 and \$0x3ffffff,%rax # h[0] 1738 shl \$12,$d1 1739 and \$0x3ffffff,%rdx # h[1] 1740 shr \$14,$h1 1741 or $d1,$h0 1742 shl \$24,$h2 1743 and \$0x3ffffff,$h0 # h[2] 1744 shr \$40,$d2 1745 and \$0x3ffffff,$h1 # h[3] 1746 or $d2,$h2 # h[4] 1747 1748 vmovd %rax#d,%x#$H0 1749 vmovd %rdx#d,%x#$H1 1750 vmovd $h0#d,%x#$H2 1751 vmovd $h1#d,%x#$H3 1752 vmovd $h2#d,%x#$H4 1753 movl \$1,20($ctx) # set is_base2_26 1754 1755 call __poly1305_init_avx 1756 1757.Lproceed_avx2$suffix: 1758 mov %r15,$len # restore $len 1759___ 1760$code.=<<___ if (!$kernel); 1761 mov OPENSSL_ia32cap_P+8(%rip),%r9d 1762 mov \$`(1<<31|1<<30|1<<16)`,%r11d 1763___ 1764$code.=<<___; 1765 pop %r15 1766.cfi_restore %r15 1767 pop %r14 1768.cfi_restore %r14 1769 pop %r13 1770.cfi_restore %r13 1771 pop %r12 1772.cfi_restore %r12 1773 pop %rbx 1774.cfi_restore %rbx 1775 pop %rbp 1776.cfi_restore %rbp 1777.Lbase2_64_avx2_epilogue$suffix: 1778 jmp .Ldo_avx2$suffix 1779.cfi_endproc 1780 1781.align 32 1782.Leven_avx2$suffix: 1783.cfi_startproc 1784___ 1785$code.=<<___ if (!$kernel); 1786 mov OPENSSL_ia32cap_P+8(%rip),%r9d 1787___ 1788$code.=<<___; 1789 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 1790 vmovd 4*1($ctx),%x#$H1 1791 vmovd 4*2($ctx),%x#$H2 1792 vmovd 4*3($ctx),%x#$H3 1793 vmovd 4*4($ctx),%x#$H4 1794 1795.Ldo_avx2$suffix: 1796___ 1797$code.=<<___ if (!$kernel && $avx>2); 1798 cmp \$512,$len 1799 jb .Lskip_avx512 1800 and %r11d,%r9d 1801 test \$`1<<16`,%r9d # check for AVX512F 1802 jnz .Lblocks_avx512 1803.Lskip_avx512$suffix: 1804___ 1805$code.=<<___ if ($avx > 2 && $avx512 && $kernel); 1806 cmp \$512,$len 1807 jae .Lblocks_avx512 1808___ 1809$code.=<<___ if (!$win64); 1810 lea 8(%rsp),%r10 1811.cfi_def_cfa_register %r10 1812 sub \$0x128,%rsp 1813___ 1814$code.=<<___ if ($win64); 1815 lea 8(%rsp),%r10 1816 sub \$0x1c8,%rsp 1817 vmovdqa %xmm6,-0xb0(%r10) 1818 vmovdqa %xmm7,-0xa0(%r10) 1819 vmovdqa %xmm8,-0x90(%r10) 1820 vmovdqa %xmm9,-0x80(%r10) 1821 vmovdqa %xmm10,-0x70(%r10) 1822 vmovdqa %xmm11,-0x60(%r10) 1823 vmovdqa %xmm12,-0x50(%r10) 1824 vmovdqa %xmm13,-0x40(%r10) 1825 vmovdqa %xmm14,-0x30(%r10) 1826 vmovdqa %xmm15,-0x20(%r10) 1827.Ldo_avx2_body$suffix: 1828___ 1829$code.=<<___; 1830 lea .Lconst(%rip),%rcx 1831 lea 48+64($ctx),$ctx # size optimization 1832 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 1833 1834 # expand and copy pre-calculated table to stack 1835 vmovdqu `16*0-64`($ctx),%x#$T2 1836 and \$-512,%rsp 1837 vmovdqu `16*1-64`($ctx),%x#$T3 1838 vmovdqu `16*2-64`($ctx),%x#$T4 1839 vmovdqu `16*3-64`($ctx),%x#$D0 1840 vmovdqu `16*4-64`($ctx),%x#$D1 1841 vmovdqu `16*5-64`($ctx),%x#$D2 1842 lea 0x90(%rsp),%rax # size optimization 1843 vmovdqu `16*6-64`($ctx),%x#$D3 1844 vpermd $T2,$T0,$T2 # 00003412 -> 14243444 1845 vmovdqu `16*7-64`($ctx),%x#$D4 1846 vpermd $T3,$T0,$T3 1847 vmovdqu `16*8-64`($ctx),%x#$MASK 1848 vpermd $T4,$T0,$T4 1849 vmovdqa $T2,0x00(%rsp) 1850 vpermd $D0,$T0,$D0 1851 vmovdqa $T3,0x20-0x90(%rax) 1852 vpermd $D1,$T0,$D1 1853 vmovdqa $T4,0x40-0x90(%rax) 1854 vpermd $D2,$T0,$D2 1855 vmovdqa $D0,0x60-0x90(%rax) 1856 vpermd $D3,$T0,$D3 1857 vmovdqa $D1,0x80-0x90(%rax) 1858 vpermd $D4,$T0,$D4 1859 vmovdqa $D2,0xa0-0x90(%rax) 1860 vpermd $MASK,$T0,$MASK 1861 vmovdqa $D3,0xc0-0x90(%rax) 1862 vmovdqa $D4,0xe0-0x90(%rax) 1863 vmovdqa $MASK,0x100-0x90(%rax) 1864 vmovdqa 64(%rcx),$MASK # .Lmask26 1865 1866 ################################################################ 1867 # load input 1868 vmovdqu 16*0($inp),%x#$T0 1869 vmovdqu 16*1($inp),%x#$T1 1870 vinserti128 \$1,16*2($inp),$T0,$T0 1871 vinserti128 \$1,16*3($inp),$T1,$T1 1872 lea 16*4($inp),$inp 1873 1874 vpsrldq \$6,$T0,$T2 # splat input 1875 vpsrldq \$6,$T1,$T3 1876 vpunpckhqdq $T1,$T0,$T4 # 4 1877 vpunpcklqdq $T3,$T2,$T2 # 2:3 1878 vpunpcklqdq $T1,$T0,$T0 # 0:1 1879 1880 vpsrlq \$30,$T2,$T3 1881 vpsrlq \$4,$T2,$T2 1882 vpsrlq \$26,$T0,$T1 1883 vpsrlq \$40,$T4,$T4 # 4 1884 vpand $MASK,$T2,$T2 # 2 1885 vpand $MASK,$T0,$T0 # 0 1886 vpand $MASK,$T1,$T1 # 1 1887 vpand $MASK,$T3,$T3 # 3 1888 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1889 1890 vpaddq $H2,$T2,$H2 # accumulate input 1891 sub \$64,$len 1892 jz .Ltail_avx2$suffix 1893 jmp .Loop_avx2$suffix 1894 1895.align 32 1896.Loop_avx2$suffix: 1897 ################################################################ 1898 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 1899 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 1900 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 1901 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 1902 # \________/\__________/ 1903 ################################################################ 1904 #vpaddq $H2,$T2,$H2 # accumulate input 1905 vpaddq $H0,$T0,$H0 1906 vmovdqa `32*0`(%rsp),$T0 # r0^4 1907 vpaddq $H1,$T1,$H1 1908 vmovdqa `32*1`(%rsp),$T1 # r1^4 1909 vpaddq $H3,$T3,$H3 1910 vmovdqa `32*3`(%rsp),$T2 # r2^4 1911 vpaddq $H4,$T4,$H4 1912 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 1913 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 1914 1915 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1916 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1917 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1918 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1919 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1920 # 1921 # however, as h2 is "chronologically" first one available pull 1922 # corresponding operations up, so it's 1923 # 1924 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 1925 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 1926 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1927 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 1928 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 1929 1930 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1931 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1932 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1933 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1934 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1935 1936 vpmuludq $H0,$T1,$T4 # h0*r1 1937 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 1938 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1939 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1940 vpmuludq $H3,$T1,$T4 # h3*r1 1941 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 1942 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1943 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1944 vmovdqa `32*4-0x90`(%rax),$T1 # s2 1945 1946 vpmuludq $H0,$T0,$T4 # h0*r0 1947 vpmuludq $H1,$T0,$H2 # h1*r0 1948 vpaddq $T4,$D0,$D0 # d0 += h0*r0 1949 vpaddq $H2,$D1,$D1 # d1 += h1*r0 1950 vpmuludq $H3,$T0,$T4 # h3*r0 1951 vpmuludq $H4,$T0,$H2 # h4*r0 1952 vmovdqu 16*0($inp),%x#$T0 # load input 1953 vpaddq $T4,$D3,$D3 # d3 += h3*r0 1954 vpaddq $H2,$D4,$D4 # d4 += h4*r0 1955 vinserti128 \$1,16*2($inp),$T0,$T0 1956 1957 vpmuludq $H3,$T1,$T4 # h3*s2 1958 vpmuludq $H4,$T1,$H2 # h4*s2 1959 vmovdqu 16*1($inp),%x#$T1 1960 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1961 vpaddq $H2,$D1,$D1 # d1 += h4*s2 1962 vmovdqa `32*5-0x90`(%rax),$H2 # r3 1963 vpmuludq $H1,$T2,$T4 # h1*r2 1964 vpmuludq $H0,$T2,$T2 # h0*r2 1965 vpaddq $T4,$D3,$D3 # d3 += h1*r2 1966 vpaddq $T2,$D2,$D2 # d2 += h0*r2 1967 vinserti128 \$1,16*3($inp),$T1,$T1 1968 lea 16*4($inp),$inp 1969 1970 vpmuludq $H1,$H2,$T4 # h1*r3 1971 vpmuludq $H0,$H2,$H2 # h0*r3 1972 vpsrldq \$6,$T0,$T2 # splat input 1973 vpaddq $T4,$D4,$D4 # d4 += h1*r3 1974 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1975 vpmuludq $H3,$T3,$T4 # h3*s3 1976 vpmuludq $H4,$T3,$H2 # h4*s3 1977 vpsrldq \$6,$T1,$T3 1978 vpaddq $T4,$D1,$D1 # d1 += h3*s3 1979 vpaddq $H2,$D2,$D2 # d2 += h4*s3 1980 vpunpckhqdq $T1,$T0,$T4 # 4 1981 1982 vpmuludq $H3,$S4,$H3 # h3*s4 1983 vpmuludq $H4,$S4,$H4 # h4*s4 1984 vpunpcklqdq $T1,$T0,$T0 # 0:1 1985 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 1986 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 1987 vpunpcklqdq $T3,$T2,$T3 # 2:3 1988 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 1989 vpmuludq $H1,$S4,$H0 # h1*s4 1990 vmovdqa 64(%rcx),$MASK # .Lmask26 1991 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1992 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1993 1994 ################################################################ 1995 # lazy reduction (interleaved with tail of input splat) 1996 1997 vpsrlq \$26,$H3,$D3 1998 vpand $MASK,$H3,$H3 1999 vpaddq $D3,$H4,$H4 # h3 -> h4 2000 2001 vpsrlq \$26,$H0,$D0 2002 vpand $MASK,$H0,$H0 2003 vpaddq $D0,$D1,$H1 # h0 -> h1 2004 2005 vpsrlq \$26,$H4,$D4 2006 vpand $MASK,$H4,$H4 2007 2008 vpsrlq \$4,$T3,$T2 2009 2010 vpsrlq \$26,$H1,$D1 2011 vpand $MASK,$H1,$H1 2012 vpaddq $D1,$H2,$H2 # h1 -> h2 2013 2014 vpaddq $D4,$H0,$H0 2015 vpsllq \$2,$D4,$D4 2016 vpaddq $D4,$H0,$H0 # h4 -> h0 2017 2018 vpand $MASK,$T2,$T2 # 2 2019 vpsrlq \$26,$T0,$T1 2020 2021 vpsrlq \$26,$H2,$D2 2022 vpand $MASK,$H2,$H2 2023 vpaddq $D2,$H3,$H3 # h2 -> h3 2024 2025 vpaddq $T2,$H2,$H2 # modulo-scheduled 2026 vpsrlq \$30,$T3,$T3 2027 2028 vpsrlq \$26,$H0,$D0 2029 vpand $MASK,$H0,$H0 2030 vpaddq $D0,$H1,$H1 # h0 -> h1 2031 2032 vpsrlq \$40,$T4,$T4 # 4 2033 2034 vpsrlq \$26,$H3,$D3 2035 vpand $MASK,$H3,$H3 2036 vpaddq $D3,$H4,$H4 # h3 -> h4 2037 2038 vpand $MASK,$T0,$T0 # 0 2039 vpand $MASK,$T1,$T1 # 1 2040 vpand $MASK,$T3,$T3 # 3 2041 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2042 2043 sub \$64,$len 2044 jnz .Loop_avx2$suffix 2045 2046 .byte 0x66,0x90 2047.Ltail_avx2$suffix: 2048 ################################################################ 2049 # while above multiplications were by r^4 in all lanes, in last 2050 # iteration we multiply least significant lane by r^4 and most 2051 # significant one by r, so copy of above except that references 2052 # to the precomputed table are displaced by 4... 2053 2054 #vpaddq $H2,$T2,$H2 # accumulate input 2055 vpaddq $H0,$T0,$H0 2056 vmovdqu `32*0+4`(%rsp),$T0 # r0^4 2057 vpaddq $H1,$T1,$H1 2058 vmovdqu `32*1+4`(%rsp),$T1 # r1^4 2059 vpaddq $H3,$T3,$H3 2060 vmovdqu `32*3+4`(%rsp),$T2 # r2^4 2061 vpaddq $H4,$T4,$H4 2062 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 2063 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 2064 2065 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 2066 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 2067 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 2068 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 2069 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2070 2071 vpmuludq $H0,$T1,$T4 # h0*r1 2072 vpmuludq $H1,$T1,$H2 # h1*r1 2073 vpaddq $T4,$D1,$D1 # d1 += h0*r1 2074 vpaddq $H2,$D2,$D2 # d2 += h1*r1 2075 vpmuludq $H3,$T1,$T4 # h3*r1 2076 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 2077 vpaddq $T4,$D4,$D4 # d4 += h3*r1 2078 vpaddq $H2,$D0,$D0 # d0 += h4*s1 2079 2080 vpmuludq $H0,$T0,$T4 # h0*r0 2081 vpmuludq $H1,$T0,$H2 # h1*r0 2082 vpaddq $T4,$D0,$D0 # d0 += h0*r0 2083 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 2084 vpaddq $H2,$D1,$D1 # d1 += h1*r0 2085 vpmuludq $H3,$T0,$T4 # h3*r0 2086 vpmuludq $H4,$T0,$H2 # h4*r0 2087 vpaddq $T4,$D3,$D3 # d3 += h3*r0 2088 vpaddq $H2,$D4,$D4 # d4 += h4*r0 2089 2090 vpmuludq $H3,$T1,$T4 # h3*s2 2091 vpmuludq $H4,$T1,$H2 # h4*s2 2092 vpaddq $T4,$D0,$D0 # d0 += h3*s2 2093 vpaddq $H2,$D1,$D1 # d1 += h4*s2 2094 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 2095 vpmuludq $H1,$T2,$T4 # h1*r2 2096 vpmuludq $H0,$T2,$T2 # h0*r2 2097 vpaddq $T4,$D3,$D3 # d3 += h1*r2 2098 vpaddq $T2,$D2,$D2 # d2 += h0*r2 2099 2100 vpmuludq $H1,$H2,$T4 # h1*r3 2101 vpmuludq $H0,$H2,$H2 # h0*r3 2102 vpaddq $T4,$D4,$D4 # d4 += h1*r3 2103 vpaddq $H2,$D3,$D3 # d3 += h0*r3 2104 vpmuludq $H3,$T3,$T4 # h3*s3 2105 vpmuludq $H4,$T3,$H2 # h4*s3 2106 vpaddq $T4,$D1,$D1 # d1 += h3*s3 2107 vpaddq $H2,$D2,$D2 # d2 += h4*s3 2108 2109 vpmuludq $H3,$S4,$H3 # h3*s4 2110 vpmuludq $H4,$S4,$H4 # h4*s4 2111 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 2112 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 2113 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 2114 vpmuludq $H1,$S4,$H0 # h1*s4 2115 vmovdqa 64(%rcx),$MASK # .Lmask26 2116 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 2117 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 2118 2119 ################################################################ 2120 # horizontal addition 2121 2122 vpsrldq \$8,$D1,$T1 2123 vpsrldq \$8,$H2,$T2 2124 vpsrldq \$8,$H3,$T3 2125 vpsrldq \$8,$H4,$T4 2126 vpsrldq \$8,$H0,$T0 2127 vpaddq $T1,$D1,$D1 2128 vpaddq $T2,$H2,$H2 2129 vpaddq $T3,$H3,$H3 2130 vpaddq $T4,$H4,$H4 2131 vpaddq $T0,$H0,$H0 2132 2133 vpermq \$0x2,$H3,$T3 2134 vpermq \$0x2,$H4,$T4 2135 vpermq \$0x2,$H0,$T0 2136 vpermq \$0x2,$D1,$T1 2137 vpermq \$0x2,$H2,$T2 2138 vpaddq $T3,$H3,$H3 2139 vpaddq $T4,$H4,$H4 2140 vpaddq $T0,$H0,$H0 2141 vpaddq $T1,$D1,$D1 2142 vpaddq $T2,$H2,$H2 2143 2144 ################################################################ 2145 # lazy reduction 2146 2147 vpsrlq \$26,$H3,$D3 2148 vpand $MASK,$H3,$H3 2149 vpaddq $D3,$H4,$H4 # h3 -> h4 2150 2151 vpsrlq \$26,$H0,$D0 2152 vpand $MASK,$H0,$H0 2153 vpaddq $D0,$D1,$H1 # h0 -> h1 2154 2155 vpsrlq \$26,$H4,$D4 2156 vpand $MASK,$H4,$H4 2157 2158 vpsrlq \$26,$H1,$D1 2159 vpand $MASK,$H1,$H1 2160 vpaddq $D1,$H2,$H2 # h1 -> h2 2161 2162 vpaddq $D4,$H0,$H0 2163 vpsllq \$2,$D4,$D4 2164 vpaddq $D4,$H0,$H0 # h4 -> h0 2165 2166 vpsrlq \$26,$H2,$D2 2167 vpand $MASK,$H2,$H2 2168 vpaddq $D2,$H3,$H3 # h2 -> h3 2169 2170 vpsrlq \$26,$H0,$D0 2171 vpand $MASK,$H0,$H0 2172 vpaddq $D0,$H1,$H1 # h0 -> h1 2173 2174 vpsrlq \$26,$H3,$D3 2175 vpand $MASK,$H3,$H3 2176 vpaddq $D3,$H4,$H4 # h3 -> h4 2177 2178 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2179 vmovd %x#$H1,`4*1-48-64`($ctx) 2180 vmovd %x#$H2,`4*2-48-64`($ctx) 2181 vmovd %x#$H3,`4*3-48-64`($ctx) 2182 vmovd %x#$H4,`4*4-48-64`($ctx) 2183___ 2184$code.=<<___ if ($win64); 2185 vmovdqa -0xb0(%r10),%xmm6 2186 vmovdqa -0xa0(%r10),%xmm7 2187 vmovdqa -0x90(%r10),%xmm8 2188 vmovdqa -0x80(%r10),%xmm9 2189 vmovdqa -0x70(%r10),%xmm10 2190 vmovdqa -0x60(%r10),%xmm11 2191 vmovdqa -0x50(%r10),%xmm12 2192 vmovdqa -0x40(%r10),%xmm13 2193 vmovdqa -0x30(%r10),%xmm14 2194 vmovdqa -0x20(%r10),%xmm15 2195 lea -8(%r10),%rsp 2196.Ldo_avx2_epilogue$suffix: 2197___ 2198$code.=<<___ if (!$win64); 2199 lea -8(%r10),%rsp 2200.cfi_def_cfa_register %rsp 2201___ 2202$code.=<<___; 2203 vzeroupper 2204 ret 2205.cfi_endproc 2206___ 2207if($avx > 2 && $avx512) { 2208my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 2209my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 2210my $PADBIT="%zmm30"; 2211 2212map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 2213map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 2214map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 2215map(s/%y/%z/,($MASK)); 2216 2217$code.=<<___; 2218.cfi_startproc 2219.Lblocks_avx512: 2220 mov \$15,%eax 2221 kmovw %eax,%k2 2222___ 2223$code.=<<___ if (!$win64); 2224 lea 8(%rsp),%r10 2225.cfi_def_cfa_register %r10 2226 sub \$0x128,%rsp 2227___ 2228$code.=<<___ if ($win64); 2229 lea 8(%rsp),%r10 2230 sub \$0x1c8,%rsp 2231 vmovdqa %xmm6,-0xb0(%r10) 2232 vmovdqa %xmm7,-0xa0(%r10) 2233 vmovdqa %xmm8,-0x90(%r10) 2234 vmovdqa %xmm9,-0x80(%r10) 2235 vmovdqa %xmm10,-0x70(%r10) 2236 vmovdqa %xmm11,-0x60(%r10) 2237 vmovdqa %xmm12,-0x50(%r10) 2238 vmovdqa %xmm13,-0x40(%r10) 2239 vmovdqa %xmm14,-0x30(%r10) 2240 vmovdqa %xmm15,-0x20(%r10) 2241.Ldo_avx512_body: 2242___ 2243$code.=<<___; 2244 lea .Lconst(%rip),%rcx 2245 lea 48+64($ctx),$ctx # size optimization 2246 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 2247 2248 # expand pre-calculated table 2249 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 2250 and \$-512,%rsp 2251 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 2252 mov \$0x20,%rax 2253 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 2254 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 2255 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 2256 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 2257 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 2258 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 2259 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 2260 vpermd $D0,$T2,$R0 # 00003412 -> 14243444 2261 vpbroadcastq 64(%rcx),$MASK # .Lmask26 2262 vpermd $D1,$T2,$R1 2263 vpermd $T0,$T2,$S1 2264 vpermd $D2,$T2,$R2 2265 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 2266 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 2267 vpermd $T1,$T2,$S2 2268 vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 2269 vpsrlq \$32,$R1,$T1 2270 vpermd $D3,$T2,$R3 2271 vmovdqa64 $S1,0x40(%rsp){%k2} 2272 vpermd $T3,$T2,$S3 2273 vpermd $D4,$T2,$R4 2274 vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 2275 vpermd $T4,$T2,$S4 2276 vmovdqa64 $S2,0x80(%rsp){%k2} 2277 vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 2278 vmovdqa64 $S3,0xc0(%rsp){%k2} 2279 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 2280 vmovdqa64 $S4,0x100(%rsp){%k2} 2281 2282 ################################################################ 2283 # calculate 5th through 8th powers of the key 2284 # 2285 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 2286 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 2287 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 2288 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 2289 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 2290 2291 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 2292 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 2293 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 2294 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 2295 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 2296 vpsrlq \$32,$R2,$T2 2297 2298 vpmuludq $T1,$S4,$M0 2299 vpmuludq $T1,$R0,$M1 2300 vpmuludq $T1,$R1,$M2 2301 vpmuludq $T1,$R2,$M3 2302 vpmuludq $T1,$R3,$M4 2303 vpsrlq \$32,$R3,$T3 2304 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 2305 vpaddq $M1,$D1,$D1 # d1 += r1'*r0 2306 vpaddq $M2,$D2,$D2 # d2 += r1'*r1 2307 vpaddq $M3,$D3,$D3 # d3 += r1'*r2 2308 vpaddq $M4,$D4,$D4 # d4 += r1'*r3 2309 2310 vpmuludq $T2,$S3,$M0 2311 vpmuludq $T2,$S4,$M1 2312 vpmuludq $T2,$R1,$M3 2313 vpmuludq $T2,$R2,$M4 2314 vpmuludq $T2,$R0,$M2 2315 vpsrlq \$32,$R4,$T4 2316 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 2317 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 2318 vpaddq $M3,$D3,$D3 # d3 += r2'*r1 2319 vpaddq $M4,$D4,$D4 # d4 += r2'*r2 2320 vpaddq $M2,$D2,$D2 # d2 += r2'*r0 2321 2322 vpmuludq $T3,$S2,$M0 2323 vpmuludq $T3,$R0,$M3 2324 vpmuludq $T3,$R1,$M4 2325 vpmuludq $T3,$S3,$M1 2326 vpmuludq $T3,$S4,$M2 2327 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 2328 vpaddq $M3,$D3,$D3 # d3 += r3'*r0 2329 vpaddq $M4,$D4,$D4 # d4 += r3'*r1 2330 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 2331 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 2332 2333 vpmuludq $T4,$S4,$M3 2334 vpmuludq $T4,$R0,$M4 2335 vpmuludq $T4,$S1,$M0 2336 vpmuludq $T4,$S2,$M1 2337 vpmuludq $T4,$S3,$M2 2338 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 2339 vpaddq $M4,$D4,$D4 # d4 += r2'*r0 2340 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 2341 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 2342 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 2343 2344 ################################################################ 2345 # load input 2346 vmovdqu64 16*0($inp),%z#$T3 2347 vmovdqu64 16*4($inp),%z#$T4 2348 lea 16*8($inp),$inp 2349 2350 ################################################################ 2351 # lazy reduction 2352 2353 vpsrlq \$26,$D3,$M3 2354 vpandq $MASK,$D3,$D3 2355 vpaddq $M3,$D4,$D4 # d3 -> d4 2356 2357 vpsrlq \$26,$D0,$M0 2358 vpandq $MASK,$D0,$D0 2359 vpaddq $M0,$D1,$D1 # d0 -> d1 2360 2361 vpsrlq \$26,$D4,$M4 2362 vpandq $MASK,$D4,$D4 2363 2364 vpsrlq \$26,$D1,$M1 2365 vpandq $MASK,$D1,$D1 2366 vpaddq $M1,$D2,$D2 # d1 -> d2 2367 2368 vpaddq $M4,$D0,$D0 2369 vpsllq \$2,$M4,$M4 2370 vpaddq $M4,$D0,$D0 # d4 -> d0 2371 2372 vpsrlq \$26,$D2,$M2 2373 vpandq $MASK,$D2,$D2 2374 vpaddq $M2,$D3,$D3 # d2 -> d3 2375 2376 vpsrlq \$26,$D0,$M0 2377 vpandq $MASK,$D0,$D0 2378 vpaddq $M0,$D1,$D1 # d0 -> d1 2379 2380 vpsrlq \$26,$D3,$M3 2381 vpandq $MASK,$D3,$D3 2382 vpaddq $M3,$D4,$D4 # d3 -> d4 2383 2384 ################################################################ 2385 # at this point we have 14243444 in $R0-$S4 and 05060708 in 2386 # $D0-$D4, ... 2387 2388 vpunpcklqdq $T4,$T3,$T0 # transpose input 2389 vpunpckhqdq $T4,$T3,$T4 2390 2391 # ... since input 64-bit lanes are ordered as 73625140, we could 2392 # "vperm" it to 76543210 (here and in each loop iteration), *or* 2393 # we could just flow along, hence the goal for $R0-$S4 is 2394 # 1858286838784888 ... 2395 2396 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 2397 mov \$0x7777,%eax 2398 kmovw %eax,%k1 2399 2400 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 2401 vpermd $R1,$M0,$R1 2402 vpermd $R2,$M0,$R2 2403 vpermd $R3,$M0,$R3 2404 vpermd $R4,$M0,$R4 2405 2406 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 2407 vpermd $D1,$M0,${R1}{%k1} 2408 vpermd $D2,$M0,${R2}{%k1} 2409 vpermd $D3,$M0,${R3}{%k1} 2410 vpermd $D4,$M0,${R4}{%k1} 2411 2412 vpslld \$2,$R1,$S1 # *5 2413 vpslld \$2,$R2,$S2 2414 vpslld \$2,$R3,$S3 2415 vpslld \$2,$R4,$S4 2416 vpaddd $R1,$S1,$S1 2417 vpaddd $R2,$S2,$S2 2418 vpaddd $R3,$S3,$S3 2419 vpaddd $R4,$S4,$S4 2420 2421 vpbroadcastq 32(%rcx),$PADBIT # .L129 2422 2423 vpsrlq \$52,$T0,$T2 # splat input 2424 vpsllq \$12,$T4,$T3 2425 vporq $T3,$T2,$T2 2426 vpsrlq \$26,$T0,$T1 2427 vpsrlq \$14,$T4,$T3 2428 vpsrlq \$40,$T4,$T4 # 4 2429 vpandq $MASK,$T2,$T2 # 2 2430 vpandq $MASK,$T0,$T0 # 0 2431 #vpandq $MASK,$T1,$T1 # 1 2432 #vpandq $MASK,$T3,$T3 # 3 2433 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2434 2435 vpaddq $H2,$T2,$H2 # accumulate input 2436 sub \$192,$len 2437 jbe .Ltail_avx512 2438 jmp .Loop_avx512 2439 2440.align 32 2441.Loop_avx512: 2442 ################################################################ 2443 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 2444 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 2445 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 2446 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 2447 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 2448 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 2449 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 2450 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 2451 # \________/\___________/ 2452 ################################################################ 2453 #vpaddq $H2,$T2,$H2 # accumulate input 2454 2455 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 2456 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 2457 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 2458 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 2459 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 2460 # 2461 # however, as h2 is "chronologically" first one available pull 2462 # corresponding operations up, so it's 2463 # 2464 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 2465 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 2466 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 2467 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 2468 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 2469 2470 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2471 vpaddq $H0,$T0,$H0 2472 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2473 vpandq $MASK,$T1,$T1 # 1 2474 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2475 vpandq $MASK,$T3,$T3 # 3 2476 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2477 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2478 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2479 vpaddq $H1,$T1,$H1 # accumulate input 2480 vpaddq $H3,$T3,$H3 2481 vpaddq $H4,$T4,$H4 2482 2483 vmovdqu64 16*0($inp),$T3 # load input 2484 vmovdqu64 16*4($inp),$T4 2485 lea 16*8($inp),$inp 2486 vpmuludq $H0,$R3,$M3 2487 vpmuludq $H0,$R4,$M4 2488 vpmuludq $H0,$R0,$M0 2489 vpmuludq $H0,$R1,$M1 2490 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2491 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2492 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2493 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2494 2495 vpmuludq $H1,$R2,$M3 2496 vpmuludq $H1,$R3,$M4 2497 vpmuludq $H1,$S4,$M0 2498 vpmuludq $H0,$R2,$M2 2499 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2500 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2501 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2502 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2503 2504 vpunpcklqdq $T4,$T3,$T0 # transpose input 2505 vpunpckhqdq $T4,$T3,$T4 2506 2507 vpmuludq $H3,$R0,$M3 2508 vpmuludq $H3,$R1,$M4 2509 vpmuludq $H1,$R0,$M1 2510 vpmuludq $H1,$R1,$M2 2511 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2512 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2513 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2514 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2515 2516 vpmuludq $H4,$S4,$M3 2517 vpmuludq $H4,$R0,$M4 2518 vpmuludq $H3,$S2,$M0 2519 vpmuludq $H3,$S3,$M1 2520 vpaddq $M3,$D3,$D3 # d3 += h4*s4 2521 vpmuludq $H3,$S4,$M2 2522 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2523 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2524 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2525 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2526 2527 vpmuludq $H4,$S1,$M0 2528 vpmuludq $H4,$S2,$M1 2529 vpmuludq $H4,$S3,$M2 2530 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2531 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2532 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2533 2534 ################################################################ 2535 # lazy reduction (interleaved with input splat) 2536 2537 vpsrlq \$52,$T0,$T2 # splat input 2538 vpsllq \$12,$T4,$T3 2539 2540 vpsrlq \$26,$D3,$H3 2541 vpandq $MASK,$D3,$D3 2542 vpaddq $H3,$D4,$H4 # h3 -> h4 2543 2544 vporq $T3,$T2,$T2 2545 2546 vpsrlq \$26,$H0,$D0 2547 vpandq $MASK,$H0,$H0 2548 vpaddq $D0,$H1,$H1 # h0 -> h1 2549 2550 vpandq $MASK,$T2,$T2 # 2 2551 2552 vpsrlq \$26,$H4,$D4 2553 vpandq $MASK,$H4,$H4 2554 2555 vpsrlq \$26,$H1,$D1 2556 vpandq $MASK,$H1,$H1 2557 vpaddq $D1,$H2,$H2 # h1 -> h2 2558 2559 vpaddq $D4,$H0,$H0 2560 vpsllq \$2,$D4,$D4 2561 vpaddq $D4,$H0,$H0 # h4 -> h0 2562 2563 vpaddq $T2,$H2,$H2 # modulo-scheduled 2564 vpsrlq \$26,$T0,$T1 2565 2566 vpsrlq \$26,$H2,$D2 2567 vpandq $MASK,$H2,$H2 2568 vpaddq $D2,$D3,$H3 # h2 -> h3 2569 2570 vpsrlq \$14,$T4,$T3 2571 2572 vpsrlq \$26,$H0,$D0 2573 vpandq $MASK,$H0,$H0 2574 vpaddq $D0,$H1,$H1 # h0 -> h1 2575 2576 vpsrlq \$40,$T4,$T4 # 4 2577 2578 vpsrlq \$26,$H3,$D3 2579 vpandq $MASK,$H3,$H3 2580 vpaddq $D3,$H4,$H4 # h3 -> h4 2581 2582 vpandq $MASK,$T0,$T0 # 0 2583 #vpandq $MASK,$T1,$T1 # 1 2584 #vpandq $MASK,$T3,$T3 # 3 2585 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2586 2587 sub \$128,$len 2588 ja .Loop_avx512 2589 2590.Ltail_avx512: 2591 ################################################################ 2592 # while above multiplications were by r^8 in all lanes, in last 2593 # iteration we multiply least significant lane by r^8 and most 2594 # significant one by r, that's why table gets shifted... 2595 2596 vpsrlq \$32,$R0,$R0 # 0105020603070408 2597 vpsrlq \$32,$R1,$R1 2598 vpsrlq \$32,$R2,$R2 2599 vpsrlq \$32,$S3,$S3 2600 vpsrlq \$32,$S4,$S4 2601 vpsrlq \$32,$R3,$R3 2602 vpsrlq \$32,$R4,$R4 2603 vpsrlq \$32,$S1,$S1 2604 vpsrlq \$32,$S2,$S2 2605 2606 ################################################################ 2607 # load either next or last 64 byte of input 2608 lea ($inp,$len),$inp 2609 2610 #vpaddq $H2,$T2,$H2 # accumulate input 2611 vpaddq $H0,$T0,$H0 2612 2613 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2614 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2615 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2616 vpandq $MASK,$T1,$T1 # 1 2617 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2618 vpandq $MASK,$T3,$T3 # 3 2619 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2620 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2621 vpaddq $H1,$T1,$H1 # accumulate input 2622 vpaddq $H3,$T3,$H3 2623 vpaddq $H4,$T4,$H4 2624 2625 vmovdqu 16*0($inp),%x#$T0 2626 vpmuludq $H0,$R3,$M3 2627 vpmuludq $H0,$R4,$M4 2628 vpmuludq $H0,$R0,$M0 2629 vpmuludq $H0,$R1,$M1 2630 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2631 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2632 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2633 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2634 2635 vmovdqu 16*1($inp),%x#$T1 2636 vpmuludq $H1,$R2,$M3 2637 vpmuludq $H1,$R3,$M4 2638 vpmuludq $H1,$S4,$M0 2639 vpmuludq $H0,$R2,$M2 2640 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2641 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2642 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2643 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2644 2645 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 2646 vpmuludq $H3,$R0,$M3 2647 vpmuludq $H3,$R1,$M4 2648 vpmuludq $H1,$R0,$M1 2649 vpmuludq $H1,$R1,$M2 2650 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2651 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2652 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2653 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2654 2655 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 2656 vpmuludq $H4,$S4,$M3 2657 vpmuludq $H4,$R0,$M4 2658 vpmuludq $H3,$S2,$M0 2659 vpmuludq $H3,$S3,$M1 2660 vpmuludq $H3,$S4,$M2 2661 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 2662 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2663 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2664 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2665 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2666 2667 vpmuludq $H4,$S1,$M0 2668 vpmuludq $H4,$S2,$M1 2669 vpmuludq $H4,$S3,$M2 2670 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2671 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2672 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2673 2674 ################################################################ 2675 # horizontal addition 2676 2677 mov \$1,%eax 2678 vpermq \$0xb1,$H3,$D3 2679 vpermq \$0xb1,$D4,$H4 2680 vpermq \$0xb1,$H0,$D0 2681 vpermq \$0xb1,$H1,$D1 2682 vpermq \$0xb1,$H2,$D2 2683 vpaddq $D3,$H3,$H3 2684 vpaddq $D4,$H4,$H4 2685 vpaddq $D0,$H0,$H0 2686 vpaddq $D1,$H1,$H1 2687 vpaddq $D2,$H2,$H2 2688 2689 kmovw %eax,%k3 2690 vpermq \$0x2,$H3,$D3 2691 vpermq \$0x2,$H4,$D4 2692 vpermq \$0x2,$H0,$D0 2693 vpermq \$0x2,$H1,$D1 2694 vpermq \$0x2,$H2,$D2 2695 vpaddq $D3,$H3,$H3 2696 vpaddq $D4,$H4,$H4 2697 vpaddq $D0,$H0,$H0 2698 vpaddq $D1,$H1,$H1 2699 vpaddq $D2,$H2,$H2 2700 2701 vextracti64x4 \$0x1,$H3,%y#$D3 2702 vextracti64x4 \$0x1,$H4,%y#$D4 2703 vextracti64x4 \$0x1,$H0,%y#$D0 2704 vextracti64x4 \$0x1,$H1,%y#$D1 2705 vextracti64x4 \$0x1,$H2,%y#$D2 2706 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 2707 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 2708 vpaddq $D0,$H0,${H0}{%k3}{z} 2709 vpaddq $D1,$H1,${H1}{%k3}{z} 2710 vpaddq $D2,$H2,${H2}{%k3}{z} 2711___ 2712map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 2713map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 2714$code.=<<___; 2715 ################################################################ 2716 # lazy reduction (interleaved with input splat) 2717 2718 vpsrlq \$26,$H3,$D3 2719 vpand $MASK,$H3,$H3 2720 vpsrldq \$6,$T0,$T2 # splat input 2721 vpsrldq \$6,$T1,$T3 2722 vpunpckhqdq $T1,$T0,$T4 # 4 2723 vpaddq $D3,$H4,$H4 # h3 -> h4 2724 2725 vpsrlq \$26,$H0,$D0 2726 vpand $MASK,$H0,$H0 2727 vpunpcklqdq $T3,$T2,$T2 # 2:3 2728 vpunpcklqdq $T1,$T0,$T0 # 0:1 2729 vpaddq $D0,$H1,$H1 # h0 -> h1 2730 2731 vpsrlq \$26,$H4,$D4 2732 vpand $MASK,$H4,$H4 2733 2734 vpsrlq \$26,$H1,$D1 2735 vpand $MASK,$H1,$H1 2736 vpsrlq \$30,$T2,$T3 2737 vpsrlq \$4,$T2,$T2 2738 vpaddq $D1,$H2,$H2 # h1 -> h2 2739 2740 vpaddq $D4,$H0,$H0 2741 vpsllq \$2,$D4,$D4 2742 vpsrlq \$26,$T0,$T1 2743 vpsrlq \$40,$T4,$T4 # 4 2744 vpaddq $D4,$H0,$H0 # h4 -> h0 2745 2746 vpsrlq \$26,$H2,$D2 2747 vpand $MASK,$H2,$H2 2748 vpand $MASK,$T2,$T2 # 2 2749 vpand $MASK,$T0,$T0 # 0 2750 vpaddq $D2,$H3,$H3 # h2 -> h3 2751 2752 vpsrlq \$26,$H0,$D0 2753 vpand $MASK,$H0,$H0 2754 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 2755 vpand $MASK,$T1,$T1 # 1 2756 vpaddq $D0,$H1,$H1 # h0 -> h1 2757 2758 vpsrlq \$26,$H3,$D3 2759 vpand $MASK,$H3,$H3 2760 vpand $MASK,$T3,$T3 # 3 2761 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2762 vpaddq $D3,$H4,$H4 # h3 -> h4 2763 2764 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 2765 add \$64,$len 2766 jnz .Ltail_avx2$suffix 2767 2768 vpsubq $T2,$H2,$H2 # undo input accumulation 2769 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2770 vmovd %x#$H1,`4*1-48-64`($ctx) 2771 vmovd %x#$H2,`4*2-48-64`($ctx) 2772 vmovd %x#$H3,`4*3-48-64`($ctx) 2773 vmovd %x#$H4,`4*4-48-64`($ctx) 2774 vzeroall 2775___ 2776$code.=<<___ if ($win64); 2777 movdqa -0xb0(%r10),%xmm6 2778 movdqa -0xa0(%r10),%xmm7 2779 movdqa -0x90(%r10),%xmm8 2780 movdqa -0x80(%r10),%xmm9 2781 movdqa -0x70(%r10),%xmm10 2782 movdqa -0x60(%r10),%xmm11 2783 movdqa -0x50(%r10),%xmm12 2784 movdqa -0x40(%r10),%xmm13 2785 movdqa -0x30(%r10),%xmm14 2786 movdqa -0x20(%r10),%xmm15 2787 lea -8(%r10),%rsp 2788.Ldo_avx512_epilogue: 2789___ 2790$code.=<<___ if (!$win64); 2791 lea -8(%r10),%rsp 2792.cfi_def_cfa_register %rsp 2793___ 2794$code.=<<___; 2795 ret 2796.cfi_endproc 2797___ 2798 2799} 2800 2801} 2802 2803&declare_function("poly1305_blocks_avx2", 32, 4); 2804poly1305_blocks_avxN(0); 2805&end_function("poly1305_blocks_avx2"); 2806 2807####################################################################### 2808if ($avx>2) { 2809# On entry we have input length divisible by 64. But since inner loop 2810# processes 128 bytes per iteration, cases when length is not divisible 2811# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 2812# reason stack layout is kept identical to poly1305_blocks_avx2. If not 2813# for this tail, we wouldn't have to even allocate stack frame... 2814 2815if($kernel) { 2816 $code .= "#ifdef CONFIG_AS_AVX512\n"; 2817} 2818 2819&declare_function("poly1305_blocks_avx512", 32, 4); 2820poly1305_blocks_avxN(1); 2821&end_function("poly1305_blocks_avx512"); 2822 2823if ($kernel) { 2824 $code .= "#endif\n"; 2825} 2826 2827if (!$kernel && $avx>3) { 2828######################################################################## 2829# VPMADD52 version using 2^44 radix. 2830# 2831# One can argue that base 2^52 would be more natural. Well, even though 2832# some operations would be more natural, one has to recognize couple of 2833# things. Base 2^52 doesn't provide advantage over base 2^44 if you look 2834# at amount of multiply-n-accumulate operations. Secondly, it makes it 2835# impossible to pre-compute multiples of 5 [referred to as s[]/sN in 2836# reference implementations], which means that more such operations 2837# would have to be performed in inner loop, which in turn makes critical 2838# path longer. In other words, even though base 2^44 reduction might 2839# look less elegant, overall critical path is actually shorter... 2840 2841######################################################################## 2842# Layout of opaque area is following. 2843# 2844# unsigned __int64 h[3]; # current hash value base 2^44 2845# unsigned __int64 s[2]; # key value*20 base 2^44 2846# unsigned __int64 r[3]; # key value base 2^44 2847# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 2848# # r^n positions reflect 2849# # placement in register, not 2850# # memory, R[3] is R[1]*20 2851 2852$code.=<<___; 2853.type poly1305_init_base2_44,\@function,3 2854.align 32 2855poly1305_init_base2_44: 2856 xor %eax,%eax 2857 mov %rax,0($ctx) # initialize hash value 2858 mov %rax,8($ctx) 2859 mov %rax,16($ctx) 2860 2861.Linit_base2_44: 2862 lea poly1305_blocks_vpmadd52(%rip),%r10 2863 lea poly1305_emit_base2_44(%rip),%r11 2864 2865 mov \$0x0ffffffc0fffffff,%rax 2866 mov \$0x0ffffffc0ffffffc,%rcx 2867 and 0($inp),%rax 2868 mov \$0x00000fffffffffff,%r8 2869 and 8($inp),%rcx 2870 mov \$0x00000fffffffffff,%r9 2871 and %rax,%r8 2872 shrd \$44,%rcx,%rax 2873 mov %r8,40($ctx) # r0 2874 and %r9,%rax 2875 shr \$24,%rcx 2876 mov %rax,48($ctx) # r1 2877 lea (%rax,%rax,4),%rax # *5 2878 mov %rcx,56($ctx) # r2 2879 shl \$2,%rax # magic <<2 2880 lea (%rcx,%rcx,4),%rcx # *5 2881 shl \$2,%rcx # magic <<2 2882 mov %rax,24($ctx) # s1 2883 mov %rcx,32($ctx) # s2 2884 movq \$-1,64($ctx) # write impossible value 2885___ 2886$code.=<<___ if ($flavour !~ /elf32/); 2887 mov %r10,0(%rdx) 2888 mov %r11,8(%rdx) 2889___ 2890$code.=<<___ if ($flavour =~ /elf32/); 2891 mov %r10d,0(%rdx) 2892 mov %r11d,4(%rdx) 2893___ 2894$code.=<<___; 2895 mov \$1,%eax 2896 ret 2897.size poly1305_init_base2_44,.-poly1305_init_base2_44 2898___ 2899{ 2900my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 2901my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 2902my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 2903 2904$code.=<<___; 2905.type poly1305_blocks_vpmadd52,\@function,4 2906.align 32 2907poly1305_blocks_vpmadd52: 2908 shr \$4,$len 2909 jz .Lno_data_vpmadd52 # too short 2910 2911 shl \$40,$padbit 2912 mov 64($ctx),%r8 # peek on power of the key 2913 2914 # if powers of the key are not calculated yet, process up to 3 2915 # blocks with this single-block subroutine, otherwise ensure that 2916 # length is divisible by 2 blocks and pass the rest down to next 2917 # subroutine... 2918 2919 mov \$3,%rax 2920 mov \$1,%r10 2921 cmp \$4,$len # is input long 2922 cmovae %r10,%rax 2923 test %r8,%r8 # is power value impossible? 2924 cmovns %r10,%rax 2925 2926 and $len,%rax # is input of favourable length? 2927 jz .Lblocks_vpmadd52_4x 2928 2929 sub %rax,$len 2930 mov \$7,%r10d 2931 mov \$1,%r11d 2932 kmovw %r10d,%k7 2933 lea .L2_44_inp_permd(%rip),%r10 2934 kmovw %r11d,%k1 2935 2936 vmovq $padbit,%x#$PAD 2937 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 2938 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 2939 vpermq \$0xcf,$PAD,$PAD 2940 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 2941 2942 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 2943 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 2944 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 2945 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 2946 2947 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 2948 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 2949 2950 jmp .Loop_vpmadd52 2951 2952.align 32 2953.Loop_vpmadd52: 2954 vmovdqu32 0($inp),%x#$T0 # load input as ----3210 2955 lea 16($inp),$inp 2956 2957 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 2958 vpsrlvq $inp_shift,$T0,$T0 2959 vpandq $reduc_mask,$T0,$T0 2960 vporq $PAD,$T0,$T0 2961 2962 vpaddq $T0,$Dlo,$Dlo # accumulate input 2963 2964 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 2965 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 2966 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 2967 2968 vpxord $Dlo,$Dlo,$Dlo 2969 vpxord $Dhi,$Dhi,$Dhi 2970 2971 vpmadd52luq $r2r1r0,$H0,$Dlo 2972 vpmadd52huq $r2r1r0,$H0,$Dhi 2973 2974 vpmadd52luq $r1r0s2,$H1,$Dlo 2975 vpmadd52huq $r1r0s2,$H1,$Dhi 2976 2977 vpmadd52luq $r0s2s1,$H2,$Dlo 2978 vpmadd52huq $r0s2s1,$H2,$Dhi 2979 2980 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 2981 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 2982 vpandq $reduc_mask,$Dlo,$Dlo 2983 2984 vpaddq $T0,$Dhi,$Dhi 2985 2986 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 2987 2988 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 2989 2990 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 2991 vpandq $reduc_mask,$Dlo,$Dlo 2992 2993 vpermq \$0b10010011,$T0,$T0 2994 2995 vpaddq $T0,$Dlo,$Dlo 2996 2997 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 2998 2999 vpaddq $T0,$Dlo,$Dlo 3000 vpsllq \$2,$T0,$T0 3001 3002 vpaddq $T0,$Dlo,$Dlo 3003 3004 dec %rax # len-=16 3005 jnz .Loop_vpmadd52 3006 3007 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 3008 3009 test $len,$len 3010 jnz .Lblocks_vpmadd52_4x 3011 3012.Lno_data_vpmadd52: 3013 ret 3014.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 3015___ 3016} 3017{ 3018######################################################################## 3019# As implied by its name 4x subroutine processes 4 blocks in parallel 3020# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 3021# and is handled in 256-bit %ymm registers. 3022 3023my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3024my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3025my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3026 3027$code.=<<___; 3028.type poly1305_blocks_vpmadd52_4x,\@function,4 3029.align 32 3030poly1305_blocks_vpmadd52_4x: 3031 shr \$4,$len 3032 jz .Lno_data_vpmadd52_4x # too short 3033 3034 shl \$40,$padbit 3035 mov 64($ctx),%r8 # peek on power of the key 3036 3037.Lblocks_vpmadd52_4x: 3038 vpbroadcastq $padbit,$PAD 3039 3040 vmovdqa64 .Lx_mask44(%rip),$mask44 3041 mov \$5,%eax 3042 vmovdqa64 .Lx_mask42(%rip),$mask42 3043 kmovw %eax,%k1 # used in 2x path 3044 3045 test %r8,%r8 # is power value impossible? 3046 js .Linit_vpmadd52 # if it is, then init R[4] 3047 3048 vmovq 0($ctx),%x#$H0 # load current hash value 3049 vmovq 8($ctx),%x#$H1 3050 vmovq 16($ctx),%x#$H2 3051 3052 test \$3,$len # is length 4*n+2? 3053 jnz .Lblocks_vpmadd52_2x_do 3054 3055.Lblocks_vpmadd52_4x_do: 3056 vpbroadcastq 64($ctx),$R0 # load 4th power of the key 3057 vpbroadcastq 96($ctx),$R1 3058 vpbroadcastq 128($ctx),$R2 3059 vpbroadcastq 160($ctx),$S1 3060 3061.Lblocks_vpmadd52_4x_key_loaded: 3062 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3063 vpaddq $R2,$S2,$S2 3064 vpsllq \$2,$S2,$S2 3065 3066 test \$7,$len # is len 8*n? 3067 jz .Lblocks_vpmadd52_8x 3068 3069 vmovdqu64 16*0($inp),$T2 # load data 3070 vmovdqu64 16*2($inp),$T3 3071 lea 16*4($inp),$inp 3072 3073 vpunpcklqdq $T3,$T2,$T1 # transpose data 3074 vpunpckhqdq $T3,$T2,$T3 3075 3076 # at this point 64-bit lanes are ordered as 3-1-2-0 3077 3078 vpsrlq \$24,$T3,$T2 # splat the data 3079 vporq $PAD,$T2,$T2 3080 vpaddq $T2,$H2,$H2 # accumulate input 3081 vpandq $mask44,$T1,$T0 3082 vpsrlq \$44,$T1,$T1 3083 vpsllq \$20,$T3,$T3 3084 vporq $T3,$T1,$T1 3085 vpandq $mask44,$T1,$T1 3086 3087 sub \$4,$len 3088 jz .Ltail_vpmadd52_4x 3089 jmp .Loop_vpmadd52_4x 3090 ud2 3091 3092.align 32 3093.Linit_vpmadd52: 3094 vmovq 24($ctx),%x#$S1 # load key 3095 vmovq 56($ctx),%x#$H2 3096 vmovq 32($ctx),%x#$S2 3097 vmovq 40($ctx),%x#$R0 3098 vmovq 48($ctx),%x#$R1 3099 3100 vmovdqa $R0,$H0 3101 vmovdqa $R1,$H1 3102 vmovdqa $H2,$R2 3103 3104 mov \$2,%eax 3105 3106.Lmul_init_vpmadd52: 3107 vpxorq $D0lo,$D0lo,$D0lo 3108 vpmadd52luq $H2,$S1,$D0lo 3109 vpxorq $D0hi,$D0hi,$D0hi 3110 vpmadd52huq $H2,$S1,$D0hi 3111 vpxorq $D1lo,$D1lo,$D1lo 3112 vpmadd52luq $H2,$S2,$D1lo 3113 vpxorq $D1hi,$D1hi,$D1hi 3114 vpmadd52huq $H2,$S2,$D1hi 3115 vpxorq $D2lo,$D2lo,$D2lo 3116 vpmadd52luq $H2,$R0,$D2lo 3117 vpxorq $D2hi,$D2hi,$D2hi 3118 vpmadd52huq $H2,$R0,$D2hi 3119 3120 vpmadd52luq $H0,$R0,$D0lo 3121 vpmadd52huq $H0,$R0,$D0hi 3122 vpmadd52luq $H0,$R1,$D1lo 3123 vpmadd52huq $H0,$R1,$D1hi 3124 vpmadd52luq $H0,$R2,$D2lo 3125 vpmadd52huq $H0,$R2,$D2hi 3126 3127 vpmadd52luq $H1,$S2,$D0lo 3128 vpmadd52huq $H1,$S2,$D0hi 3129 vpmadd52luq $H1,$R0,$D1lo 3130 vpmadd52huq $H1,$R0,$D1hi 3131 vpmadd52luq $H1,$R1,$D2lo 3132 vpmadd52huq $H1,$R1,$D2hi 3133 3134 ################################################################ 3135 # partial reduction 3136 vpsrlq \$44,$D0lo,$tmp 3137 vpsllq \$8,$D0hi,$D0hi 3138 vpandq $mask44,$D0lo,$H0 3139 vpaddq $tmp,$D0hi,$D0hi 3140 3141 vpaddq $D0hi,$D1lo,$D1lo 3142 3143 vpsrlq \$44,$D1lo,$tmp 3144 vpsllq \$8,$D1hi,$D1hi 3145 vpandq $mask44,$D1lo,$H1 3146 vpaddq $tmp,$D1hi,$D1hi 3147 3148 vpaddq $D1hi,$D2lo,$D2lo 3149 3150 vpsrlq \$42,$D2lo,$tmp 3151 vpsllq \$10,$D2hi,$D2hi 3152 vpandq $mask42,$D2lo,$H2 3153 vpaddq $tmp,$D2hi,$D2hi 3154 3155 vpaddq $D2hi,$H0,$H0 3156 vpsllq \$2,$D2hi,$D2hi 3157 3158 vpaddq $D2hi,$H0,$H0 3159 3160 vpsrlq \$44,$H0,$tmp # additional step 3161 vpandq $mask44,$H0,$H0 3162 3163 vpaddq $tmp,$H1,$H1 3164 3165 dec %eax 3166 jz .Ldone_init_vpmadd52 3167 3168 vpunpcklqdq $R1,$H1,$R1 # 1,2 3169 vpbroadcastq %x#$H1,%x#$H1 # 2,2 3170 vpunpcklqdq $R2,$H2,$R2 3171 vpbroadcastq %x#$H2,%x#$H2 3172 vpunpcklqdq $R0,$H0,$R0 3173 vpbroadcastq %x#$H0,%x#$H0 3174 3175 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3176 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3177 vpaddq $R1,$S1,$S1 3178 vpaddq $R2,$S2,$S2 3179 vpsllq \$2,$S1,$S1 3180 vpsllq \$2,$S2,$S2 3181 3182 jmp .Lmul_init_vpmadd52 3183 ud2 3184 3185.align 32 3186.Ldone_init_vpmadd52: 3187 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 3188 vinserti128 \$1,%x#$R2,$H2,$R2 3189 vinserti128 \$1,%x#$R0,$H0,$R0 3190 3191 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 3192 vpermq \$0b11011000,$R2,$R2 3193 vpermq \$0b11011000,$R0,$R0 3194 3195 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3196 vpaddq $R1,$S1,$S1 3197 vpsllq \$2,$S1,$S1 3198 3199 vmovq 0($ctx),%x#$H0 # load current hash value 3200 vmovq 8($ctx),%x#$H1 3201 vmovq 16($ctx),%x#$H2 3202 3203 test \$3,$len # is length 4*n+2? 3204 jnz .Ldone_init_vpmadd52_2x 3205 3206 vmovdqu64 $R0,64($ctx) # save key powers 3207 vpbroadcastq %x#$R0,$R0 # broadcast 4th power 3208 vmovdqu64 $R1,96($ctx) 3209 vpbroadcastq %x#$R1,$R1 3210 vmovdqu64 $R2,128($ctx) 3211 vpbroadcastq %x#$R2,$R2 3212 vmovdqu64 $S1,160($ctx) 3213 vpbroadcastq %x#$S1,$S1 3214 3215 jmp .Lblocks_vpmadd52_4x_key_loaded 3216 ud2 3217 3218.align 32 3219.Ldone_init_vpmadd52_2x: 3220 vmovdqu64 $R0,64($ctx) # save key powers 3221 vpsrldq \$8,$R0,$R0 # 0-1-0-2 3222 vmovdqu64 $R1,96($ctx) 3223 vpsrldq \$8,$R1,$R1 3224 vmovdqu64 $R2,128($ctx) 3225 vpsrldq \$8,$R2,$R2 3226 vmovdqu64 $S1,160($ctx) 3227 vpsrldq \$8,$S1,$S1 3228 jmp .Lblocks_vpmadd52_2x_key_loaded 3229 ud2 3230 3231.align 32 3232.Lblocks_vpmadd52_2x_do: 3233 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 3234 vmovdqu64 160+8($ctx),${S1}{%k1}{z} 3235 vmovdqu64 64+8($ctx),${R0}{%k1}{z} 3236 vmovdqu64 96+8($ctx),${R1}{%k1}{z} 3237 3238.Lblocks_vpmadd52_2x_key_loaded: 3239 vmovdqu64 16*0($inp),$T2 # load data 3240 vpxorq $T3,$T3,$T3 3241 lea 16*2($inp),$inp 3242 3243 vpunpcklqdq $T3,$T2,$T1 # transpose data 3244 vpunpckhqdq $T3,$T2,$T3 3245 3246 # at this point 64-bit lanes are ordered as x-1-x-0 3247 3248 vpsrlq \$24,$T3,$T2 # splat the data 3249 vporq $PAD,$T2,$T2 3250 vpaddq $T2,$H2,$H2 # accumulate input 3251 vpandq $mask44,$T1,$T0 3252 vpsrlq \$44,$T1,$T1 3253 vpsllq \$20,$T3,$T3 3254 vporq $T3,$T1,$T1 3255 vpandq $mask44,$T1,$T1 3256 3257 jmp .Ltail_vpmadd52_2x 3258 ud2 3259 3260.align 32 3261.Loop_vpmadd52_4x: 3262 #vpaddq $T2,$H2,$H2 # accumulate input 3263 vpaddq $T0,$H0,$H0 3264 vpaddq $T1,$H1,$H1 3265 3266 vpxorq $D0lo,$D0lo,$D0lo 3267 vpmadd52luq $H2,$S1,$D0lo 3268 vpxorq $D0hi,$D0hi,$D0hi 3269 vpmadd52huq $H2,$S1,$D0hi 3270 vpxorq $D1lo,$D1lo,$D1lo 3271 vpmadd52luq $H2,$S2,$D1lo 3272 vpxorq $D1hi,$D1hi,$D1hi 3273 vpmadd52huq $H2,$S2,$D1hi 3274 vpxorq $D2lo,$D2lo,$D2lo 3275 vpmadd52luq $H2,$R0,$D2lo 3276 vpxorq $D2hi,$D2hi,$D2hi 3277 vpmadd52huq $H2,$R0,$D2hi 3278 3279 vmovdqu64 16*0($inp),$T2 # load data 3280 vmovdqu64 16*2($inp),$T3 3281 lea 16*4($inp),$inp 3282 vpmadd52luq $H0,$R0,$D0lo 3283 vpmadd52huq $H0,$R0,$D0hi 3284 vpmadd52luq $H0,$R1,$D1lo 3285 vpmadd52huq $H0,$R1,$D1hi 3286 vpmadd52luq $H0,$R2,$D2lo 3287 vpmadd52huq $H0,$R2,$D2hi 3288 3289 vpunpcklqdq $T3,$T2,$T1 # transpose data 3290 vpunpckhqdq $T3,$T2,$T3 3291 vpmadd52luq $H1,$S2,$D0lo 3292 vpmadd52huq $H1,$S2,$D0hi 3293 vpmadd52luq $H1,$R0,$D1lo 3294 vpmadd52huq $H1,$R0,$D1hi 3295 vpmadd52luq $H1,$R1,$D2lo 3296 vpmadd52huq $H1,$R1,$D2hi 3297 3298 ################################################################ 3299 # partial reduction (interleaved with data splat) 3300 vpsrlq \$44,$D0lo,$tmp 3301 vpsllq \$8,$D0hi,$D0hi 3302 vpandq $mask44,$D0lo,$H0 3303 vpaddq $tmp,$D0hi,$D0hi 3304 3305 vpsrlq \$24,$T3,$T2 3306 vporq $PAD,$T2,$T2 3307 vpaddq $D0hi,$D1lo,$D1lo 3308 3309 vpsrlq \$44,$D1lo,$tmp 3310 vpsllq \$8,$D1hi,$D1hi 3311 vpandq $mask44,$D1lo,$H1 3312 vpaddq $tmp,$D1hi,$D1hi 3313 3314 vpandq $mask44,$T1,$T0 3315 vpsrlq \$44,$T1,$T1 3316 vpsllq \$20,$T3,$T3 3317 vpaddq $D1hi,$D2lo,$D2lo 3318 3319 vpsrlq \$42,$D2lo,$tmp 3320 vpsllq \$10,$D2hi,$D2hi 3321 vpandq $mask42,$D2lo,$H2 3322 vpaddq $tmp,$D2hi,$D2hi 3323 3324 vpaddq $T2,$H2,$H2 # accumulate input 3325 vpaddq $D2hi,$H0,$H0 3326 vpsllq \$2,$D2hi,$D2hi 3327 3328 vpaddq $D2hi,$H0,$H0 3329 vporq $T3,$T1,$T1 3330 vpandq $mask44,$T1,$T1 3331 3332 vpsrlq \$44,$H0,$tmp # additional step 3333 vpandq $mask44,$H0,$H0 3334 3335 vpaddq $tmp,$H1,$H1 3336 3337 sub \$4,$len # len-=64 3338 jnz .Loop_vpmadd52_4x 3339 3340.Ltail_vpmadd52_4x: 3341 vmovdqu64 128($ctx),$R2 # load all key powers 3342 vmovdqu64 160($ctx),$S1 3343 vmovdqu64 64($ctx),$R0 3344 vmovdqu64 96($ctx),$R1 3345 3346.Ltail_vpmadd52_2x: 3347 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3348 vpaddq $R2,$S2,$S2 3349 vpsllq \$2,$S2,$S2 3350 3351 #vpaddq $T2,$H2,$H2 # accumulate input 3352 vpaddq $T0,$H0,$H0 3353 vpaddq $T1,$H1,$H1 3354 3355 vpxorq $D0lo,$D0lo,$D0lo 3356 vpmadd52luq $H2,$S1,$D0lo 3357 vpxorq $D0hi,$D0hi,$D0hi 3358 vpmadd52huq $H2,$S1,$D0hi 3359 vpxorq $D1lo,$D1lo,$D1lo 3360 vpmadd52luq $H2,$S2,$D1lo 3361 vpxorq $D1hi,$D1hi,$D1hi 3362 vpmadd52huq $H2,$S2,$D1hi 3363 vpxorq $D2lo,$D2lo,$D2lo 3364 vpmadd52luq $H2,$R0,$D2lo 3365 vpxorq $D2hi,$D2hi,$D2hi 3366 vpmadd52huq $H2,$R0,$D2hi 3367 3368 vpmadd52luq $H0,$R0,$D0lo 3369 vpmadd52huq $H0,$R0,$D0hi 3370 vpmadd52luq $H0,$R1,$D1lo 3371 vpmadd52huq $H0,$R1,$D1hi 3372 vpmadd52luq $H0,$R2,$D2lo 3373 vpmadd52huq $H0,$R2,$D2hi 3374 3375 vpmadd52luq $H1,$S2,$D0lo 3376 vpmadd52huq $H1,$S2,$D0hi 3377 vpmadd52luq $H1,$R0,$D1lo 3378 vpmadd52huq $H1,$R0,$D1hi 3379 vpmadd52luq $H1,$R1,$D2lo 3380 vpmadd52huq $H1,$R1,$D2hi 3381 3382 ################################################################ 3383 # horizontal addition 3384 3385 mov \$1,%eax 3386 kmovw %eax,%k1 3387 vpsrldq \$8,$D0lo,$T0 3388 vpsrldq \$8,$D0hi,$H0 3389 vpsrldq \$8,$D1lo,$T1 3390 vpsrldq \$8,$D1hi,$H1 3391 vpaddq $T0,$D0lo,$D0lo 3392 vpaddq $H0,$D0hi,$D0hi 3393 vpsrldq \$8,$D2lo,$T2 3394 vpsrldq \$8,$D2hi,$H2 3395 vpaddq $T1,$D1lo,$D1lo 3396 vpaddq $H1,$D1hi,$D1hi 3397 vpermq \$0x2,$D0lo,$T0 3398 vpermq \$0x2,$D0hi,$H0 3399 vpaddq $T2,$D2lo,$D2lo 3400 vpaddq $H2,$D2hi,$D2hi 3401 3402 vpermq \$0x2,$D1lo,$T1 3403 vpermq \$0x2,$D1hi,$H1 3404 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3405 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3406 vpermq \$0x2,$D2lo,$T2 3407 vpermq \$0x2,$D2hi,$H2 3408 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3409 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3410 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3411 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3412 3413 ################################################################ 3414 # partial reduction 3415 vpsrlq \$44,$D0lo,$tmp 3416 vpsllq \$8,$D0hi,$D0hi 3417 vpandq $mask44,$D0lo,$H0 3418 vpaddq $tmp,$D0hi,$D0hi 3419 3420 vpaddq $D0hi,$D1lo,$D1lo 3421 3422 vpsrlq \$44,$D1lo,$tmp 3423 vpsllq \$8,$D1hi,$D1hi 3424 vpandq $mask44,$D1lo,$H1 3425 vpaddq $tmp,$D1hi,$D1hi 3426 3427 vpaddq $D1hi,$D2lo,$D2lo 3428 3429 vpsrlq \$42,$D2lo,$tmp 3430 vpsllq \$10,$D2hi,$D2hi 3431 vpandq $mask42,$D2lo,$H2 3432 vpaddq $tmp,$D2hi,$D2hi 3433 3434 vpaddq $D2hi,$H0,$H0 3435 vpsllq \$2,$D2hi,$D2hi 3436 3437 vpaddq $D2hi,$H0,$H0 3438 3439 vpsrlq \$44,$H0,$tmp # additional step 3440 vpandq $mask44,$H0,$H0 3441 3442 vpaddq $tmp,$H1,$H1 3443 # at this point $len is 3444 # either 4*n+2 or 0... 3445 sub \$2,$len # len-=32 3446 ja .Lblocks_vpmadd52_4x_do 3447 3448 vmovq %x#$H0,0($ctx) 3449 vmovq %x#$H1,8($ctx) 3450 vmovq %x#$H2,16($ctx) 3451 vzeroall 3452 3453.Lno_data_vpmadd52_4x: 3454 ret 3455.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 3456___ 3457} 3458{ 3459######################################################################## 3460# As implied by its name 8x subroutine processes 8 blocks in parallel... 3461# This is intermediate version, as it's used only in cases when input 3462# length is either 8*n, 8*n+1 or 8*n+2... 3463 3464my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3465my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3466my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3467my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 3468 3469$code.=<<___; 3470.type poly1305_blocks_vpmadd52_8x,\@function,4 3471.align 32 3472poly1305_blocks_vpmadd52_8x: 3473 shr \$4,$len 3474 jz .Lno_data_vpmadd52_8x # too short 3475 3476 shl \$40,$padbit 3477 mov 64($ctx),%r8 # peek on power of the key 3478 3479 vmovdqa64 .Lx_mask44(%rip),$mask44 3480 vmovdqa64 .Lx_mask42(%rip),$mask42 3481 3482 test %r8,%r8 # is power value impossible? 3483 js .Linit_vpmadd52 # if it is, then init R[4] 3484 3485 vmovq 0($ctx),%x#$H0 # load current hash value 3486 vmovq 8($ctx),%x#$H1 3487 vmovq 16($ctx),%x#$H2 3488 3489.Lblocks_vpmadd52_8x: 3490 ################################################################ 3491 # fist we calculate more key powers 3492 3493 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 3494 vmovdqu64 160($ctx),$S1 3495 vmovdqu64 64($ctx),$R0 3496 vmovdqu64 96($ctx),$R1 3497 3498 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3499 vpaddq $R2,$S2,$S2 3500 vpsllq \$2,$S2,$S2 3501 3502 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 3503 vpbroadcastq %x#$R0,$RR0 3504 vpbroadcastq %x#$R1,$RR1 3505 3506 vpxorq $D0lo,$D0lo,$D0lo 3507 vpmadd52luq $RR2,$S1,$D0lo 3508 vpxorq $D0hi,$D0hi,$D0hi 3509 vpmadd52huq $RR2,$S1,$D0hi 3510 vpxorq $D1lo,$D1lo,$D1lo 3511 vpmadd52luq $RR2,$S2,$D1lo 3512 vpxorq $D1hi,$D1hi,$D1hi 3513 vpmadd52huq $RR2,$S2,$D1hi 3514 vpxorq $D2lo,$D2lo,$D2lo 3515 vpmadd52luq $RR2,$R0,$D2lo 3516 vpxorq $D2hi,$D2hi,$D2hi 3517 vpmadd52huq $RR2,$R0,$D2hi 3518 3519 vpmadd52luq $RR0,$R0,$D0lo 3520 vpmadd52huq $RR0,$R0,$D0hi 3521 vpmadd52luq $RR0,$R1,$D1lo 3522 vpmadd52huq $RR0,$R1,$D1hi 3523 vpmadd52luq $RR0,$R2,$D2lo 3524 vpmadd52huq $RR0,$R2,$D2hi 3525 3526 vpmadd52luq $RR1,$S2,$D0lo 3527 vpmadd52huq $RR1,$S2,$D0hi 3528 vpmadd52luq $RR1,$R0,$D1lo 3529 vpmadd52huq $RR1,$R0,$D1hi 3530 vpmadd52luq $RR1,$R1,$D2lo 3531 vpmadd52huq $RR1,$R1,$D2hi 3532 3533 ################################################################ 3534 # partial reduction 3535 vpsrlq \$44,$D0lo,$tmp 3536 vpsllq \$8,$D0hi,$D0hi 3537 vpandq $mask44,$D0lo,$RR0 3538 vpaddq $tmp,$D0hi,$D0hi 3539 3540 vpaddq $D0hi,$D1lo,$D1lo 3541 3542 vpsrlq \$44,$D1lo,$tmp 3543 vpsllq \$8,$D1hi,$D1hi 3544 vpandq $mask44,$D1lo,$RR1 3545 vpaddq $tmp,$D1hi,$D1hi 3546 3547 vpaddq $D1hi,$D2lo,$D2lo 3548 3549 vpsrlq \$42,$D2lo,$tmp 3550 vpsllq \$10,$D2hi,$D2hi 3551 vpandq $mask42,$D2lo,$RR2 3552 vpaddq $tmp,$D2hi,$D2hi 3553 3554 vpaddq $D2hi,$RR0,$RR0 3555 vpsllq \$2,$D2hi,$D2hi 3556 3557 vpaddq $D2hi,$RR0,$RR0 3558 3559 vpsrlq \$44,$RR0,$tmp # additional step 3560 vpandq $mask44,$RR0,$RR0 3561 3562 vpaddq $tmp,$RR1,$RR1 3563 3564 ################################################################ 3565 # At this point Rx holds 1324 powers, RRx - 5768, and the goal 3566 # is 15263748, which reflects how data is loaded... 3567 3568 vpunpcklqdq $R2,$RR2,$T2 # 3748 3569 vpunpckhqdq $R2,$RR2,$R2 # 1526 3570 vpunpcklqdq $R0,$RR0,$T0 3571 vpunpckhqdq $R0,$RR0,$R0 3572 vpunpcklqdq $R1,$RR1,$T1 3573 vpunpckhqdq $R1,$RR1,$R1 3574___ 3575######## switch to %zmm 3576map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3577map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3578map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3579map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 3580 3581$code.=<<___; 3582 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 3583 vshufi64x2 \$0x44,$R0,$T0,$RR0 3584 vshufi64x2 \$0x44,$R1,$T1,$RR1 3585 3586 vmovdqu64 16*0($inp),$T2 # load data 3587 vmovdqu64 16*4($inp),$T3 3588 lea 16*8($inp),$inp 3589 3590 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 3591 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 3592 vpaddq $RR2,$SS2,$SS2 3593 vpaddq $RR1,$SS1,$SS1 3594 vpsllq \$2,$SS2,$SS2 3595 vpsllq \$2,$SS1,$SS1 3596 3597 vpbroadcastq $padbit,$PAD 3598 vpbroadcastq %x#$mask44,$mask44 3599 vpbroadcastq %x#$mask42,$mask42 3600 3601 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 3602 vpbroadcastq %x#$SS2,$S2 3603 vpbroadcastq %x#$RR0,$R0 3604 vpbroadcastq %x#$RR1,$R1 3605 vpbroadcastq %x#$RR2,$R2 3606 3607 vpunpcklqdq $T3,$T2,$T1 # transpose data 3608 vpunpckhqdq $T3,$T2,$T3 3609 3610 # at this point 64-bit lanes are ordered as 73625140 3611 3612 vpsrlq \$24,$T3,$T2 # splat the data 3613 vporq $PAD,$T2,$T2 3614 vpaddq $T2,$H2,$H2 # accumulate input 3615 vpandq $mask44,$T1,$T0 3616 vpsrlq \$44,$T1,$T1 3617 vpsllq \$20,$T3,$T3 3618 vporq $T3,$T1,$T1 3619 vpandq $mask44,$T1,$T1 3620 3621 sub \$8,$len 3622 jz .Ltail_vpmadd52_8x 3623 jmp .Loop_vpmadd52_8x 3624 3625.align 32 3626.Loop_vpmadd52_8x: 3627 #vpaddq $T2,$H2,$H2 # accumulate input 3628 vpaddq $T0,$H0,$H0 3629 vpaddq $T1,$H1,$H1 3630 3631 vpxorq $D0lo,$D0lo,$D0lo 3632 vpmadd52luq $H2,$S1,$D0lo 3633 vpxorq $D0hi,$D0hi,$D0hi 3634 vpmadd52huq $H2,$S1,$D0hi 3635 vpxorq $D1lo,$D1lo,$D1lo 3636 vpmadd52luq $H2,$S2,$D1lo 3637 vpxorq $D1hi,$D1hi,$D1hi 3638 vpmadd52huq $H2,$S2,$D1hi 3639 vpxorq $D2lo,$D2lo,$D2lo 3640 vpmadd52luq $H2,$R0,$D2lo 3641 vpxorq $D2hi,$D2hi,$D2hi 3642 vpmadd52huq $H2,$R0,$D2hi 3643 3644 vmovdqu64 16*0($inp),$T2 # load data 3645 vmovdqu64 16*4($inp),$T3 3646 lea 16*8($inp),$inp 3647 vpmadd52luq $H0,$R0,$D0lo 3648 vpmadd52huq $H0,$R0,$D0hi 3649 vpmadd52luq $H0,$R1,$D1lo 3650 vpmadd52huq $H0,$R1,$D1hi 3651 vpmadd52luq $H0,$R2,$D2lo 3652 vpmadd52huq $H0,$R2,$D2hi 3653 3654 vpunpcklqdq $T3,$T2,$T1 # transpose data 3655 vpunpckhqdq $T3,$T2,$T3 3656 vpmadd52luq $H1,$S2,$D0lo 3657 vpmadd52huq $H1,$S2,$D0hi 3658 vpmadd52luq $H1,$R0,$D1lo 3659 vpmadd52huq $H1,$R0,$D1hi 3660 vpmadd52luq $H1,$R1,$D2lo 3661 vpmadd52huq $H1,$R1,$D2hi 3662 3663 ################################################################ 3664 # partial reduction (interleaved with data splat) 3665 vpsrlq \$44,$D0lo,$tmp 3666 vpsllq \$8,$D0hi,$D0hi 3667 vpandq $mask44,$D0lo,$H0 3668 vpaddq $tmp,$D0hi,$D0hi 3669 3670 vpsrlq \$24,$T3,$T2 3671 vporq $PAD,$T2,$T2 3672 vpaddq $D0hi,$D1lo,$D1lo 3673 3674 vpsrlq \$44,$D1lo,$tmp 3675 vpsllq \$8,$D1hi,$D1hi 3676 vpandq $mask44,$D1lo,$H1 3677 vpaddq $tmp,$D1hi,$D1hi 3678 3679 vpandq $mask44,$T1,$T0 3680 vpsrlq \$44,$T1,$T1 3681 vpsllq \$20,$T3,$T3 3682 vpaddq $D1hi,$D2lo,$D2lo 3683 3684 vpsrlq \$42,$D2lo,$tmp 3685 vpsllq \$10,$D2hi,$D2hi 3686 vpandq $mask42,$D2lo,$H2 3687 vpaddq $tmp,$D2hi,$D2hi 3688 3689 vpaddq $T2,$H2,$H2 # accumulate input 3690 vpaddq $D2hi,$H0,$H0 3691 vpsllq \$2,$D2hi,$D2hi 3692 3693 vpaddq $D2hi,$H0,$H0 3694 vporq $T3,$T1,$T1 3695 vpandq $mask44,$T1,$T1 3696 3697 vpsrlq \$44,$H0,$tmp # additional step 3698 vpandq $mask44,$H0,$H0 3699 3700 vpaddq $tmp,$H1,$H1 3701 3702 sub \$8,$len # len-=128 3703 jnz .Loop_vpmadd52_8x 3704 3705.Ltail_vpmadd52_8x: 3706 #vpaddq $T2,$H2,$H2 # accumulate input 3707 vpaddq $T0,$H0,$H0 3708 vpaddq $T1,$H1,$H1 3709 3710 vpxorq $D0lo,$D0lo,$D0lo 3711 vpmadd52luq $H2,$SS1,$D0lo 3712 vpxorq $D0hi,$D0hi,$D0hi 3713 vpmadd52huq $H2,$SS1,$D0hi 3714 vpxorq $D1lo,$D1lo,$D1lo 3715 vpmadd52luq $H2,$SS2,$D1lo 3716 vpxorq $D1hi,$D1hi,$D1hi 3717 vpmadd52huq $H2,$SS2,$D1hi 3718 vpxorq $D2lo,$D2lo,$D2lo 3719 vpmadd52luq $H2,$RR0,$D2lo 3720 vpxorq $D2hi,$D2hi,$D2hi 3721 vpmadd52huq $H2,$RR0,$D2hi 3722 3723 vpmadd52luq $H0,$RR0,$D0lo 3724 vpmadd52huq $H0,$RR0,$D0hi 3725 vpmadd52luq $H0,$RR1,$D1lo 3726 vpmadd52huq $H0,$RR1,$D1hi 3727 vpmadd52luq $H0,$RR2,$D2lo 3728 vpmadd52huq $H0,$RR2,$D2hi 3729 3730 vpmadd52luq $H1,$SS2,$D0lo 3731 vpmadd52huq $H1,$SS2,$D0hi 3732 vpmadd52luq $H1,$RR0,$D1lo 3733 vpmadd52huq $H1,$RR0,$D1hi 3734 vpmadd52luq $H1,$RR1,$D2lo 3735 vpmadd52huq $H1,$RR1,$D2hi 3736 3737 ################################################################ 3738 # horizontal addition 3739 3740 mov \$1,%eax 3741 kmovw %eax,%k1 3742 vpsrldq \$8,$D0lo,$T0 3743 vpsrldq \$8,$D0hi,$H0 3744 vpsrldq \$8,$D1lo,$T1 3745 vpsrldq \$8,$D1hi,$H1 3746 vpaddq $T0,$D0lo,$D0lo 3747 vpaddq $H0,$D0hi,$D0hi 3748 vpsrldq \$8,$D2lo,$T2 3749 vpsrldq \$8,$D2hi,$H2 3750 vpaddq $T1,$D1lo,$D1lo 3751 vpaddq $H1,$D1hi,$D1hi 3752 vpermq \$0x2,$D0lo,$T0 3753 vpermq \$0x2,$D0hi,$H0 3754 vpaddq $T2,$D2lo,$D2lo 3755 vpaddq $H2,$D2hi,$D2hi 3756 3757 vpermq \$0x2,$D1lo,$T1 3758 vpermq \$0x2,$D1hi,$H1 3759 vpaddq $T0,$D0lo,$D0lo 3760 vpaddq $H0,$D0hi,$D0hi 3761 vpermq \$0x2,$D2lo,$T2 3762 vpermq \$0x2,$D2hi,$H2 3763 vpaddq $T1,$D1lo,$D1lo 3764 vpaddq $H1,$D1hi,$D1hi 3765 vextracti64x4 \$1,$D0lo,%y#$T0 3766 vextracti64x4 \$1,$D0hi,%y#$H0 3767 vpaddq $T2,$D2lo,$D2lo 3768 vpaddq $H2,$D2hi,$D2hi 3769 3770 vextracti64x4 \$1,$D1lo,%y#$T1 3771 vextracti64x4 \$1,$D1hi,%y#$H1 3772 vextracti64x4 \$1,$D2lo,%y#$T2 3773 vextracti64x4 \$1,$D2hi,%y#$H2 3774___ 3775######## switch back to %ymm 3776map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3777map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3778map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3779 3780$code.=<<___; 3781 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3782 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3783 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3784 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3785 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3786 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3787 3788 ################################################################ 3789 # partial reduction 3790 vpsrlq \$44,$D0lo,$tmp 3791 vpsllq \$8,$D0hi,$D0hi 3792 vpandq $mask44,$D0lo,$H0 3793 vpaddq $tmp,$D0hi,$D0hi 3794 3795 vpaddq $D0hi,$D1lo,$D1lo 3796 3797 vpsrlq \$44,$D1lo,$tmp 3798 vpsllq \$8,$D1hi,$D1hi 3799 vpandq $mask44,$D1lo,$H1 3800 vpaddq $tmp,$D1hi,$D1hi 3801 3802 vpaddq $D1hi,$D2lo,$D2lo 3803 3804 vpsrlq \$42,$D2lo,$tmp 3805 vpsllq \$10,$D2hi,$D2hi 3806 vpandq $mask42,$D2lo,$H2 3807 vpaddq $tmp,$D2hi,$D2hi 3808 3809 vpaddq $D2hi,$H0,$H0 3810 vpsllq \$2,$D2hi,$D2hi 3811 3812 vpaddq $D2hi,$H0,$H0 3813 3814 vpsrlq \$44,$H0,$tmp # additional step 3815 vpandq $mask44,$H0,$H0 3816 3817 vpaddq $tmp,$H1,$H1 3818 3819 ################################################################ 3820 3821 vmovq %x#$H0,0($ctx) 3822 vmovq %x#$H1,8($ctx) 3823 vmovq %x#$H2,16($ctx) 3824 vzeroall 3825 3826.Lno_data_vpmadd52_8x: 3827 ret 3828.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 3829___ 3830} 3831$code.=<<___; 3832.type poly1305_emit_base2_44,\@function,3 3833.align 32 3834poly1305_emit_base2_44: 3835 mov 0($ctx),%r8 # load hash value 3836 mov 8($ctx),%r9 3837 mov 16($ctx),%r10 3838 3839 mov %r9,%rax 3840 shr \$20,%r9 3841 shl \$44,%rax 3842 mov %r10,%rcx 3843 shr \$40,%r10 3844 shl \$24,%rcx 3845 3846 add %rax,%r8 3847 adc %rcx,%r9 3848 adc \$0,%r10 3849 3850 mov %r8,%rax 3851 add \$5,%r8 # compare to modulus 3852 mov %r9,%rcx 3853 adc \$0,%r9 3854 adc \$0,%r10 3855 shr \$2,%r10 # did 130-bit value overflow? 3856 cmovnz %r8,%rax 3857 cmovnz %r9,%rcx 3858 3859 add 0($nonce),%rax # accumulate nonce 3860 adc 8($nonce),%rcx 3861 mov %rax,0($mac) # write result 3862 mov %rcx,8($mac) 3863 3864 ret 3865.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 3866___ 3867} } } 3868} 3869 3870if (!$kernel) 3871{ # chacha20-poly1305 helpers 3872my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 3873 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 3874$code.=<<___; 3875.globl xor128_encrypt_n_pad 3876.type xor128_encrypt_n_pad,\@abi-omnipotent 3877.align 16 3878xor128_encrypt_n_pad: 3879 sub $otp,$inp 3880 sub $otp,$out 3881 mov $len,%r10 # put len aside 3882 shr \$4,$len # len / 16 3883 jz .Ltail_enc 3884 nop 3885.Loop_enc_xmm: 3886 movdqu ($inp,$otp),%xmm0 3887 pxor ($otp),%xmm0 3888 movdqu %xmm0,($out,$otp) 3889 movdqa %xmm0,($otp) 3890 lea 16($otp),$otp 3891 dec $len 3892 jnz .Loop_enc_xmm 3893 3894 and \$15,%r10 # len % 16 3895 jz .Ldone_enc 3896 3897.Ltail_enc: 3898 mov \$16,$len 3899 sub %r10,$len 3900 xor %eax,%eax 3901.Loop_enc_byte: 3902 mov ($inp,$otp),%al 3903 xor ($otp),%al 3904 mov %al,($out,$otp) 3905 mov %al,($otp) 3906 lea 1($otp),$otp 3907 dec %r10 3908 jnz .Loop_enc_byte 3909 3910 xor %eax,%eax 3911.Loop_enc_pad: 3912 mov %al,($otp) 3913 lea 1($otp),$otp 3914 dec $len 3915 jnz .Loop_enc_pad 3916 3917.Ldone_enc: 3918 mov $otp,%rax 3919 ret 3920.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 3921 3922.globl xor128_decrypt_n_pad 3923.type xor128_decrypt_n_pad,\@abi-omnipotent 3924.align 16 3925xor128_decrypt_n_pad: 3926 sub $otp,$inp 3927 sub $otp,$out 3928 mov $len,%r10 # put len aside 3929 shr \$4,$len # len / 16 3930 jz .Ltail_dec 3931 nop 3932.Loop_dec_xmm: 3933 movdqu ($inp,$otp),%xmm0 3934 movdqa ($otp),%xmm1 3935 pxor %xmm0,%xmm1 3936 movdqu %xmm1,($out,$otp) 3937 movdqa %xmm0,($otp) 3938 lea 16($otp),$otp 3939 dec $len 3940 jnz .Loop_dec_xmm 3941 3942 pxor %xmm1,%xmm1 3943 and \$15,%r10 # len % 16 3944 jz .Ldone_dec 3945 3946.Ltail_dec: 3947 mov \$16,$len 3948 sub %r10,$len 3949 xor %eax,%eax 3950 xor %r11d,%r11d 3951.Loop_dec_byte: 3952 mov ($inp,$otp),%r11b 3953 mov ($otp),%al 3954 xor %r11b,%al 3955 mov %al,($out,$otp) 3956 mov %r11b,($otp) 3957 lea 1($otp),$otp 3958 dec %r10 3959 jnz .Loop_dec_byte 3960 3961 xor %eax,%eax 3962.Loop_dec_pad: 3963 mov %al,($otp) 3964 lea 1($otp),$otp 3965 dec $len 3966 jnz .Loop_dec_pad 3967 3968.Ldone_dec: 3969 mov $otp,%rax 3970 ret 3971.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 3972___ 3973} 3974 3975# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3976# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3977if ($win64) { 3978$rec="%rcx"; 3979$frame="%rdx"; 3980$context="%r8"; 3981$disp="%r9"; 3982 3983$code.=<<___; 3984.extern __imp_RtlVirtualUnwind 3985.type se_handler,\@abi-omnipotent 3986.align 16 3987se_handler: 3988 push %rsi 3989 push %rdi 3990 push %rbx 3991 push %rbp 3992 push %r12 3993 push %r13 3994 push %r14 3995 push %r15 3996 pushfq 3997 sub \$64,%rsp 3998 3999 mov 120($context),%rax # pull context->Rax 4000 mov 248($context),%rbx # pull context->Rip 4001 4002 mov 8($disp),%rsi # disp->ImageBase 4003 mov 56($disp),%r11 # disp->HandlerData 4004 4005 mov 0(%r11),%r10d # HandlerData[0] 4006 lea (%rsi,%r10),%r10 # prologue label 4007 cmp %r10,%rbx # context->Rip<.Lprologue 4008 jb .Lcommon_seh_tail 4009 4010 mov 152($context),%rax # pull context->Rsp 4011 4012 mov 4(%r11),%r10d # HandlerData[1] 4013 lea (%rsi,%r10),%r10 # epilogue label 4014 cmp %r10,%rbx # context->Rip>=.Lepilogue 4015 jae .Lcommon_seh_tail 4016 4017 lea 48(%rax),%rax 4018 4019 mov -8(%rax),%rbx 4020 mov -16(%rax),%rbp 4021 mov -24(%rax),%r12 4022 mov -32(%rax),%r13 4023 mov -40(%rax),%r14 4024 mov -48(%rax),%r15 4025 mov %rbx,144($context) # restore context->Rbx 4026 mov %rbp,160($context) # restore context->Rbp 4027 mov %r12,216($context) # restore context->R12 4028 mov %r13,224($context) # restore context->R13 4029 mov %r14,232($context) # restore context->R14 4030 mov %r15,240($context) # restore context->R14 4031 4032 jmp .Lcommon_seh_tail 4033.size se_handler,.-se_handler 4034 4035.type avx_handler,\@abi-omnipotent 4036.align 16 4037avx_handler: 4038 push %rsi 4039 push %rdi 4040 push %rbx 4041 push %rbp 4042 push %r12 4043 push %r13 4044 push %r14 4045 push %r15 4046 pushfq 4047 sub \$64,%rsp 4048 4049 mov 120($context),%rax # pull context->Rax 4050 mov 248($context),%rbx # pull context->Rip 4051 4052 mov 8($disp),%rsi # disp->ImageBase 4053 mov 56($disp),%r11 # disp->HandlerData 4054 4055 mov 0(%r11),%r10d # HandlerData[0] 4056 lea (%rsi,%r10),%r10 # prologue label 4057 cmp %r10,%rbx # context->Rip<prologue label 4058 jb .Lcommon_seh_tail 4059 4060 mov 152($context),%rax # pull context->Rsp 4061 4062 mov 4(%r11),%r10d # HandlerData[1] 4063 lea (%rsi,%r10),%r10 # epilogue label 4064 cmp %r10,%rbx # context->Rip>=epilogue label 4065 jae .Lcommon_seh_tail 4066 4067 mov 208($context),%rax # pull context->R11 4068 4069 lea 0x50(%rax),%rsi 4070 lea 0xf8(%rax),%rax 4071 lea 512($context),%rdi # &context.Xmm6 4072 mov \$20,%ecx 4073 .long 0xa548f3fc # cld; rep movsq 4074 4075.Lcommon_seh_tail: 4076 mov 8(%rax),%rdi 4077 mov 16(%rax),%rsi 4078 mov %rax,152($context) # restore context->Rsp 4079 mov %rsi,168($context) # restore context->Rsi 4080 mov %rdi,176($context) # restore context->Rdi 4081 4082 mov 40($disp),%rdi # disp->ContextRecord 4083 mov $context,%rsi # context 4084 mov \$154,%ecx # sizeof(CONTEXT) 4085 .long 0xa548f3fc # cld; rep movsq 4086 4087 mov $disp,%rsi 4088 xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER 4089 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4090 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4091 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4092 mov 40(%rsi),%r10 # disp->ContextRecord 4093 lea 56(%rsi),%r11 # &disp->HandlerData 4094 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4095 mov %r10,32(%rsp) # arg5 4096 mov %r11,40(%rsp) # arg6 4097 mov %r12,48(%rsp) # arg7 4098 mov %rcx,56(%rsp) # arg8, (NULL) 4099 call *__imp_RtlVirtualUnwind(%rip) 4100 4101 mov \$1,%eax # ExceptionContinueSearch 4102 add \$64,%rsp 4103 popfq 4104 pop %r15 4105 pop %r14 4106 pop %r13 4107 pop %r12 4108 pop %rbp 4109 pop %rbx 4110 pop %rdi 4111 pop %rsi 4112 ret 4113.size avx_handler,.-avx_handler 4114 4115.section .pdata 4116.align 4 4117 .rva .LSEH_begin_poly1305_init_x86_64 4118 .rva .LSEH_end_poly1305_init_x86_64 4119 .rva .LSEH_info_poly1305_init_x86_64 4120 4121 .rva .LSEH_begin_poly1305_blocks_x86_64 4122 .rva .LSEH_end_poly1305_blocks_x86_64 4123 .rva .LSEH_info_poly1305_blocks_x86_64 4124 4125 .rva .LSEH_begin_poly1305_emit_x86_64 4126 .rva .LSEH_end_poly1305_emit_x86_64 4127 .rva .LSEH_info_poly1305_emit_x86_64 4128___ 4129$code.=<<___ if ($avx); 4130 .rva .LSEH_begin_poly1305_blocks_avx 4131 .rva .Lbase2_64_avx 4132 .rva .LSEH_info_poly1305_blocks_avx_1 4133 4134 .rva .Lbase2_64_avx 4135 .rva .Leven_avx 4136 .rva .LSEH_info_poly1305_blocks_avx_2 4137 4138 .rva .Leven_avx 4139 .rva .LSEH_end_poly1305_blocks_avx 4140 .rva .LSEH_info_poly1305_blocks_avx_3 4141 4142 .rva .LSEH_begin_poly1305_emit_avx 4143 .rva .LSEH_end_poly1305_emit_avx 4144 .rva .LSEH_info_poly1305_emit_avx 4145___ 4146$code.=<<___ if ($avx>1); 4147 .rva .LSEH_begin_poly1305_blocks_avx2 4148 .rva .Lbase2_64_avx2 4149 .rva .LSEH_info_poly1305_blocks_avx2_1 4150 4151 .rva .Lbase2_64_avx2 4152 .rva .Leven_avx2 4153 .rva .LSEH_info_poly1305_blocks_avx2_2 4154 4155 .rva .Leven_avx2 4156 .rva .LSEH_end_poly1305_blocks_avx2 4157 .rva .LSEH_info_poly1305_blocks_avx2_3 4158___ 4159$code.=<<___ if ($avx>2); 4160 .rva .LSEH_begin_poly1305_blocks_avx512 4161 .rva .LSEH_end_poly1305_blocks_avx512 4162 .rva .LSEH_info_poly1305_blocks_avx512 4163___ 4164$code.=<<___; 4165.section .xdata 4166.align 8 4167.LSEH_info_poly1305_init_x86_64: 4168 .byte 9,0,0,0 4169 .rva se_handler 4170 .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 4171 4172.LSEH_info_poly1305_blocks_x86_64: 4173 .byte 9,0,0,0 4174 .rva se_handler 4175 .rva .Lblocks_body,.Lblocks_epilogue 4176 4177.LSEH_info_poly1305_emit_x86_64: 4178 .byte 9,0,0,0 4179 .rva se_handler 4180 .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 4181___ 4182$code.=<<___ if ($avx); 4183.LSEH_info_poly1305_blocks_avx_1: 4184 .byte 9,0,0,0 4185 .rva se_handler 4186 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 4187 4188.LSEH_info_poly1305_blocks_avx_2: 4189 .byte 9,0,0,0 4190 .rva se_handler 4191 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 4192 4193.LSEH_info_poly1305_blocks_avx_3: 4194 .byte 9,0,0,0 4195 .rva avx_handler 4196 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 4197 4198.LSEH_info_poly1305_emit_avx: 4199 .byte 9,0,0,0 4200 .rva se_handler 4201 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 4202___ 4203$code.=<<___ if ($avx>1); 4204.LSEH_info_poly1305_blocks_avx2_1: 4205 .byte 9,0,0,0 4206 .rva se_handler 4207 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 4208 4209.LSEH_info_poly1305_blocks_avx2_2: 4210 .byte 9,0,0,0 4211 .rva se_handler 4212 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 4213 4214.LSEH_info_poly1305_blocks_avx2_3: 4215 .byte 9,0,0,0 4216 .rva avx_handler 4217 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 4218___ 4219$code.=<<___ if ($avx>2); 4220.LSEH_info_poly1305_blocks_avx512: 4221 .byte 9,0,0,0 4222 .rva avx_handler 4223 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 4224___ 4225} 4226 4227open SELF,$0; 4228while(<SELF>) { 4229 next if (/^#!/); 4230 last if (!s/^#/\/\// and !/^$/); 4231 print; 4232} 4233close SELF; 4234 4235foreach (split('\n',$code)) { 4236 s/\`([^\`]*)\`/eval($1)/ge; 4237 s/%r([a-z]+)#d/%e$1/g; 4238 s/%r([0-9]+)#d/%r$1d/g; 4239 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 4240 4241 if ($kernel) { 4242 s/(^\.type.*),[0-9]+$/\1/; 4243 s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; 4244 next if /^\.cfi.*/; 4245 } 4246 4247 print $_,"\n"; 4248} 4249close STDOUT; 4250