1#!/usr/bin/env perl 2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3# 4# ==================================================================== 5# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL 6# project. 7# ==================================================================== 8 9# Poly1305 hash for MIPS. 10# 11# May 2016 12# 13# Numbers are cycles per processed byte with poly1305_blocks alone. 14# 15# IALU/gcc 16# R1x000 ~5.5/+130% (big-endian) 17# Octeon II 2.50/+70% (little-endian) 18# 19# March 2019 20# 21# Add 32-bit code path. 22# 23# October 2019 24# 25# Modulo-scheduling reduction allows to omit dependency chain at the 26# end of inner loop and improve performance. Also optimize MIPS32R2 27# code path for MIPS 1004K core. Per René von Dorst's suggestions. 28# 29# IALU/gcc 30# R1x000 ~9.8/? (big-endian) 31# Octeon II 3.65/+140% (little-endian) 32# MT7621/1004K 4.75/? (little-endian) 33# 34###################################################################### 35# There is a number of MIPS ABI in use, O32 and N32/64 are most 36# widely used. Then there is a new contender: NUBI. It appears that if 37# one picks the latter, it's possible to arrange code in ABI neutral 38# manner. Therefore let's stick to NUBI register layout: 39# 40($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 41($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 42($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 43($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 44# 45# The return value is placed in $a0. Following coding rules facilitate 46# interoperability: 47# 48# - never ever touch $tp, "thread pointer", former $gp [o32 can be 49# excluded from the rule, because it's specified volatile]; 50# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 51# old code]; 52# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 53# 54# For reference here is register layout for N32/64 MIPS ABIs: 55# 56# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 57# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 58# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 59# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 60# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 61# 62# <appro@openssl.org> 63# 64###################################################################### 65 66$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 67 68$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; 69 70if ($flavour =~ /64|n32/i) {{{ 71###################################################################### 72# 64-bit code path 73# 74 75my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 76my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); 77 78$code.=<<___; 79#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ 80 defined(_MIPS_ARCH_MIPS64R6)) \\ 81 && !defined(_MIPS_ARCH_MIPS64R2) 82# define _MIPS_ARCH_MIPS64R2 83#endif 84 85#if defined(_MIPS_ARCH_MIPS64R6) 86# define dmultu(rs,rt) 87# define mflo(rd,rs,rt) dmulu rd,rs,rt 88# define mfhi(rd,rs,rt) dmuhu rd,rs,rt 89#else 90# define dmultu(rs,rt) dmultu rs,rt 91# define mflo(rd,rs,rt) mflo rd 92# define mfhi(rd,rs,rt) mfhi rd 93#endif 94 95#ifdef __KERNEL__ 96# define poly1305_init poly1305_init_mips 97# define poly1305_blocks poly1305_blocks_mips 98# define poly1305_emit poly1305_emit_mips 99#endif 100 101#if defined(__MIPSEB__) && !defined(MIPSEB) 102# define MIPSEB 103#endif 104 105#ifdef MIPSEB 106# define MSB 0 107# define LSB 7 108#else 109# define MSB 7 110# define LSB 0 111#endif 112 113.text 114.set noat 115.set noreorder 116 117.align 5 118.globl poly1305_init 119.ent poly1305_init 120poly1305_init: 121 .frame $sp,0,$ra 122 .set reorder 123 124 sd $zero,0($ctx) 125 sd $zero,8($ctx) 126 sd $zero,16($ctx) 127 128 beqz $inp,.Lno_key 129 130#if defined(_MIPS_ARCH_MIPS64R6) 131 andi $tmp0,$inp,7 # $inp % 8 132 dsubu $inp,$inp,$tmp0 # align $inp 133 sll $tmp0,$tmp0,3 # byte to bit offset 134 ld $in0,0($inp) 135 ld $in1,8($inp) 136 beqz $tmp0,.Laligned_key 137 ld $tmp2,16($inp) 138 139 subu $tmp1,$zero,$tmp0 140# ifdef MIPSEB 141 dsllv $in0,$in0,$tmp0 142 dsrlv $tmp3,$in1,$tmp1 143 dsllv $in1,$in1,$tmp0 144 dsrlv $tmp2,$tmp2,$tmp1 145# else 146 dsrlv $in0,$in0,$tmp0 147 dsllv $tmp3,$in1,$tmp1 148 dsrlv $in1,$in1,$tmp0 149 dsllv $tmp2,$tmp2,$tmp1 150# endif 151 or $in0,$in0,$tmp3 152 or $in1,$in1,$tmp2 153.Laligned_key: 154#else 155 ldl $in0,0+MSB($inp) 156 ldl $in1,8+MSB($inp) 157 ldr $in0,0+LSB($inp) 158 ldr $in1,8+LSB($inp) 159#endif 160#ifdef MIPSEB 161# if defined(_MIPS_ARCH_MIPS64R2) 162 dsbh $in0,$in0 # byte swap 163 dsbh $in1,$in1 164 dshd $in0,$in0 165 dshd $in1,$in1 166# else 167 ori $tmp0,$zero,0xFF 168 dsll $tmp2,$tmp0,32 169 or $tmp0,$tmp2 # 0x000000FF000000FF 170 171 and $tmp1,$in0,$tmp0 # byte swap 172 and $tmp3,$in1,$tmp0 173 dsrl $tmp2,$in0,24 174 dsrl $tmp4,$in1,24 175 dsll $tmp1,24 176 dsll $tmp3,24 177 and $tmp2,$tmp0 178 and $tmp4,$tmp0 179 dsll $tmp0,8 # 0x0000FF000000FF00 180 or $tmp1,$tmp2 181 or $tmp3,$tmp4 182 and $tmp2,$in0,$tmp0 183 and $tmp4,$in1,$tmp0 184 dsrl $in0,8 185 dsrl $in1,8 186 dsll $tmp2,8 187 dsll $tmp4,8 188 and $in0,$tmp0 189 and $in1,$tmp0 190 or $tmp1,$tmp2 191 or $tmp3,$tmp4 192 or $in0,$tmp1 193 or $in1,$tmp3 194 dsrl $tmp1,$in0,32 195 dsrl $tmp3,$in1,32 196 dsll $in0,32 197 dsll $in1,32 198 or $in0,$tmp1 199 or $in1,$tmp3 200# endif 201#endif 202 li $tmp0,1 203 dsll $tmp0,32 # 0x0000000100000000 204 daddiu $tmp0,-63 # 0x00000000ffffffc1 205 dsll $tmp0,28 # 0x0ffffffc10000000 206 daddiu $tmp0,-1 # 0x0ffffffc0fffffff 207 208 and $in0,$tmp0 209 daddiu $tmp0,-3 # 0x0ffffffc0ffffffc 210 and $in1,$tmp0 211 212 sd $in0,24($ctx) 213 dsrl $tmp0,$in1,2 214 sd $in1,32($ctx) 215 daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) 216 sd $tmp0,40($ctx) 217 218.Lno_key: 219 li $v0,0 # return 0 220 jr $ra 221.end poly1305_init 222___ 223{ 224my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; 225 226my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = 227 ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); 228my ($shr,$shl) = ($s6,$s7); # used on R6 229 230$code.=<<___; 231.align 5 232.globl poly1305_blocks 233.ent poly1305_blocks 234poly1305_blocks: 235 .set noreorder 236 dsrl $len,4 # number of complete blocks 237 bnez $len,poly1305_blocks_internal 238 nop 239 jr $ra 240 nop 241.end poly1305_blocks 242 243.align 5 244.ent poly1305_blocks_internal 245poly1305_blocks_internal: 246 .set noreorder 247#if defined(_MIPS_ARCH_MIPS64R6) 248 .frame $sp,8*8,$ra 249 .mask $SAVED_REGS_MASK|0x000c0000,-8 250 dsubu $sp,8*8 251 sd $s7,56($sp) 252 sd $s6,48($sp) 253#else 254 .frame $sp,6*8,$ra 255 .mask $SAVED_REGS_MASK,-8 256 dsubu $sp,6*8 257#endif 258 sd $s5,40($sp) 259 sd $s4,32($sp) 260___ 261$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 262 sd $s3,24($sp) 263 sd $s2,16($sp) 264 sd $s1,8($sp) 265 sd $s0,0($sp) 266___ 267$code.=<<___; 268 .set reorder 269 270#if defined(_MIPS_ARCH_MIPS64R6) 271 andi $shr,$inp,7 272 dsubu $inp,$inp,$shr # align $inp 273 sll $shr,$shr,3 # byte to bit offset 274 subu $shl,$zero,$shr 275#endif 276 277 ld $h0,0($ctx) # load hash value 278 ld $h1,8($ctx) 279 ld $h2,16($ctx) 280 281 ld $r0,24($ctx) # load key 282 ld $r1,32($ctx) 283 ld $rs1,40($ctx) 284 285 dsll $len,4 286 daddu $len,$inp # end of buffer 287 b .Loop 288 289.align 4 290.Loop: 291#if defined(_MIPS_ARCH_MIPS64R6) 292 ld $in0,0($inp) # load input 293 ld $in1,8($inp) 294 beqz $shr,.Laligned_inp 295 296 ld $tmp2,16($inp) 297# ifdef MIPSEB 298 dsllv $in0,$in0,$shr 299 dsrlv $tmp3,$in1,$shl 300 dsllv $in1,$in1,$shr 301 dsrlv $tmp2,$tmp2,$shl 302# else 303 dsrlv $in0,$in0,$shr 304 dsllv $tmp3,$in1,$shl 305 dsrlv $in1,$in1,$shr 306 dsllv $tmp2,$tmp2,$shl 307# endif 308 or $in0,$in0,$tmp3 309 or $in1,$in1,$tmp2 310.Laligned_inp: 311#else 312 ldl $in0,0+MSB($inp) # load input 313 ldl $in1,8+MSB($inp) 314 ldr $in0,0+LSB($inp) 315 ldr $in1,8+LSB($inp) 316#endif 317 daddiu $inp,16 318#ifdef MIPSEB 319# if defined(_MIPS_ARCH_MIPS64R2) 320 dsbh $in0,$in0 # byte swap 321 dsbh $in1,$in1 322 dshd $in0,$in0 323 dshd $in1,$in1 324# else 325 ori $tmp0,$zero,0xFF 326 dsll $tmp2,$tmp0,32 327 or $tmp0,$tmp2 # 0x000000FF000000FF 328 329 and $tmp1,$in0,$tmp0 # byte swap 330 and $tmp3,$in1,$tmp0 331 dsrl $tmp2,$in0,24 332 dsrl $tmp4,$in1,24 333 dsll $tmp1,24 334 dsll $tmp3,24 335 and $tmp2,$tmp0 336 and $tmp4,$tmp0 337 dsll $tmp0,8 # 0x0000FF000000FF00 338 or $tmp1,$tmp2 339 or $tmp3,$tmp4 340 and $tmp2,$in0,$tmp0 341 and $tmp4,$in1,$tmp0 342 dsrl $in0,8 343 dsrl $in1,8 344 dsll $tmp2,8 345 dsll $tmp4,8 346 and $in0,$tmp0 347 and $in1,$tmp0 348 or $tmp1,$tmp2 349 or $tmp3,$tmp4 350 or $in0,$tmp1 351 or $in1,$tmp3 352 dsrl $tmp1,$in0,32 353 dsrl $tmp3,$in1,32 354 dsll $in0,32 355 dsll $in1,32 356 or $in0,$tmp1 357 or $in1,$tmp3 358# endif 359#endif 360 dsrl $tmp1,$h2,2 # modulo-scheduled reduction 361 andi $h2,$h2,3 362 dsll $tmp0,$tmp1,2 363 364 daddu $d0,$h0,$in0 # accumulate input 365 daddu $tmp1,$tmp0 366 sltu $tmp0,$d0,$h0 367 daddu $d0,$d0,$tmp1 # ... and residue 368 sltu $tmp1,$d0,$tmp1 369 daddu $d1,$h1,$in1 370 daddu $tmp0,$tmp1 371 sltu $tmp1,$d1,$h1 372 daddu $d1,$tmp0 373 374 dmultu ($r0,$d0) # h0*r0 375 daddu $d2,$h2,$padbit 376 sltu $tmp0,$d1,$tmp0 377 mflo ($h0,$r0,$d0) 378 mfhi ($h1,$r0,$d0) 379 380 dmultu ($rs1,$d1) # h1*5*r1 381 daddu $d2,$tmp1 382 daddu $d2,$tmp0 383 mflo ($tmp0,$rs1,$d1) 384 mfhi ($tmp1,$rs1,$d1) 385 386 dmultu ($r1,$d0) # h0*r1 387 mflo ($tmp2,$r1,$d0) 388 mfhi ($h2,$r1,$d0) 389 daddu $h0,$tmp0 390 daddu $h1,$tmp1 391 sltu $tmp0,$h0,$tmp0 392 393 dmultu ($r0,$d1) # h1*r0 394 daddu $h1,$tmp0 395 daddu $h1,$tmp2 396 mflo ($tmp0,$r0,$d1) 397 mfhi ($tmp1,$r0,$d1) 398 399 dmultu ($rs1,$d2) # h2*5*r1 400 sltu $tmp2,$h1,$tmp2 401 daddu $h2,$tmp2 402 mflo ($tmp2,$rs1,$d2) 403 404 dmultu ($r0,$d2) # h2*r0 405 daddu $h1,$tmp0 406 daddu $h2,$tmp1 407 mflo ($tmp3,$r0,$d2) 408 sltu $tmp0,$h1,$tmp0 409 daddu $h2,$tmp0 410 411 daddu $h1,$tmp2 412 sltu $tmp2,$h1,$tmp2 413 daddu $h2,$tmp2 414 daddu $h2,$tmp3 415 416 bne $inp,$len,.Loop 417 418 sd $h0,0($ctx) # store hash value 419 sd $h1,8($ctx) 420 sd $h2,16($ctx) 421 422 .set noreorder 423#if defined(_MIPS_ARCH_MIPS64R6) 424 ld $s7,56($sp) 425 ld $s6,48($sp) 426#endif 427 ld $s5,40($sp) # epilogue 428 ld $s4,32($sp) 429___ 430$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue 431 ld $s3,24($sp) 432 ld $s2,16($sp) 433 ld $s1,8($sp) 434 ld $s0,0($sp) 435___ 436$code.=<<___; 437 jr $ra 438#if defined(_MIPS_ARCH_MIPS64R6) 439 daddu $sp,8*8 440#else 441 daddu $sp,6*8 442#endif 443.end poly1305_blocks_internal 444___ 445} 446{ 447my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); 448 449$code.=<<___; 450.align 5 451.globl poly1305_emit 452.ent poly1305_emit 453poly1305_emit: 454 .frame $sp,0,$ra 455 .set reorder 456 457 ld $tmp2,16($ctx) 458 ld $tmp0,0($ctx) 459 ld $tmp1,8($ctx) 460 461 li $in0,-4 # final reduction 462 dsrl $in1,$tmp2,2 463 and $in0,$tmp2 464 andi $tmp2,$tmp2,3 465 daddu $in0,$in1 466 467 daddu $tmp0,$tmp0,$in0 468 sltu $in1,$tmp0,$in0 469 daddiu $in0,$tmp0,5 # compare to modulus 470 daddu $tmp1,$tmp1,$in1 471 sltiu $tmp3,$in0,5 472 sltu $tmp4,$tmp1,$in1 473 daddu $in1,$tmp1,$tmp3 474 daddu $tmp2,$tmp2,$tmp4 475 sltu $tmp3,$in1,$tmp3 476 daddu $tmp2,$tmp2,$tmp3 477 478 dsrl $tmp2,2 # see if it carried/borrowed 479 dsubu $tmp2,$zero,$tmp2 480 481 xor $in0,$tmp0 482 xor $in1,$tmp1 483 and $in0,$tmp2 484 and $in1,$tmp2 485 xor $in0,$tmp0 486 xor $in1,$tmp1 487 488 lwu $tmp0,0($nonce) # load nonce 489 lwu $tmp1,4($nonce) 490 lwu $tmp2,8($nonce) 491 lwu $tmp3,12($nonce) 492 dsll $tmp1,32 493 dsll $tmp3,32 494 or $tmp0,$tmp1 495 or $tmp2,$tmp3 496 497 daddu $in0,$tmp0 # accumulate nonce 498 daddu $in1,$tmp2 499 sltu $tmp0,$in0,$tmp0 500 daddu $in1,$tmp0 501 502 dsrl $tmp0,$in0,8 # write mac value 503 dsrl $tmp1,$in0,16 504 dsrl $tmp2,$in0,24 505 sb $in0,0($mac) 506 dsrl $tmp3,$in0,32 507 sb $tmp0,1($mac) 508 dsrl $tmp0,$in0,40 509 sb $tmp1,2($mac) 510 dsrl $tmp1,$in0,48 511 sb $tmp2,3($mac) 512 dsrl $tmp2,$in0,56 513 sb $tmp3,4($mac) 514 dsrl $tmp3,$in1,8 515 sb $tmp0,5($mac) 516 dsrl $tmp0,$in1,16 517 sb $tmp1,6($mac) 518 dsrl $tmp1,$in1,24 519 sb $tmp2,7($mac) 520 521 sb $in1,8($mac) 522 dsrl $tmp2,$in1,32 523 sb $tmp3,9($mac) 524 dsrl $tmp3,$in1,40 525 sb $tmp0,10($mac) 526 dsrl $tmp0,$in1,48 527 sb $tmp1,11($mac) 528 dsrl $tmp1,$in1,56 529 sb $tmp2,12($mac) 530 sb $tmp3,13($mac) 531 sb $tmp0,14($mac) 532 sb $tmp1,15($mac) 533 534 jr $ra 535.end poly1305_emit 536.rdata 537.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" 538.align 2 539___ 540} 541}}} else {{{ 542###################################################################### 543# 32-bit code path 544# 545 546my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 547my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = 548 ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); 549 550$code.=<<___; 551#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ 552 defined(_MIPS_ARCH_MIPS32R6)) \\ 553 && !defined(_MIPS_ARCH_MIPS32R2) 554# define _MIPS_ARCH_MIPS32R2 555#endif 556 557#if defined(_MIPS_ARCH_MIPS32R6) 558# define multu(rs,rt) 559# define mflo(rd,rs,rt) mulu rd,rs,rt 560# define mfhi(rd,rs,rt) muhu rd,rs,rt 561#else 562# define multu(rs,rt) multu rs,rt 563# define mflo(rd,rs,rt) mflo rd 564# define mfhi(rd,rs,rt) mfhi rd 565#endif 566 567#ifdef __KERNEL__ 568# define poly1305_init poly1305_init_mips 569# define poly1305_blocks poly1305_blocks_mips 570# define poly1305_emit poly1305_emit_mips 571#endif 572 573#if defined(__MIPSEB__) && !defined(MIPSEB) 574# define MIPSEB 575#endif 576 577#ifdef MIPSEB 578# define MSB 0 579# define LSB 3 580#else 581# define MSB 3 582# define LSB 0 583#endif 584 585.text 586.set noat 587.set noreorder 588 589.align 5 590.globl poly1305_init 591.ent poly1305_init 592poly1305_init: 593 .frame $sp,0,$ra 594 .set reorder 595 596 sw $zero,0($ctx) 597 sw $zero,4($ctx) 598 sw $zero,8($ctx) 599 sw $zero,12($ctx) 600 sw $zero,16($ctx) 601 602 beqz $inp,.Lno_key 603 604#if defined(_MIPS_ARCH_MIPS32R6) 605 andi $tmp0,$inp,3 # $inp % 4 606 subu $inp,$inp,$tmp0 # align $inp 607 sll $tmp0,$tmp0,3 # byte to bit offset 608 lw $in0,0($inp) 609 lw $in1,4($inp) 610 lw $in2,8($inp) 611 lw $in3,12($inp) 612 beqz $tmp0,.Laligned_key 613 614 lw $tmp2,16($inp) 615 subu $tmp1,$zero,$tmp0 616# ifdef MIPSEB 617 sllv $in0,$in0,$tmp0 618 srlv $tmp3,$in1,$tmp1 619 sllv $in1,$in1,$tmp0 620 or $in0,$in0,$tmp3 621 srlv $tmp3,$in2,$tmp1 622 sllv $in2,$in2,$tmp0 623 or $in1,$in1,$tmp3 624 srlv $tmp3,$in3,$tmp1 625 sllv $in3,$in3,$tmp0 626 or $in2,$in2,$tmp3 627 srlv $tmp2,$tmp2,$tmp1 628 or $in3,$in3,$tmp2 629# else 630 srlv $in0,$in0,$tmp0 631 sllv $tmp3,$in1,$tmp1 632 srlv $in1,$in1,$tmp0 633 or $in0,$in0,$tmp3 634 sllv $tmp3,$in2,$tmp1 635 srlv $in2,$in2,$tmp0 636 or $in1,$in1,$tmp3 637 sllv $tmp3,$in3,$tmp1 638 srlv $in3,$in3,$tmp0 639 or $in2,$in2,$tmp3 640 sllv $tmp2,$tmp2,$tmp1 641 or $in3,$in3,$tmp2 642# endif 643.Laligned_key: 644#else 645 lwl $in0,0+MSB($inp) 646 lwl $in1,4+MSB($inp) 647 lwl $in2,8+MSB($inp) 648 lwl $in3,12+MSB($inp) 649 lwr $in0,0+LSB($inp) 650 lwr $in1,4+LSB($inp) 651 lwr $in2,8+LSB($inp) 652 lwr $in3,12+LSB($inp) 653#endif 654#ifdef MIPSEB 655# if defined(_MIPS_ARCH_MIPS32R2) 656 wsbh $in0,$in0 # byte swap 657 wsbh $in1,$in1 658 wsbh $in2,$in2 659 wsbh $in3,$in3 660 rotr $in0,$in0,16 661 rotr $in1,$in1,16 662 rotr $in2,$in2,16 663 rotr $in3,$in3,16 664# else 665 srl $tmp0,$in0,24 # byte swap 666 srl $tmp1,$in0,8 667 andi $tmp2,$in0,0xFF00 668 sll $in0,$in0,24 669 andi $tmp1,0xFF00 670 sll $tmp2,$tmp2,8 671 or $in0,$tmp0 672 srl $tmp0,$in1,24 673 or $tmp1,$tmp2 674 srl $tmp2,$in1,8 675 or $in0,$tmp1 676 andi $tmp1,$in1,0xFF00 677 sll $in1,$in1,24 678 andi $tmp2,0xFF00 679 sll $tmp1,$tmp1,8 680 or $in1,$tmp0 681 srl $tmp0,$in2,24 682 or $tmp2,$tmp1 683 srl $tmp1,$in2,8 684 or $in1,$tmp2 685 andi $tmp2,$in2,0xFF00 686 sll $in2,$in2,24 687 andi $tmp1,0xFF00 688 sll $tmp2,$tmp2,8 689 or $in2,$tmp0 690 srl $tmp0,$in3,24 691 or $tmp1,$tmp2 692 srl $tmp2,$in3,8 693 or $in2,$tmp1 694 andi $tmp1,$in3,0xFF00 695 sll $in3,$in3,24 696 andi $tmp2,0xFF00 697 sll $tmp1,$tmp1,8 698 or $in3,$tmp0 699 or $tmp2,$tmp1 700 or $in3,$tmp2 701# endif 702#endif 703 lui $tmp0,0x0fff 704 ori $tmp0,0xffff # 0x0fffffff 705 and $in0,$in0,$tmp0 706 subu $tmp0,3 # 0x0ffffffc 707 and $in1,$in1,$tmp0 708 and $in2,$in2,$tmp0 709 and $in3,$in3,$tmp0 710 711 sw $in0,20($ctx) 712 sw $in1,24($ctx) 713 sw $in2,28($ctx) 714 sw $in3,32($ctx) 715 716 srl $tmp1,$in1,2 717 srl $tmp2,$in2,2 718 srl $tmp3,$in3,2 719 addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) 720 addu $in2,$in2,$tmp2 721 addu $in3,$in3,$tmp3 722 sw $in1,36($ctx) 723 sw $in2,40($ctx) 724 sw $in3,44($ctx) 725.Lno_key: 726 li $v0,0 727 jr $ra 728.end poly1305_init 729___ 730{ 731my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; 732 733my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = 734 ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); 735my ($d0,$d1,$d2,$d3) = 736 ($a4,$a5,$a6,$a7); 737my $shr = $t2; # used on R6 738my $one = $t2; # used on R2 739 740$code.=<<___; 741.globl poly1305_blocks 742.align 5 743.ent poly1305_blocks 744poly1305_blocks: 745 .frame $sp,16*4,$ra 746 .mask $SAVED_REGS_MASK,-4 747 .set noreorder 748 subu $sp, $sp,4*12 749 sw $s11,4*11($sp) 750 sw $s10,4*10($sp) 751 sw $s9, 4*9($sp) 752 sw $s8, 4*8($sp) 753 sw $s7, 4*7($sp) 754 sw $s6, 4*6($sp) 755 sw $s5, 4*5($sp) 756 sw $s4, 4*4($sp) 757___ 758$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 759 sw $s3, 4*3($sp) 760 sw $s2, 4*2($sp) 761 sw $s1, 4*1($sp) 762 sw $s0, 4*0($sp) 763___ 764$code.=<<___; 765 .set reorder 766 767 srl $len,4 # number of complete blocks 768 li $one,1 769 beqz $len,.Labort 770 771#if defined(_MIPS_ARCH_MIPS32R6) 772 andi $shr,$inp,3 773 subu $inp,$inp,$shr # align $inp 774 sll $shr,$shr,3 # byte to bit offset 775#endif 776 777 lw $h0,0($ctx) # load hash value 778 lw $h1,4($ctx) 779 lw $h2,8($ctx) 780 lw $h3,12($ctx) 781 lw $h4,16($ctx) 782 783 lw $r0,20($ctx) # load key 784 lw $r1,24($ctx) 785 lw $r2,28($ctx) 786 lw $r3,32($ctx) 787 lw $rs1,36($ctx) 788 lw $rs2,40($ctx) 789 lw $rs3,44($ctx) 790 791 sll $len,4 792 addu $len,$len,$inp # end of buffer 793 b .Loop 794 795.align 4 796.Loop: 797#if defined(_MIPS_ARCH_MIPS32R6) 798 lw $d0,0($inp) # load input 799 lw $d1,4($inp) 800 lw $d2,8($inp) 801 lw $d3,12($inp) 802 beqz $shr,.Laligned_inp 803 804 lw $t0,16($inp) 805 subu $t1,$zero,$shr 806# ifdef MIPSEB 807 sllv $d0,$d0,$shr 808 srlv $at,$d1,$t1 809 sllv $d1,$d1,$shr 810 or $d0,$d0,$at 811 srlv $at,$d2,$t1 812 sllv $d2,$d2,$shr 813 or $d1,$d1,$at 814 srlv $at,$d3,$t1 815 sllv $d3,$d3,$shr 816 or $d2,$d2,$at 817 srlv $t0,$t0,$t1 818 or $d3,$d3,$t0 819# else 820 srlv $d0,$d0,$shr 821 sllv $at,$d1,$t1 822 srlv $d1,$d1,$shr 823 or $d0,$d0,$at 824 sllv $at,$d2,$t1 825 srlv $d2,$d2,$shr 826 or $d1,$d1,$at 827 sllv $at,$d3,$t1 828 srlv $d3,$d3,$shr 829 or $d2,$d2,$at 830 sllv $t0,$t0,$t1 831 or $d3,$d3,$t0 832# endif 833.Laligned_inp: 834#else 835 lwl $d0,0+MSB($inp) # load input 836 lwl $d1,4+MSB($inp) 837 lwl $d2,8+MSB($inp) 838 lwl $d3,12+MSB($inp) 839 lwr $d0,0+LSB($inp) 840 lwr $d1,4+LSB($inp) 841 lwr $d2,8+LSB($inp) 842 lwr $d3,12+LSB($inp) 843#endif 844#ifdef MIPSEB 845# if defined(_MIPS_ARCH_MIPS32R2) 846 wsbh $d0,$d0 # byte swap 847 wsbh $d1,$d1 848 wsbh $d2,$d2 849 wsbh $d3,$d3 850 rotr $d0,$d0,16 851 rotr $d1,$d1,16 852 rotr $d2,$d2,16 853 rotr $d3,$d3,16 854# else 855 srl $at,$d0,24 # byte swap 856 srl $t0,$d0,8 857 andi $t1,$d0,0xFF00 858 sll $d0,$d0,24 859 andi $t0,0xFF00 860 sll $t1,$t1,8 861 or $d0,$at 862 srl $at,$d1,24 863 or $t0,$t1 864 srl $t1,$d1,8 865 or $d0,$t0 866 andi $t0,$d1,0xFF00 867 sll $d1,$d1,24 868 andi $t1,0xFF00 869 sll $t0,$t0,8 870 or $d1,$at 871 srl $at,$d2,24 872 or $t1,$t0 873 srl $t0,$d2,8 874 or $d1,$t1 875 andi $t1,$d2,0xFF00 876 sll $d2,$d2,24 877 andi $t0,0xFF00 878 sll $t1,$t1,8 879 or $d2,$at 880 srl $at,$d3,24 881 or $t0,$t1 882 srl $t1,$d3,8 883 or $d2,$t0 884 andi $t0,$d3,0xFF00 885 sll $d3,$d3,24 886 andi $t1,0xFF00 887 sll $t0,$t0,8 888 or $d3,$at 889 or $t1,$t0 890 or $d3,$t1 891# endif 892#endif 893 srl $t0,$h4,2 # modulo-scheduled reduction 894 andi $h4,$h4,3 895 sll $at,$t0,2 896 897 addu $d0,$d0,$h0 # accumulate input 898 addu $t0,$t0,$at 899 sltu $h0,$d0,$h0 900 addu $d0,$d0,$t0 # ... and residue 901 sltu $at,$d0,$t0 902 903 addu $d1,$d1,$h1 904 addu $h0,$h0,$at # carry 905 sltu $h1,$d1,$h1 906 addu $d1,$d1,$h0 907 sltu $h0,$d1,$h0 908 909 addu $d2,$d2,$h2 910 addu $h1,$h1,$h0 # carry 911 sltu $h2,$d2,$h2 912 addu $d2,$d2,$h1 913 sltu $h1,$d2,$h1 914 915 addu $d3,$d3,$h3 916 addu $h2,$h2,$h1 # carry 917 sltu $h3,$d3,$h3 918 addu $d3,$d3,$h2 919 920#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) 921 multu $r0,$d0 # d0*r0 922 sltu $h2,$d3,$h2 923 maddu $rs3,$d1 # d1*s3 924 addu $h3,$h3,$h2 # carry 925 maddu $rs2,$d2 # d2*s2 926 addu $h4,$h4,$padbit 927 maddu $rs1,$d3 # d3*s1 928 addu $h4,$h4,$h3 929 mfhi $at 930 mflo $h0 931 932 multu $r1,$d0 # d0*r1 933 maddu $r0,$d1 # d1*r0 934 maddu $rs3,$d2 # d2*s3 935 maddu $rs2,$d3 # d3*s2 936 maddu $rs1,$h4 # h4*s1 937 maddu $at,$one # hi*1 938 mfhi $at 939 mflo $h1 940 941 multu $r2,$d0 # d0*r2 942 maddu $r1,$d1 # d1*r1 943 maddu $r0,$d2 # d2*r0 944 maddu $rs3,$d3 # d3*s3 945 maddu $rs2,$h4 # h4*s2 946 maddu $at,$one # hi*1 947 mfhi $at 948 mflo $h2 949 950 mul $t0,$r0,$h4 # h4*r0 951 952 multu $r3,$d0 # d0*r3 953 maddu $r2,$d1 # d1*r2 954 maddu $r1,$d2 # d2*r1 955 maddu $r0,$d3 # d3*r0 956 maddu $rs3,$h4 # h4*s3 957 maddu $at,$one # hi*1 958 mfhi $at 959 mflo $h3 960 961 addiu $inp,$inp,16 962 963 addu $h4,$t0,$at 964#else 965 multu ($r0,$d0) # d0*r0 966 mflo ($h0,$r0,$d0) 967 mfhi ($h1,$r0,$d0) 968 969 sltu $h2,$d3,$h2 970 addu $h3,$h3,$h2 # carry 971 972 multu ($rs3,$d1) # d1*s3 973 mflo ($at,$rs3,$d1) 974 mfhi ($t0,$rs3,$d1) 975 976 addu $h4,$h4,$padbit 977 addiu $inp,$inp,16 978 addu $h4,$h4,$h3 979 980 multu ($rs2,$d2) # d2*s2 981 mflo ($a3,$rs2,$d2) 982 mfhi ($t1,$rs2,$d2) 983 addu $h0,$h0,$at 984 addu $h1,$h1,$t0 985 multu ($rs1,$d3) # d3*s1 986 sltu $at,$h0,$at 987 addu $h1,$h1,$at 988 989 mflo ($at,$rs1,$d3) 990 mfhi ($t0,$rs1,$d3) 991 addu $h0,$h0,$a3 992 addu $h1,$h1,$t1 993 multu ($r1,$d0) # d0*r1 994 sltu $a3,$h0,$a3 995 addu $h1,$h1,$a3 996 997 998 mflo ($a3,$r1,$d0) 999 mfhi ($h2,$r1,$d0) 1000 addu $h0,$h0,$at 1001 addu $h1,$h1,$t0 1002 multu ($r0,$d1) # d1*r0 1003 sltu $at,$h0,$at 1004 addu $h1,$h1,$at 1005 1006 mflo ($at,$r0,$d1) 1007 mfhi ($t0,$r0,$d1) 1008 addu $h1,$h1,$a3 1009 sltu $a3,$h1,$a3 1010 multu ($rs3,$d2) # d2*s3 1011 addu $h2,$h2,$a3 1012 1013 mflo ($a3,$rs3,$d2) 1014 mfhi ($t1,$rs3,$d2) 1015 addu $h1,$h1,$at 1016 addu $h2,$h2,$t0 1017 multu ($rs2,$d3) # d3*s2 1018 sltu $at,$h1,$at 1019 addu $h2,$h2,$at 1020 1021 mflo ($at,$rs2,$d3) 1022 mfhi ($t0,$rs2,$d3) 1023 addu $h1,$h1,$a3 1024 addu $h2,$h2,$t1 1025 multu ($rs1,$h4) # h4*s1 1026 sltu $a3,$h1,$a3 1027 addu $h2,$h2,$a3 1028 1029 mflo ($a3,$rs1,$h4) 1030 addu $h1,$h1,$at 1031 addu $h2,$h2,$t0 1032 multu ($r2,$d0) # d0*r2 1033 sltu $at,$h1,$at 1034 addu $h2,$h2,$at 1035 1036 1037 mflo ($at,$r2,$d0) 1038 mfhi ($h3,$r2,$d0) 1039 addu $h1,$h1,$a3 1040 sltu $a3,$h1,$a3 1041 multu ($r1,$d1) # d1*r1 1042 addu $h2,$h2,$a3 1043 1044 mflo ($a3,$r1,$d1) 1045 mfhi ($t1,$r1,$d1) 1046 addu $h2,$h2,$at 1047 sltu $at,$h2,$at 1048 multu ($r0,$d2) # d2*r0 1049 addu $h3,$h3,$at 1050 1051 mflo ($at,$r0,$d2) 1052 mfhi ($t0,$r0,$d2) 1053 addu $h2,$h2,$a3 1054 addu $h3,$h3,$t1 1055 multu ($rs3,$d3) # d3*s3 1056 sltu $a3,$h2,$a3 1057 addu $h3,$h3,$a3 1058 1059 mflo ($a3,$rs3,$d3) 1060 mfhi ($t1,$rs3,$d3) 1061 addu $h2,$h2,$at 1062 addu $h3,$h3,$t0 1063 multu ($rs2,$h4) # h4*s2 1064 sltu $at,$h2,$at 1065 addu $h3,$h3,$at 1066 1067 mflo ($at,$rs2,$h4) 1068 addu $h2,$h2,$a3 1069 addu $h3,$h3,$t1 1070 multu ($r3,$d0) # d0*r3 1071 sltu $a3,$h2,$a3 1072 addu $h3,$h3,$a3 1073 1074 1075 mflo ($a3,$r3,$d0) 1076 mfhi ($t1,$r3,$d0) 1077 addu $h2,$h2,$at 1078 sltu $at,$h2,$at 1079 multu ($r2,$d1) # d1*r2 1080 addu $h3,$h3,$at 1081 1082 mflo ($at,$r2,$d1) 1083 mfhi ($t0,$r2,$d1) 1084 addu $h3,$h3,$a3 1085 sltu $a3,$h3,$a3 1086 multu ($r0,$d3) # d3*r0 1087 addu $t1,$t1,$a3 1088 1089 mflo ($a3,$r0,$d3) 1090 mfhi ($d3,$r0,$d3) 1091 addu $h3,$h3,$at 1092 addu $t1,$t1,$t0 1093 multu ($r1,$d2) # d2*r1 1094 sltu $at,$h3,$at 1095 addu $t1,$t1,$at 1096 1097 mflo ($at,$r1,$d2) 1098 mfhi ($t0,$r1,$d2) 1099 addu $h3,$h3,$a3 1100 addu $t1,$t1,$d3 1101 multu ($rs3,$h4) # h4*s3 1102 sltu $a3,$h3,$a3 1103 addu $t1,$t1,$a3 1104 1105 mflo ($a3,$rs3,$h4) 1106 addu $h3,$h3,$at 1107 addu $t1,$t1,$t0 1108 multu ($r0,$h4) # h4*r0 1109 sltu $at,$h3,$at 1110 addu $t1,$t1,$at 1111 1112 1113 mflo ($h4,$r0,$h4) 1114 addu $h3,$h3,$a3 1115 sltu $a3,$h3,$a3 1116 addu $t1,$t1,$a3 1117 addu $h4,$h4,$t1 1118 1119 li $padbit,1 # if we loop, padbit is 1 1120#endif 1121 bne $inp,$len,.Loop 1122 1123 sw $h0,0($ctx) # store hash value 1124 sw $h1,4($ctx) 1125 sw $h2,8($ctx) 1126 sw $h3,12($ctx) 1127 sw $h4,16($ctx) 1128 1129 .set noreorder 1130.Labort: 1131 lw $s11,4*11($sp) 1132 lw $s10,4*10($sp) 1133 lw $s9, 4*9($sp) 1134 lw $s8, 4*8($sp) 1135 lw $s7, 4*7($sp) 1136 lw $s6, 4*6($sp) 1137 lw $s5, 4*5($sp) 1138 lw $s4, 4*4($sp) 1139___ 1140$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 1141 lw $s3, 4*3($sp) 1142 lw $s2, 4*2($sp) 1143 lw $s1, 4*1($sp) 1144 lw $s0, 4*0($sp) 1145___ 1146$code.=<<___; 1147 jr $ra 1148 addu $sp,$sp,4*12 1149.end poly1305_blocks 1150___ 1151} 1152{ 1153my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); 1154 1155$code.=<<___; 1156.align 5 1157.globl poly1305_emit 1158.ent poly1305_emit 1159poly1305_emit: 1160 .frame $sp,0,$ra 1161 .set reorder 1162 1163 lw $tmp4,16($ctx) 1164 lw $tmp0,0($ctx) 1165 lw $tmp1,4($ctx) 1166 lw $tmp2,8($ctx) 1167 lw $tmp3,12($ctx) 1168 1169 li $in0,-4 # final reduction 1170 srl $ctx,$tmp4,2 1171 and $in0,$in0,$tmp4 1172 andi $tmp4,$tmp4,3 1173 addu $ctx,$ctx,$in0 1174 1175 addu $tmp0,$tmp0,$ctx 1176 sltu $ctx,$tmp0,$ctx 1177 addiu $in0,$tmp0,5 # compare to modulus 1178 addu $tmp1,$tmp1,$ctx 1179 sltiu $in1,$in0,5 1180 sltu $ctx,$tmp1,$ctx 1181 addu $in1,$in1,$tmp1 1182 addu $tmp2,$tmp2,$ctx 1183 sltu $in2,$in1,$tmp1 1184 sltu $ctx,$tmp2,$ctx 1185 addu $in2,$in2,$tmp2 1186 addu $tmp3,$tmp3,$ctx 1187 sltu $in3,$in2,$tmp2 1188 sltu $ctx,$tmp3,$ctx 1189 addu $in3,$in3,$tmp3 1190 addu $tmp4,$tmp4,$ctx 1191 sltu $ctx,$in3,$tmp3 1192 addu $ctx,$tmp4 1193 1194 srl $ctx,2 # see if it carried/borrowed 1195 subu $ctx,$zero,$ctx 1196 1197 xor $in0,$tmp0 1198 xor $in1,$tmp1 1199 xor $in2,$tmp2 1200 xor $in3,$tmp3 1201 and $in0,$ctx 1202 and $in1,$ctx 1203 and $in2,$ctx 1204 and $in3,$ctx 1205 xor $in0,$tmp0 1206 xor $in1,$tmp1 1207 xor $in2,$tmp2 1208 xor $in3,$tmp3 1209 1210 lw $tmp0,0($nonce) # load nonce 1211 lw $tmp1,4($nonce) 1212 lw $tmp2,8($nonce) 1213 lw $tmp3,12($nonce) 1214 1215 addu $in0,$tmp0 # accumulate nonce 1216 sltu $ctx,$in0,$tmp0 1217 1218 addu $in1,$tmp1 1219 sltu $tmp1,$in1,$tmp1 1220 addu $in1,$ctx 1221 sltu $ctx,$in1,$ctx 1222 addu $ctx,$tmp1 1223 1224 addu $in2,$tmp2 1225 sltu $tmp2,$in2,$tmp2 1226 addu $in2,$ctx 1227 sltu $ctx,$in2,$ctx 1228 addu $ctx,$tmp2 1229 1230 addu $in3,$tmp3 1231 addu $in3,$ctx 1232 1233 srl $tmp0,$in0,8 # write mac value 1234 srl $tmp1,$in0,16 1235 srl $tmp2,$in0,24 1236 sb $in0, 0($mac) 1237 sb $tmp0,1($mac) 1238 srl $tmp0,$in1,8 1239 sb $tmp1,2($mac) 1240 srl $tmp1,$in1,16 1241 sb $tmp2,3($mac) 1242 srl $tmp2,$in1,24 1243 sb $in1, 4($mac) 1244 sb $tmp0,5($mac) 1245 srl $tmp0,$in2,8 1246 sb $tmp1,6($mac) 1247 srl $tmp1,$in2,16 1248 sb $tmp2,7($mac) 1249 srl $tmp2,$in2,24 1250 sb $in2, 8($mac) 1251 sb $tmp0,9($mac) 1252 srl $tmp0,$in3,8 1253 sb $tmp1,10($mac) 1254 srl $tmp1,$in3,16 1255 sb $tmp2,11($mac) 1256 srl $tmp2,$in3,24 1257 sb $in3, 12($mac) 1258 sb $tmp0,13($mac) 1259 sb $tmp1,14($mac) 1260 sb $tmp2,15($mac) 1261 1262 jr $ra 1263.end poly1305_emit 1264.rdata 1265.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" 1266.align 2 1267___ 1268} 1269}}} 1270 1271$output=pop and open STDOUT,">$output"; 1272print $code; 1273close STDOUT; 1274