1*a11d055eSArd Biesheuvel#!/usr/bin/env perl 2*a11d055eSArd Biesheuvel# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3*a11d055eSArd Biesheuvel# 4*a11d055eSArd Biesheuvel# ==================================================================== 5*a11d055eSArd Biesheuvel# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL 6*a11d055eSArd Biesheuvel# project. 7*a11d055eSArd Biesheuvel# ==================================================================== 8*a11d055eSArd Biesheuvel 9*a11d055eSArd Biesheuvel# Poly1305 hash for MIPS. 10*a11d055eSArd Biesheuvel# 11*a11d055eSArd Biesheuvel# May 2016 12*a11d055eSArd Biesheuvel# 13*a11d055eSArd Biesheuvel# Numbers are cycles per processed byte with poly1305_blocks alone. 14*a11d055eSArd Biesheuvel# 15*a11d055eSArd Biesheuvel# IALU/gcc 16*a11d055eSArd Biesheuvel# R1x000 ~5.5/+130% (big-endian) 17*a11d055eSArd Biesheuvel# Octeon II 2.50/+70% (little-endian) 18*a11d055eSArd Biesheuvel# 19*a11d055eSArd Biesheuvel# March 2019 20*a11d055eSArd Biesheuvel# 21*a11d055eSArd Biesheuvel# Add 32-bit code path. 22*a11d055eSArd Biesheuvel# 23*a11d055eSArd Biesheuvel# October 2019 24*a11d055eSArd Biesheuvel# 25*a11d055eSArd Biesheuvel# Modulo-scheduling reduction allows to omit dependency chain at the 26*a11d055eSArd Biesheuvel# end of inner loop and improve performance. Also optimize MIPS32R2 27*a11d055eSArd Biesheuvel# code path for MIPS 1004K core. Per René von Dorst's suggestions. 28*a11d055eSArd Biesheuvel# 29*a11d055eSArd Biesheuvel# IALU/gcc 30*a11d055eSArd Biesheuvel# R1x000 ~9.8/? (big-endian) 31*a11d055eSArd Biesheuvel# Octeon II 3.65/+140% (little-endian) 32*a11d055eSArd Biesheuvel# MT7621/1004K 4.75/? (little-endian) 33*a11d055eSArd Biesheuvel# 34*a11d055eSArd Biesheuvel###################################################################### 35*a11d055eSArd Biesheuvel# There is a number of MIPS ABI in use, O32 and N32/64 are most 36*a11d055eSArd Biesheuvel# widely used. Then there is a new contender: NUBI. It appears that if 37*a11d055eSArd Biesheuvel# one picks the latter, it's possible to arrange code in ABI neutral 38*a11d055eSArd Biesheuvel# manner. Therefore let's stick to NUBI register layout: 39*a11d055eSArd Biesheuvel# 40*a11d055eSArd Biesheuvel($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 41*a11d055eSArd Biesheuvel($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 42*a11d055eSArd Biesheuvel($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 43*a11d055eSArd Biesheuvel($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 44*a11d055eSArd Biesheuvel# 45*a11d055eSArd Biesheuvel# The return value is placed in $a0. Following coding rules facilitate 46*a11d055eSArd Biesheuvel# interoperability: 47*a11d055eSArd Biesheuvel# 48*a11d055eSArd Biesheuvel# - never ever touch $tp, "thread pointer", former $gp [o32 can be 49*a11d055eSArd Biesheuvel# excluded from the rule, because it's specified volatile]; 50*a11d055eSArd Biesheuvel# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 51*a11d055eSArd Biesheuvel# old code]; 52*a11d055eSArd Biesheuvel# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 53*a11d055eSArd Biesheuvel# 54*a11d055eSArd Biesheuvel# For reference here is register layout for N32/64 MIPS ABIs: 55*a11d055eSArd Biesheuvel# 56*a11d055eSArd Biesheuvel# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 57*a11d055eSArd Biesheuvel# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 58*a11d055eSArd Biesheuvel# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 59*a11d055eSArd Biesheuvel# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 60*a11d055eSArd Biesheuvel# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 61*a11d055eSArd Biesheuvel# 62*a11d055eSArd Biesheuvel# <appro@openssl.org> 63*a11d055eSArd Biesheuvel# 64*a11d055eSArd Biesheuvel###################################################################### 65*a11d055eSArd Biesheuvel 66*a11d055eSArd Biesheuvel$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 67*a11d055eSArd Biesheuvel 68*a11d055eSArd Biesheuvel$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; 69*a11d055eSArd Biesheuvel 70*a11d055eSArd Biesheuvelif ($flavour =~ /64|n32/i) {{{ 71*a11d055eSArd Biesheuvel###################################################################### 72*a11d055eSArd Biesheuvel# 64-bit code path 73*a11d055eSArd Biesheuvel# 74*a11d055eSArd Biesheuvel 75*a11d055eSArd Biesheuvelmy ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 76*a11d055eSArd Biesheuvelmy ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); 77*a11d055eSArd Biesheuvel 78*a11d055eSArd Biesheuvel$code.=<<___; 79*a11d055eSArd Biesheuvel#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ 80*a11d055eSArd Biesheuvel defined(_MIPS_ARCH_MIPS64R6)) \\ 81*a11d055eSArd Biesheuvel && !defined(_MIPS_ARCH_MIPS64R2) 82*a11d055eSArd Biesheuvel# define _MIPS_ARCH_MIPS64R2 83*a11d055eSArd Biesheuvel#endif 84*a11d055eSArd Biesheuvel 85*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6) 86*a11d055eSArd Biesheuvel# define dmultu(rs,rt) 87*a11d055eSArd Biesheuvel# define mflo(rd,rs,rt) dmulu rd,rs,rt 88*a11d055eSArd Biesheuvel# define mfhi(rd,rs,rt) dmuhu rd,rs,rt 89*a11d055eSArd Biesheuvel#else 90*a11d055eSArd Biesheuvel# define dmultu(rs,rt) dmultu rs,rt 91*a11d055eSArd Biesheuvel# define mflo(rd,rs,rt) mflo rd 92*a11d055eSArd Biesheuvel# define mfhi(rd,rs,rt) mfhi rd 93*a11d055eSArd Biesheuvel#endif 94*a11d055eSArd Biesheuvel 95*a11d055eSArd Biesheuvel#ifdef __KERNEL__ 96*a11d055eSArd Biesheuvel# define poly1305_init poly1305_init_mips 97*a11d055eSArd Biesheuvel# define poly1305_blocks poly1305_blocks_mips 98*a11d055eSArd Biesheuvel# define poly1305_emit poly1305_emit_mips 99*a11d055eSArd Biesheuvel#endif 100*a11d055eSArd Biesheuvel 101*a11d055eSArd Biesheuvel#if defined(__MIPSEB__) && !defined(MIPSEB) 102*a11d055eSArd Biesheuvel# define MIPSEB 103*a11d055eSArd Biesheuvel#endif 104*a11d055eSArd Biesheuvel 105*a11d055eSArd Biesheuvel#ifdef MIPSEB 106*a11d055eSArd Biesheuvel# define MSB 0 107*a11d055eSArd Biesheuvel# define LSB 7 108*a11d055eSArd Biesheuvel#else 109*a11d055eSArd Biesheuvel# define MSB 7 110*a11d055eSArd Biesheuvel# define LSB 0 111*a11d055eSArd Biesheuvel#endif 112*a11d055eSArd Biesheuvel 113*a11d055eSArd Biesheuvel.text 114*a11d055eSArd Biesheuvel.set noat 115*a11d055eSArd Biesheuvel.set noreorder 116*a11d055eSArd Biesheuvel 117*a11d055eSArd Biesheuvel.align 5 118*a11d055eSArd Biesheuvel.globl poly1305_init 119*a11d055eSArd Biesheuvel.ent poly1305_init 120*a11d055eSArd Biesheuvelpoly1305_init: 121*a11d055eSArd Biesheuvel .frame $sp,0,$ra 122*a11d055eSArd Biesheuvel .set reorder 123*a11d055eSArd Biesheuvel 124*a11d055eSArd Biesheuvel sd $zero,0($ctx) 125*a11d055eSArd Biesheuvel sd $zero,8($ctx) 126*a11d055eSArd Biesheuvel sd $zero,16($ctx) 127*a11d055eSArd Biesheuvel 128*a11d055eSArd Biesheuvel beqz $inp,.Lno_key 129*a11d055eSArd Biesheuvel 130*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6) 131*a11d055eSArd Biesheuvel andi $tmp0,$inp,7 # $inp % 8 132*a11d055eSArd Biesheuvel dsubu $inp,$inp,$tmp0 # align $inp 133*a11d055eSArd Biesheuvel sll $tmp0,$tmp0,3 # byte to bit offset 134*a11d055eSArd Biesheuvel ld $in0,0($inp) 135*a11d055eSArd Biesheuvel ld $in1,8($inp) 136*a11d055eSArd Biesheuvel beqz $tmp0,.Laligned_key 137*a11d055eSArd Biesheuvel ld $tmp2,16($inp) 138*a11d055eSArd Biesheuvel 139*a11d055eSArd Biesheuvel subu $tmp1,$zero,$tmp0 140*a11d055eSArd Biesheuvel# ifdef MIPSEB 141*a11d055eSArd Biesheuvel dsllv $in0,$in0,$tmp0 142*a11d055eSArd Biesheuvel dsrlv $tmp3,$in1,$tmp1 143*a11d055eSArd Biesheuvel dsllv $in1,$in1,$tmp0 144*a11d055eSArd Biesheuvel dsrlv $tmp2,$tmp2,$tmp1 145*a11d055eSArd Biesheuvel# else 146*a11d055eSArd Biesheuvel dsrlv $in0,$in0,$tmp0 147*a11d055eSArd Biesheuvel dsllv $tmp3,$in1,$tmp1 148*a11d055eSArd Biesheuvel dsrlv $in1,$in1,$tmp0 149*a11d055eSArd Biesheuvel dsllv $tmp2,$tmp2,$tmp1 150*a11d055eSArd Biesheuvel# endif 151*a11d055eSArd Biesheuvel or $in0,$in0,$tmp3 152*a11d055eSArd Biesheuvel or $in1,$in1,$tmp2 153*a11d055eSArd Biesheuvel.Laligned_key: 154*a11d055eSArd Biesheuvel#else 155*a11d055eSArd Biesheuvel ldl $in0,0+MSB($inp) 156*a11d055eSArd Biesheuvel ldl $in1,8+MSB($inp) 157*a11d055eSArd Biesheuvel ldr $in0,0+LSB($inp) 158*a11d055eSArd Biesheuvel ldr $in1,8+LSB($inp) 159*a11d055eSArd Biesheuvel#endif 160*a11d055eSArd Biesheuvel#ifdef MIPSEB 161*a11d055eSArd Biesheuvel# if defined(_MIPS_ARCH_MIPS64R2) 162*a11d055eSArd Biesheuvel dsbh $in0,$in0 # byte swap 163*a11d055eSArd Biesheuvel dsbh $in1,$in1 164*a11d055eSArd Biesheuvel dshd $in0,$in0 165*a11d055eSArd Biesheuvel dshd $in1,$in1 166*a11d055eSArd Biesheuvel# else 167*a11d055eSArd Biesheuvel ori $tmp0,$zero,0xFF 168*a11d055eSArd Biesheuvel dsll $tmp2,$tmp0,32 169*a11d055eSArd Biesheuvel or $tmp0,$tmp2 # 0x000000FF000000FF 170*a11d055eSArd Biesheuvel 171*a11d055eSArd Biesheuvel and $tmp1,$in0,$tmp0 # byte swap 172*a11d055eSArd Biesheuvel and $tmp3,$in1,$tmp0 173*a11d055eSArd Biesheuvel dsrl $tmp2,$in0,24 174*a11d055eSArd Biesheuvel dsrl $tmp4,$in1,24 175*a11d055eSArd Biesheuvel dsll $tmp1,24 176*a11d055eSArd Biesheuvel dsll $tmp3,24 177*a11d055eSArd Biesheuvel and $tmp2,$tmp0 178*a11d055eSArd Biesheuvel and $tmp4,$tmp0 179*a11d055eSArd Biesheuvel dsll $tmp0,8 # 0x0000FF000000FF00 180*a11d055eSArd Biesheuvel or $tmp1,$tmp2 181*a11d055eSArd Biesheuvel or $tmp3,$tmp4 182*a11d055eSArd Biesheuvel and $tmp2,$in0,$tmp0 183*a11d055eSArd Biesheuvel and $tmp4,$in1,$tmp0 184*a11d055eSArd Biesheuvel dsrl $in0,8 185*a11d055eSArd Biesheuvel dsrl $in1,8 186*a11d055eSArd Biesheuvel dsll $tmp2,8 187*a11d055eSArd Biesheuvel dsll $tmp4,8 188*a11d055eSArd Biesheuvel and $in0,$tmp0 189*a11d055eSArd Biesheuvel and $in1,$tmp0 190*a11d055eSArd Biesheuvel or $tmp1,$tmp2 191*a11d055eSArd Biesheuvel or $tmp3,$tmp4 192*a11d055eSArd Biesheuvel or $in0,$tmp1 193*a11d055eSArd Biesheuvel or $in1,$tmp3 194*a11d055eSArd Biesheuvel dsrl $tmp1,$in0,32 195*a11d055eSArd Biesheuvel dsrl $tmp3,$in1,32 196*a11d055eSArd Biesheuvel dsll $in0,32 197*a11d055eSArd Biesheuvel dsll $in1,32 198*a11d055eSArd Biesheuvel or $in0,$tmp1 199*a11d055eSArd Biesheuvel or $in1,$tmp3 200*a11d055eSArd Biesheuvel# endif 201*a11d055eSArd Biesheuvel#endif 202*a11d055eSArd Biesheuvel li $tmp0,1 203*a11d055eSArd Biesheuvel dsll $tmp0,32 # 0x0000000100000000 204*a11d055eSArd Biesheuvel daddiu $tmp0,-63 # 0x00000000ffffffc1 205*a11d055eSArd Biesheuvel dsll $tmp0,28 # 0x0ffffffc10000000 206*a11d055eSArd Biesheuvel daddiu $tmp0,-1 # 0x0ffffffc0fffffff 207*a11d055eSArd Biesheuvel 208*a11d055eSArd Biesheuvel and $in0,$tmp0 209*a11d055eSArd Biesheuvel daddiu $tmp0,-3 # 0x0ffffffc0ffffffc 210*a11d055eSArd Biesheuvel and $in1,$tmp0 211*a11d055eSArd Biesheuvel 212*a11d055eSArd Biesheuvel sd $in0,24($ctx) 213*a11d055eSArd Biesheuvel dsrl $tmp0,$in1,2 214*a11d055eSArd Biesheuvel sd $in1,32($ctx) 215*a11d055eSArd Biesheuvel daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) 216*a11d055eSArd Biesheuvel sd $tmp0,40($ctx) 217*a11d055eSArd Biesheuvel 218*a11d055eSArd Biesheuvel.Lno_key: 219*a11d055eSArd Biesheuvel li $v0,0 # return 0 220*a11d055eSArd Biesheuvel jr $ra 221*a11d055eSArd Biesheuvel.end poly1305_init 222*a11d055eSArd Biesheuvel___ 223*a11d055eSArd Biesheuvel{ 224*a11d055eSArd Biesheuvelmy $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; 225*a11d055eSArd Biesheuvel 226*a11d055eSArd Biesheuvelmy ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = 227*a11d055eSArd Biesheuvel ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); 228*a11d055eSArd Biesheuvelmy ($shr,$shl) = ($s6,$s7); # used on R6 229*a11d055eSArd Biesheuvel 230*a11d055eSArd Biesheuvel$code.=<<___; 231*a11d055eSArd Biesheuvel.align 5 232*a11d055eSArd Biesheuvel.globl poly1305_blocks 233*a11d055eSArd Biesheuvel.ent poly1305_blocks 234*a11d055eSArd Biesheuvelpoly1305_blocks: 235*a11d055eSArd Biesheuvel .set noreorder 236*a11d055eSArd Biesheuvel dsrl $len,4 # number of complete blocks 237*a11d055eSArd Biesheuvel bnez $len,poly1305_blocks_internal 238*a11d055eSArd Biesheuvel nop 239*a11d055eSArd Biesheuvel jr $ra 240*a11d055eSArd Biesheuvel nop 241*a11d055eSArd Biesheuvel.end poly1305_blocks 242*a11d055eSArd Biesheuvel 243*a11d055eSArd Biesheuvel.align 5 244*a11d055eSArd Biesheuvel.ent poly1305_blocks_internal 245*a11d055eSArd Biesheuvelpoly1305_blocks_internal: 246*a11d055eSArd Biesheuvel .set noreorder 247*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6) 248*a11d055eSArd Biesheuvel .frame $sp,8*8,$ra 249*a11d055eSArd Biesheuvel .mask $SAVED_REGS_MASK|0x000c0000,-8 250*a11d055eSArd Biesheuvel dsubu $sp,8*8 251*a11d055eSArd Biesheuvel sd $s7,56($sp) 252*a11d055eSArd Biesheuvel sd $s6,48($sp) 253*a11d055eSArd Biesheuvel#else 254*a11d055eSArd Biesheuvel .frame $sp,6*8,$ra 255*a11d055eSArd Biesheuvel .mask $SAVED_REGS_MASK,-8 256*a11d055eSArd Biesheuvel dsubu $sp,6*8 257*a11d055eSArd Biesheuvel#endif 258*a11d055eSArd Biesheuvel sd $s5,40($sp) 259*a11d055eSArd Biesheuvel sd $s4,32($sp) 260*a11d055eSArd Biesheuvel___ 261*a11d055eSArd Biesheuvel$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 262*a11d055eSArd Biesheuvel sd $s3,24($sp) 263*a11d055eSArd Biesheuvel sd $s2,16($sp) 264*a11d055eSArd Biesheuvel sd $s1,8($sp) 265*a11d055eSArd Biesheuvel sd $s0,0($sp) 266*a11d055eSArd Biesheuvel___ 267*a11d055eSArd Biesheuvel$code.=<<___; 268*a11d055eSArd Biesheuvel .set reorder 269*a11d055eSArd Biesheuvel 270*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6) 271*a11d055eSArd Biesheuvel andi $shr,$inp,7 272*a11d055eSArd Biesheuvel dsubu $inp,$inp,$shr # align $inp 273*a11d055eSArd Biesheuvel sll $shr,$shr,3 # byte to bit offset 274*a11d055eSArd Biesheuvel subu $shl,$zero,$shr 275*a11d055eSArd Biesheuvel#endif 276*a11d055eSArd Biesheuvel 277*a11d055eSArd Biesheuvel ld $h0,0($ctx) # load hash value 278*a11d055eSArd Biesheuvel ld $h1,8($ctx) 279*a11d055eSArd Biesheuvel ld $h2,16($ctx) 280*a11d055eSArd Biesheuvel 281*a11d055eSArd Biesheuvel ld $r0,24($ctx) # load key 282*a11d055eSArd Biesheuvel ld $r1,32($ctx) 283*a11d055eSArd Biesheuvel ld $rs1,40($ctx) 284*a11d055eSArd Biesheuvel 285*a11d055eSArd Biesheuvel dsll $len,4 286*a11d055eSArd Biesheuvel daddu $len,$inp # end of buffer 287*a11d055eSArd Biesheuvel b .Loop 288*a11d055eSArd Biesheuvel 289*a11d055eSArd Biesheuvel.align 4 290*a11d055eSArd Biesheuvel.Loop: 291*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6) 292*a11d055eSArd Biesheuvel ld $in0,0($inp) # load input 293*a11d055eSArd Biesheuvel ld $in1,8($inp) 294*a11d055eSArd Biesheuvel beqz $shr,.Laligned_inp 295*a11d055eSArd Biesheuvel 296*a11d055eSArd Biesheuvel ld $tmp2,16($inp) 297*a11d055eSArd Biesheuvel# ifdef MIPSEB 298*a11d055eSArd Biesheuvel dsllv $in0,$in0,$shr 299*a11d055eSArd Biesheuvel dsrlv $tmp3,$in1,$shl 300*a11d055eSArd Biesheuvel dsllv $in1,$in1,$shr 301*a11d055eSArd Biesheuvel dsrlv $tmp2,$tmp2,$shl 302*a11d055eSArd Biesheuvel# else 303*a11d055eSArd Biesheuvel dsrlv $in0,$in0,$shr 304*a11d055eSArd Biesheuvel dsllv $tmp3,$in1,$shl 305*a11d055eSArd Biesheuvel dsrlv $in1,$in1,$shr 306*a11d055eSArd Biesheuvel dsllv $tmp2,$tmp2,$shl 307*a11d055eSArd Biesheuvel# endif 308*a11d055eSArd Biesheuvel or $in0,$in0,$tmp3 309*a11d055eSArd Biesheuvel or $in1,$in1,$tmp2 310*a11d055eSArd Biesheuvel.Laligned_inp: 311*a11d055eSArd Biesheuvel#else 312*a11d055eSArd Biesheuvel ldl $in0,0+MSB($inp) # load input 313*a11d055eSArd Biesheuvel ldl $in1,8+MSB($inp) 314*a11d055eSArd Biesheuvel ldr $in0,0+LSB($inp) 315*a11d055eSArd Biesheuvel ldr $in1,8+LSB($inp) 316*a11d055eSArd Biesheuvel#endif 317*a11d055eSArd Biesheuvel daddiu $inp,16 318*a11d055eSArd Biesheuvel#ifdef MIPSEB 319*a11d055eSArd Biesheuvel# if defined(_MIPS_ARCH_MIPS64R2) 320*a11d055eSArd Biesheuvel dsbh $in0,$in0 # byte swap 321*a11d055eSArd Biesheuvel dsbh $in1,$in1 322*a11d055eSArd Biesheuvel dshd $in0,$in0 323*a11d055eSArd Biesheuvel dshd $in1,$in1 324*a11d055eSArd Biesheuvel# else 325*a11d055eSArd Biesheuvel ori $tmp0,$zero,0xFF 326*a11d055eSArd Biesheuvel dsll $tmp2,$tmp0,32 327*a11d055eSArd Biesheuvel or $tmp0,$tmp2 # 0x000000FF000000FF 328*a11d055eSArd Biesheuvel 329*a11d055eSArd Biesheuvel and $tmp1,$in0,$tmp0 # byte swap 330*a11d055eSArd Biesheuvel and $tmp3,$in1,$tmp0 331*a11d055eSArd Biesheuvel dsrl $tmp2,$in0,24 332*a11d055eSArd Biesheuvel dsrl $tmp4,$in1,24 333*a11d055eSArd Biesheuvel dsll $tmp1,24 334*a11d055eSArd Biesheuvel dsll $tmp3,24 335*a11d055eSArd Biesheuvel and $tmp2,$tmp0 336*a11d055eSArd Biesheuvel and $tmp4,$tmp0 337*a11d055eSArd Biesheuvel dsll $tmp0,8 # 0x0000FF000000FF00 338*a11d055eSArd Biesheuvel or $tmp1,$tmp2 339*a11d055eSArd Biesheuvel or $tmp3,$tmp4 340*a11d055eSArd Biesheuvel and $tmp2,$in0,$tmp0 341*a11d055eSArd Biesheuvel and $tmp4,$in1,$tmp0 342*a11d055eSArd Biesheuvel dsrl $in0,8 343*a11d055eSArd Biesheuvel dsrl $in1,8 344*a11d055eSArd Biesheuvel dsll $tmp2,8 345*a11d055eSArd Biesheuvel dsll $tmp4,8 346*a11d055eSArd Biesheuvel and $in0,$tmp0 347*a11d055eSArd Biesheuvel and $in1,$tmp0 348*a11d055eSArd Biesheuvel or $tmp1,$tmp2 349*a11d055eSArd Biesheuvel or $tmp3,$tmp4 350*a11d055eSArd Biesheuvel or $in0,$tmp1 351*a11d055eSArd Biesheuvel or $in1,$tmp3 352*a11d055eSArd Biesheuvel dsrl $tmp1,$in0,32 353*a11d055eSArd Biesheuvel dsrl $tmp3,$in1,32 354*a11d055eSArd Biesheuvel dsll $in0,32 355*a11d055eSArd Biesheuvel dsll $in1,32 356*a11d055eSArd Biesheuvel or $in0,$tmp1 357*a11d055eSArd Biesheuvel or $in1,$tmp3 358*a11d055eSArd Biesheuvel# endif 359*a11d055eSArd Biesheuvel#endif 360*a11d055eSArd Biesheuvel dsrl $tmp1,$h2,2 # modulo-scheduled reduction 361*a11d055eSArd Biesheuvel andi $h2,$h2,3 362*a11d055eSArd Biesheuvel dsll $tmp0,$tmp1,2 363*a11d055eSArd Biesheuvel 364*a11d055eSArd Biesheuvel daddu $d0,$h0,$in0 # accumulate input 365*a11d055eSArd Biesheuvel daddu $tmp1,$tmp0 366*a11d055eSArd Biesheuvel sltu $tmp0,$d0,$h0 367*a11d055eSArd Biesheuvel daddu $d0,$d0,$tmp1 # ... and residue 368*a11d055eSArd Biesheuvel sltu $tmp1,$d0,$tmp1 369*a11d055eSArd Biesheuvel daddu $d1,$h1,$in1 370*a11d055eSArd Biesheuvel daddu $tmp0,$tmp1 371*a11d055eSArd Biesheuvel sltu $tmp1,$d1,$h1 372*a11d055eSArd Biesheuvel daddu $d1,$tmp0 373*a11d055eSArd Biesheuvel 374*a11d055eSArd Biesheuvel dmultu ($r0,$d0) # h0*r0 375*a11d055eSArd Biesheuvel daddu $d2,$h2,$padbit 376*a11d055eSArd Biesheuvel sltu $tmp0,$d1,$tmp0 377*a11d055eSArd Biesheuvel mflo ($h0,$r0,$d0) 378*a11d055eSArd Biesheuvel mfhi ($h1,$r0,$d0) 379*a11d055eSArd Biesheuvel 380*a11d055eSArd Biesheuvel dmultu ($rs1,$d1) # h1*5*r1 381*a11d055eSArd Biesheuvel daddu $d2,$tmp1 382*a11d055eSArd Biesheuvel daddu $d2,$tmp0 383*a11d055eSArd Biesheuvel mflo ($tmp0,$rs1,$d1) 384*a11d055eSArd Biesheuvel mfhi ($tmp1,$rs1,$d1) 385*a11d055eSArd Biesheuvel 386*a11d055eSArd Biesheuvel dmultu ($r1,$d0) # h0*r1 387*a11d055eSArd Biesheuvel mflo ($tmp2,$r1,$d0) 388*a11d055eSArd Biesheuvel mfhi ($h2,$r1,$d0) 389*a11d055eSArd Biesheuvel daddu $h0,$tmp0 390*a11d055eSArd Biesheuvel daddu $h1,$tmp1 391*a11d055eSArd Biesheuvel sltu $tmp0,$h0,$tmp0 392*a11d055eSArd Biesheuvel 393*a11d055eSArd Biesheuvel dmultu ($r0,$d1) # h1*r0 394*a11d055eSArd Biesheuvel daddu $h1,$tmp0 395*a11d055eSArd Biesheuvel daddu $h1,$tmp2 396*a11d055eSArd Biesheuvel mflo ($tmp0,$r0,$d1) 397*a11d055eSArd Biesheuvel mfhi ($tmp1,$r0,$d1) 398*a11d055eSArd Biesheuvel 399*a11d055eSArd Biesheuvel dmultu ($rs1,$d2) # h2*5*r1 400*a11d055eSArd Biesheuvel sltu $tmp2,$h1,$tmp2 401*a11d055eSArd Biesheuvel daddu $h2,$tmp2 402*a11d055eSArd Biesheuvel mflo ($tmp2,$rs1,$d2) 403*a11d055eSArd Biesheuvel 404*a11d055eSArd Biesheuvel dmultu ($r0,$d2) # h2*r0 405*a11d055eSArd Biesheuvel daddu $h1,$tmp0 406*a11d055eSArd Biesheuvel daddu $h2,$tmp1 407*a11d055eSArd Biesheuvel mflo ($tmp3,$r0,$d2) 408*a11d055eSArd Biesheuvel sltu $tmp0,$h1,$tmp0 409*a11d055eSArd Biesheuvel daddu $h2,$tmp0 410*a11d055eSArd Biesheuvel 411*a11d055eSArd Biesheuvel daddu $h1,$tmp2 412*a11d055eSArd Biesheuvel sltu $tmp2,$h1,$tmp2 413*a11d055eSArd Biesheuvel daddu $h2,$tmp2 414*a11d055eSArd Biesheuvel daddu $h2,$tmp3 415*a11d055eSArd Biesheuvel 416*a11d055eSArd Biesheuvel bne $inp,$len,.Loop 417*a11d055eSArd Biesheuvel 418*a11d055eSArd Biesheuvel sd $h0,0($ctx) # store hash value 419*a11d055eSArd Biesheuvel sd $h1,8($ctx) 420*a11d055eSArd Biesheuvel sd $h2,16($ctx) 421*a11d055eSArd Biesheuvel 422*a11d055eSArd Biesheuvel .set noreorder 423*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6) 424*a11d055eSArd Biesheuvel ld $s7,56($sp) 425*a11d055eSArd Biesheuvel ld $s6,48($sp) 426*a11d055eSArd Biesheuvel#endif 427*a11d055eSArd Biesheuvel ld $s5,40($sp) # epilogue 428*a11d055eSArd Biesheuvel ld $s4,32($sp) 429*a11d055eSArd Biesheuvel___ 430*a11d055eSArd Biesheuvel$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue 431*a11d055eSArd Biesheuvel ld $s3,24($sp) 432*a11d055eSArd Biesheuvel ld $s2,16($sp) 433*a11d055eSArd Biesheuvel ld $s1,8($sp) 434*a11d055eSArd Biesheuvel ld $s0,0($sp) 435*a11d055eSArd Biesheuvel___ 436*a11d055eSArd Biesheuvel$code.=<<___; 437*a11d055eSArd Biesheuvel jr $ra 438*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6) 439*a11d055eSArd Biesheuvel daddu $sp,8*8 440*a11d055eSArd Biesheuvel#else 441*a11d055eSArd Biesheuvel daddu $sp,6*8 442*a11d055eSArd Biesheuvel#endif 443*a11d055eSArd Biesheuvel.end poly1305_blocks_internal 444*a11d055eSArd Biesheuvel___ 445*a11d055eSArd Biesheuvel} 446*a11d055eSArd Biesheuvel{ 447*a11d055eSArd Biesheuvelmy ($ctx,$mac,$nonce) = ($a0,$a1,$a2); 448*a11d055eSArd Biesheuvel 449*a11d055eSArd Biesheuvel$code.=<<___; 450*a11d055eSArd Biesheuvel.align 5 451*a11d055eSArd Biesheuvel.globl poly1305_emit 452*a11d055eSArd Biesheuvel.ent poly1305_emit 453*a11d055eSArd Biesheuvelpoly1305_emit: 454*a11d055eSArd Biesheuvel .frame $sp,0,$ra 455*a11d055eSArd Biesheuvel .set reorder 456*a11d055eSArd Biesheuvel 457*a11d055eSArd Biesheuvel ld $tmp2,16($ctx) 458*a11d055eSArd Biesheuvel ld $tmp0,0($ctx) 459*a11d055eSArd Biesheuvel ld $tmp1,8($ctx) 460*a11d055eSArd Biesheuvel 461*a11d055eSArd Biesheuvel li $in0,-4 # final reduction 462*a11d055eSArd Biesheuvel dsrl $in1,$tmp2,2 463*a11d055eSArd Biesheuvel and $in0,$tmp2 464*a11d055eSArd Biesheuvel andi $tmp2,$tmp2,3 465*a11d055eSArd Biesheuvel daddu $in0,$in1 466*a11d055eSArd Biesheuvel 467*a11d055eSArd Biesheuvel daddu $tmp0,$tmp0,$in0 468*a11d055eSArd Biesheuvel sltu $in1,$tmp0,$in0 469*a11d055eSArd Biesheuvel daddiu $in0,$tmp0,5 # compare to modulus 470*a11d055eSArd Biesheuvel daddu $tmp1,$tmp1,$in1 471*a11d055eSArd Biesheuvel sltiu $tmp3,$in0,5 472*a11d055eSArd Biesheuvel sltu $tmp4,$tmp1,$in1 473*a11d055eSArd Biesheuvel daddu $in1,$tmp1,$tmp3 474*a11d055eSArd Biesheuvel daddu $tmp2,$tmp2,$tmp4 475*a11d055eSArd Biesheuvel sltu $tmp3,$in1,$tmp3 476*a11d055eSArd Biesheuvel daddu $tmp2,$tmp2,$tmp3 477*a11d055eSArd Biesheuvel 478*a11d055eSArd Biesheuvel dsrl $tmp2,2 # see if it carried/borrowed 479*a11d055eSArd Biesheuvel dsubu $tmp2,$zero,$tmp2 480*a11d055eSArd Biesheuvel 481*a11d055eSArd Biesheuvel xor $in0,$tmp0 482*a11d055eSArd Biesheuvel xor $in1,$tmp1 483*a11d055eSArd Biesheuvel and $in0,$tmp2 484*a11d055eSArd Biesheuvel and $in1,$tmp2 485*a11d055eSArd Biesheuvel xor $in0,$tmp0 486*a11d055eSArd Biesheuvel xor $in1,$tmp1 487*a11d055eSArd Biesheuvel 488*a11d055eSArd Biesheuvel lwu $tmp0,0($nonce) # load nonce 489*a11d055eSArd Biesheuvel lwu $tmp1,4($nonce) 490*a11d055eSArd Biesheuvel lwu $tmp2,8($nonce) 491*a11d055eSArd Biesheuvel lwu $tmp3,12($nonce) 492*a11d055eSArd Biesheuvel dsll $tmp1,32 493*a11d055eSArd Biesheuvel dsll $tmp3,32 494*a11d055eSArd Biesheuvel or $tmp0,$tmp1 495*a11d055eSArd Biesheuvel or $tmp2,$tmp3 496*a11d055eSArd Biesheuvel 497*a11d055eSArd Biesheuvel daddu $in0,$tmp0 # accumulate nonce 498*a11d055eSArd Biesheuvel daddu $in1,$tmp2 499*a11d055eSArd Biesheuvel sltu $tmp0,$in0,$tmp0 500*a11d055eSArd Biesheuvel daddu $in1,$tmp0 501*a11d055eSArd Biesheuvel 502*a11d055eSArd Biesheuvel dsrl $tmp0,$in0,8 # write mac value 503*a11d055eSArd Biesheuvel dsrl $tmp1,$in0,16 504*a11d055eSArd Biesheuvel dsrl $tmp2,$in0,24 505*a11d055eSArd Biesheuvel sb $in0,0($mac) 506*a11d055eSArd Biesheuvel dsrl $tmp3,$in0,32 507*a11d055eSArd Biesheuvel sb $tmp0,1($mac) 508*a11d055eSArd Biesheuvel dsrl $tmp0,$in0,40 509*a11d055eSArd Biesheuvel sb $tmp1,2($mac) 510*a11d055eSArd Biesheuvel dsrl $tmp1,$in0,48 511*a11d055eSArd Biesheuvel sb $tmp2,3($mac) 512*a11d055eSArd Biesheuvel dsrl $tmp2,$in0,56 513*a11d055eSArd Biesheuvel sb $tmp3,4($mac) 514*a11d055eSArd Biesheuvel dsrl $tmp3,$in1,8 515*a11d055eSArd Biesheuvel sb $tmp0,5($mac) 516*a11d055eSArd Biesheuvel dsrl $tmp0,$in1,16 517*a11d055eSArd Biesheuvel sb $tmp1,6($mac) 518*a11d055eSArd Biesheuvel dsrl $tmp1,$in1,24 519*a11d055eSArd Biesheuvel sb $tmp2,7($mac) 520*a11d055eSArd Biesheuvel 521*a11d055eSArd Biesheuvel sb $in1,8($mac) 522*a11d055eSArd Biesheuvel dsrl $tmp2,$in1,32 523*a11d055eSArd Biesheuvel sb $tmp3,9($mac) 524*a11d055eSArd Biesheuvel dsrl $tmp3,$in1,40 525*a11d055eSArd Biesheuvel sb $tmp0,10($mac) 526*a11d055eSArd Biesheuvel dsrl $tmp0,$in1,48 527*a11d055eSArd Biesheuvel sb $tmp1,11($mac) 528*a11d055eSArd Biesheuvel dsrl $tmp1,$in1,56 529*a11d055eSArd Biesheuvel sb $tmp2,12($mac) 530*a11d055eSArd Biesheuvel sb $tmp3,13($mac) 531*a11d055eSArd Biesheuvel sb $tmp0,14($mac) 532*a11d055eSArd Biesheuvel sb $tmp1,15($mac) 533*a11d055eSArd Biesheuvel 534*a11d055eSArd Biesheuvel jr $ra 535*a11d055eSArd Biesheuvel.end poly1305_emit 536*a11d055eSArd Biesheuvel.rdata 537*a11d055eSArd Biesheuvel.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" 538*a11d055eSArd Biesheuvel.align 2 539*a11d055eSArd Biesheuvel___ 540*a11d055eSArd Biesheuvel} 541*a11d055eSArd Biesheuvel}}} else {{{ 542*a11d055eSArd Biesheuvel###################################################################### 543*a11d055eSArd Biesheuvel# 32-bit code path 544*a11d055eSArd Biesheuvel# 545*a11d055eSArd Biesheuvel 546*a11d055eSArd Biesheuvelmy ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 547*a11d055eSArd Biesheuvelmy ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = 548*a11d055eSArd Biesheuvel ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); 549*a11d055eSArd Biesheuvel 550*a11d055eSArd Biesheuvel$code.=<<___; 551*a11d055eSArd Biesheuvel#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ 552*a11d055eSArd Biesheuvel defined(_MIPS_ARCH_MIPS32R6)) \\ 553*a11d055eSArd Biesheuvel && !defined(_MIPS_ARCH_MIPS32R2) 554*a11d055eSArd Biesheuvel# define _MIPS_ARCH_MIPS32R2 555*a11d055eSArd Biesheuvel#endif 556*a11d055eSArd Biesheuvel 557*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R6) 558*a11d055eSArd Biesheuvel# define multu(rs,rt) 559*a11d055eSArd Biesheuvel# define mflo(rd,rs,rt) mulu rd,rs,rt 560*a11d055eSArd Biesheuvel# define mfhi(rd,rs,rt) muhu rd,rs,rt 561*a11d055eSArd Biesheuvel#else 562*a11d055eSArd Biesheuvel# define multu(rs,rt) multu rs,rt 563*a11d055eSArd Biesheuvel# define mflo(rd,rs,rt) mflo rd 564*a11d055eSArd Biesheuvel# define mfhi(rd,rs,rt) mfhi rd 565*a11d055eSArd Biesheuvel#endif 566*a11d055eSArd Biesheuvel 567*a11d055eSArd Biesheuvel#ifdef __KERNEL__ 568*a11d055eSArd Biesheuvel# define poly1305_init poly1305_init_mips 569*a11d055eSArd Biesheuvel# define poly1305_blocks poly1305_blocks_mips 570*a11d055eSArd Biesheuvel# define poly1305_emit poly1305_emit_mips 571*a11d055eSArd Biesheuvel#endif 572*a11d055eSArd Biesheuvel 573*a11d055eSArd Biesheuvel#if defined(__MIPSEB__) && !defined(MIPSEB) 574*a11d055eSArd Biesheuvel# define MIPSEB 575*a11d055eSArd Biesheuvel#endif 576*a11d055eSArd Biesheuvel 577*a11d055eSArd Biesheuvel#ifdef MIPSEB 578*a11d055eSArd Biesheuvel# define MSB 0 579*a11d055eSArd Biesheuvel# define LSB 3 580*a11d055eSArd Biesheuvel#else 581*a11d055eSArd Biesheuvel# define MSB 3 582*a11d055eSArd Biesheuvel# define LSB 0 583*a11d055eSArd Biesheuvel#endif 584*a11d055eSArd Biesheuvel 585*a11d055eSArd Biesheuvel.text 586*a11d055eSArd Biesheuvel.set noat 587*a11d055eSArd Biesheuvel.set noreorder 588*a11d055eSArd Biesheuvel 589*a11d055eSArd Biesheuvel.align 5 590*a11d055eSArd Biesheuvel.globl poly1305_init 591*a11d055eSArd Biesheuvel.ent poly1305_init 592*a11d055eSArd Biesheuvelpoly1305_init: 593*a11d055eSArd Biesheuvel .frame $sp,0,$ra 594*a11d055eSArd Biesheuvel .set reorder 595*a11d055eSArd Biesheuvel 596*a11d055eSArd Biesheuvel sw $zero,0($ctx) 597*a11d055eSArd Biesheuvel sw $zero,4($ctx) 598*a11d055eSArd Biesheuvel sw $zero,8($ctx) 599*a11d055eSArd Biesheuvel sw $zero,12($ctx) 600*a11d055eSArd Biesheuvel sw $zero,16($ctx) 601*a11d055eSArd Biesheuvel 602*a11d055eSArd Biesheuvel beqz $inp,.Lno_key 603*a11d055eSArd Biesheuvel 604*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R6) 605*a11d055eSArd Biesheuvel andi $tmp0,$inp,3 # $inp % 4 606*a11d055eSArd Biesheuvel subu $inp,$inp,$tmp0 # align $inp 607*a11d055eSArd Biesheuvel sll $tmp0,$tmp0,3 # byte to bit offset 608*a11d055eSArd Biesheuvel lw $in0,0($inp) 609*a11d055eSArd Biesheuvel lw $in1,4($inp) 610*a11d055eSArd Biesheuvel lw $in2,8($inp) 611*a11d055eSArd Biesheuvel lw $in3,12($inp) 612*a11d055eSArd Biesheuvel beqz $tmp0,.Laligned_key 613*a11d055eSArd Biesheuvel 614*a11d055eSArd Biesheuvel lw $tmp2,16($inp) 615*a11d055eSArd Biesheuvel subu $tmp1,$zero,$tmp0 616*a11d055eSArd Biesheuvel# ifdef MIPSEB 617*a11d055eSArd Biesheuvel sllv $in0,$in0,$tmp0 618*a11d055eSArd Biesheuvel srlv $tmp3,$in1,$tmp1 619*a11d055eSArd Biesheuvel sllv $in1,$in1,$tmp0 620*a11d055eSArd Biesheuvel or $in0,$in0,$tmp3 621*a11d055eSArd Biesheuvel srlv $tmp3,$in2,$tmp1 622*a11d055eSArd Biesheuvel sllv $in2,$in2,$tmp0 623*a11d055eSArd Biesheuvel or $in1,$in1,$tmp3 624*a11d055eSArd Biesheuvel srlv $tmp3,$in3,$tmp1 625*a11d055eSArd Biesheuvel sllv $in3,$in3,$tmp0 626*a11d055eSArd Biesheuvel or $in2,$in2,$tmp3 627*a11d055eSArd Biesheuvel srlv $tmp2,$tmp2,$tmp1 628*a11d055eSArd Biesheuvel or $in3,$in3,$tmp2 629*a11d055eSArd Biesheuvel# else 630*a11d055eSArd Biesheuvel srlv $in0,$in0,$tmp0 631*a11d055eSArd Biesheuvel sllv $tmp3,$in1,$tmp1 632*a11d055eSArd Biesheuvel srlv $in1,$in1,$tmp0 633*a11d055eSArd Biesheuvel or $in0,$in0,$tmp3 634*a11d055eSArd Biesheuvel sllv $tmp3,$in2,$tmp1 635*a11d055eSArd Biesheuvel srlv $in2,$in2,$tmp0 636*a11d055eSArd Biesheuvel or $in1,$in1,$tmp3 637*a11d055eSArd Biesheuvel sllv $tmp3,$in3,$tmp1 638*a11d055eSArd Biesheuvel srlv $in3,$in3,$tmp0 639*a11d055eSArd Biesheuvel or $in2,$in2,$tmp3 640*a11d055eSArd Biesheuvel sllv $tmp2,$tmp2,$tmp1 641*a11d055eSArd Biesheuvel or $in3,$in3,$tmp2 642*a11d055eSArd Biesheuvel# endif 643*a11d055eSArd Biesheuvel.Laligned_key: 644*a11d055eSArd Biesheuvel#else 645*a11d055eSArd Biesheuvel lwl $in0,0+MSB($inp) 646*a11d055eSArd Biesheuvel lwl $in1,4+MSB($inp) 647*a11d055eSArd Biesheuvel lwl $in2,8+MSB($inp) 648*a11d055eSArd Biesheuvel lwl $in3,12+MSB($inp) 649*a11d055eSArd Biesheuvel lwr $in0,0+LSB($inp) 650*a11d055eSArd Biesheuvel lwr $in1,4+LSB($inp) 651*a11d055eSArd Biesheuvel lwr $in2,8+LSB($inp) 652*a11d055eSArd Biesheuvel lwr $in3,12+LSB($inp) 653*a11d055eSArd Biesheuvel#endif 654*a11d055eSArd Biesheuvel#ifdef MIPSEB 655*a11d055eSArd Biesheuvel# if defined(_MIPS_ARCH_MIPS32R2) 656*a11d055eSArd Biesheuvel wsbh $in0,$in0 # byte swap 657*a11d055eSArd Biesheuvel wsbh $in1,$in1 658*a11d055eSArd Biesheuvel wsbh $in2,$in2 659*a11d055eSArd Biesheuvel wsbh $in3,$in3 660*a11d055eSArd Biesheuvel rotr $in0,$in0,16 661*a11d055eSArd Biesheuvel rotr $in1,$in1,16 662*a11d055eSArd Biesheuvel rotr $in2,$in2,16 663*a11d055eSArd Biesheuvel rotr $in3,$in3,16 664*a11d055eSArd Biesheuvel# else 665*a11d055eSArd Biesheuvel srl $tmp0,$in0,24 # byte swap 666*a11d055eSArd Biesheuvel srl $tmp1,$in0,8 667*a11d055eSArd Biesheuvel andi $tmp2,$in0,0xFF00 668*a11d055eSArd Biesheuvel sll $in0,$in0,24 669*a11d055eSArd Biesheuvel andi $tmp1,0xFF00 670*a11d055eSArd Biesheuvel sll $tmp2,$tmp2,8 671*a11d055eSArd Biesheuvel or $in0,$tmp0 672*a11d055eSArd Biesheuvel srl $tmp0,$in1,24 673*a11d055eSArd Biesheuvel or $tmp1,$tmp2 674*a11d055eSArd Biesheuvel srl $tmp2,$in1,8 675*a11d055eSArd Biesheuvel or $in0,$tmp1 676*a11d055eSArd Biesheuvel andi $tmp1,$in1,0xFF00 677*a11d055eSArd Biesheuvel sll $in1,$in1,24 678*a11d055eSArd Biesheuvel andi $tmp2,0xFF00 679*a11d055eSArd Biesheuvel sll $tmp1,$tmp1,8 680*a11d055eSArd Biesheuvel or $in1,$tmp0 681*a11d055eSArd Biesheuvel srl $tmp0,$in2,24 682*a11d055eSArd Biesheuvel or $tmp2,$tmp1 683*a11d055eSArd Biesheuvel srl $tmp1,$in2,8 684*a11d055eSArd Biesheuvel or $in1,$tmp2 685*a11d055eSArd Biesheuvel andi $tmp2,$in2,0xFF00 686*a11d055eSArd Biesheuvel sll $in2,$in2,24 687*a11d055eSArd Biesheuvel andi $tmp1,0xFF00 688*a11d055eSArd Biesheuvel sll $tmp2,$tmp2,8 689*a11d055eSArd Biesheuvel or $in2,$tmp0 690*a11d055eSArd Biesheuvel srl $tmp0,$in3,24 691*a11d055eSArd Biesheuvel or $tmp1,$tmp2 692*a11d055eSArd Biesheuvel srl $tmp2,$in3,8 693*a11d055eSArd Biesheuvel or $in2,$tmp1 694*a11d055eSArd Biesheuvel andi $tmp1,$in3,0xFF00 695*a11d055eSArd Biesheuvel sll $in3,$in3,24 696*a11d055eSArd Biesheuvel andi $tmp2,0xFF00 697*a11d055eSArd Biesheuvel sll $tmp1,$tmp1,8 698*a11d055eSArd Biesheuvel or $in3,$tmp0 699*a11d055eSArd Biesheuvel or $tmp2,$tmp1 700*a11d055eSArd Biesheuvel or $in3,$tmp2 701*a11d055eSArd Biesheuvel# endif 702*a11d055eSArd Biesheuvel#endif 703*a11d055eSArd Biesheuvel lui $tmp0,0x0fff 704*a11d055eSArd Biesheuvel ori $tmp0,0xffff # 0x0fffffff 705*a11d055eSArd Biesheuvel and $in0,$in0,$tmp0 706*a11d055eSArd Biesheuvel subu $tmp0,3 # 0x0ffffffc 707*a11d055eSArd Biesheuvel and $in1,$in1,$tmp0 708*a11d055eSArd Biesheuvel and $in2,$in2,$tmp0 709*a11d055eSArd Biesheuvel and $in3,$in3,$tmp0 710*a11d055eSArd Biesheuvel 711*a11d055eSArd Biesheuvel sw $in0,20($ctx) 712*a11d055eSArd Biesheuvel sw $in1,24($ctx) 713*a11d055eSArd Biesheuvel sw $in2,28($ctx) 714*a11d055eSArd Biesheuvel sw $in3,32($ctx) 715*a11d055eSArd Biesheuvel 716*a11d055eSArd Biesheuvel srl $tmp1,$in1,2 717*a11d055eSArd Biesheuvel srl $tmp2,$in2,2 718*a11d055eSArd Biesheuvel srl $tmp3,$in3,2 719*a11d055eSArd Biesheuvel addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) 720*a11d055eSArd Biesheuvel addu $in2,$in2,$tmp2 721*a11d055eSArd Biesheuvel addu $in3,$in3,$tmp3 722*a11d055eSArd Biesheuvel sw $in1,36($ctx) 723*a11d055eSArd Biesheuvel sw $in2,40($ctx) 724*a11d055eSArd Biesheuvel sw $in3,44($ctx) 725*a11d055eSArd Biesheuvel.Lno_key: 726*a11d055eSArd Biesheuvel li $v0,0 727*a11d055eSArd Biesheuvel jr $ra 728*a11d055eSArd Biesheuvel.end poly1305_init 729*a11d055eSArd Biesheuvel___ 730*a11d055eSArd Biesheuvel{ 731*a11d055eSArd Biesheuvelmy $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; 732*a11d055eSArd Biesheuvel 733*a11d055eSArd Biesheuvelmy ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = 734*a11d055eSArd Biesheuvel ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); 735*a11d055eSArd Biesheuvelmy ($d0,$d1,$d2,$d3) = 736*a11d055eSArd Biesheuvel ($a4,$a5,$a6,$a7); 737*a11d055eSArd Biesheuvelmy $shr = $t2; # used on R6 738*a11d055eSArd Biesheuvelmy $one = $t2; # used on R2 739*a11d055eSArd Biesheuvel 740*a11d055eSArd Biesheuvel$code.=<<___; 741*a11d055eSArd Biesheuvel.globl poly1305_blocks 742*a11d055eSArd Biesheuvel.align 5 743*a11d055eSArd Biesheuvel.ent poly1305_blocks 744*a11d055eSArd Biesheuvelpoly1305_blocks: 745*a11d055eSArd Biesheuvel .frame $sp,16*4,$ra 746*a11d055eSArd Biesheuvel .mask $SAVED_REGS_MASK,-4 747*a11d055eSArd Biesheuvel .set noreorder 748*a11d055eSArd Biesheuvel subu $sp, $sp,4*12 749*a11d055eSArd Biesheuvel sw $s11,4*11($sp) 750*a11d055eSArd Biesheuvel sw $s10,4*10($sp) 751*a11d055eSArd Biesheuvel sw $s9, 4*9($sp) 752*a11d055eSArd Biesheuvel sw $s8, 4*8($sp) 753*a11d055eSArd Biesheuvel sw $s7, 4*7($sp) 754*a11d055eSArd Biesheuvel sw $s6, 4*6($sp) 755*a11d055eSArd Biesheuvel sw $s5, 4*5($sp) 756*a11d055eSArd Biesheuvel sw $s4, 4*4($sp) 757*a11d055eSArd Biesheuvel___ 758*a11d055eSArd Biesheuvel$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 759*a11d055eSArd Biesheuvel sw $s3, 4*3($sp) 760*a11d055eSArd Biesheuvel sw $s2, 4*2($sp) 761*a11d055eSArd Biesheuvel sw $s1, 4*1($sp) 762*a11d055eSArd Biesheuvel sw $s0, 4*0($sp) 763*a11d055eSArd Biesheuvel___ 764*a11d055eSArd Biesheuvel$code.=<<___; 765*a11d055eSArd Biesheuvel .set reorder 766*a11d055eSArd Biesheuvel 767*a11d055eSArd Biesheuvel srl $len,4 # number of complete blocks 768*a11d055eSArd Biesheuvel li $one,1 769*a11d055eSArd Biesheuvel beqz $len,.Labort 770*a11d055eSArd Biesheuvel 771*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R6) 772*a11d055eSArd Biesheuvel andi $shr,$inp,3 773*a11d055eSArd Biesheuvel subu $inp,$inp,$shr # align $inp 774*a11d055eSArd Biesheuvel sll $shr,$shr,3 # byte to bit offset 775*a11d055eSArd Biesheuvel#endif 776*a11d055eSArd Biesheuvel 777*a11d055eSArd Biesheuvel lw $h0,0($ctx) # load hash value 778*a11d055eSArd Biesheuvel lw $h1,4($ctx) 779*a11d055eSArd Biesheuvel lw $h2,8($ctx) 780*a11d055eSArd Biesheuvel lw $h3,12($ctx) 781*a11d055eSArd Biesheuvel lw $h4,16($ctx) 782*a11d055eSArd Biesheuvel 783*a11d055eSArd Biesheuvel lw $r0,20($ctx) # load key 784*a11d055eSArd Biesheuvel lw $r1,24($ctx) 785*a11d055eSArd Biesheuvel lw $r2,28($ctx) 786*a11d055eSArd Biesheuvel lw $r3,32($ctx) 787*a11d055eSArd Biesheuvel lw $rs1,36($ctx) 788*a11d055eSArd Biesheuvel lw $rs2,40($ctx) 789*a11d055eSArd Biesheuvel lw $rs3,44($ctx) 790*a11d055eSArd Biesheuvel 791*a11d055eSArd Biesheuvel sll $len,4 792*a11d055eSArd Biesheuvel addu $len,$len,$inp # end of buffer 793*a11d055eSArd Biesheuvel b .Loop 794*a11d055eSArd Biesheuvel 795*a11d055eSArd Biesheuvel.align 4 796*a11d055eSArd Biesheuvel.Loop: 797*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R6) 798*a11d055eSArd Biesheuvel lw $d0,0($inp) # load input 799*a11d055eSArd Biesheuvel lw $d1,4($inp) 800*a11d055eSArd Biesheuvel lw $d2,8($inp) 801*a11d055eSArd Biesheuvel lw $d3,12($inp) 802*a11d055eSArd Biesheuvel beqz $shr,.Laligned_inp 803*a11d055eSArd Biesheuvel 804*a11d055eSArd Biesheuvel lw $t0,16($inp) 805*a11d055eSArd Biesheuvel subu $t1,$zero,$shr 806*a11d055eSArd Biesheuvel# ifdef MIPSEB 807*a11d055eSArd Biesheuvel sllv $d0,$d0,$shr 808*a11d055eSArd Biesheuvel srlv $at,$d1,$t1 809*a11d055eSArd Biesheuvel sllv $d1,$d1,$shr 810*a11d055eSArd Biesheuvel or $d0,$d0,$at 811*a11d055eSArd Biesheuvel srlv $at,$d2,$t1 812*a11d055eSArd Biesheuvel sllv $d2,$d2,$shr 813*a11d055eSArd Biesheuvel or $d1,$d1,$at 814*a11d055eSArd Biesheuvel srlv $at,$d3,$t1 815*a11d055eSArd Biesheuvel sllv $d3,$d3,$shr 816*a11d055eSArd Biesheuvel or $d2,$d2,$at 817*a11d055eSArd Biesheuvel srlv $t0,$t0,$t1 818*a11d055eSArd Biesheuvel or $d3,$d3,$t0 819*a11d055eSArd Biesheuvel# else 820*a11d055eSArd Biesheuvel srlv $d0,$d0,$shr 821*a11d055eSArd Biesheuvel sllv $at,$d1,$t1 822*a11d055eSArd Biesheuvel srlv $d1,$d1,$shr 823*a11d055eSArd Biesheuvel or $d0,$d0,$at 824*a11d055eSArd Biesheuvel sllv $at,$d2,$t1 825*a11d055eSArd Biesheuvel srlv $d2,$d2,$shr 826*a11d055eSArd Biesheuvel or $d1,$d1,$at 827*a11d055eSArd Biesheuvel sllv $at,$d3,$t1 828*a11d055eSArd Biesheuvel srlv $d3,$d3,$shr 829*a11d055eSArd Biesheuvel or $d2,$d2,$at 830*a11d055eSArd Biesheuvel sllv $t0,$t0,$t1 831*a11d055eSArd Biesheuvel or $d3,$d3,$t0 832*a11d055eSArd Biesheuvel# endif 833*a11d055eSArd Biesheuvel.Laligned_inp: 834*a11d055eSArd Biesheuvel#else 835*a11d055eSArd Biesheuvel lwl $d0,0+MSB($inp) # load input 836*a11d055eSArd Biesheuvel lwl $d1,4+MSB($inp) 837*a11d055eSArd Biesheuvel lwl $d2,8+MSB($inp) 838*a11d055eSArd Biesheuvel lwl $d3,12+MSB($inp) 839*a11d055eSArd Biesheuvel lwr $d0,0+LSB($inp) 840*a11d055eSArd Biesheuvel lwr $d1,4+LSB($inp) 841*a11d055eSArd Biesheuvel lwr $d2,8+LSB($inp) 842*a11d055eSArd Biesheuvel lwr $d3,12+LSB($inp) 843*a11d055eSArd Biesheuvel#endif 844*a11d055eSArd Biesheuvel#ifdef MIPSEB 845*a11d055eSArd Biesheuvel# if defined(_MIPS_ARCH_MIPS32R2) 846*a11d055eSArd Biesheuvel wsbh $d0,$d0 # byte swap 847*a11d055eSArd Biesheuvel wsbh $d1,$d1 848*a11d055eSArd Biesheuvel wsbh $d2,$d2 849*a11d055eSArd Biesheuvel wsbh $d3,$d3 850*a11d055eSArd Biesheuvel rotr $d0,$d0,16 851*a11d055eSArd Biesheuvel rotr $d1,$d1,16 852*a11d055eSArd Biesheuvel rotr $d2,$d2,16 853*a11d055eSArd Biesheuvel rotr $d3,$d3,16 854*a11d055eSArd Biesheuvel# else 855*a11d055eSArd Biesheuvel srl $at,$d0,24 # byte swap 856*a11d055eSArd Biesheuvel srl $t0,$d0,8 857*a11d055eSArd Biesheuvel andi $t1,$d0,0xFF00 858*a11d055eSArd Biesheuvel sll $d0,$d0,24 859*a11d055eSArd Biesheuvel andi $t0,0xFF00 860*a11d055eSArd Biesheuvel sll $t1,$t1,8 861*a11d055eSArd Biesheuvel or $d0,$at 862*a11d055eSArd Biesheuvel srl $at,$d1,24 863*a11d055eSArd Biesheuvel or $t0,$t1 864*a11d055eSArd Biesheuvel srl $t1,$d1,8 865*a11d055eSArd Biesheuvel or $d0,$t0 866*a11d055eSArd Biesheuvel andi $t0,$d1,0xFF00 867*a11d055eSArd Biesheuvel sll $d1,$d1,24 868*a11d055eSArd Biesheuvel andi $t1,0xFF00 869*a11d055eSArd Biesheuvel sll $t0,$t0,8 870*a11d055eSArd Biesheuvel or $d1,$at 871*a11d055eSArd Biesheuvel srl $at,$d2,24 872*a11d055eSArd Biesheuvel or $t1,$t0 873*a11d055eSArd Biesheuvel srl $t0,$d2,8 874*a11d055eSArd Biesheuvel or $d1,$t1 875*a11d055eSArd Biesheuvel andi $t1,$d2,0xFF00 876*a11d055eSArd Biesheuvel sll $d2,$d2,24 877*a11d055eSArd Biesheuvel andi $t0,0xFF00 878*a11d055eSArd Biesheuvel sll $t1,$t1,8 879*a11d055eSArd Biesheuvel or $d2,$at 880*a11d055eSArd Biesheuvel srl $at,$d3,24 881*a11d055eSArd Biesheuvel or $t0,$t1 882*a11d055eSArd Biesheuvel srl $t1,$d3,8 883*a11d055eSArd Biesheuvel or $d2,$t0 884*a11d055eSArd Biesheuvel andi $t0,$d3,0xFF00 885*a11d055eSArd Biesheuvel sll $d3,$d3,24 886*a11d055eSArd Biesheuvel andi $t1,0xFF00 887*a11d055eSArd Biesheuvel sll $t0,$t0,8 888*a11d055eSArd Biesheuvel or $d3,$at 889*a11d055eSArd Biesheuvel or $t1,$t0 890*a11d055eSArd Biesheuvel or $d3,$t1 891*a11d055eSArd Biesheuvel# endif 892*a11d055eSArd Biesheuvel#endif 893*a11d055eSArd Biesheuvel srl $t0,$h4,2 # modulo-scheduled reduction 894*a11d055eSArd Biesheuvel andi $h4,$h4,3 895*a11d055eSArd Biesheuvel sll $at,$t0,2 896*a11d055eSArd Biesheuvel 897*a11d055eSArd Biesheuvel addu $d0,$d0,$h0 # accumulate input 898*a11d055eSArd Biesheuvel addu $t0,$t0,$at 899*a11d055eSArd Biesheuvel sltu $h0,$d0,$h0 900*a11d055eSArd Biesheuvel addu $d0,$d0,$t0 # ... and residue 901*a11d055eSArd Biesheuvel sltu $at,$d0,$t0 902*a11d055eSArd Biesheuvel 903*a11d055eSArd Biesheuvel addu $d1,$d1,$h1 904*a11d055eSArd Biesheuvel addu $h0,$h0,$at # carry 905*a11d055eSArd Biesheuvel sltu $h1,$d1,$h1 906*a11d055eSArd Biesheuvel addu $d1,$d1,$h0 907*a11d055eSArd Biesheuvel sltu $h0,$d1,$h0 908*a11d055eSArd Biesheuvel 909*a11d055eSArd Biesheuvel addu $d2,$d2,$h2 910*a11d055eSArd Biesheuvel addu $h1,$h1,$h0 # carry 911*a11d055eSArd Biesheuvel sltu $h2,$d2,$h2 912*a11d055eSArd Biesheuvel addu $d2,$d2,$h1 913*a11d055eSArd Biesheuvel sltu $h1,$d2,$h1 914*a11d055eSArd Biesheuvel 915*a11d055eSArd Biesheuvel addu $d3,$d3,$h3 916*a11d055eSArd Biesheuvel addu $h2,$h2,$h1 # carry 917*a11d055eSArd Biesheuvel sltu $h3,$d3,$h3 918*a11d055eSArd Biesheuvel addu $d3,$d3,$h2 919*a11d055eSArd Biesheuvel 920*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) 921*a11d055eSArd Biesheuvel multu $r0,$d0 # d0*r0 922*a11d055eSArd Biesheuvel sltu $h2,$d3,$h2 923*a11d055eSArd Biesheuvel maddu $rs3,$d1 # d1*s3 924*a11d055eSArd Biesheuvel addu $h3,$h3,$h2 # carry 925*a11d055eSArd Biesheuvel maddu $rs2,$d2 # d2*s2 926*a11d055eSArd Biesheuvel addu $h4,$h4,$padbit 927*a11d055eSArd Biesheuvel maddu $rs1,$d3 # d3*s1 928*a11d055eSArd Biesheuvel addu $h4,$h4,$h3 929*a11d055eSArd Biesheuvel mfhi $at 930*a11d055eSArd Biesheuvel mflo $h0 931*a11d055eSArd Biesheuvel 932*a11d055eSArd Biesheuvel multu $r1,$d0 # d0*r1 933*a11d055eSArd Biesheuvel maddu $r0,$d1 # d1*r0 934*a11d055eSArd Biesheuvel maddu $rs3,$d2 # d2*s3 935*a11d055eSArd Biesheuvel maddu $rs2,$d3 # d3*s2 936*a11d055eSArd Biesheuvel maddu $rs1,$h4 # h4*s1 937*a11d055eSArd Biesheuvel maddu $at,$one # hi*1 938*a11d055eSArd Biesheuvel mfhi $at 939*a11d055eSArd Biesheuvel mflo $h1 940*a11d055eSArd Biesheuvel 941*a11d055eSArd Biesheuvel multu $r2,$d0 # d0*r2 942*a11d055eSArd Biesheuvel maddu $r1,$d1 # d1*r1 943*a11d055eSArd Biesheuvel maddu $r0,$d2 # d2*r0 944*a11d055eSArd Biesheuvel maddu $rs3,$d3 # d3*s3 945*a11d055eSArd Biesheuvel maddu $rs2,$h4 # h4*s2 946*a11d055eSArd Biesheuvel maddu $at,$one # hi*1 947*a11d055eSArd Biesheuvel mfhi $at 948*a11d055eSArd Biesheuvel mflo $h2 949*a11d055eSArd Biesheuvel 950*a11d055eSArd Biesheuvel mul $t0,$r0,$h4 # h4*r0 951*a11d055eSArd Biesheuvel 952*a11d055eSArd Biesheuvel multu $r3,$d0 # d0*r3 953*a11d055eSArd Biesheuvel maddu $r2,$d1 # d1*r2 954*a11d055eSArd Biesheuvel maddu $r1,$d2 # d2*r1 955*a11d055eSArd Biesheuvel maddu $r0,$d3 # d3*r0 956*a11d055eSArd Biesheuvel maddu $rs3,$h4 # h4*s3 957*a11d055eSArd Biesheuvel maddu $at,$one # hi*1 958*a11d055eSArd Biesheuvel mfhi $at 959*a11d055eSArd Biesheuvel mflo $h3 960*a11d055eSArd Biesheuvel 961*a11d055eSArd Biesheuvel addiu $inp,$inp,16 962*a11d055eSArd Biesheuvel 963*a11d055eSArd Biesheuvel addu $h4,$t0,$at 964*a11d055eSArd Biesheuvel#else 965*a11d055eSArd Biesheuvel multu ($r0,$d0) # d0*r0 966*a11d055eSArd Biesheuvel mflo ($h0,$r0,$d0) 967*a11d055eSArd Biesheuvel mfhi ($h1,$r0,$d0) 968*a11d055eSArd Biesheuvel 969*a11d055eSArd Biesheuvel sltu $h2,$d3,$h2 970*a11d055eSArd Biesheuvel addu $h3,$h3,$h2 # carry 971*a11d055eSArd Biesheuvel 972*a11d055eSArd Biesheuvel multu ($rs3,$d1) # d1*s3 973*a11d055eSArd Biesheuvel mflo ($at,$rs3,$d1) 974*a11d055eSArd Biesheuvel mfhi ($t0,$rs3,$d1) 975*a11d055eSArd Biesheuvel 976*a11d055eSArd Biesheuvel addu $h4,$h4,$padbit 977*a11d055eSArd Biesheuvel addiu $inp,$inp,16 978*a11d055eSArd Biesheuvel addu $h4,$h4,$h3 979*a11d055eSArd Biesheuvel 980*a11d055eSArd Biesheuvel multu ($rs2,$d2) # d2*s2 981*a11d055eSArd Biesheuvel mflo ($a3,$rs2,$d2) 982*a11d055eSArd Biesheuvel mfhi ($t1,$rs2,$d2) 983*a11d055eSArd Biesheuvel addu $h0,$h0,$at 984*a11d055eSArd Biesheuvel addu $h1,$h1,$t0 985*a11d055eSArd Biesheuvel multu ($rs1,$d3) # d3*s1 986*a11d055eSArd Biesheuvel sltu $at,$h0,$at 987*a11d055eSArd Biesheuvel addu $h1,$h1,$at 988*a11d055eSArd Biesheuvel 989*a11d055eSArd Biesheuvel mflo ($at,$rs1,$d3) 990*a11d055eSArd Biesheuvel mfhi ($t0,$rs1,$d3) 991*a11d055eSArd Biesheuvel addu $h0,$h0,$a3 992*a11d055eSArd Biesheuvel addu $h1,$h1,$t1 993*a11d055eSArd Biesheuvel multu ($r1,$d0) # d0*r1 994*a11d055eSArd Biesheuvel sltu $a3,$h0,$a3 995*a11d055eSArd Biesheuvel addu $h1,$h1,$a3 996*a11d055eSArd Biesheuvel 997*a11d055eSArd Biesheuvel 998*a11d055eSArd Biesheuvel mflo ($a3,$r1,$d0) 999*a11d055eSArd Biesheuvel mfhi ($h2,$r1,$d0) 1000*a11d055eSArd Biesheuvel addu $h0,$h0,$at 1001*a11d055eSArd Biesheuvel addu $h1,$h1,$t0 1002*a11d055eSArd Biesheuvel multu ($r0,$d1) # d1*r0 1003*a11d055eSArd Biesheuvel sltu $at,$h0,$at 1004*a11d055eSArd Biesheuvel addu $h1,$h1,$at 1005*a11d055eSArd Biesheuvel 1006*a11d055eSArd Biesheuvel mflo ($at,$r0,$d1) 1007*a11d055eSArd Biesheuvel mfhi ($t0,$r0,$d1) 1008*a11d055eSArd Biesheuvel addu $h1,$h1,$a3 1009*a11d055eSArd Biesheuvel sltu $a3,$h1,$a3 1010*a11d055eSArd Biesheuvel multu ($rs3,$d2) # d2*s3 1011*a11d055eSArd Biesheuvel addu $h2,$h2,$a3 1012*a11d055eSArd Biesheuvel 1013*a11d055eSArd Biesheuvel mflo ($a3,$rs3,$d2) 1014*a11d055eSArd Biesheuvel mfhi ($t1,$rs3,$d2) 1015*a11d055eSArd Biesheuvel addu $h1,$h1,$at 1016*a11d055eSArd Biesheuvel addu $h2,$h2,$t0 1017*a11d055eSArd Biesheuvel multu ($rs2,$d3) # d3*s2 1018*a11d055eSArd Biesheuvel sltu $at,$h1,$at 1019*a11d055eSArd Biesheuvel addu $h2,$h2,$at 1020*a11d055eSArd Biesheuvel 1021*a11d055eSArd Biesheuvel mflo ($at,$rs2,$d3) 1022*a11d055eSArd Biesheuvel mfhi ($t0,$rs2,$d3) 1023*a11d055eSArd Biesheuvel addu $h1,$h1,$a3 1024*a11d055eSArd Biesheuvel addu $h2,$h2,$t1 1025*a11d055eSArd Biesheuvel multu ($rs1,$h4) # h4*s1 1026*a11d055eSArd Biesheuvel sltu $a3,$h1,$a3 1027*a11d055eSArd Biesheuvel addu $h2,$h2,$a3 1028*a11d055eSArd Biesheuvel 1029*a11d055eSArd Biesheuvel mflo ($a3,$rs1,$h4) 1030*a11d055eSArd Biesheuvel addu $h1,$h1,$at 1031*a11d055eSArd Biesheuvel addu $h2,$h2,$t0 1032*a11d055eSArd Biesheuvel multu ($r2,$d0) # d0*r2 1033*a11d055eSArd Biesheuvel sltu $at,$h1,$at 1034*a11d055eSArd Biesheuvel addu $h2,$h2,$at 1035*a11d055eSArd Biesheuvel 1036*a11d055eSArd Biesheuvel 1037*a11d055eSArd Biesheuvel mflo ($at,$r2,$d0) 1038*a11d055eSArd Biesheuvel mfhi ($h3,$r2,$d0) 1039*a11d055eSArd Biesheuvel addu $h1,$h1,$a3 1040*a11d055eSArd Biesheuvel sltu $a3,$h1,$a3 1041*a11d055eSArd Biesheuvel multu ($r1,$d1) # d1*r1 1042*a11d055eSArd Biesheuvel addu $h2,$h2,$a3 1043*a11d055eSArd Biesheuvel 1044*a11d055eSArd Biesheuvel mflo ($a3,$r1,$d1) 1045*a11d055eSArd Biesheuvel mfhi ($t1,$r1,$d1) 1046*a11d055eSArd Biesheuvel addu $h2,$h2,$at 1047*a11d055eSArd Biesheuvel sltu $at,$h2,$at 1048*a11d055eSArd Biesheuvel multu ($r0,$d2) # d2*r0 1049*a11d055eSArd Biesheuvel addu $h3,$h3,$at 1050*a11d055eSArd Biesheuvel 1051*a11d055eSArd Biesheuvel mflo ($at,$r0,$d2) 1052*a11d055eSArd Biesheuvel mfhi ($t0,$r0,$d2) 1053*a11d055eSArd Biesheuvel addu $h2,$h2,$a3 1054*a11d055eSArd Biesheuvel addu $h3,$h3,$t1 1055*a11d055eSArd Biesheuvel multu ($rs3,$d3) # d3*s3 1056*a11d055eSArd Biesheuvel sltu $a3,$h2,$a3 1057*a11d055eSArd Biesheuvel addu $h3,$h3,$a3 1058*a11d055eSArd Biesheuvel 1059*a11d055eSArd Biesheuvel mflo ($a3,$rs3,$d3) 1060*a11d055eSArd Biesheuvel mfhi ($t1,$rs3,$d3) 1061*a11d055eSArd Biesheuvel addu $h2,$h2,$at 1062*a11d055eSArd Biesheuvel addu $h3,$h3,$t0 1063*a11d055eSArd Biesheuvel multu ($rs2,$h4) # h4*s2 1064*a11d055eSArd Biesheuvel sltu $at,$h2,$at 1065*a11d055eSArd Biesheuvel addu $h3,$h3,$at 1066*a11d055eSArd Biesheuvel 1067*a11d055eSArd Biesheuvel mflo ($at,$rs2,$h4) 1068*a11d055eSArd Biesheuvel addu $h2,$h2,$a3 1069*a11d055eSArd Biesheuvel addu $h3,$h3,$t1 1070*a11d055eSArd Biesheuvel multu ($r3,$d0) # d0*r3 1071*a11d055eSArd Biesheuvel sltu $a3,$h2,$a3 1072*a11d055eSArd Biesheuvel addu $h3,$h3,$a3 1073*a11d055eSArd Biesheuvel 1074*a11d055eSArd Biesheuvel 1075*a11d055eSArd Biesheuvel mflo ($a3,$r3,$d0) 1076*a11d055eSArd Biesheuvel mfhi ($t1,$r3,$d0) 1077*a11d055eSArd Biesheuvel addu $h2,$h2,$at 1078*a11d055eSArd Biesheuvel sltu $at,$h2,$at 1079*a11d055eSArd Biesheuvel multu ($r2,$d1) # d1*r2 1080*a11d055eSArd Biesheuvel addu $h3,$h3,$at 1081*a11d055eSArd Biesheuvel 1082*a11d055eSArd Biesheuvel mflo ($at,$r2,$d1) 1083*a11d055eSArd Biesheuvel mfhi ($t0,$r2,$d1) 1084*a11d055eSArd Biesheuvel addu $h3,$h3,$a3 1085*a11d055eSArd Biesheuvel sltu $a3,$h3,$a3 1086*a11d055eSArd Biesheuvel multu ($r0,$d3) # d3*r0 1087*a11d055eSArd Biesheuvel addu $t1,$t1,$a3 1088*a11d055eSArd Biesheuvel 1089*a11d055eSArd Biesheuvel mflo ($a3,$r0,$d3) 1090*a11d055eSArd Biesheuvel mfhi ($d3,$r0,$d3) 1091*a11d055eSArd Biesheuvel addu $h3,$h3,$at 1092*a11d055eSArd Biesheuvel addu $t1,$t1,$t0 1093*a11d055eSArd Biesheuvel multu ($r1,$d2) # d2*r1 1094*a11d055eSArd Biesheuvel sltu $at,$h3,$at 1095*a11d055eSArd Biesheuvel addu $t1,$t1,$at 1096*a11d055eSArd Biesheuvel 1097*a11d055eSArd Biesheuvel mflo ($at,$r1,$d2) 1098*a11d055eSArd Biesheuvel mfhi ($t0,$r1,$d2) 1099*a11d055eSArd Biesheuvel addu $h3,$h3,$a3 1100*a11d055eSArd Biesheuvel addu $t1,$t1,$d3 1101*a11d055eSArd Biesheuvel multu ($rs3,$h4) # h4*s3 1102*a11d055eSArd Biesheuvel sltu $a3,$h3,$a3 1103*a11d055eSArd Biesheuvel addu $t1,$t1,$a3 1104*a11d055eSArd Biesheuvel 1105*a11d055eSArd Biesheuvel mflo ($a3,$rs3,$h4) 1106*a11d055eSArd Biesheuvel addu $h3,$h3,$at 1107*a11d055eSArd Biesheuvel addu $t1,$t1,$t0 1108*a11d055eSArd Biesheuvel multu ($r0,$h4) # h4*r0 1109*a11d055eSArd Biesheuvel sltu $at,$h3,$at 1110*a11d055eSArd Biesheuvel addu $t1,$t1,$at 1111*a11d055eSArd Biesheuvel 1112*a11d055eSArd Biesheuvel 1113*a11d055eSArd Biesheuvel mflo ($h4,$r0,$h4) 1114*a11d055eSArd Biesheuvel addu $h3,$h3,$a3 1115*a11d055eSArd Biesheuvel sltu $a3,$h3,$a3 1116*a11d055eSArd Biesheuvel addu $t1,$t1,$a3 1117*a11d055eSArd Biesheuvel addu $h4,$h4,$t1 1118*a11d055eSArd Biesheuvel 1119*a11d055eSArd Biesheuvel li $padbit,1 # if we loop, padbit is 1 1120*a11d055eSArd Biesheuvel#endif 1121*a11d055eSArd Biesheuvel bne $inp,$len,.Loop 1122*a11d055eSArd Biesheuvel 1123*a11d055eSArd Biesheuvel sw $h0,0($ctx) # store hash value 1124*a11d055eSArd Biesheuvel sw $h1,4($ctx) 1125*a11d055eSArd Biesheuvel sw $h2,8($ctx) 1126*a11d055eSArd Biesheuvel sw $h3,12($ctx) 1127*a11d055eSArd Biesheuvel sw $h4,16($ctx) 1128*a11d055eSArd Biesheuvel 1129*a11d055eSArd Biesheuvel .set noreorder 1130*a11d055eSArd Biesheuvel.Labort: 1131*a11d055eSArd Biesheuvel lw $s11,4*11($sp) 1132*a11d055eSArd Biesheuvel lw $s10,4*10($sp) 1133*a11d055eSArd Biesheuvel lw $s9, 4*9($sp) 1134*a11d055eSArd Biesheuvel lw $s8, 4*8($sp) 1135*a11d055eSArd Biesheuvel lw $s7, 4*7($sp) 1136*a11d055eSArd Biesheuvel lw $s6, 4*6($sp) 1137*a11d055eSArd Biesheuvel lw $s5, 4*5($sp) 1138*a11d055eSArd Biesheuvel lw $s4, 4*4($sp) 1139*a11d055eSArd Biesheuvel___ 1140*a11d055eSArd Biesheuvel$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 1141*a11d055eSArd Biesheuvel lw $s3, 4*3($sp) 1142*a11d055eSArd Biesheuvel lw $s2, 4*2($sp) 1143*a11d055eSArd Biesheuvel lw $s1, 4*1($sp) 1144*a11d055eSArd Biesheuvel lw $s0, 4*0($sp) 1145*a11d055eSArd Biesheuvel___ 1146*a11d055eSArd Biesheuvel$code.=<<___; 1147*a11d055eSArd Biesheuvel jr $ra 1148*a11d055eSArd Biesheuvel addu $sp,$sp,4*12 1149*a11d055eSArd Biesheuvel.end poly1305_blocks 1150*a11d055eSArd Biesheuvel___ 1151*a11d055eSArd Biesheuvel} 1152*a11d055eSArd Biesheuvel{ 1153*a11d055eSArd Biesheuvelmy ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); 1154*a11d055eSArd Biesheuvel 1155*a11d055eSArd Biesheuvel$code.=<<___; 1156*a11d055eSArd Biesheuvel.align 5 1157*a11d055eSArd Biesheuvel.globl poly1305_emit 1158*a11d055eSArd Biesheuvel.ent poly1305_emit 1159*a11d055eSArd Biesheuvelpoly1305_emit: 1160*a11d055eSArd Biesheuvel .frame $sp,0,$ra 1161*a11d055eSArd Biesheuvel .set reorder 1162*a11d055eSArd Biesheuvel 1163*a11d055eSArd Biesheuvel lw $tmp4,16($ctx) 1164*a11d055eSArd Biesheuvel lw $tmp0,0($ctx) 1165*a11d055eSArd Biesheuvel lw $tmp1,4($ctx) 1166*a11d055eSArd Biesheuvel lw $tmp2,8($ctx) 1167*a11d055eSArd Biesheuvel lw $tmp3,12($ctx) 1168*a11d055eSArd Biesheuvel 1169*a11d055eSArd Biesheuvel li $in0,-4 # final reduction 1170*a11d055eSArd Biesheuvel srl $ctx,$tmp4,2 1171*a11d055eSArd Biesheuvel and $in0,$in0,$tmp4 1172*a11d055eSArd Biesheuvel andi $tmp4,$tmp4,3 1173*a11d055eSArd Biesheuvel addu $ctx,$ctx,$in0 1174*a11d055eSArd Biesheuvel 1175*a11d055eSArd Biesheuvel addu $tmp0,$tmp0,$ctx 1176*a11d055eSArd Biesheuvel sltu $ctx,$tmp0,$ctx 1177*a11d055eSArd Biesheuvel addiu $in0,$tmp0,5 # compare to modulus 1178*a11d055eSArd Biesheuvel addu $tmp1,$tmp1,$ctx 1179*a11d055eSArd Biesheuvel sltiu $in1,$in0,5 1180*a11d055eSArd Biesheuvel sltu $ctx,$tmp1,$ctx 1181*a11d055eSArd Biesheuvel addu $in1,$in1,$tmp1 1182*a11d055eSArd Biesheuvel addu $tmp2,$tmp2,$ctx 1183*a11d055eSArd Biesheuvel sltu $in2,$in1,$tmp1 1184*a11d055eSArd Biesheuvel sltu $ctx,$tmp2,$ctx 1185*a11d055eSArd Biesheuvel addu $in2,$in2,$tmp2 1186*a11d055eSArd Biesheuvel addu $tmp3,$tmp3,$ctx 1187*a11d055eSArd Biesheuvel sltu $in3,$in2,$tmp2 1188*a11d055eSArd Biesheuvel sltu $ctx,$tmp3,$ctx 1189*a11d055eSArd Biesheuvel addu $in3,$in3,$tmp3 1190*a11d055eSArd Biesheuvel addu $tmp4,$tmp4,$ctx 1191*a11d055eSArd Biesheuvel sltu $ctx,$in3,$tmp3 1192*a11d055eSArd Biesheuvel addu $ctx,$tmp4 1193*a11d055eSArd Biesheuvel 1194*a11d055eSArd Biesheuvel srl $ctx,2 # see if it carried/borrowed 1195*a11d055eSArd Biesheuvel subu $ctx,$zero,$ctx 1196*a11d055eSArd Biesheuvel 1197*a11d055eSArd Biesheuvel xor $in0,$tmp0 1198*a11d055eSArd Biesheuvel xor $in1,$tmp1 1199*a11d055eSArd Biesheuvel xor $in2,$tmp2 1200*a11d055eSArd Biesheuvel xor $in3,$tmp3 1201*a11d055eSArd Biesheuvel and $in0,$ctx 1202*a11d055eSArd Biesheuvel and $in1,$ctx 1203*a11d055eSArd Biesheuvel and $in2,$ctx 1204*a11d055eSArd Biesheuvel and $in3,$ctx 1205*a11d055eSArd Biesheuvel xor $in0,$tmp0 1206*a11d055eSArd Biesheuvel xor $in1,$tmp1 1207*a11d055eSArd Biesheuvel xor $in2,$tmp2 1208*a11d055eSArd Biesheuvel xor $in3,$tmp3 1209*a11d055eSArd Biesheuvel 1210*a11d055eSArd Biesheuvel lw $tmp0,0($nonce) # load nonce 1211*a11d055eSArd Biesheuvel lw $tmp1,4($nonce) 1212*a11d055eSArd Biesheuvel lw $tmp2,8($nonce) 1213*a11d055eSArd Biesheuvel lw $tmp3,12($nonce) 1214*a11d055eSArd Biesheuvel 1215*a11d055eSArd Biesheuvel addu $in0,$tmp0 # accumulate nonce 1216*a11d055eSArd Biesheuvel sltu $ctx,$in0,$tmp0 1217*a11d055eSArd Biesheuvel 1218*a11d055eSArd Biesheuvel addu $in1,$tmp1 1219*a11d055eSArd Biesheuvel sltu $tmp1,$in1,$tmp1 1220*a11d055eSArd Biesheuvel addu $in1,$ctx 1221*a11d055eSArd Biesheuvel sltu $ctx,$in1,$ctx 1222*a11d055eSArd Biesheuvel addu $ctx,$tmp1 1223*a11d055eSArd Biesheuvel 1224*a11d055eSArd Biesheuvel addu $in2,$tmp2 1225*a11d055eSArd Biesheuvel sltu $tmp2,$in2,$tmp2 1226*a11d055eSArd Biesheuvel addu $in2,$ctx 1227*a11d055eSArd Biesheuvel sltu $ctx,$in2,$ctx 1228*a11d055eSArd Biesheuvel addu $ctx,$tmp2 1229*a11d055eSArd Biesheuvel 1230*a11d055eSArd Biesheuvel addu $in3,$tmp3 1231*a11d055eSArd Biesheuvel addu $in3,$ctx 1232*a11d055eSArd Biesheuvel 1233*a11d055eSArd Biesheuvel srl $tmp0,$in0,8 # write mac value 1234*a11d055eSArd Biesheuvel srl $tmp1,$in0,16 1235*a11d055eSArd Biesheuvel srl $tmp2,$in0,24 1236*a11d055eSArd Biesheuvel sb $in0, 0($mac) 1237*a11d055eSArd Biesheuvel sb $tmp0,1($mac) 1238*a11d055eSArd Biesheuvel srl $tmp0,$in1,8 1239*a11d055eSArd Biesheuvel sb $tmp1,2($mac) 1240*a11d055eSArd Biesheuvel srl $tmp1,$in1,16 1241*a11d055eSArd Biesheuvel sb $tmp2,3($mac) 1242*a11d055eSArd Biesheuvel srl $tmp2,$in1,24 1243*a11d055eSArd Biesheuvel sb $in1, 4($mac) 1244*a11d055eSArd Biesheuvel sb $tmp0,5($mac) 1245*a11d055eSArd Biesheuvel srl $tmp0,$in2,8 1246*a11d055eSArd Biesheuvel sb $tmp1,6($mac) 1247*a11d055eSArd Biesheuvel srl $tmp1,$in2,16 1248*a11d055eSArd Biesheuvel sb $tmp2,7($mac) 1249*a11d055eSArd Biesheuvel srl $tmp2,$in2,24 1250*a11d055eSArd Biesheuvel sb $in2, 8($mac) 1251*a11d055eSArd Biesheuvel sb $tmp0,9($mac) 1252*a11d055eSArd Biesheuvel srl $tmp0,$in3,8 1253*a11d055eSArd Biesheuvel sb $tmp1,10($mac) 1254*a11d055eSArd Biesheuvel srl $tmp1,$in3,16 1255*a11d055eSArd Biesheuvel sb $tmp2,11($mac) 1256*a11d055eSArd Biesheuvel srl $tmp2,$in3,24 1257*a11d055eSArd Biesheuvel sb $in3, 12($mac) 1258*a11d055eSArd Biesheuvel sb $tmp0,13($mac) 1259*a11d055eSArd Biesheuvel sb $tmp1,14($mac) 1260*a11d055eSArd Biesheuvel sb $tmp2,15($mac) 1261*a11d055eSArd Biesheuvel 1262*a11d055eSArd Biesheuvel jr $ra 1263*a11d055eSArd Biesheuvel.end poly1305_emit 1264*a11d055eSArd Biesheuvel.rdata 1265*a11d055eSArd Biesheuvel.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" 1266*a11d055eSArd Biesheuvel.align 2 1267*a11d055eSArd Biesheuvel___ 1268*a11d055eSArd Biesheuvel} 1269*a11d055eSArd Biesheuvel}}} 1270*a11d055eSArd Biesheuvel 1271*a11d055eSArd Biesheuvel$output=pop and open STDOUT,">$output"; 1272*a11d055eSArd Biesheuvelprint $code; 1273*a11d055eSArd Biesheuvelclose STDOUT; 1274