xref: /openbmc/linux/arch/mips/crypto/poly1305-mips.pl (revision c95baf12f5077419db01313ab61c2aac007d40cd)
1*a11d055eSArd Biesheuvel#!/usr/bin/env perl
2*a11d055eSArd Biesheuvel# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3*a11d055eSArd Biesheuvel#
4*a11d055eSArd Biesheuvel# ====================================================================
5*a11d055eSArd Biesheuvel# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
6*a11d055eSArd Biesheuvel# project.
7*a11d055eSArd Biesheuvel# ====================================================================
8*a11d055eSArd Biesheuvel
9*a11d055eSArd Biesheuvel# Poly1305 hash for MIPS.
10*a11d055eSArd Biesheuvel#
11*a11d055eSArd Biesheuvel# May 2016
12*a11d055eSArd Biesheuvel#
13*a11d055eSArd Biesheuvel# Numbers are cycles per processed byte with poly1305_blocks alone.
14*a11d055eSArd Biesheuvel#
15*a11d055eSArd Biesheuvel#		IALU/gcc
16*a11d055eSArd Biesheuvel# R1x000	~5.5/+130%	(big-endian)
17*a11d055eSArd Biesheuvel# Octeon II	2.50/+70%	(little-endian)
18*a11d055eSArd Biesheuvel#
19*a11d055eSArd Biesheuvel# March 2019
20*a11d055eSArd Biesheuvel#
21*a11d055eSArd Biesheuvel# Add 32-bit code path.
22*a11d055eSArd Biesheuvel#
23*a11d055eSArd Biesheuvel# October 2019
24*a11d055eSArd Biesheuvel#
25*a11d055eSArd Biesheuvel# Modulo-scheduling reduction allows to omit dependency chain at the
26*a11d055eSArd Biesheuvel# end of inner loop and improve performance. Also optimize MIPS32R2
27*a11d055eSArd Biesheuvel# code path for MIPS 1004K core. Per René von Dorst's suggestions.
28*a11d055eSArd Biesheuvel#
29*a11d055eSArd Biesheuvel#		IALU/gcc
30*a11d055eSArd Biesheuvel# R1x000	~9.8/?		(big-endian)
31*a11d055eSArd Biesheuvel# Octeon II	3.65/+140%	(little-endian)
32*a11d055eSArd Biesheuvel# MT7621/1004K	4.75/?		(little-endian)
33*a11d055eSArd Biesheuvel#
34*a11d055eSArd Biesheuvel######################################################################
35*a11d055eSArd Biesheuvel# There is a number of MIPS ABI in use, O32 and N32/64 are most
36*a11d055eSArd Biesheuvel# widely used. Then there is a new contender: NUBI. It appears that if
37*a11d055eSArd Biesheuvel# one picks the latter, it's possible to arrange code in ABI neutral
38*a11d055eSArd Biesheuvel# manner. Therefore let's stick to NUBI register layout:
39*a11d055eSArd Biesheuvel#
40*a11d055eSArd Biesheuvel($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
41*a11d055eSArd Biesheuvel($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
42*a11d055eSArd Biesheuvel($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
43*a11d055eSArd Biesheuvel($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
44*a11d055eSArd Biesheuvel#
45*a11d055eSArd Biesheuvel# The return value is placed in $a0. Following coding rules facilitate
46*a11d055eSArd Biesheuvel# interoperability:
47*a11d055eSArd Biesheuvel#
48*a11d055eSArd Biesheuvel# - never ever touch $tp, "thread pointer", former $gp [o32 can be
49*a11d055eSArd Biesheuvel#   excluded from the rule, because it's specified volatile];
50*a11d055eSArd Biesheuvel# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
51*a11d055eSArd Biesheuvel#   old code];
52*a11d055eSArd Biesheuvel# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
53*a11d055eSArd Biesheuvel#
54*a11d055eSArd Biesheuvel# For reference here is register layout for N32/64 MIPS ABIs:
55*a11d055eSArd Biesheuvel#
56*a11d055eSArd Biesheuvel# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
57*a11d055eSArd Biesheuvel# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
58*a11d055eSArd Biesheuvel# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
59*a11d055eSArd Biesheuvel# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
60*a11d055eSArd Biesheuvel# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
61*a11d055eSArd Biesheuvel#
62*a11d055eSArd Biesheuvel# <appro@openssl.org>
63*a11d055eSArd Biesheuvel#
64*a11d055eSArd Biesheuvel######################################################################
65*a11d055eSArd Biesheuvel
66*a11d055eSArd Biesheuvel$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
67*a11d055eSArd Biesheuvel
68*a11d055eSArd Biesheuvel$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
69*a11d055eSArd Biesheuvel
70*a11d055eSArd Biesheuvelif ($flavour =~ /64|n32/i) {{{
71*a11d055eSArd Biesheuvel######################################################################
72*a11d055eSArd Biesheuvel# 64-bit code path
73*a11d055eSArd Biesheuvel#
74*a11d055eSArd Biesheuvel
75*a11d055eSArd Biesheuvelmy ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
76*a11d055eSArd Biesheuvelmy ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
77*a11d055eSArd Biesheuvel
78*a11d055eSArd Biesheuvel$code.=<<___;
79*a11d055eSArd Biesheuvel#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
80*a11d055eSArd Biesheuvel     defined(_MIPS_ARCH_MIPS64R6)) \\
81*a11d055eSArd Biesheuvel     && !defined(_MIPS_ARCH_MIPS64R2)
82*a11d055eSArd Biesheuvel# define _MIPS_ARCH_MIPS64R2
83*a11d055eSArd Biesheuvel#endif
84*a11d055eSArd Biesheuvel
85*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6)
86*a11d055eSArd Biesheuvel# define dmultu(rs,rt)
87*a11d055eSArd Biesheuvel# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
88*a11d055eSArd Biesheuvel# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
89*a11d055eSArd Biesheuvel#else
90*a11d055eSArd Biesheuvel# define dmultu(rs,rt)		dmultu	rs,rt
91*a11d055eSArd Biesheuvel# define mflo(rd,rs,rt)	mflo	rd
92*a11d055eSArd Biesheuvel# define mfhi(rd,rs,rt)	mfhi	rd
93*a11d055eSArd Biesheuvel#endif
94*a11d055eSArd Biesheuvel
95*a11d055eSArd Biesheuvel#ifdef	__KERNEL__
96*a11d055eSArd Biesheuvel# define poly1305_init   poly1305_init_mips
97*a11d055eSArd Biesheuvel# define poly1305_blocks poly1305_blocks_mips
98*a11d055eSArd Biesheuvel# define poly1305_emit   poly1305_emit_mips
99*a11d055eSArd Biesheuvel#endif
100*a11d055eSArd Biesheuvel
101*a11d055eSArd Biesheuvel#if defined(__MIPSEB__) && !defined(MIPSEB)
102*a11d055eSArd Biesheuvel# define MIPSEB
103*a11d055eSArd Biesheuvel#endif
104*a11d055eSArd Biesheuvel
105*a11d055eSArd Biesheuvel#ifdef MIPSEB
106*a11d055eSArd Biesheuvel# define MSB 0
107*a11d055eSArd Biesheuvel# define LSB 7
108*a11d055eSArd Biesheuvel#else
109*a11d055eSArd Biesheuvel# define MSB 7
110*a11d055eSArd Biesheuvel# define LSB 0
111*a11d055eSArd Biesheuvel#endif
112*a11d055eSArd Biesheuvel
113*a11d055eSArd Biesheuvel.text
114*a11d055eSArd Biesheuvel.set	noat
115*a11d055eSArd Biesheuvel.set	noreorder
116*a11d055eSArd Biesheuvel
117*a11d055eSArd Biesheuvel.align	5
118*a11d055eSArd Biesheuvel.globl	poly1305_init
119*a11d055eSArd Biesheuvel.ent	poly1305_init
120*a11d055eSArd Biesheuvelpoly1305_init:
121*a11d055eSArd Biesheuvel	.frame	$sp,0,$ra
122*a11d055eSArd Biesheuvel	.set	reorder
123*a11d055eSArd Biesheuvel
124*a11d055eSArd Biesheuvel	sd	$zero,0($ctx)
125*a11d055eSArd Biesheuvel	sd	$zero,8($ctx)
126*a11d055eSArd Biesheuvel	sd	$zero,16($ctx)
127*a11d055eSArd Biesheuvel
128*a11d055eSArd Biesheuvel	beqz	$inp,.Lno_key
129*a11d055eSArd Biesheuvel
130*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6)
131*a11d055eSArd Biesheuvel	andi	$tmp0,$inp,7		# $inp % 8
132*a11d055eSArd Biesheuvel	dsubu	$inp,$inp,$tmp0		# align $inp
133*a11d055eSArd Biesheuvel	sll	$tmp0,$tmp0,3		# byte to bit offset
134*a11d055eSArd Biesheuvel	ld	$in0,0($inp)
135*a11d055eSArd Biesheuvel	ld	$in1,8($inp)
136*a11d055eSArd Biesheuvel	beqz	$tmp0,.Laligned_key
137*a11d055eSArd Biesheuvel	ld	$tmp2,16($inp)
138*a11d055eSArd Biesheuvel
139*a11d055eSArd Biesheuvel	subu	$tmp1,$zero,$tmp0
140*a11d055eSArd Biesheuvel# ifdef	MIPSEB
141*a11d055eSArd Biesheuvel	dsllv	$in0,$in0,$tmp0
142*a11d055eSArd Biesheuvel	dsrlv	$tmp3,$in1,$tmp1
143*a11d055eSArd Biesheuvel	dsllv	$in1,$in1,$tmp0
144*a11d055eSArd Biesheuvel	dsrlv	$tmp2,$tmp2,$tmp1
145*a11d055eSArd Biesheuvel# else
146*a11d055eSArd Biesheuvel	dsrlv	$in0,$in0,$tmp0
147*a11d055eSArd Biesheuvel	dsllv	$tmp3,$in1,$tmp1
148*a11d055eSArd Biesheuvel	dsrlv	$in1,$in1,$tmp0
149*a11d055eSArd Biesheuvel	dsllv	$tmp2,$tmp2,$tmp1
150*a11d055eSArd Biesheuvel# endif
151*a11d055eSArd Biesheuvel	or	$in0,$in0,$tmp3
152*a11d055eSArd Biesheuvel	or	$in1,$in1,$tmp2
153*a11d055eSArd Biesheuvel.Laligned_key:
154*a11d055eSArd Biesheuvel#else
155*a11d055eSArd Biesheuvel	ldl	$in0,0+MSB($inp)
156*a11d055eSArd Biesheuvel	ldl	$in1,8+MSB($inp)
157*a11d055eSArd Biesheuvel	ldr	$in0,0+LSB($inp)
158*a11d055eSArd Biesheuvel	ldr	$in1,8+LSB($inp)
159*a11d055eSArd Biesheuvel#endif
160*a11d055eSArd Biesheuvel#ifdef	MIPSEB
161*a11d055eSArd Biesheuvel# if defined(_MIPS_ARCH_MIPS64R2)
162*a11d055eSArd Biesheuvel	dsbh	$in0,$in0		# byte swap
163*a11d055eSArd Biesheuvel	 dsbh	$in1,$in1
164*a11d055eSArd Biesheuvel	dshd	$in0,$in0
165*a11d055eSArd Biesheuvel	 dshd	$in1,$in1
166*a11d055eSArd Biesheuvel# else
167*a11d055eSArd Biesheuvel	ori	$tmp0,$zero,0xFF
168*a11d055eSArd Biesheuvel	dsll	$tmp2,$tmp0,32
169*a11d055eSArd Biesheuvel	or	$tmp0,$tmp2		# 0x000000FF000000FF
170*a11d055eSArd Biesheuvel
171*a11d055eSArd Biesheuvel	and	$tmp1,$in0,$tmp0	# byte swap
172*a11d055eSArd Biesheuvel	 and	$tmp3,$in1,$tmp0
173*a11d055eSArd Biesheuvel	dsrl	$tmp2,$in0,24
174*a11d055eSArd Biesheuvel	 dsrl	$tmp4,$in1,24
175*a11d055eSArd Biesheuvel	dsll	$tmp1,24
176*a11d055eSArd Biesheuvel	 dsll	$tmp3,24
177*a11d055eSArd Biesheuvel	and	$tmp2,$tmp0
178*a11d055eSArd Biesheuvel	 and	$tmp4,$tmp0
179*a11d055eSArd Biesheuvel	dsll	$tmp0,8			# 0x0000FF000000FF00
180*a11d055eSArd Biesheuvel	or	$tmp1,$tmp2
181*a11d055eSArd Biesheuvel	 or	$tmp3,$tmp4
182*a11d055eSArd Biesheuvel	and	$tmp2,$in0,$tmp0
183*a11d055eSArd Biesheuvel	 and	$tmp4,$in1,$tmp0
184*a11d055eSArd Biesheuvel	dsrl	$in0,8
185*a11d055eSArd Biesheuvel	 dsrl	$in1,8
186*a11d055eSArd Biesheuvel	dsll	$tmp2,8
187*a11d055eSArd Biesheuvel	 dsll	$tmp4,8
188*a11d055eSArd Biesheuvel	and	$in0,$tmp0
189*a11d055eSArd Biesheuvel	 and	$in1,$tmp0
190*a11d055eSArd Biesheuvel	or	$tmp1,$tmp2
191*a11d055eSArd Biesheuvel	 or	$tmp3,$tmp4
192*a11d055eSArd Biesheuvel	or	$in0,$tmp1
193*a11d055eSArd Biesheuvel	 or	$in1,$tmp3
194*a11d055eSArd Biesheuvel	dsrl	$tmp1,$in0,32
195*a11d055eSArd Biesheuvel	 dsrl	$tmp3,$in1,32
196*a11d055eSArd Biesheuvel	dsll	$in0,32
197*a11d055eSArd Biesheuvel	 dsll	$in1,32
198*a11d055eSArd Biesheuvel	or	$in0,$tmp1
199*a11d055eSArd Biesheuvel	 or	$in1,$tmp3
200*a11d055eSArd Biesheuvel# endif
201*a11d055eSArd Biesheuvel#endif
202*a11d055eSArd Biesheuvel	li	$tmp0,1
203*a11d055eSArd Biesheuvel	dsll	$tmp0,32		# 0x0000000100000000
204*a11d055eSArd Biesheuvel	daddiu	$tmp0,-63		# 0x00000000ffffffc1
205*a11d055eSArd Biesheuvel	dsll	$tmp0,28		# 0x0ffffffc10000000
206*a11d055eSArd Biesheuvel	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
207*a11d055eSArd Biesheuvel
208*a11d055eSArd Biesheuvel	and	$in0,$tmp0
209*a11d055eSArd Biesheuvel	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
210*a11d055eSArd Biesheuvel	and	$in1,$tmp0
211*a11d055eSArd Biesheuvel
212*a11d055eSArd Biesheuvel	sd	$in0,24($ctx)
213*a11d055eSArd Biesheuvel	dsrl	$tmp0,$in1,2
214*a11d055eSArd Biesheuvel	sd	$in1,32($ctx)
215*a11d055eSArd Biesheuvel	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
216*a11d055eSArd Biesheuvel	sd	$tmp0,40($ctx)
217*a11d055eSArd Biesheuvel
218*a11d055eSArd Biesheuvel.Lno_key:
219*a11d055eSArd Biesheuvel	li	$v0,0			# return 0
220*a11d055eSArd Biesheuvel	jr	$ra
221*a11d055eSArd Biesheuvel.end	poly1305_init
222*a11d055eSArd Biesheuvel___
223*a11d055eSArd Biesheuvel{
224*a11d055eSArd Biesheuvelmy $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
225*a11d055eSArd Biesheuvel
226*a11d055eSArd Biesheuvelmy ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
227*a11d055eSArd Biesheuvel   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
228*a11d055eSArd Biesheuvelmy ($shr,$shl) = ($s6,$s7);		# used on R6
229*a11d055eSArd Biesheuvel
230*a11d055eSArd Biesheuvel$code.=<<___;
231*a11d055eSArd Biesheuvel.align	5
232*a11d055eSArd Biesheuvel.globl	poly1305_blocks
233*a11d055eSArd Biesheuvel.ent	poly1305_blocks
234*a11d055eSArd Biesheuvelpoly1305_blocks:
235*a11d055eSArd Biesheuvel	.set	noreorder
236*a11d055eSArd Biesheuvel	dsrl	$len,4			# number of complete blocks
237*a11d055eSArd Biesheuvel	bnez	$len,poly1305_blocks_internal
238*a11d055eSArd Biesheuvel	nop
239*a11d055eSArd Biesheuvel	jr	$ra
240*a11d055eSArd Biesheuvel	nop
241*a11d055eSArd Biesheuvel.end	poly1305_blocks
242*a11d055eSArd Biesheuvel
243*a11d055eSArd Biesheuvel.align	5
244*a11d055eSArd Biesheuvel.ent	poly1305_blocks_internal
245*a11d055eSArd Biesheuvelpoly1305_blocks_internal:
246*a11d055eSArd Biesheuvel	.set	noreorder
247*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6)
248*a11d055eSArd Biesheuvel	.frame	$sp,8*8,$ra
249*a11d055eSArd Biesheuvel	.mask	$SAVED_REGS_MASK|0x000c0000,-8
250*a11d055eSArd Biesheuvel	dsubu	$sp,8*8
251*a11d055eSArd Biesheuvel	sd	$s7,56($sp)
252*a11d055eSArd Biesheuvel	sd	$s6,48($sp)
253*a11d055eSArd Biesheuvel#else
254*a11d055eSArd Biesheuvel	.frame	$sp,6*8,$ra
255*a11d055eSArd Biesheuvel	.mask	$SAVED_REGS_MASK,-8
256*a11d055eSArd Biesheuvel	dsubu	$sp,6*8
257*a11d055eSArd Biesheuvel#endif
258*a11d055eSArd Biesheuvel	sd	$s5,40($sp)
259*a11d055eSArd Biesheuvel	sd	$s4,32($sp)
260*a11d055eSArd Biesheuvel___
261*a11d055eSArd Biesheuvel$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
262*a11d055eSArd Biesheuvel	sd	$s3,24($sp)
263*a11d055eSArd Biesheuvel	sd	$s2,16($sp)
264*a11d055eSArd Biesheuvel	sd	$s1,8($sp)
265*a11d055eSArd Biesheuvel	sd	$s0,0($sp)
266*a11d055eSArd Biesheuvel___
267*a11d055eSArd Biesheuvel$code.=<<___;
268*a11d055eSArd Biesheuvel	.set	reorder
269*a11d055eSArd Biesheuvel
270*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6)
271*a11d055eSArd Biesheuvel	andi	$shr,$inp,7
272*a11d055eSArd Biesheuvel	dsubu	$inp,$inp,$shr		# align $inp
273*a11d055eSArd Biesheuvel	sll	$shr,$shr,3		# byte to bit offset
274*a11d055eSArd Biesheuvel	subu	$shl,$zero,$shr
275*a11d055eSArd Biesheuvel#endif
276*a11d055eSArd Biesheuvel
277*a11d055eSArd Biesheuvel	ld	$h0,0($ctx)		# load hash value
278*a11d055eSArd Biesheuvel	ld	$h1,8($ctx)
279*a11d055eSArd Biesheuvel	ld	$h2,16($ctx)
280*a11d055eSArd Biesheuvel
281*a11d055eSArd Biesheuvel	ld	$r0,24($ctx)		# load key
282*a11d055eSArd Biesheuvel	ld	$r1,32($ctx)
283*a11d055eSArd Biesheuvel	ld	$rs1,40($ctx)
284*a11d055eSArd Biesheuvel
285*a11d055eSArd Biesheuvel	dsll	$len,4
286*a11d055eSArd Biesheuvel	daddu	$len,$inp		# end of buffer
287*a11d055eSArd Biesheuvel	b	.Loop
288*a11d055eSArd Biesheuvel
289*a11d055eSArd Biesheuvel.align	4
290*a11d055eSArd Biesheuvel.Loop:
291*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6)
292*a11d055eSArd Biesheuvel	ld	$in0,0($inp)		# load input
293*a11d055eSArd Biesheuvel	ld	$in1,8($inp)
294*a11d055eSArd Biesheuvel	beqz	$shr,.Laligned_inp
295*a11d055eSArd Biesheuvel
296*a11d055eSArd Biesheuvel	ld	$tmp2,16($inp)
297*a11d055eSArd Biesheuvel# ifdef	MIPSEB
298*a11d055eSArd Biesheuvel	dsllv	$in0,$in0,$shr
299*a11d055eSArd Biesheuvel	dsrlv	$tmp3,$in1,$shl
300*a11d055eSArd Biesheuvel	dsllv	$in1,$in1,$shr
301*a11d055eSArd Biesheuvel	dsrlv	$tmp2,$tmp2,$shl
302*a11d055eSArd Biesheuvel# else
303*a11d055eSArd Biesheuvel	dsrlv	$in0,$in0,$shr
304*a11d055eSArd Biesheuvel	dsllv	$tmp3,$in1,$shl
305*a11d055eSArd Biesheuvel	dsrlv	$in1,$in1,$shr
306*a11d055eSArd Biesheuvel	dsllv	$tmp2,$tmp2,$shl
307*a11d055eSArd Biesheuvel# endif
308*a11d055eSArd Biesheuvel	or	$in0,$in0,$tmp3
309*a11d055eSArd Biesheuvel	or	$in1,$in1,$tmp2
310*a11d055eSArd Biesheuvel.Laligned_inp:
311*a11d055eSArd Biesheuvel#else
312*a11d055eSArd Biesheuvel	ldl	$in0,0+MSB($inp)	# load input
313*a11d055eSArd Biesheuvel	ldl	$in1,8+MSB($inp)
314*a11d055eSArd Biesheuvel	ldr	$in0,0+LSB($inp)
315*a11d055eSArd Biesheuvel	ldr	$in1,8+LSB($inp)
316*a11d055eSArd Biesheuvel#endif
317*a11d055eSArd Biesheuvel	daddiu	$inp,16
318*a11d055eSArd Biesheuvel#ifdef	MIPSEB
319*a11d055eSArd Biesheuvel# if defined(_MIPS_ARCH_MIPS64R2)
320*a11d055eSArd Biesheuvel	dsbh	$in0,$in0		# byte swap
321*a11d055eSArd Biesheuvel	 dsbh	$in1,$in1
322*a11d055eSArd Biesheuvel	dshd	$in0,$in0
323*a11d055eSArd Biesheuvel	 dshd	$in1,$in1
324*a11d055eSArd Biesheuvel# else
325*a11d055eSArd Biesheuvel	ori	$tmp0,$zero,0xFF
326*a11d055eSArd Biesheuvel	dsll	$tmp2,$tmp0,32
327*a11d055eSArd Biesheuvel	or	$tmp0,$tmp2		# 0x000000FF000000FF
328*a11d055eSArd Biesheuvel
329*a11d055eSArd Biesheuvel	and	$tmp1,$in0,$tmp0	# byte swap
330*a11d055eSArd Biesheuvel	 and	$tmp3,$in1,$tmp0
331*a11d055eSArd Biesheuvel	dsrl	$tmp2,$in0,24
332*a11d055eSArd Biesheuvel	 dsrl	$tmp4,$in1,24
333*a11d055eSArd Biesheuvel	dsll	$tmp1,24
334*a11d055eSArd Biesheuvel	 dsll	$tmp3,24
335*a11d055eSArd Biesheuvel	and	$tmp2,$tmp0
336*a11d055eSArd Biesheuvel	 and	$tmp4,$tmp0
337*a11d055eSArd Biesheuvel	dsll	$tmp0,8			# 0x0000FF000000FF00
338*a11d055eSArd Biesheuvel	or	$tmp1,$tmp2
339*a11d055eSArd Biesheuvel	 or	$tmp3,$tmp4
340*a11d055eSArd Biesheuvel	and	$tmp2,$in0,$tmp0
341*a11d055eSArd Biesheuvel	 and	$tmp4,$in1,$tmp0
342*a11d055eSArd Biesheuvel	dsrl	$in0,8
343*a11d055eSArd Biesheuvel	 dsrl	$in1,8
344*a11d055eSArd Biesheuvel	dsll	$tmp2,8
345*a11d055eSArd Biesheuvel	 dsll	$tmp4,8
346*a11d055eSArd Biesheuvel	and	$in0,$tmp0
347*a11d055eSArd Biesheuvel	 and	$in1,$tmp0
348*a11d055eSArd Biesheuvel	or	$tmp1,$tmp2
349*a11d055eSArd Biesheuvel	 or	$tmp3,$tmp4
350*a11d055eSArd Biesheuvel	or	$in0,$tmp1
351*a11d055eSArd Biesheuvel	 or	$in1,$tmp3
352*a11d055eSArd Biesheuvel	dsrl	$tmp1,$in0,32
353*a11d055eSArd Biesheuvel	 dsrl	$tmp3,$in1,32
354*a11d055eSArd Biesheuvel	dsll	$in0,32
355*a11d055eSArd Biesheuvel	 dsll	$in1,32
356*a11d055eSArd Biesheuvel	or	$in0,$tmp1
357*a11d055eSArd Biesheuvel	 or	$in1,$tmp3
358*a11d055eSArd Biesheuvel# endif
359*a11d055eSArd Biesheuvel#endif
360*a11d055eSArd Biesheuvel	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
361*a11d055eSArd Biesheuvel	andi	$h2,$h2,3
362*a11d055eSArd Biesheuvel	dsll	$tmp0,$tmp1,2
363*a11d055eSArd Biesheuvel
364*a11d055eSArd Biesheuvel	daddu	$d0,$h0,$in0		# accumulate input
365*a11d055eSArd Biesheuvel	 daddu	$tmp1,$tmp0
366*a11d055eSArd Biesheuvel	sltu	$tmp0,$d0,$h0
367*a11d055eSArd Biesheuvel	daddu	$d0,$d0,$tmp1		# ... and residue
368*a11d055eSArd Biesheuvel	sltu	$tmp1,$d0,$tmp1
369*a11d055eSArd Biesheuvel	daddu	$d1,$h1,$in1
370*a11d055eSArd Biesheuvel	daddu	$tmp0,$tmp1
371*a11d055eSArd Biesheuvel	sltu	$tmp1,$d1,$h1
372*a11d055eSArd Biesheuvel	daddu	$d1,$tmp0
373*a11d055eSArd Biesheuvel
374*a11d055eSArd Biesheuvel	dmultu	($r0,$d0)		# h0*r0
375*a11d055eSArd Biesheuvel	 daddu	$d2,$h2,$padbit
376*a11d055eSArd Biesheuvel	 sltu	$tmp0,$d1,$tmp0
377*a11d055eSArd Biesheuvel	mflo	($h0,$r0,$d0)
378*a11d055eSArd Biesheuvel	mfhi	($h1,$r0,$d0)
379*a11d055eSArd Biesheuvel
380*a11d055eSArd Biesheuvel	dmultu	($rs1,$d1)		# h1*5*r1
381*a11d055eSArd Biesheuvel	 daddu	$d2,$tmp1
382*a11d055eSArd Biesheuvel	 daddu	$d2,$tmp0
383*a11d055eSArd Biesheuvel	mflo	($tmp0,$rs1,$d1)
384*a11d055eSArd Biesheuvel	mfhi	($tmp1,$rs1,$d1)
385*a11d055eSArd Biesheuvel
386*a11d055eSArd Biesheuvel	dmultu	($r1,$d0)		# h0*r1
387*a11d055eSArd Biesheuvel	mflo	($tmp2,$r1,$d0)
388*a11d055eSArd Biesheuvel	mfhi	($h2,$r1,$d0)
389*a11d055eSArd Biesheuvel	 daddu	$h0,$tmp0
390*a11d055eSArd Biesheuvel	 daddu	$h1,$tmp1
391*a11d055eSArd Biesheuvel	 sltu	$tmp0,$h0,$tmp0
392*a11d055eSArd Biesheuvel
393*a11d055eSArd Biesheuvel	dmultu	($r0,$d1)		# h1*r0
394*a11d055eSArd Biesheuvel	 daddu	$h1,$tmp0
395*a11d055eSArd Biesheuvel	 daddu	$h1,$tmp2
396*a11d055eSArd Biesheuvel	mflo	($tmp0,$r0,$d1)
397*a11d055eSArd Biesheuvel	mfhi	($tmp1,$r0,$d1)
398*a11d055eSArd Biesheuvel
399*a11d055eSArd Biesheuvel	dmultu	($rs1,$d2)		# h2*5*r1
400*a11d055eSArd Biesheuvel	 sltu	$tmp2,$h1,$tmp2
401*a11d055eSArd Biesheuvel	 daddu	$h2,$tmp2
402*a11d055eSArd Biesheuvel	mflo	($tmp2,$rs1,$d2)
403*a11d055eSArd Biesheuvel
404*a11d055eSArd Biesheuvel	dmultu	($r0,$d2)		# h2*r0
405*a11d055eSArd Biesheuvel	 daddu	$h1,$tmp0
406*a11d055eSArd Biesheuvel	 daddu	$h2,$tmp1
407*a11d055eSArd Biesheuvel	mflo	($tmp3,$r0,$d2)
408*a11d055eSArd Biesheuvel	 sltu	$tmp0,$h1,$tmp0
409*a11d055eSArd Biesheuvel	 daddu	$h2,$tmp0
410*a11d055eSArd Biesheuvel
411*a11d055eSArd Biesheuvel	daddu	$h1,$tmp2
412*a11d055eSArd Biesheuvel	sltu	$tmp2,$h1,$tmp2
413*a11d055eSArd Biesheuvel	daddu	$h2,$tmp2
414*a11d055eSArd Biesheuvel	daddu	$h2,$tmp3
415*a11d055eSArd Biesheuvel
416*a11d055eSArd Biesheuvel	bne	$inp,$len,.Loop
417*a11d055eSArd Biesheuvel
418*a11d055eSArd Biesheuvel	sd	$h0,0($ctx)		# store hash value
419*a11d055eSArd Biesheuvel	sd	$h1,8($ctx)
420*a11d055eSArd Biesheuvel	sd	$h2,16($ctx)
421*a11d055eSArd Biesheuvel
422*a11d055eSArd Biesheuvel	.set	noreorder
423*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6)
424*a11d055eSArd Biesheuvel	ld	$s7,56($sp)
425*a11d055eSArd Biesheuvel	ld	$s6,48($sp)
426*a11d055eSArd Biesheuvel#endif
427*a11d055eSArd Biesheuvel	ld	$s5,40($sp)		# epilogue
428*a11d055eSArd Biesheuvel	ld	$s4,32($sp)
429*a11d055eSArd Biesheuvel___
430*a11d055eSArd Biesheuvel$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
431*a11d055eSArd Biesheuvel	ld	$s3,24($sp)
432*a11d055eSArd Biesheuvel	ld	$s2,16($sp)
433*a11d055eSArd Biesheuvel	ld	$s1,8($sp)
434*a11d055eSArd Biesheuvel	ld	$s0,0($sp)
435*a11d055eSArd Biesheuvel___
436*a11d055eSArd Biesheuvel$code.=<<___;
437*a11d055eSArd Biesheuvel	jr	$ra
438*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS64R6)
439*a11d055eSArd Biesheuvel	daddu	$sp,8*8
440*a11d055eSArd Biesheuvel#else
441*a11d055eSArd Biesheuvel	daddu	$sp,6*8
442*a11d055eSArd Biesheuvel#endif
443*a11d055eSArd Biesheuvel.end	poly1305_blocks_internal
444*a11d055eSArd Biesheuvel___
445*a11d055eSArd Biesheuvel}
446*a11d055eSArd Biesheuvel{
447*a11d055eSArd Biesheuvelmy ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
448*a11d055eSArd Biesheuvel
449*a11d055eSArd Biesheuvel$code.=<<___;
450*a11d055eSArd Biesheuvel.align	5
451*a11d055eSArd Biesheuvel.globl	poly1305_emit
452*a11d055eSArd Biesheuvel.ent	poly1305_emit
453*a11d055eSArd Biesheuvelpoly1305_emit:
454*a11d055eSArd Biesheuvel	.frame	$sp,0,$ra
455*a11d055eSArd Biesheuvel	.set	reorder
456*a11d055eSArd Biesheuvel
457*a11d055eSArd Biesheuvel	ld	$tmp2,16($ctx)
458*a11d055eSArd Biesheuvel	ld	$tmp0,0($ctx)
459*a11d055eSArd Biesheuvel	ld	$tmp1,8($ctx)
460*a11d055eSArd Biesheuvel
461*a11d055eSArd Biesheuvel	li	$in0,-4			# final reduction
462*a11d055eSArd Biesheuvel	dsrl	$in1,$tmp2,2
463*a11d055eSArd Biesheuvel	and	$in0,$tmp2
464*a11d055eSArd Biesheuvel	andi	$tmp2,$tmp2,3
465*a11d055eSArd Biesheuvel	daddu	$in0,$in1
466*a11d055eSArd Biesheuvel
467*a11d055eSArd Biesheuvel	daddu	$tmp0,$tmp0,$in0
468*a11d055eSArd Biesheuvel	sltu	$in1,$tmp0,$in0
469*a11d055eSArd Biesheuvel	 daddiu	$in0,$tmp0,5		# compare to modulus
470*a11d055eSArd Biesheuvel	daddu	$tmp1,$tmp1,$in1
471*a11d055eSArd Biesheuvel	 sltiu	$tmp3,$in0,5
472*a11d055eSArd Biesheuvel	sltu	$tmp4,$tmp1,$in1
473*a11d055eSArd Biesheuvel	 daddu	$in1,$tmp1,$tmp3
474*a11d055eSArd Biesheuvel	daddu	$tmp2,$tmp2,$tmp4
475*a11d055eSArd Biesheuvel	 sltu	$tmp3,$in1,$tmp3
476*a11d055eSArd Biesheuvel	 daddu	$tmp2,$tmp2,$tmp3
477*a11d055eSArd Biesheuvel
478*a11d055eSArd Biesheuvel	dsrl	$tmp2,2			# see if it carried/borrowed
479*a11d055eSArd Biesheuvel	dsubu	$tmp2,$zero,$tmp2
480*a11d055eSArd Biesheuvel
481*a11d055eSArd Biesheuvel	xor	$in0,$tmp0
482*a11d055eSArd Biesheuvel	xor	$in1,$tmp1
483*a11d055eSArd Biesheuvel	and	$in0,$tmp2
484*a11d055eSArd Biesheuvel	and	$in1,$tmp2
485*a11d055eSArd Biesheuvel	xor	$in0,$tmp0
486*a11d055eSArd Biesheuvel	xor	$in1,$tmp1
487*a11d055eSArd Biesheuvel
488*a11d055eSArd Biesheuvel	lwu	$tmp0,0($nonce)		# load nonce
489*a11d055eSArd Biesheuvel	lwu	$tmp1,4($nonce)
490*a11d055eSArd Biesheuvel	lwu	$tmp2,8($nonce)
491*a11d055eSArd Biesheuvel	lwu	$tmp3,12($nonce)
492*a11d055eSArd Biesheuvel	dsll	$tmp1,32
493*a11d055eSArd Biesheuvel	dsll	$tmp3,32
494*a11d055eSArd Biesheuvel	or	$tmp0,$tmp1
495*a11d055eSArd Biesheuvel	or	$tmp2,$tmp3
496*a11d055eSArd Biesheuvel
497*a11d055eSArd Biesheuvel	daddu	$in0,$tmp0		# accumulate nonce
498*a11d055eSArd Biesheuvel	daddu	$in1,$tmp2
499*a11d055eSArd Biesheuvel	sltu	$tmp0,$in0,$tmp0
500*a11d055eSArd Biesheuvel	daddu	$in1,$tmp0
501*a11d055eSArd Biesheuvel
502*a11d055eSArd Biesheuvel	dsrl	$tmp0,$in0,8		# write mac value
503*a11d055eSArd Biesheuvel	dsrl	$tmp1,$in0,16
504*a11d055eSArd Biesheuvel	dsrl	$tmp2,$in0,24
505*a11d055eSArd Biesheuvel	sb	$in0,0($mac)
506*a11d055eSArd Biesheuvel	dsrl	$tmp3,$in0,32
507*a11d055eSArd Biesheuvel	sb	$tmp0,1($mac)
508*a11d055eSArd Biesheuvel	dsrl	$tmp0,$in0,40
509*a11d055eSArd Biesheuvel	sb	$tmp1,2($mac)
510*a11d055eSArd Biesheuvel	dsrl	$tmp1,$in0,48
511*a11d055eSArd Biesheuvel	sb	$tmp2,3($mac)
512*a11d055eSArd Biesheuvel	dsrl	$tmp2,$in0,56
513*a11d055eSArd Biesheuvel	sb	$tmp3,4($mac)
514*a11d055eSArd Biesheuvel	dsrl	$tmp3,$in1,8
515*a11d055eSArd Biesheuvel	sb	$tmp0,5($mac)
516*a11d055eSArd Biesheuvel	dsrl	$tmp0,$in1,16
517*a11d055eSArd Biesheuvel	sb	$tmp1,6($mac)
518*a11d055eSArd Biesheuvel	dsrl	$tmp1,$in1,24
519*a11d055eSArd Biesheuvel	sb	$tmp2,7($mac)
520*a11d055eSArd Biesheuvel
521*a11d055eSArd Biesheuvel	sb	$in1,8($mac)
522*a11d055eSArd Biesheuvel	dsrl	$tmp2,$in1,32
523*a11d055eSArd Biesheuvel	sb	$tmp3,9($mac)
524*a11d055eSArd Biesheuvel	dsrl	$tmp3,$in1,40
525*a11d055eSArd Biesheuvel	sb	$tmp0,10($mac)
526*a11d055eSArd Biesheuvel	dsrl	$tmp0,$in1,48
527*a11d055eSArd Biesheuvel	sb	$tmp1,11($mac)
528*a11d055eSArd Biesheuvel	dsrl	$tmp1,$in1,56
529*a11d055eSArd Biesheuvel	sb	$tmp2,12($mac)
530*a11d055eSArd Biesheuvel	sb	$tmp3,13($mac)
531*a11d055eSArd Biesheuvel	sb	$tmp0,14($mac)
532*a11d055eSArd Biesheuvel	sb	$tmp1,15($mac)
533*a11d055eSArd Biesheuvel
534*a11d055eSArd Biesheuvel	jr	$ra
535*a11d055eSArd Biesheuvel.end	poly1305_emit
536*a11d055eSArd Biesheuvel.rdata
537*a11d055eSArd Biesheuvel.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
538*a11d055eSArd Biesheuvel.align	2
539*a11d055eSArd Biesheuvel___
540*a11d055eSArd Biesheuvel}
541*a11d055eSArd Biesheuvel}}} else {{{
542*a11d055eSArd Biesheuvel######################################################################
543*a11d055eSArd Biesheuvel# 32-bit code path
544*a11d055eSArd Biesheuvel#
545*a11d055eSArd Biesheuvel
546*a11d055eSArd Biesheuvelmy ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
547*a11d055eSArd Biesheuvelmy ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
548*a11d055eSArd Biesheuvel   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
549*a11d055eSArd Biesheuvel
550*a11d055eSArd Biesheuvel$code.=<<___;
551*a11d055eSArd Biesheuvel#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
552*a11d055eSArd Biesheuvel     defined(_MIPS_ARCH_MIPS32R6)) \\
553*a11d055eSArd Biesheuvel     && !defined(_MIPS_ARCH_MIPS32R2)
554*a11d055eSArd Biesheuvel# define _MIPS_ARCH_MIPS32R2
555*a11d055eSArd Biesheuvel#endif
556*a11d055eSArd Biesheuvel
557*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R6)
558*a11d055eSArd Biesheuvel# define multu(rs,rt)
559*a11d055eSArd Biesheuvel# define mflo(rd,rs,rt)	mulu	rd,rs,rt
560*a11d055eSArd Biesheuvel# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
561*a11d055eSArd Biesheuvel#else
562*a11d055eSArd Biesheuvel# define multu(rs,rt)	multu	rs,rt
563*a11d055eSArd Biesheuvel# define mflo(rd,rs,rt)	mflo	rd
564*a11d055eSArd Biesheuvel# define mfhi(rd,rs,rt)	mfhi	rd
565*a11d055eSArd Biesheuvel#endif
566*a11d055eSArd Biesheuvel
567*a11d055eSArd Biesheuvel#ifdef	__KERNEL__
568*a11d055eSArd Biesheuvel# define poly1305_init   poly1305_init_mips
569*a11d055eSArd Biesheuvel# define poly1305_blocks poly1305_blocks_mips
570*a11d055eSArd Biesheuvel# define poly1305_emit   poly1305_emit_mips
571*a11d055eSArd Biesheuvel#endif
572*a11d055eSArd Biesheuvel
573*a11d055eSArd Biesheuvel#if defined(__MIPSEB__) && !defined(MIPSEB)
574*a11d055eSArd Biesheuvel# define MIPSEB
575*a11d055eSArd Biesheuvel#endif
576*a11d055eSArd Biesheuvel
577*a11d055eSArd Biesheuvel#ifdef MIPSEB
578*a11d055eSArd Biesheuvel# define MSB 0
579*a11d055eSArd Biesheuvel# define LSB 3
580*a11d055eSArd Biesheuvel#else
581*a11d055eSArd Biesheuvel# define MSB 3
582*a11d055eSArd Biesheuvel# define LSB 0
583*a11d055eSArd Biesheuvel#endif
584*a11d055eSArd Biesheuvel
585*a11d055eSArd Biesheuvel.text
586*a11d055eSArd Biesheuvel.set	noat
587*a11d055eSArd Biesheuvel.set	noreorder
588*a11d055eSArd Biesheuvel
589*a11d055eSArd Biesheuvel.align	5
590*a11d055eSArd Biesheuvel.globl	poly1305_init
591*a11d055eSArd Biesheuvel.ent	poly1305_init
592*a11d055eSArd Biesheuvelpoly1305_init:
593*a11d055eSArd Biesheuvel	.frame	$sp,0,$ra
594*a11d055eSArd Biesheuvel	.set	reorder
595*a11d055eSArd Biesheuvel
596*a11d055eSArd Biesheuvel	sw	$zero,0($ctx)
597*a11d055eSArd Biesheuvel	sw	$zero,4($ctx)
598*a11d055eSArd Biesheuvel	sw	$zero,8($ctx)
599*a11d055eSArd Biesheuvel	sw	$zero,12($ctx)
600*a11d055eSArd Biesheuvel	sw	$zero,16($ctx)
601*a11d055eSArd Biesheuvel
602*a11d055eSArd Biesheuvel	beqz	$inp,.Lno_key
603*a11d055eSArd Biesheuvel
604*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R6)
605*a11d055eSArd Biesheuvel	andi	$tmp0,$inp,3		# $inp % 4
606*a11d055eSArd Biesheuvel	subu	$inp,$inp,$tmp0		# align $inp
607*a11d055eSArd Biesheuvel	sll	$tmp0,$tmp0,3		# byte to bit offset
608*a11d055eSArd Biesheuvel	lw	$in0,0($inp)
609*a11d055eSArd Biesheuvel	lw	$in1,4($inp)
610*a11d055eSArd Biesheuvel	lw	$in2,8($inp)
611*a11d055eSArd Biesheuvel	lw	$in3,12($inp)
612*a11d055eSArd Biesheuvel	beqz	$tmp0,.Laligned_key
613*a11d055eSArd Biesheuvel
614*a11d055eSArd Biesheuvel	lw	$tmp2,16($inp)
615*a11d055eSArd Biesheuvel	subu	$tmp1,$zero,$tmp0
616*a11d055eSArd Biesheuvel# ifdef	MIPSEB
617*a11d055eSArd Biesheuvel	sllv	$in0,$in0,$tmp0
618*a11d055eSArd Biesheuvel	srlv	$tmp3,$in1,$tmp1
619*a11d055eSArd Biesheuvel	sllv	$in1,$in1,$tmp0
620*a11d055eSArd Biesheuvel	or	$in0,$in0,$tmp3
621*a11d055eSArd Biesheuvel	srlv	$tmp3,$in2,$tmp1
622*a11d055eSArd Biesheuvel	sllv	$in2,$in2,$tmp0
623*a11d055eSArd Biesheuvel	or	$in1,$in1,$tmp3
624*a11d055eSArd Biesheuvel	srlv	$tmp3,$in3,$tmp1
625*a11d055eSArd Biesheuvel	sllv	$in3,$in3,$tmp0
626*a11d055eSArd Biesheuvel	or	$in2,$in2,$tmp3
627*a11d055eSArd Biesheuvel	srlv	$tmp2,$tmp2,$tmp1
628*a11d055eSArd Biesheuvel	or	$in3,$in3,$tmp2
629*a11d055eSArd Biesheuvel# else
630*a11d055eSArd Biesheuvel	srlv	$in0,$in0,$tmp0
631*a11d055eSArd Biesheuvel	sllv	$tmp3,$in1,$tmp1
632*a11d055eSArd Biesheuvel	srlv	$in1,$in1,$tmp0
633*a11d055eSArd Biesheuvel	or	$in0,$in0,$tmp3
634*a11d055eSArd Biesheuvel	sllv	$tmp3,$in2,$tmp1
635*a11d055eSArd Biesheuvel	srlv	$in2,$in2,$tmp0
636*a11d055eSArd Biesheuvel	or	$in1,$in1,$tmp3
637*a11d055eSArd Biesheuvel	sllv	$tmp3,$in3,$tmp1
638*a11d055eSArd Biesheuvel	srlv	$in3,$in3,$tmp0
639*a11d055eSArd Biesheuvel	or	$in2,$in2,$tmp3
640*a11d055eSArd Biesheuvel	sllv	$tmp2,$tmp2,$tmp1
641*a11d055eSArd Biesheuvel	or	$in3,$in3,$tmp2
642*a11d055eSArd Biesheuvel# endif
643*a11d055eSArd Biesheuvel.Laligned_key:
644*a11d055eSArd Biesheuvel#else
645*a11d055eSArd Biesheuvel	lwl	$in0,0+MSB($inp)
646*a11d055eSArd Biesheuvel	lwl	$in1,4+MSB($inp)
647*a11d055eSArd Biesheuvel	lwl	$in2,8+MSB($inp)
648*a11d055eSArd Biesheuvel	lwl	$in3,12+MSB($inp)
649*a11d055eSArd Biesheuvel	lwr	$in0,0+LSB($inp)
650*a11d055eSArd Biesheuvel	lwr	$in1,4+LSB($inp)
651*a11d055eSArd Biesheuvel	lwr	$in2,8+LSB($inp)
652*a11d055eSArd Biesheuvel	lwr	$in3,12+LSB($inp)
653*a11d055eSArd Biesheuvel#endif
654*a11d055eSArd Biesheuvel#ifdef	MIPSEB
655*a11d055eSArd Biesheuvel# if defined(_MIPS_ARCH_MIPS32R2)
656*a11d055eSArd Biesheuvel	wsbh	$in0,$in0		# byte swap
657*a11d055eSArd Biesheuvel	wsbh	$in1,$in1
658*a11d055eSArd Biesheuvel	wsbh	$in2,$in2
659*a11d055eSArd Biesheuvel	wsbh	$in3,$in3
660*a11d055eSArd Biesheuvel	rotr	$in0,$in0,16
661*a11d055eSArd Biesheuvel	rotr	$in1,$in1,16
662*a11d055eSArd Biesheuvel	rotr	$in2,$in2,16
663*a11d055eSArd Biesheuvel	rotr	$in3,$in3,16
664*a11d055eSArd Biesheuvel# else
665*a11d055eSArd Biesheuvel	srl	$tmp0,$in0,24		# byte swap
666*a11d055eSArd Biesheuvel	srl	$tmp1,$in0,8
667*a11d055eSArd Biesheuvel	andi	$tmp2,$in0,0xFF00
668*a11d055eSArd Biesheuvel	sll	$in0,$in0,24
669*a11d055eSArd Biesheuvel	andi	$tmp1,0xFF00
670*a11d055eSArd Biesheuvel	sll	$tmp2,$tmp2,8
671*a11d055eSArd Biesheuvel	or	$in0,$tmp0
672*a11d055eSArd Biesheuvel	 srl	$tmp0,$in1,24
673*a11d055eSArd Biesheuvel	or	$tmp1,$tmp2
674*a11d055eSArd Biesheuvel	 srl	$tmp2,$in1,8
675*a11d055eSArd Biesheuvel	or	$in0,$tmp1
676*a11d055eSArd Biesheuvel	 andi	$tmp1,$in1,0xFF00
677*a11d055eSArd Biesheuvel	 sll	$in1,$in1,24
678*a11d055eSArd Biesheuvel	 andi	$tmp2,0xFF00
679*a11d055eSArd Biesheuvel	 sll	$tmp1,$tmp1,8
680*a11d055eSArd Biesheuvel	 or	$in1,$tmp0
681*a11d055eSArd Biesheuvel	srl	$tmp0,$in2,24
682*a11d055eSArd Biesheuvel	 or	$tmp2,$tmp1
683*a11d055eSArd Biesheuvel	srl	$tmp1,$in2,8
684*a11d055eSArd Biesheuvel	 or	$in1,$tmp2
685*a11d055eSArd Biesheuvel	andi	$tmp2,$in2,0xFF00
686*a11d055eSArd Biesheuvel	sll	$in2,$in2,24
687*a11d055eSArd Biesheuvel	andi	$tmp1,0xFF00
688*a11d055eSArd Biesheuvel	sll	$tmp2,$tmp2,8
689*a11d055eSArd Biesheuvel	or	$in2,$tmp0
690*a11d055eSArd Biesheuvel	 srl	$tmp0,$in3,24
691*a11d055eSArd Biesheuvel	or	$tmp1,$tmp2
692*a11d055eSArd Biesheuvel	 srl	$tmp2,$in3,8
693*a11d055eSArd Biesheuvel	or	$in2,$tmp1
694*a11d055eSArd Biesheuvel	 andi	$tmp1,$in3,0xFF00
695*a11d055eSArd Biesheuvel	 sll	$in3,$in3,24
696*a11d055eSArd Biesheuvel	 andi	$tmp2,0xFF00
697*a11d055eSArd Biesheuvel	 sll	$tmp1,$tmp1,8
698*a11d055eSArd Biesheuvel	 or	$in3,$tmp0
699*a11d055eSArd Biesheuvel	 or	$tmp2,$tmp1
700*a11d055eSArd Biesheuvel	 or	$in3,$tmp2
701*a11d055eSArd Biesheuvel# endif
702*a11d055eSArd Biesheuvel#endif
703*a11d055eSArd Biesheuvel	lui	$tmp0,0x0fff
704*a11d055eSArd Biesheuvel	ori	$tmp0,0xffff		# 0x0fffffff
705*a11d055eSArd Biesheuvel	and	$in0,$in0,$tmp0
706*a11d055eSArd Biesheuvel	subu	$tmp0,3			# 0x0ffffffc
707*a11d055eSArd Biesheuvel	and	$in1,$in1,$tmp0
708*a11d055eSArd Biesheuvel	and	$in2,$in2,$tmp0
709*a11d055eSArd Biesheuvel	and	$in3,$in3,$tmp0
710*a11d055eSArd Biesheuvel
711*a11d055eSArd Biesheuvel	sw	$in0,20($ctx)
712*a11d055eSArd Biesheuvel	sw	$in1,24($ctx)
713*a11d055eSArd Biesheuvel	sw	$in2,28($ctx)
714*a11d055eSArd Biesheuvel	sw	$in3,32($ctx)
715*a11d055eSArd Biesheuvel
716*a11d055eSArd Biesheuvel	srl	$tmp1,$in1,2
717*a11d055eSArd Biesheuvel	srl	$tmp2,$in2,2
718*a11d055eSArd Biesheuvel	srl	$tmp3,$in3,2
719*a11d055eSArd Biesheuvel	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
720*a11d055eSArd Biesheuvel	addu	$in2,$in2,$tmp2
721*a11d055eSArd Biesheuvel	addu	$in3,$in3,$tmp3
722*a11d055eSArd Biesheuvel	sw	$in1,36($ctx)
723*a11d055eSArd Biesheuvel	sw	$in2,40($ctx)
724*a11d055eSArd Biesheuvel	sw	$in3,44($ctx)
725*a11d055eSArd Biesheuvel.Lno_key:
726*a11d055eSArd Biesheuvel	li	$v0,0
727*a11d055eSArd Biesheuvel	jr	$ra
728*a11d055eSArd Biesheuvel.end	poly1305_init
729*a11d055eSArd Biesheuvel___
730*a11d055eSArd Biesheuvel{
731*a11d055eSArd Biesheuvelmy $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
732*a11d055eSArd Biesheuvel
733*a11d055eSArd Biesheuvelmy ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
734*a11d055eSArd Biesheuvel   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
735*a11d055eSArd Biesheuvelmy ($d0,$d1,$d2,$d3) =
736*a11d055eSArd Biesheuvel   ($a4,$a5,$a6,$a7);
737*a11d055eSArd Biesheuvelmy $shr = $t2;		# used on R6
738*a11d055eSArd Biesheuvelmy $one = $t2;		# used on R2
739*a11d055eSArd Biesheuvel
740*a11d055eSArd Biesheuvel$code.=<<___;
741*a11d055eSArd Biesheuvel.globl	poly1305_blocks
742*a11d055eSArd Biesheuvel.align	5
743*a11d055eSArd Biesheuvel.ent	poly1305_blocks
744*a11d055eSArd Biesheuvelpoly1305_blocks:
745*a11d055eSArd Biesheuvel	.frame	$sp,16*4,$ra
746*a11d055eSArd Biesheuvel	.mask	$SAVED_REGS_MASK,-4
747*a11d055eSArd Biesheuvel	.set	noreorder
748*a11d055eSArd Biesheuvel	subu	$sp, $sp,4*12
749*a11d055eSArd Biesheuvel	sw	$s11,4*11($sp)
750*a11d055eSArd Biesheuvel	sw	$s10,4*10($sp)
751*a11d055eSArd Biesheuvel	sw	$s9, 4*9($sp)
752*a11d055eSArd Biesheuvel	sw	$s8, 4*8($sp)
753*a11d055eSArd Biesheuvel	sw	$s7, 4*7($sp)
754*a11d055eSArd Biesheuvel	sw	$s6, 4*6($sp)
755*a11d055eSArd Biesheuvel	sw	$s5, 4*5($sp)
756*a11d055eSArd Biesheuvel	sw	$s4, 4*4($sp)
757*a11d055eSArd Biesheuvel___
758*a11d055eSArd Biesheuvel$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
759*a11d055eSArd Biesheuvel	sw	$s3, 4*3($sp)
760*a11d055eSArd Biesheuvel	sw	$s2, 4*2($sp)
761*a11d055eSArd Biesheuvel	sw	$s1, 4*1($sp)
762*a11d055eSArd Biesheuvel	sw	$s0, 4*0($sp)
763*a11d055eSArd Biesheuvel___
764*a11d055eSArd Biesheuvel$code.=<<___;
765*a11d055eSArd Biesheuvel	.set	reorder
766*a11d055eSArd Biesheuvel
767*a11d055eSArd Biesheuvel	srl	$len,4			# number of complete blocks
768*a11d055eSArd Biesheuvel	li	$one,1
769*a11d055eSArd Biesheuvel	beqz	$len,.Labort
770*a11d055eSArd Biesheuvel
771*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R6)
772*a11d055eSArd Biesheuvel	andi	$shr,$inp,3
773*a11d055eSArd Biesheuvel	subu	$inp,$inp,$shr		# align $inp
774*a11d055eSArd Biesheuvel	sll	$shr,$shr,3		# byte to bit offset
775*a11d055eSArd Biesheuvel#endif
776*a11d055eSArd Biesheuvel
777*a11d055eSArd Biesheuvel	lw	$h0,0($ctx)		# load hash value
778*a11d055eSArd Biesheuvel	lw	$h1,4($ctx)
779*a11d055eSArd Biesheuvel	lw	$h2,8($ctx)
780*a11d055eSArd Biesheuvel	lw	$h3,12($ctx)
781*a11d055eSArd Biesheuvel	lw	$h4,16($ctx)
782*a11d055eSArd Biesheuvel
783*a11d055eSArd Biesheuvel	lw	$r0,20($ctx)		# load key
784*a11d055eSArd Biesheuvel	lw	$r1,24($ctx)
785*a11d055eSArd Biesheuvel	lw	$r2,28($ctx)
786*a11d055eSArd Biesheuvel	lw	$r3,32($ctx)
787*a11d055eSArd Biesheuvel	lw	$rs1,36($ctx)
788*a11d055eSArd Biesheuvel	lw	$rs2,40($ctx)
789*a11d055eSArd Biesheuvel	lw	$rs3,44($ctx)
790*a11d055eSArd Biesheuvel
791*a11d055eSArd Biesheuvel	sll	$len,4
792*a11d055eSArd Biesheuvel	addu	$len,$len,$inp		# end of buffer
793*a11d055eSArd Biesheuvel	b	.Loop
794*a11d055eSArd Biesheuvel
795*a11d055eSArd Biesheuvel.align	4
796*a11d055eSArd Biesheuvel.Loop:
797*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R6)
798*a11d055eSArd Biesheuvel	lw	$d0,0($inp)		# load input
799*a11d055eSArd Biesheuvel	lw	$d1,4($inp)
800*a11d055eSArd Biesheuvel	lw	$d2,8($inp)
801*a11d055eSArd Biesheuvel	lw	$d3,12($inp)
802*a11d055eSArd Biesheuvel	beqz	$shr,.Laligned_inp
803*a11d055eSArd Biesheuvel
804*a11d055eSArd Biesheuvel	lw	$t0,16($inp)
805*a11d055eSArd Biesheuvel	subu	$t1,$zero,$shr
806*a11d055eSArd Biesheuvel# ifdef	MIPSEB
807*a11d055eSArd Biesheuvel	sllv	$d0,$d0,$shr
808*a11d055eSArd Biesheuvel	srlv	$at,$d1,$t1
809*a11d055eSArd Biesheuvel	sllv	$d1,$d1,$shr
810*a11d055eSArd Biesheuvel	or	$d0,$d0,$at
811*a11d055eSArd Biesheuvel	srlv	$at,$d2,$t1
812*a11d055eSArd Biesheuvel	sllv	$d2,$d2,$shr
813*a11d055eSArd Biesheuvel	or	$d1,$d1,$at
814*a11d055eSArd Biesheuvel	srlv	$at,$d3,$t1
815*a11d055eSArd Biesheuvel	sllv	$d3,$d3,$shr
816*a11d055eSArd Biesheuvel	or	$d2,$d2,$at
817*a11d055eSArd Biesheuvel	srlv	$t0,$t0,$t1
818*a11d055eSArd Biesheuvel	or	$d3,$d3,$t0
819*a11d055eSArd Biesheuvel# else
820*a11d055eSArd Biesheuvel	srlv	$d0,$d0,$shr
821*a11d055eSArd Biesheuvel	sllv	$at,$d1,$t1
822*a11d055eSArd Biesheuvel	srlv	$d1,$d1,$shr
823*a11d055eSArd Biesheuvel	or	$d0,$d0,$at
824*a11d055eSArd Biesheuvel	sllv	$at,$d2,$t1
825*a11d055eSArd Biesheuvel	srlv	$d2,$d2,$shr
826*a11d055eSArd Biesheuvel	or	$d1,$d1,$at
827*a11d055eSArd Biesheuvel	sllv	$at,$d3,$t1
828*a11d055eSArd Biesheuvel	srlv	$d3,$d3,$shr
829*a11d055eSArd Biesheuvel	or	$d2,$d2,$at
830*a11d055eSArd Biesheuvel	sllv	$t0,$t0,$t1
831*a11d055eSArd Biesheuvel	or	$d3,$d3,$t0
832*a11d055eSArd Biesheuvel# endif
833*a11d055eSArd Biesheuvel.Laligned_inp:
834*a11d055eSArd Biesheuvel#else
835*a11d055eSArd Biesheuvel	lwl	$d0,0+MSB($inp)		# load input
836*a11d055eSArd Biesheuvel	lwl	$d1,4+MSB($inp)
837*a11d055eSArd Biesheuvel	lwl	$d2,8+MSB($inp)
838*a11d055eSArd Biesheuvel	lwl	$d3,12+MSB($inp)
839*a11d055eSArd Biesheuvel	lwr	$d0,0+LSB($inp)
840*a11d055eSArd Biesheuvel	lwr	$d1,4+LSB($inp)
841*a11d055eSArd Biesheuvel	lwr	$d2,8+LSB($inp)
842*a11d055eSArd Biesheuvel	lwr	$d3,12+LSB($inp)
843*a11d055eSArd Biesheuvel#endif
844*a11d055eSArd Biesheuvel#ifdef	MIPSEB
845*a11d055eSArd Biesheuvel# if defined(_MIPS_ARCH_MIPS32R2)
846*a11d055eSArd Biesheuvel	wsbh	$d0,$d0			# byte swap
847*a11d055eSArd Biesheuvel	wsbh	$d1,$d1
848*a11d055eSArd Biesheuvel	wsbh	$d2,$d2
849*a11d055eSArd Biesheuvel	wsbh	$d3,$d3
850*a11d055eSArd Biesheuvel	rotr	$d0,$d0,16
851*a11d055eSArd Biesheuvel	rotr	$d1,$d1,16
852*a11d055eSArd Biesheuvel	rotr	$d2,$d2,16
853*a11d055eSArd Biesheuvel	rotr	$d3,$d3,16
854*a11d055eSArd Biesheuvel# else
855*a11d055eSArd Biesheuvel	srl	$at,$d0,24		# byte swap
856*a11d055eSArd Biesheuvel	srl	$t0,$d0,8
857*a11d055eSArd Biesheuvel	andi	$t1,$d0,0xFF00
858*a11d055eSArd Biesheuvel	sll	$d0,$d0,24
859*a11d055eSArd Biesheuvel	andi	$t0,0xFF00
860*a11d055eSArd Biesheuvel	sll	$t1,$t1,8
861*a11d055eSArd Biesheuvel	or	$d0,$at
862*a11d055eSArd Biesheuvel	 srl	$at,$d1,24
863*a11d055eSArd Biesheuvel	or	$t0,$t1
864*a11d055eSArd Biesheuvel	 srl	$t1,$d1,8
865*a11d055eSArd Biesheuvel	or	$d0,$t0
866*a11d055eSArd Biesheuvel	 andi	$t0,$d1,0xFF00
867*a11d055eSArd Biesheuvel	 sll	$d1,$d1,24
868*a11d055eSArd Biesheuvel	 andi	$t1,0xFF00
869*a11d055eSArd Biesheuvel	 sll	$t0,$t0,8
870*a11d055eSArd Biesheuvel	 or	$d1,$at
871*a11d055eSArd Biesheuvel	srl	$at,$d2,24
872*a11d055eSArd Biesheuvel	 or	$t1,$t0
873*a11d055eSArd Biesheuvel	srl	$t0,$d2,8
874*a11d055eSArd Biesheuvel	 or	$d1,$t1
875*a11d055eSArd Biesheuvel	andi	$t1,$d2,0xFF00
876*a11d055eSArd Biesheuvel	sll	$d2,$d2,24
877*a11d055eSArd Biesheuvel	andi	$t0,0xFF00
878*a11d055eSArd Biesheuvel	sll	$t1,$t1,8
879*a11d055eSArd Biesheuvel	or	$d2,$at
880*a11d055eSArd Biesheuvel	 srl	$at,$d3,24
881*a11d055eSArd Biesheuvel	or	$t0,$t1
882*a11d055eSArd Biesheuvel	 srl	$t1,$d3,8
883*a11d055eSArd Biesheuvel	or	$d2,$t0
884*a11d055eSArd Biesheuvel	 andi	$t0,$d3,0xFF00
885*a11d055eSArd Biesheuvel	 sll	$d3,$d3,24
886*a11d055eSArd Biesheuvel	 andi	$t1,0xFF00
887*a11d055eSArd Biesheuvel	 sll	$t0,$t0,8
888*a11d055eSArd Biesheuvel	 or	$d3,$at
889*a11d055eSArd Biesheuvel	 or	$t1,$t0
890*a11d055eSArd Biesheuvel	 or	$d3,$t1
891*a11d055eSArd Biesheuvel# endif
892*a11d055eSArd Biesheuvel#endif
893*a11d055eSArd Biesheuvel	srl	$t0,$h4,2		# modulo-scheduled reduction
894*a11d055eSArd Biesheuvel	andi	$h4,$h4,3
895*a11d055eSArd Biesheuvel	sll	$at,$t0,2
896*a11d055eSArd Biesheuvel
897*a11d055eSArd Biesheuvel	addu	$d0,$d0,$h0		# accumulate input
898*a11d055eSArd Biesheuvel	 addu	$t0,$t0,$at
899*a11d055eSArd Biesheuvel	sltu	$h0,$d0,$h0
900*a11d055eSArd Biesheuvel	addu	$d0,$d0,$t0		# ... and residue
901*a11d055eSArd Biesheuvel	sltu	$at,$d0,$t0
902*a11d055eSArd Biesheuvel
903*a11d055eSArd Biesheuvel	addu	$d1,$d1,$h1
904*a11d055eSArd Biesheuvel	 addu	$h0,$h0,$at		# carry
905*a11d055eSArd Biesheuvel	sltu	$h1,$d1,$h1
906*a11d055eSArd Biesheuvel	addu	$d1,$d1,$h0
907*a11d055eSArd Biesheuvel	sltu	$h0,$d1,$h0
908*a11d055eSArd Biesheuvel
909*a11d055eSArd Biesheuvel	addu	$d2,$d2,$h2
910*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$h0		# carry
911*a11d055eSArd Biesheuvel	sltu	$h2,$d2,$h2
912*a11d055eSArd Biesheuvel	addu	$d2,$d2,$h1
913*a11d055eSArd Biesheuvel	sltu	$h1,$d2,$h1
914*a11d055eSArd Biesheuvel
915*a11d055eSArd Biesheuvel	addu	$d3,$d3,$h3
916*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$h1		# carry
917*a11d055eSArd Biesheuvel	sltu	$h3,$d3,$h3
918*a11d055eSArd Biesheuvel	addu	$d3,$d3,$h2
919*a11d055eSArd Biesheuvel
920*a11d055eSArd Biesheuvel#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
921*a11d055eSArd Biesheuvel	multu	$r0,$d0			# d0*r0
922*a11d055eSArd Biesheuvel	 sltu	$h2,$d3,$h2
923*a11d055eSArd Biesheuvel	maddu	$rs3,$d1		# d1*s3
924*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$h2		# carry
925*a11d055eSArd Biesheuvel	maddu	$rs2,$d2		# d2*s2
926*a11d055eSArd Biesheuvel	 addu	$h4,$h4,$padbit
927*a11d055eSArd Biesheuvel	maddu	$rs1,$d3		# d3*s1
928*a11d055eSArd Biesheuvel	 addu	$h4,$h4,$h3
929*a11d055eSArd Biesheuvel	mfhi	$at
930*a11d055eSArd Biesheuvel	mflo	$h0
931*a11d055eSArd Biesheuvel
932*a11d055eSArd Biesheuvel	multu	$r1,$d0			# d0*r1
933*a11d055eSArd Biesheuvel	maddu	$r0,$d1			# d1*r0
934*a11d055eSArd Biesheuvel	maddu	$rs3,$d2		# d2*s3
935*a11d055eSArd Biesheuvel	maddu	$rs2,$d3		# d3*s2
936*a11d055eSArd Biesheuvel	maddu	$rs1,$h4		# h4*s1
937*a11d055eSArd Biesheuvel	maddu	$at,$one		# hi*1
938*a11d055eSArd Biesheuvel	mfhi	$at
939*a11d055eSArd Biesheuvel	mflo	$h1
940*a11d055eSArd Biesheuvel
941*a11d055eSArd Biesheuvel	multu	$r2,$d0			# d0*r2
942*a11d055eSArd Biesheuvel	maddu	$r1,$d1			# d1*r1
943*a11d055eSArd Biesheuvel	maddu	$r0,$d2			# d2*r0
944*a11d055eSArd Biesheuvel	maddu	$rs3,$d3		# d3*s3
945*a11d055eSArd Biesheuvel	maddu	$rs2,$h4		# h4*s2
946*a11d055eSArd Biesheuvel	maddu	$at,$one		# hi*1
947*a11d055eSArd Biesheuvel	mfhi	$at
948*a11d055eSArd Biesheuvel	mflo	$h2
949*a11d055eSArd Biesheuvel
950*a11d055eSArd Biesheuvel	mul	$t0,$r0,$h4		# h4*r0
951*a11d055eSArd Biesheuvel
952*a11d055eSArd Biesheuvel	multu	$r3,$d0			# d0*r3
953*a11d055eSArd Biesheuvel	maddu	$r2,$d1			# d1*r2
954*a11d055eSArd Biesheuvel	maddu	$r1,$d2			# d2*r1
955*a11d055eSArd Biesheuvel	maddu	$r0,$d3			# d3*r0
956*a11d055eSArd Biesheuvel	maddu	$rs3,$h4		# h4*s3
957*a11d055eSArd Biesheuvel	maddu	$at,$one		# hi*1
958*a11d055eSArd Biesheuvel	mfhi	$at
959*a11d055eSArd Biesheuvel	mflo	$h3
960*a11d055eSArd Biesheuvel
961*a11d055eSArd Biesheuvel	 addiu	$inp,$inp,16
962*a11d055eSArd Biesheuvel
963*a11d055eSArd Biesheuvel	addu	$h4,$t0,$at
964*a11d055eSArd Biesheuvel#else
965*a11d055eSArd Biesheuvel	multu	($r0,$d0)		# d0*r0
966*a11d055eSArd Biesheuvel	mflo	($h0,$r0,$d0)
967*a11d055eSArd Biesheuvel	mfhi	($h1,$r0,$d0)
968*a11d055eSArd Biesheuvel
969*a11d055eSArd Biesheuvel	 sltu	$h2,$d3,$h2
970*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$h2		# carry
971*a11d055eSArd Biesheuvel
972*a11d055eSArd Biesheuvel	multu	($rs3,$d1)		# d1*s3
973*a11d055eSArd Biesheuvel	mflo	($at,$rs3,$d1)
974*a11d055eSArd Biesheuvel	mfhi	($t0,$rs3,$d1)
975*a11d055eSArd Biesheuvel
976*a11d055eSArd Biesheuvel	 addu	$h4,$h4,$padbit
977*a11d055eSArd Biesheuvel	 addiu	$inp,$inp,16
978*a11d055eSArd Biesheuvel	 addu	$h4,$h4,$h3
979*a11d055eSArd Biesheuvel
980*a11d055eSArd Biesheuvel	multu	($rs2,$d2)		# d2*s2
981*a11d055eSArd Biesheuvel	mflo	($a3,$rs2,$d2)
982*a11d055eSArd Biesheuvel	mfhi	($t1,$rs2,$d2)
983*a11d055eSArd Biesheuvel	 addu	$h0,$h0,$at
984*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$t0
985*a11d055eSArd Biesheuvel	multu	($rs1,$d3)		# d3*s1
986*a11d055eSArd Biesheuvel	 sltu	$at,$h0,$at
987*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$at
988*a11d055eSArd Biesheuvel
989*a11d055eSArd Biesheuvel	mflo	($at,$rs1,$d3)
990*a11d055eSArd Biesheuvel	mfhi	($t0,$rs1,$d3)
991*a11d055eSArd Biesheuvel	 addu	$h0,$h0,$a3
992*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$t1
993*a11d055eSArd Biesheuvel	multu	($r1,$d0)		# d0*r1
994*a11d055eSArd Biesheuvel	 sltu	$a3,$h0,$a3
995*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$a3
996*a11d055eSArd Biesheuvel
997*a11d055eSArd Biesheuvel
998*a11d055eSArd Biesheuvel	mflo	($a3,$r1,$d0)
999*a11d055eSArd Biesheuvel	mfhi	($h2,$r1,$d0)
1000*a11d055eSArd Biesheuvel	 addu	$h0,$h0,$at
1001*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$t0
1002*a11d055eSArd Biesheuvel	multu	($r0,$d1)		# d1*r0
1003*a11d055eSArd Biesheuvel	 sltu	$at,$h0,$at
1004*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$at
1005*a11d055eSArd Biesheuvel
1006*a11d055eSArd Biesheuvel	mflo	($at,$r0,$d1)
1007*a11d055eSArd Biesheuvel	mfhi	($t0,$r0,$d1)
1008*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$a3
1009*a11d055eSArd Biesheuvel	 sltu	$a3,$h1,$a3
1010*a11d055eSArd Biesheuvel	multu	($rs3,$d2)		# d2*s3
1011*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$a3
1012*a11d055eSArd Biesheuvel
1013*a11d055eSArd Biesheuvel	mflo	($a3,$rs3,$d2)
1014*a11d055eSArd Biesheuvel	mfhi	($t1,$rs3,$d2)
1015*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$at
1016*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$t0
1017*a11d055eSArd Biesheuvel	multu	($rs2,$d3)		# d3*s2
1018*a11d055eSArd Biesheuvel	 sltu	$at,$h1,$at
1019*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$at
1020*a11d055eSArd Biesheuvel
1021*a11d055eSArd Biesheuvel	mflo	($at,$rs2,$d3)
1022*a11d055eSArd Biesheuvel	mfhi	($t0,$rs2,$d3)
1023*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$a3
1024*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$t1
1025*a11d055eSArd Biesheuvel	multu	($rs1,$h4)		# h4*s1
1026*a11d055eSArd Biesheuvel	 sltu	$a3,$h1,$a3
1027*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$a3
1028*a11d055eSArd Biesheuvel
1029*a11d055eSArd Biesheuvel	mflo	($a3,$rs1,$h4)
1030*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$at
1031*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$t0
1032*a11d055eSArd Biesheuvel	multu	($r2,$d0)		# d0*r2
1033*a11d055eSArd Biesheuvel	 sltu	$at,$h1,$at
1034*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$at
1035*a11d055eSArd Biesheuvel
1036*a11d055eSArd Biesheuvel
1037*a11d055eSArd Biesheuvel	mflo	($at,$r2,$d0)
1038*a11d055eSArd Biesheuvel	mfhi	($h3,$r2,$d0)
1039*a11d055eSArd Biesheuvel	 addu	$h1,$h1,$a3
1040*a11d055eSArd Biesheuvel	 sltu	$a3,$h1,$a3
1041*a11d055eSArd Biesheuvel	multu	($r1,$d1)		# d1*r1
1042*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$a3
1043*a11d055eSArd Biesheuvel
1044*a11d055eSArd Biesheuvel	mflo	($a3,$r1,$d1)
1045*a11d055eSArd Biesheuvel	mfhi	($t1,$r1,$d1)
1046*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$at
1047*a11d055eSArd Biesheuvel	 sltu	$at,$h2,$at
1048*a11d055eSArd Biesheuvel	multu	($r0,$d2)		# d2*r0
1049*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$at
1050*a11d055eSArd Biesheuvel
1051*a11d055eSArd Biesheuvel	mflo	($at,$r0,$d2)
1052*a11d055eSArd Biesheuvel	mfhi	($t0,$r0,$d2)
1053*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$a3
1054*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$t1
1055*a11d055eSArd Biesheuvel	multu	($rs3,$d3)		# d3*s3
1056*a11d055eSArd Biesheuvel	 sltu	$a3,$h2,$a3
1057*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$a3
1058*a11d055eSArd Biesheuvel
1059*a11d055eSArd Biesheuvel	mflo	($a3,$rs3,$d3)
1060*a11d055eSArd Biesheuvel	mfhi	($t1,$rs3,$d3)
1061*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$at
1062*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$t0
1063*a11d055eSArd Biesheuvel	multu	($rs2,$h4)		# h4*s2
1064*a11d055eSArd Biesheuvel	 sltu	$at,$h2,$at
1065*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$at
1066*a11d055eSArd Biesheuvel
1067*a11d055eSArd Biesheuvel	mflo	($at,$rs2,$h4)
1068*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$a3
1069*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$t1
1070*a11d055eSArd Biesheuvel	multu	($r3,$d0)		# d0*r3
1071*a11d055eSArd Biesheuvel	 sltu	$a3,$h2,$a3
1072*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$a3
1073*a11d055eSArd Biesheuvel
1074*a11d055eSArd Biesheuvel
1075*a11d055eSArd Biesheuvel	mflo	($a3,$r3,$d0)
1076*a11d055eSArd Biesheuvel	mfhi	($t1,$r3,$d0)
1077*a11d055eSArd Biesheuvel	 addu	$h2,$h2,$at
1078*a11d055eSArd Biesheuvel	 sltu	$at,$h2,$at
1079*a11d055eSArd Biesheuvel	multu	($r2,$d1)		# d1*r2
1080*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$at
1081*a11d055eSArd Biesheuvel
1082*a11d055eSArd Biesheuvel	mflo	($at,$r2,$d1)
1083*a11d055eSArd Biesheuvel	mfhi	($t0,$r2,$d1)
1084*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$a3
1085*a11d055eSArd Biesheuvel	 sltu	$a3,$h3,$a3
1086*a11d055eSArd Biesheuvel	multu	($r0,$d3)		# d3*r0
1087*a11d055eSArd Biesheuvel	 addu	$t1,$t1,$a3
1088*a11d055eSArd Biesheuvel
1089*a11d055eSArd Biesheuvel	mflo	($a3,$r0,$d3)
1090*a11d055eSArd Biesheuvel	mfhi	($d3,$r0,$d3)
1091*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$at
1092*a11d055eSArd Biesheuvel	 addu	$t1,$t1,$t0
1093*a11d055eSArd Biesheuvel	multu	($r1,$d2)		# d2*r1
1094*a11d055eSArd Biesheuvel	 sltu	$at,$h3,$at
1095*a11d055eSArd Biesheuvel	 addu	$t1,$t1,$at
1096*a11d055eSArd Biesheuvel
1097*a11d055eSArd Biesheuvel	mflo	($at,$r1,$d2)
1098*a11d055eSArd Biesheuvel	mfhi	($t0,$r1,$d2)
1099*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$a3
1100*a11d055eSArd Biesheuvel	 addu	$t1,$t1,$d3
1101*a11d055eSArd Biesheuvel	multu	($rs3,$h4)		# h4*s3
1102*a11d055eSArd Biesheuvel	 sltu	$a3,$h3,$a3
1103*a11d055eSArd Biesheuvel	 addu	$t1,$t1,$a3
1104*a11d055eSArd Biesheuvel
1105*a11d055eSArd Biesheuvel	mflo	($a3,$rs3,$h4)
1106*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$at
1107*a11d055eSArd Biesheuvel	 addu	$t1,$t1,$t0
1108*a11d055eSArd Biesheuvel	multu	($r0,$h4)		# h4*r0
1109*a11d055eSArd Biesheuvel	 sltu	$at,$h3,$at
1110*a11d055eSArd Biesheuvel	 addu	$t1,$t1,$at
1111*a11d055eSArd Biesheuvel
1112*a11d055eSArd Biesheuvel
1113*a11d055eSArd Biesheuvel	mflo	($h4,$r0,$h4)
1114*a11d055eSArd Biesheuvel	 addu	$h3,$h3,$a3
1115*a11d055eSArd Biesheuvel	 sltu	$a3,$h3,$a3
1116*a11d055eSArd Biesheuvel	 addu	$t1,$t1,$a3
1117*a11d055eSArd Biesheuvel	addu	$h4,$h4,$t1
1118*a11d055eSArd Biesheuvel
1119*a11d055eSArd Biesheuvel	li	$padbit,1		# if we loop, padbit is 1
1120*a11d055eSArd Biesheuvel#endif
1121*a11d055eSArd Biesheuvel	bne	$inp,$len,.Loop
1122*a11d055eSArd Biesheuvel
1123*a11d055eSArd Biesheuvel	sw	$h0,0($ctx)		# store hash value
1124*a11d055eSArd Biesheuvel	sw	$h1,4($ctx)
1125*a11d055eSArd Biesheuvel	sw	$h2,8($ctx)
1126*a11d055eSArd Biesheuvel	sw	$h3,12($ctx)
1127*a11d055eSArd Biesheuvel	sw	$h4,16($ctx)
1128*a11d055eSArd Biesheuvel
1129*a11d055eSArd Biesheuvel	.set	noreorder
1130*a11d055eSArd Biesheuvel.Labort:
1131*a11d055eSArd Biesheuvel	lw	$s11,4*11($sp)
1132*a11d055eSArd Biesheuvel	lw	$s10,4*10($sp)
1133*a11d055eSArd Biesheuvel	lw	$s9, 4*9($sp)
1134*a11d055eSArd Biesheuvel	lw	$s8, 4*8($sp)
1135*a11d055eSArd Biesheuvel	lw	$s7, 4*7($sp)
1136*a11d055eSArd Biesheuvel	lw	$s6, 4*6($sp)
1137*a11d055eSArd Biesheuvel	lw	$s5, 4*5($sp)
1138*a11d055eSArd Biesheuvel	lw	$s4, 4*4($sp)
1139*a11d055eSArd Biesheuvel___
1140*a11d055eSArd Biesheuvel$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
1141*a11d055eSArd Biesheuvel	lw	$s3, 4*3($sp)
1142*a11d055eSArd Biesheuvel	lw	$s2, 4*2($sp)
1143*a11d055eSArd Biesheuvel	lw	$s1, 4*1($sp)
1144*a11d055eSArd Biesheuvel	lw	$s0, 4*0($sp)
1145*a11d055eSArd Biesheuvel___
1146*a11d055eSArd Biesheuvel$code.=<<___;
1147*a11d055eSArd Biesheuvel	jr	$ra
1148*a11d055eSArd Biesheuvel	addu	$sp,$sp,4*12
1149*a11d055eSArd Biesheuvel.end	poly1305_blocks
1150*a11d055eSArd Biesheuvel___
1151*a11d055eSArd Biesheuvel}
1152*a11d055eSArd Biesheuvel{
1153*a11d055eSArd Biesheuvelmy ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1154*a11d055eSArd Biesheuvel
1155*a11d055eSArd Biesheuvel$code.=<<___;
1156*a11d055eSArd Biesheuvel.align	5
1157*a11d055eSArd Biesheuvel.globl	poly1305_emit
1158*a11d055eSArd Biesheuvel.ent	poly1305_emit
1159*a11d055eSArd Biesheuvelpoly1305_emit:
1160*a11d055eSArd Biesheuvel	.frame	$sp,0,$ra
1161*a11d055eSArd Biesheuvel	.set	reorder
1162*a11d055eSArd Biesheuvel
1163*a11d055eSArd Biesheuvel	lw	$tmp4,16($ctx)
1164*a11d055eSArd Biesheuvel	lw	$tmp0,0($ctx)
1165*a11d055eSArd Biesheuvel	lw	$tmp1,4($ctx)
1166*a11d055eSArd Biesheuvel	lw	$tmp2,8($ctx)
1167*a11d055eSArd Biesheuvel	lw	$tmp3,12($ctx)
1168*a11d055eSArd Biesheuvel
1169*a11d055eSArd Biesheuvel	li	$in0,-4			# final reduction
1170*a11d055eSArd Biesheuvel	srl	$ctx,$tmp4,2
1171*a11d055eSArd Biesheuvel	and	$in0,$in0,$tmp4
1172*a11d055eSArd Biesheuvel	andi	$tmp4,$tmp4,3
1173*a11d055eSArd Biesheuvel	addu	$ctx,$ctx,$in0
1174*a11d055eSArd Biesheuvel
1175*a11d055eSArd Biesheuvel	addu	$tmp0,$tmp0,$ctx
1176*a11d055eSArd Biesheuvel	sltu	$ctx,$tmp0,$ctx
1177*a11d055eSArd Biesheuvel	 addiu	$in0,$tmp0,5		# compare to modulus
1178*a11d055eSArd Biesheuvel	addu	$tmp1,$tmp1,$ctx
1179*a11d055eSArd Biesheuvel	 sltiu	$in1,$in0,5
1180*a11d055eSArd Biesheuvel	sltu	$ctx,$tmp1,$ctx
1181*a11d055eSArd Biesheuvel	 addu	$in1,$in1,$tmp1
1182*a11d055eSArd Biesheuvel	addu	$tmp2,$tmp2,$ctx
1183*a11d055eSArd Biesheuvel	 sltu	$in2,$in1,$tmp1
1184*a11d055eSArd Biesheuvel	sltu	$ctx,$tmp2,$ctx
1185*a11d055eSArd Biesheuvel	 addu	$in2,$in2,$tmp2
1186*a11d055eSArd Biesheuvel	addu	$tmp3,$tmp3,$ctx
1187*a11d055eSArd Biesheuvel	 sltu	$in3,$in2,$tmp2
1188*a11d055eSArd Biesheuvel	sltu	$ctx,$tmp3,$ctx
1189*a11d055eSArd Biesheuvel	 addu	$in3,$in3,$tmp3
1190*a11d055eSArd Biesheuvel	addu	$tmp4,$tmp4,$ctx
1191*a11d055eSArd Biesheuvel	 sltu	$ctx,$in3,$tmp3
1192*a11d055eSArd Biesheuvel	 addu	$ctx,$tmp4
1193*a11d055eSArd Biesheuvel
1194*a11d055eSArd Biesheuvel	srl	$ctx,2			# see if it carried/borrowed
1195*a11d055eSArd Biesheuvel	subu	$ctx,$zero,$ctx
1196*a11d055eSArd Biesheuvel
1197*a11d055eSArd Biesheuvel	xor	$in0,$tmp0
1198*a11d055eSArd Biesheuvel	xor	$in1,$tmp1
1199*a11d055eSArd Biesheuvel	xor	$in2,$tmp2
1200*a11d055eSArd Biesheuvel	xor	$in3,$tmp3
1201*a11d055eSArd Biesheuvel	and	$in0,$ctx
1202*a11d055eSArd Biesheuvel	and	$in1,$ctx
1203*a11d055eSArd Biesheuvel	and	$in2,$ctx
1204*a11d055eSArd Biesheuvel	and	$in3,$ctx
1205*a11d055eSArd Biesheuvel	xor	$in0,$tmp0
1206*a11d055eSArd Biesheuvel	xor	$in1,$tmp1
1207*a11d055eSArd Biesheuvel	xor	$in2,$tmp2
1208*a11d055eSArd Biesheuvel	xor	$in3,$tmp3
1209*a11d055eSArd Biesheuvel
1210*a11d055eSArd Biesheuvel	lw	$tmp0,0($nonce)		# load nonce
1211*a11d055eSArd Biesheuvel	lw	$tmp1,4($nonce)
1212*a11d055eSArd Biesheuvel	lw	$tmp2,8($nonce)
1213*a11d055eSArd Biesheuvel	lw	$tmp3,12($nonce)
1214*a11d055eSArd Biesheuvel
1215*a11d055eSArd Biesheuvel	addu	$in0,$tmp0		# accumulate nonce
1216*a11d055eSArd Biesheuvel	sltu	$ctx,$in0,$tmp0
1217*a11d055eSArd Biesheuvel
1218*a11d055eSArd Biesheuvel	addu	$in1,$tmp1
1219*a11d055eSArd Biesheuvel	sltu	$tmp1,$in1,$tmp1
1220*a11d055eSArd Biesheuvel	addu	$in1,$ctx
1221*a11d055eSArd Biesheuvel	sltu	$ctx,$in1,$ctx
1222*a11d055eSArd Biesheuvel	addu	$ctx,$tmp1
1223*a11d055eSArd Biesheuvel
1224*a11d055eSArd Biesheuvel	addu	$in2,$tmp2
1225*a11d055eSArd Biesheuvel	sltu	$tmp2,$in2,$tmp2
1226*a11d055eSArd Biesheuvel	addu	$in2,$ctx
1227*a11d055eSArd Biesheuvel	sltu	$ctx,$in2,$ctx
1228*a11d055eSArd Biesheuvel	addu	$ctx,$tmp2
1229*a11d055eSArd Biesheuvel
1230*a11d055eSArd Biesheuvel	addu	$in3,$tmp3
1231*a11d055eSArd Biesheuvel	addu	$in3,$ctx
1232*a11d055eSArd Biesheuvel
1233*a11d055eSArd Biesheuvel	srl	$tmp0,$in0,8		# write mac value
1234*a11d055eSArd Biesheuvel	srl	$tmp1,$in0,16
1235*a11d055eSArd Biesheuvel	srl	$tmp2,$in0,24
1236*a11d055eSArd Biesheuvel	sb	$in0, 0($mac)
1237*a11d055eSArd Biesheuvel	sb	$tmp0,1($mac)
1238*a11d055eSArd Biesheuvel	srl	$tmp0,$in1,8
1239*a11d055eSArd Biesheuvel	sb	$tmp1,2($mac)
1240*a11d055eSArd Biesheuvel	srl	$tmp1,$in1,16
1241*a11d055eSArd Biesheuvel	sb	$tmp2,3($mac)
1242*a11d055eSArd Biesheuvel	srl	$tmp2,$in1,24
1243*a11d055eSArd Biesheuvel	sb	$in1, 4($mac)
1244*a11d055eSArd Biesheuvel	sb	$tmp0,5($mac)
1245*a11d055eSArd Biesheuvel	srl	$tmp0,$in2,8
1246*a11d055eSArd Biesheuvel	sb	$tmp1,6($mac)
1247*a11d055eSArd Biesheuvel	srl	$tmp1,$in2,16
1248*a11d055eSArd Biesheuvel	sb	$tmp2,7($mac)
1249*a11d055eSArd Biesheuvel	srl	$tmp2,$in2,24
1250*a11d055eSArd Biesheuvel	sb	$in2, 8($mac)
1251*a11d055eSArd Biesheuvel	sb	$tmp0,9($mac)
1252*a11d055eSArd Biesheuvel	srl	$tmp0,$in3,8
1253*a11d055eSArd Biesheuvel	sb	$tmp1,10($mac)
1254*a11d055eSArd Biesheuvel	srl	$tmp1,$in3,16
1255*a11d055eSArd Biesheuvel	sb	$tmp2,11($mac)
1256*a11d055eSArd Biesheuvel	srl	$tmp2,$in3,24
1257*a11d055eSArd Biesheuvel	sb	$in3, 12($mac)
1258*a11d055eSArd Biesheuvel	sb	$tmp0,13($mac)
1259*a11d055eSArd Biesheuvel	sb	$tmp1,14($mac)
1260*a11d055eSArd Biesheuvel	sb	$tmp2,15($mac)
1261*a11d055eSArd Biesheuvel
1262*a11d055eSArd Biesheuvel	jr	$ra
1263*a11d055eSArd Biesheuvel.end	poly1305_emit
1264*a11d055eSArd Biesheuvel.rdata
1265*a11d055eSArd Biesheuvel.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1266*a11d055eSArd Biesheuvel.align	2
1267*a11d055eSArd Biesheuvel___
1268*a11d055eSArd Biesheuvel}
1269*a11d055eSArd Biesheuvel}}}
1270*a11d055eSArd Biesheuvel
1271*a11d055eSArd Biesheuvel$output=pop and open STDOUT,">$output";
1272*a11d055eSArd Biesheuvelprint $code;
1273*a11d055eSArd Biesheuvelclose STDOUT;
1274