1#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3#
4# ====================================================================
5# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
6# project.
7# ====================================================================
8
9# Poly1305 hash for MIPS.
10#
11# May 2016
12#
13# Numbers are cycles per processed byte with poly1305_blocks alone.
14#
15#		IALU/gcc
16# R1x000	~5.5/+130%	(big-endian)
17# Octeon II	2.50/+70%	(little-endian)
18#
19# March 2019
20#
21# Add 32-bit code path.
22#
23# October 2019
24#
25# Modulo-scheduling reduction allows to omit dependency chain at the
26# end of inner loop and improve performance. Also optimize MIPS32R2
27# code path for MIPS 1004K core. Per René von Dorst's suggestions.
28#
29#		IALU/gcc
30# R1x000	~9.8/?		(big-endian)
31# Octeon II	3.65/+140%	(little-endian)
32# MT7621/1004K	4.75/?		(little-endian)
33#
34######################################################################
35# There is a number of MIPS ABI in use, O32 and N32/64 are most
36# widely used. Then there is a new contender: NUBI. It appears that if
37# one picks the latter, it's possible to arrange code in ABI neutral
38# manner. Therefore let's stick to NUBI register layout:
39#
40($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
41($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
42($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
43($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
44#
45# The return value is placed in $a0. Following coding rules facilitate
46# interoperability:
47#
48# - never ever touch $tp, "thread pointer", former $gp [o32 can be
49#   excluded from the rule, because it's specified volatile];
50# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
51#   old code];
52# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
53#
54# For reference here is register layout for N32/64 MIPS ABIs:
55#
56# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
57# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
58# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
59# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
60# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
61#
62# <appro@openssl.org>
63#
64######################################################################
65
66$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
67
68$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
69
70if ($flavour =~ /64|n32/i) {{{
71######################################################################
72# 64-bit code path
73#
74
75my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
76my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
77
78$code.=<<___;
79#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
80     defined(_MIPS_ARCH_MIPS64R6)) \\
81     && !defined(_MIPS_ARCH_MIPS64R2)
82# define _MIPS_ARCH_MIPS64R2
83#endif
84
85#if defined(_MIPS_ARCH_MIPS64R6)
86# define dmultu(rs,rt)
87# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
88# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
89#else
90# define dmultu(rs,rt)		dmultu	rs,rt
91# define mflo(rd,rs,rt)	mflo	rd
92# define mfhi(rd,rs,rt)	mfhi	rd
93#endif
94
95#ifdef	__KERNEL__
96# define poly1305_init   poly1305_init_mips
97# define poly1305_blocks poly1305_blocks_mips
98# define poly1305_emit   poly1305_emit_mips
99#endif
100
101#if defined(__MIPSEB__) && !defined(MIPSEB)
102# define MIPSEB
103#endif
104
105#ifdef MIPSEB
106# define MSB 0
107# define LSB 7
108#else
109# define MSB 7
110# define LSB 0
111#endif
112
113.text
114.set	noat
115.set	noreorder
116
117.align	5
118.globl	poly1305_init
119.ent	poly1305_init
120poly1305_init:
121	.frame	$sp,0,$ra
122	.set	reorder
123
124	sd	$zero,0($ctx)
125	sd	$zero,8($ctx)
126	sd	$zero,16($ctx)
127
128	beqz	$inp,.Lno_key
129
130#if defined(_MIPS_ARCH_MIPS64R6)
131	andi	$tmp0,$inp,7		# $inp % 8
132	dsubu	$inp,$inp,$tmp0		# align $inp
133	sll	$tmp0,$tmp0,3		# byte to bit offset
134	ld	$in0,0($inp)
135	ld	$in1,8($inp)
136	beqz	$tmp0,.Laligned_key
137	ld	$tmp2,16($inp)
138
139	subu	$tmp1,$zero,$tmp0
140# ifdef	MIPSEB
141	dsllv	$in0,$in0,$tmp0
142	dsrlv	$tmp3,$in1,$tmp1
143	dsllv	$in1,$in1,$tmp0
144	dsrlv	$tmp2,$tmp2,$tmp1
145# else
146	dsrlv	$in0,$in0,$tmp0
147	dsllv	$tmp3,$in1,$tmp1
148	dsrlv	$in1,$in1,$tmp0
149	dsllv	$tmp2,$tmp2,$tmp1
150# endif
151	or	$in0,$in0,$tmp3
152	or	$in1,$in1,$tmp2
153.Laligned_key:
154#else
155	ldl	$in0,0+MSB($inp)
156	ldl	$in1,8+MSB($inp)
157	ldr	$in0,0+LSB($inp)
158	ldr	$in1,8+LSB($inp)
159#endif
160#ifdef	MIPSEB
161# if defined(_MIPS_ARCH_MIPS64R2)
162	dsbh	$in0,$in0		# byte swap
163	 dsbh	$in1,$in1
164	dshd	$in0,$in0
165	 dshd	$in1,$in1
166# else
167	ori	$tmp0,$zero,0xFF
168	dsll	$tmp2,$tmp0,32
169	or	$tmp0,$tmp2		# 0x000000FF000000FF
170
171	and	$tmp1,$in0,$tmp0	# byte swap
172	 and	$tmp3,$in1,$tmp0
173	dsrl	$tmp2,$in0,24
174	 dsrl	$tmp4,$in1,24
175	dsll	$tmp1,24
176	 dsll	$tmp3,24
177	and	$tmp2,$tmp0
178	 and	$tmp4,$tmp0
179	dsll	$tmp0,8			# 0x0000FF000000FF00
180	or	$tmp1,$tmp2
181	 or	$tmp3,$tmp4
182	and	$tmp2,$in0,$tmp0
183	 and	$tmp4,$in1,$tmp0
184	dsrl	$in0,8
185	 dsrl	$in1,8
186	dsll	$tmp2,8
187	 dsll	$tmp4,8
188	and	$in0,$tmp0
189	 and	$in1,$tmp0
190	or	$tmp1,$tmp2
191	 or	$tmp3,$tmp4
192	or	$in0,$tmp1
193	 or	$in1,$tmp3
194	dsrl	$tmp1,$in0,32
195	 dsrl	$tmp3,$in1,32
196	dsll	$in0,32
197	 dsll	$in1,32
198	or	$in0,$tmp1
199	 or	$in1,$tmp3
200# endif
201#endif
202	li	$tmp0,1
203	dsll	$tmp0,32		# 0x0000000100000000
204	daddiu	$tmp0,-63		# 0x00000000ffffffc1
205	dsll	$tmp0,28		# 0x0ffffffc10000000
206	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
207
208	and	$in0,$tmp0
209	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
210	and	$in1,$tmp0
211
212	sd	$in0,24($ctx)
213	dsrl	$tmp0,$in1,2
214	sd	$in1,32($ctx)
215	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
216	sd	$tmp0,40($ctx)
217
218.Lno_key:
219	li	$v0,0			# return 0
220	jr	$ra
221.end	poly1305_init
222___
223{
224my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
225
226my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
227   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
228my ($shr,$shl) = ($s6,$s7);		# used on R6
229
230$code.=<<___;
231.align	5
232.globl	poly1305_blocks
233.ent	poly1305_blocks
234poly1305_blocks:
235	.set	noreorder
236	dsrl	$len,4			# number of complete blocks
237	bnez	$len,poly1305_blocks_internal
238	nop
239	jr	$ra
240	nop
241.end	poly1305_blocks
242
243.align	5
244.ent	poly1305_blocks_internal
245poly1305_blocks_internal:
246	.set	noreorder
247#if defined(_MIPS_ARCH_MIPS64R6)
248	.frame	$sp,8*8,$ra
249	.mask	$SAVED_REGS_MASK|0x000c0000,-8
250	dsubu	$sp,8*8
251	sd	$s7,56($sp)
252	sd	$s6,48($sp)
253#else
254	.frame	$sp,6*8,$ra
255	.mask	$SAVED_REGS_MASK,-8
256	dsubu	$sp,6*8
257#endif
258	sd	$s5,40($sp)
259	sd	$s4,32($sp)
260___
261$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
262	sd	$s3,24($sp)
263	sd	$s2,16($sp)
264	sd	$s1,8($sp)
265	sd	$s0,0($sp)
266___
267$code.=<<___;
268	.set	reorder
269
270#if defined(_MIPS_ARCH_MIPS64R6)
271	andi	$shr,$inp,7
272	dsubu	$inp,$inp,$shr		# align $inp
273	sll	$shr,$shr,3		# byte to bit offset
274	subu	$shl,$zero,$shr
275#endif
276
277	ld	$h0,0($ctx)		# load hash value
278	ld	$h1,8($ctx)
279	ld	$h2,16($ctx)
280
281	ld	$r0,24($ctx)		# load key
282	ld	$r1,32($ctx)
283	ld	$rs1,40($ctx)
284
285	dsll	$len,4
286	daddu	$len,$inp		# end of buffer
287	b	.Loop
288
289.align	4
290.Loop:
291#if defined(_MIPS_ARCH_MIPS64R6)
292	ld	$in0,0($inp)		# load input
293	ld	$in1,8($inp)
294	beqz	$shr,.Laligned_inp
295
296	ld	$tmp2,16($inp)
297# ifdef	MIPSEB
298	dsllv	$in0,$in0,$shr
299	dsrlv	$tmp3,$in1,$shl
300	dsllv	$in1,$in1,$shr
301	dsrlv	$tmp2,$tmp2,$shl
302# else
303	dsrlv	$in0,$in0,$shr
304	dsllv	$tmp3,$in1,$shl
305	dsrlv	$in1,$in1,$shr
306	dsllv	$tmp2,$tmp2,$shl
307# endif
308	or	$in0,$in0,$tmp3
309	or	$in1,$in1,$tmp2
310.Laligned_inp:
311#else
312	ldl	$in0,0+MSB($inp)	# load input
313	ldl	$in1,8+MSB($inp)
314	ldr	$in0,0+LSB($inp)
315	ldr	$in1,8+LSB($inp)
316#endif
317	daddiu	$inp,16
318#ifdef	MIPSEB
319# if defined(_MIPS_ARCH_MIPS64R2)
320	dsbh	$in0,$in0		# byte swap
321	 dsbh	$in1,$in1
322	dshd	$in0,$in0
323	 dshd	$in1,$in1
324# else
325	ori	$tmp0,$zero,0xFF
326	dsll	$tmp2,$tmp0,32
327	or	$tmp0,$tmp2		# 0x000000FF000000FF
328
329	and	$tmp1,$in0,$tmp0	# byte swap
330	 and	$tmp3,$in1,$tmp0
331	dsrl	$tmp2,$in0,24
332	 dsrl	$tmp4,$in1,24
333	dsll	$tmp1,24
334	 dsll	$tmp3,24
335	and	$tmp2,$tmp0
336	 and	$tmp4,$tmp0
337	dsll	$tmp0,8			# 0x0000FF000000FF00
338	or	$tmp1,$tmp2
339	 or	$tmp3,$tmp4
340	and	$tmp2,$in0,$tmp0
341	 and	$tmp4,$in1,$tmp0
342	dsrl	$in0,8
343	 dsrl	$in1,8
344	dsll	$tmp2,8
345	 dsll	$tmp4,8
346	and	$in0,$tmp0
347	 and	$in1,$tmp0
348	or	$tmp1,$tmp2
349	 or	$tmp3,$tmp4
350	or	$in0,$tmp1
351	 or	$in1,$tmp3
352	dsrl	$tmp1,$in0,32
353	 dsrl	$tmp3,$in1,32
354	dsll	$in0,32
355	 dsll	$in1,32
356	or	$in0,$tmp1
357	 or	$in1,$tmp3
358# endif
359#endif
360	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
361	andi	$h2,$h2,3
362	dsll	$tmp0,$tmp1,2
363
364	daddu	$d0,$h0,$in0		# accumulate input
365	 daddu	$tmp1,$tmp0
366	sltu	$tmp0,$d0,$h0
367	daddu	$d0,$d0,$tmp1		# ... and residue
368	sltu	$tmp1,$d0,$tmp1
369	daddu	$d1,$h1,$in1
370	daddu	$tmp0,$tmp1
371	sltu	$tmp1,$d1,$h1
372	daddu	$d1,$tmp0
373
374	dmultu	($r0,$d0)		# h0*r0
375	 daddu	$d2,$h2,$padbit
376	 sltu	$tmp0,$d1,$tmp0
377	mflo	($h0,$r0,$d0)
378	mfhi	($h1,$r0,$d0)
379
380	dmultu	($rs1,$d1)		# h1*5*r1
381	 daddu	$d2,$tmp1
382	 daddu	$d2,$tmp0
383	mflo	($tmp0,$rs1,$d1)
384	mfhi	($tmp1,$rs1,$d1)
385
386	dmultu	($r1,$d0)		# h0*r1
387	mflo	($tmp2,$r1,$d0)
388	mfhi	($h2,$r1,$d0)
389	 daddu	$h0,$tmp0
390	 daddu	$h1,$tmp1
391	 sltu	$tmp0,$h0,$tmp0
392
393	dmultu	($r0,$d1)		# h1*r0
394	 daddu	$h1,$tmp0
395	 daddu	$h1,$tmp2
396	mflo	($tmp0,$r0,$d1)
397	mfhi	($tmp1,$r0,$d1)
398
399	dmultu	($rs1,$d2)		# h2*5*r1
400	 sltu	$tmp2,$h1,$tmp2
401	 daddu	$h2,$tmp2
402	mflo	($tmp2,$rs1,$d2)
403
404	dmultu	($r0,$d2)		# h2*r0
405	 daddu	$h1,$tmp0
406	 daddu	$h2,$tmp1
407	mflo	($tmp3,$r0,$d2)
408	 sltu	$tmp0,$h1,$tmp0
409	 daddu	$h2,$tmp0
410
411	daddu	$h1,$tmp2
412	sltu	$tmp2,$h1,$tmp2
413	daddu	$h2,$tmp2
414	daddu	$h2,$tmp3
415
416	bne	$inp,$len,.Loop
417
418	sd	$h0,0($ctx)		# store hash value
419	sd	$h1,8($ctx)
420	sd	$h2,16($ctx)
421
422	.set	noreorder
423#if defined(_MIPS_ARCH_MIPS64R6)
424	ld	$s7,56($sp)
425	ld	$s6,48($sp)
426#endif
427	ld	$s5,40($sp)		# epilogue
428	ld	$s4,32($sp)
429___
430$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
431	ld	$s3,24($sp)
432	ld	$s2,16($sp)
433	ld	$s1,8($sp)
434	ld	$s0,0($sp)
435___
436$code.=<<___;
437	jr	$ra
438#if defined(_MIPS_ARCH_MIPS64R6)
439	daddu	$sp,8*8
440#else
441	daddu	$sp,6*8
442#endif
443.end	poly1305_blocks_internal
444___
445}
446{
447my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
448
449$code.=<<___;
450.align	5
451.globl	poly1305_emit
452.ent	poly1305_emit
453poly1305_emit:
454	.frame	$sp,0,$ra
455	.set	reorder
456
457	ld	$tmp2,16($ctx)
458	ld	$tmp0,0($ctx)
459	ld	$tmp1,8($ctx)
460
461	li	$in0,-4			# final reduction
462	dsrl	$in1,$tmp2,2
463	and	$in0,$tmp2
464	andi	$tmp2,$tmp2,3
465	daddu	$in0,$in1
466
467	daddu	$tmp0,$tmp0,$in0
468	sltu	$in1,$tmp0,$in0
469	 daddiu	$in0,$tmp0,5		# compare to modulus
470	daddu	$tmp1,$tmp1,$in1
471	 sltiu	$tmp3,$in0,5
472	sltu	$tmp4,$tmp1,$in1
473	 daddu	$in1,$tmp1,$tmp3
474	daddu	$tmp2,$tmp2,$tmp4
475	 sltu	$tmp3,$in1,$tmp3
476	 daddu	$tmp2,$tmp2,$tmp3
477
478	dsrl	$tmp2,2			# see if it carried/borrowed
479	dsubu	$tmp2,$zero,$tmp2
480
481	xor	$in0,$tmp0
482	xor	$in1,$tmp1
483	and	$in0,$tmp2
484	and	$in1,$tmp2
485	xor	$in0,$tmp0
486	xor	$in1,$tmp1
487
488	lwu	$tmp0,0($nonce)		# load nonce
489	lwu	$tmp1,4($nonce)
490	lwu	$tmp2,8($nonce)
491	lwu	$tmp3,12($nonce)
492	dsll	$tmp1,32
493	dsll	$tmp3,32
494	or	$tmp0,$tmp1
495	or	$tmp2,$tmp3
496
497	daddu	$in0,$tmp0		# accumulate nonce
498	daddu	$in1,$tmp2
499	sltu	$tmp0,$in0,$tmp0
500	daddu	$in1,$tmp0
501
502	dsrl	$tmp0,$in0,8		# write mac value
503	dsrl	$tmp1,$in0,16
504	dsrl	$tmp2,$in0,24
505	sb	$in0,0($mac)
506	dsrl	$tmp3,$in0,32
507	sb	$tmp0,1($mac)
508	dsrl	$tmp0,$in0,40
509	sb	$tmp1,2($mac)
510	dsrl	$tmp1,$in0,48
511	sb	$tmp2,3($mac)
512	dsrl	$tmp2,$in0,56
513	sb	$tmp3,4($mac)
514	dsrl	$tmp3,$in1,8
515	sb	$tmp0,5($mac)
516	dsrl	$tmp0,$in1,16
517	sb	$tmp1,6($mac)
518	dsrl	$tmp1,$in1,24
519	sb	$tmp2,7($mac)
520
521	sb	$in1,8($mac)
522	dsrl	$tmp2,$in1,32
523	sb	$tmp3,9($mac)
524	dsrl	$tmp3,$in1,40
525	sb	$tmp0,10($mac)
526	dsrl	$tmp0,$in1,48
527	sb	$tmp1,11($mac)
528	dsrl	$tmp1,$in1,56
529	sb	$tmp2,12($mac)
530	sb	$tmp3,13($mac)
531	sb	$tmp0,14($mac)
532	sb	$tmp1,15($mac)
533
534	jr	$ra
535.end	poly1305_emit
536.rdata
537.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
538.align	2
539___
540}
541}}} else {{{
542######################################################################
543# 32-bit code path
544#
545
546my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
547my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
548   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
549
550$code.=<<___;
551#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
552     defined(_MIPS_ARCH_MIPS32R6)) \\
553     && !defined(_MIPS_ARCH_MIPS32R2)
554# define _MIPS_ARCH_MIPS32R2
555#endif
556
557#if defined(_MIPS_ARCH_MIPS32R6)
558# define multu(rs,rt)
559# define mflo(rd,rs,rt)	mulu	rd,rs,rt
560# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
561#else
562# define multu(rs,rt)	multu	rs,rt
563# define mflo(rd,rs,rt)	mflo	rd
564# define mfhi(rd,rs,rt)	mfhi	rd
565#endif
566
567#ifdef	__KERNEL__
568# define poly1305_init   poly1305_init_mips
569# define poly1305_blocks poly1305_blocks_mips
570# define poly1305_emit   poly1305_emit_mips
571#endif
572
573#if defined(__MIPSEB__) && !defined(MIPSEB)
574# define MIPSEB
575#endif
576
577#ifdef MIPSEB
578# define MSB 0
579# define LSB 3
580#else
581# define MSB 3
582# define LSB 0
583#endif
584
585.text
586.set	noat
587.set	noreorder
588
589.align	5
590.globl	poly1305_init
591.ent	poly1305_init
592poly1305_init:
593	.frame	$sp,0,$ra
594	.set	reorder
595
596	sw	$zero,0($ctx)
597	sw	$zero,4($ctx)
598	sw	$zero,8($ctx)
599	sw	$zero,12($ctx)
600	sw	$zero,16($ctx)
601
602	beqz	$inp,.Lno_key
603
604#if defined(_MIPS_ARCH_MIPS32R6)
605	andi	$tmp0,$inp,3		# $inp % 4
606	subu	$inp,$inp,$tmp0		# align $inp
607	sll	$tmp0,$tmp0,3		# byte to bit offset
608	lw	$in0,0($inp)
609	lw	$in1,4($inp)
610	lw	$in2,8($inp)
611	lw	$in3,12($inp)
612	beqz	$tmp0,.Laligned_key
613
614	lw	$tmp2,16($inp)
615	subu	$tmp1,$zero,$tmp0
616# ifdef	MIPSEB
617	sllv	$in0,$in0,$tmp0
618	srlv	$tmp3,$in1,$tmp1
619	sllv	$in1,$in1,$tmp0
620	or	$in0,$in0,$tmp3
621	srlv	$tmp3,$in2,$tmp1
622	sllv	$in2,$in2,$tmp0
623	or	$in1,$in1,$tmp3
624	srlv	$tmp3,$in3,$tmp1
625	sllv	$in3,$in3,$tmp0
626	or	$in2,$in2,$tmp3
627	srlv	$tmp2,$tmp2,$tmp1
628	or	$in3,$in3,$tmp2
629# else
630	srlv	$in0,$in0,$tmp0
631	sllv	$tmp3,$in1,$tmp1
632	srlv	$in1,$in1,$tmp0
633	or	$in0,$in0,$tmp3
634	sllv	$tmp3,$in2,$tmp1
635	srlv	$in2,$in2,$tmp0
636	or	$in1,$in1,$tmp3
637	sllv	$tmp3,$in3,$tmp1
638	srlv	$in3,$in3,$tmp0
639	or	$in2,$in2,$tmp3
640	sllv	$tmp2,$tmp2,$tmp1
641	or	$in3,$in3,$tmp2
642# endif
643.Laligned_key:
644#else
645	lwl	$in0,0+MSB($inp)
646	lwl	$in1,4+MSB($inp)
647	lwl	$in2,8+MSB($inp)
648	lwl	$in3,12+MSB($inp)
649	lwr	$in0,0+LSB($inp)
650	lwr	$in1,4+LSB($inp)
651	lwr	$in2,8+LSB($inp)
652	lwr	$in3,12+LSB($inp)
653#endif
654#ifdef	MIPSEB
655# if defined(_MIPS_ARCH_MIPS32R2)
656	wsbh	$in0,$in0		# byte swap
657	wsbh	$in1,$in1
658	wsbh	$in2,$in2
659	wsbh	$in3,$in3
660	rotr	$in0,$in0,16
661	rotr	$in1,$in1,16
662	rotr	$in2,$in2,16
663	rotr	$in3,$in3,16
664# else
665	srl	$tmp0,$in0,24		# byte swap
666	srl	$tmp1,$in0,8
667	andi	$tmp2,$in0,0xFF00
668	sll	$in0,$in0,24
669	andi	$tmp1,0xFF00
670	sll	$tmp2,$tmp2,8
671	or	$in0,$tmp0
672	 srl	$tmp0,$in1,24
673	or	$tmp1,$tmp2
674	 srl	$tmp2,$in1,8
675	or	$in0,$tmp1
676	 andi	$tmp1,$in1,0xFF00
677	 sll	$in1,$in1,24
678	 andi	$tmp2,0xFF00
679	 sll	$tmp1,$tmp1,8
680	 or	$in1,$tmp0
681	srl	$tmp0,$in2,24
682	 or	$tmp2,$tmp1
683	srl	$tmp1,$in2,8
684	 or	$in1,$tmp2
685	andi	$tmp2,$in2,0xFF00
686	sll	$in2,$in2,24
687	andi	$tmp1,0xFF00
688	sll	$tmp2,$tmp2,8
689	or	$in2,$tmp0
690	 srl	$tmp0,$in3,24
691	or	$tmp1,$tmp2
692	 srl	$tmp2,$in3,8
693	or	$in2,$tmp1
694	 andi	$tmp1,$in3,0xFF00
695	 sll	$in3,$in3,24
696	 andi	$tmp2,0xFF00
697	 sll	$tmp1,$tmp1,8
698	 or	$in3,$tmp0
699	 or	$tmp2,$tmp1
700	 or	$in3,$tmp2
701# endif
702#endif
703	lui	$tmp0,0x0fff
704	ori	$tmp0,0xffff		# 0x0fffffff
705	and	$in0,$in0,$tmp0
706	subu	$tmp0,3			# 0x0ffffffc
707	and	$in1,$in1,$tmp0
708	and	$in2,$in2,$tmp0
709	and	$in3,$in3,$tmp0
710
711	sw	$in0,20($ctx)
712	sw	$in1,24($ctx)
713	sw	$in2,28($ctx)
714	sw	$in3,32($ctx)
715
716	srl	$tmp1,$in1,2
717	srl	$tmp2,$in2,2
718	srl	$tmp3,$in3,2
719	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
720	addu	$in2,$in2,$tmp2
721	addu	$in3,$in3,$tmp3
722	sw	$in1,36($ctx)
723	sw	$in2,40($ctx)
724	sw	$in3,44($ctx)
725.Lno_key:
726	li	$v0,0
727	jr	$ra
728.end	poly1305_init
729___
730{
731my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
732
733my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
734   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
735my ($d0,$d1,$d2,$d3) =
736   ($a4,$a5,$a6,$a7);
737my $shr = $t2;		# used on R6
738my $one = $t2;		# used on R2
739
740$code.=<<___;
741.globl	poly1305_blocks
742.align	5
743.ent	poly1305_blocks
744poly1305_blocks:
745	.frame	$sp,16*4,$ra
746	.mask	$SAVED_REGS_MASK,-4
747	.set	noreorder
748	subu	$sp, $sp,4*12
749	sw	$s11,4*11($sp)
750	sw	$s10,4*10($sp)
751	sw	$s9, 4*9($sp)
752	sw	$s8, 4*8($sp)
753	sw	$s7, 4*7($sp)
754	sw	$s6, 4*6($sp)
755	sw	$s5, 4*5($sp)
756	sw	$s4, 4*4($sp)
757___
758$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
759	sw	$s3, 4*3($sp)
760	sw	$s2, 4*2($sp)
761	sw	$s1, 4*1($sp)
762	sw	$s0, 4*0($sp)
763___
764$code.=<<___;
765	.set	reorder
766
767	srl	$len,4			# number of complete blocks
768	li	$one,1
769	beqz	$len,.Labort
770
771#if defined(_MIPS_ARCH_MIPS32R6)
772	andi	$shr,$inp,3
773	subu	$inp,$inp,$shr		# align $inp
774	sll	$shr,$shr,3		# byte to bit offset
775#endif
776
777	lw	$h0,0($ctx)		# load hash value
778	lw	$h1,4($ctx)
779	lw	$h2,8($ctx)
780	lw	$h3,12($ctx)
781	lw	$h4,16($ctx)
782
783	lw	$r0,20($ctx)		# load key
784	lw	$r1,24($ctx)
785	lw	$r2,28($ctx)
786	lw	$r3,32($ctx)
787	lw	$rs1,36($ctx)
788	lw	$rs2,40($ctx)
789	lw	$rs3,44($ctx)
790
791	sll	$len,4
792	addu	$len,$len,$inp		# end of buffer
793	b	.Loop
794
795.align	4
796.Loop:
797#if defined(_MIPS_ARCH_MIPS32R6)
798	lw	$d0,0($inp)		# load input
799	lw	$d1,4($inp)
800	lw	$d2,8($inp)
801	lw	$d3,12($inp)
802	beqz	$shr,.Laligned_inp
803
804	lw	$t0,16($inp)
805	subu	$t1,$zero,$shr
806# ifdef	MIPSEB
807	sllv	$d0,$d0,$shr
808	srlv	$at,$d1,$t1
809	sllv	$d1,$d1,$shr
810	or	$d0,$d0,$at
811	srlv	$at,$d2,$t1
812	sllv	$d2,$d2,$shr
813	or	$d1,$d1,$at
814	srlv	$at,$d3,$t1
815	sllv	$d3,$d3,$shr
816	or	$d2,$d2,$at
817	srlv	$t0,$t0,$t1
818	or	$d3,$d3,$t0
819# else
820	srlv	$d0,$d0,$shr
821	sllv	$at,$d1,$t1
822	srlv	$d1,$d1,$shr
823	or	$d0,$d0,$at
824	sllv	$at,$d2,$t1
825	srlv	$d2,$d2,$shr
826	or	$d1,$d1,$at
827	sllv	$at,$d3,$t1
828	srlv	$d3,$d3,$shr
829	or	$d2,$d2,$at
830	sllv	$t0,$t0,$t1
831	or	$d3,$d3,$t0
832# endif
833.Laligned_inp:
834#else
835	lwl	$d0,0+MSB($inp)		# load input
836	lwl	$d1,4+MSB($inp)
837	lwl	$d2,8+MSB($inp)
838	lwl	$d3,12+MSB($inp)
839	lwr	$d0,0+LSB($inp)
840	lwr	$d1,4+LSB($inp)
841	lwr	$d2,8+LSB($inp)
842	lwr	$d3,12+LSB($inp)
843#endif
844#ifdef	MIPSEB
845# if defined(_MIPS_ARCH_MIPS32R2)
846	wsbh	$d0,$d0			# byte swap
847	wsbh	$d1,$d1
848	wsbh	$d2,$d2
849	wsbh	$d3,$d3
850	rotr	$d0,$d0,16
851	rotr	$d1,$d1,16
852	rotr	$d2,$d2,16
853	rotr	$d3,$d3,16
854# else
855	srl	$at,$d0,24		# byte swap
856	srl	$t0,$d0,8
857	andi	$t1,$d0,0xFF00
858	sll	$d0,$d0,24
859	andi	$t0,0xFF00
860	sll	$t1,$t1,8
861	or	$d0,$at
862	 srl	$at,$d1,24
863	or	$t0,$t1
864	 srl	$t1,$d1,8
865	or	$d0,$t0
866	 andi	$t0,$d1,0xFF00
867	 sll	$d1,$d1,24
868	 andi	$t1,0xFF00
869	 sll	$t0,$t0,8
870	 or	$d1,$at
871	srl	$at,$d2,24
872	 or	$t1,$t0
873	srl	$t0,$d2,8
874	 or	$d1,$t1
875	andi	$t1,$d2,0xFF00
876	sll	$d2,$d2,24
877	andi	$t0,0xFF00
878	sll	$t1,$t1,8
879	or	$d2,$at
880	 srl	$at,$d3,24
881	or	$t0,$t1
882	 srl	$t1,$d3,8
883	or	$d2,$t0
884	 andi	$t0,$d3,0xFF00
885	 sll	$d3,$d3,24
886	 andi	$t1,0xFF00
887	 sll	$t0,$t0,8
888	 or	$d3,$at
889	 or	$t1,$t0
890	 or	$d3,$t1
891# endif
892#endif
893	srl	$t0,$h4,2		# modulo-scheduled reduction
894	andi	$h4,$h4,3
895	sll	$at,$t0,2
896
897	addu	$d0,$d0,$h0		# accumulate input
898	 addu	$t0,$t0,$at
899	sltu	$h0,$d0,$h0
900	addu	$d0,$d0,$t0		# ... and residue
901	sltu	$at,$d0,$t0
902
903	addu	$d1,$d1,$h1
904	 addu	$h0,$h0,$at		# carry
905	sltu	$h1,$d1,$h1
906	addu	$d1,$d1,$h0
907	sltu	$h0,$d1,$h0
908
909	addu	$d2,$d2,$h2
910	 addu	$h1,$h1,$h0		# carry
911	sltu	$h2,$d2,$h2
912	addu	$d2,$d2,$h1
913	sltu	$h1,$d2,$h1
914
915	addu	$d3,$d3,$h3
916	 addu	$h2,$h2,$h1		# carry
917	sltu	$h3,$d3,$h3
918	addu	$d3,$d3,$h2
919
920#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
921	multu	$r0,$d0			# d0*r0
922	 sltu	$h2,$d3,$h2
923	maddu	$rs3,$d1		# d1*s3
924	 addu	$h3,$h3,$h2		# carry
925	maddu	$rs2,$d2		# d2*s2
926	 addu	$h4,$h4,$padbit
927	maddu	$rs1,$d3		# d3*s1
928	 addu	$h4,$h4,$h3
929	mfhi	$at
930	mflo	$h0
931
932	multu	$r1,$d0			# d0*r1
933	maddu	$r0,$d1			# d1*r0
934	maddu	$rs3,$d2		# d2*s3
935	maddu	$rs2,$d3		# d3*s2
936	maddu	$rs1,$h4		# h4*s1
937	maddu	$at,$one		# hi*1
938	mfhi	$at
939	mflo	$h1
940
941	multu	$r2,$d0			# d0*r2
942	maddu	$r1,$d1			# d1*r1
943	maddu	$r0,$d2			# d2*r0
944	maddu	$rs3,$d3		# d3*s3
945	maddu	$rs2,$h4		# h4*s2
946	maddu	$at,$one		# hi*1
947	mfhi	$at
948	mflo	$h2
949
950	mul	$t0,$r0,$h4		# h4*r0
951
952	multu	$r3,$d0			# d0*r3
953	maddu	$r2,$d1			# d1*r2
954	maddu	$r1,$d2			# d2*r1
955	maddu	$r0,$d3			# d3*r0
956	maddu	$rs3,$h4		# h4*s3
957	maddu	$at,$one		# hi*1
958	mfhi	$at
959	mflo	$h3
960
961	 addiu	$inp,$inp,16
962
963	addu	$h4,$t0,$at
964#else
965	multu	($r0,$d0)		# d0*r0
966	mflo	($h0,$r0,$d0)
967	mfhi	($h1,$r0,$d0)
968
969	 sltu	$h2,$d3,$h2
970	 addu	$h3,$h3,$h2		# carry
971
972	multu	($rs3,$d1)		# d1*s3
973	mflo	($at,$rs3,$d1)
974	mfhi	($t0,$rs3,$d1)
975
976	 addu	$h4,$h4,$padbit
977	 addiu	$inp,$inp,16
978	 addu	$h4,$h4,$h3
979
980	multu	($rs2,$d2)		# d2*s2
981	mflo	($a3,$rs2,$d2)
982	mfhi	($t1,$rs2,$d2)
983	 addu	$h0,$h0,$at
984	 addu	$h1,$h1,$t0
985	multu	($rs1,$d3)		# d3*s1
986	 sltu	$at,$h0,$at
987	 addu	$h1,$h1,$at
988
989	mflo	($at,$rs1,$d3)
990	mfhi	($t0,$rs1,$d3)
991	 addu	$h0,$h0,$a3
992	 addu	$h1,$h1,$t1
993	multu	($r1,$d0)		# d0*r1
994	 sltu	$a3,$h0,$a3
995	 addu	$h1,$h1,$a3
996
997
998	mflo	($a3,$r1,$d0)
999	mfhi	($h2,$r1,$d0)
1000	 addu	$h0,$h0,$at
1001	 addu	$h1,$h1,$t0
1002	multu	($r0,$d1)		# d1*r0
1003	 sltu	$at,$h0,$at
1004	 addu	$h1,$h1,$at
1005
1006	mflo	($at,$r0,$d1)
1007	mfhi	($t0,$r0,$d1)
1008	 addu	$h1,$h1,$a3
1009	 sltu	$a3,$h1,$a3
1010	multu	($rs3,$d2)		# d2*s3
1011	 addu	$h2,$h2,$a3
1012
1013	mflo	($a3,$rs3,$d2)
1014	mfhi	($t1,$rs3,$d2)
1015	 addu	$h1,$h1,$at
1016	 addu	$h2,$h2,$t0
1017	multu	($rs2,$d3)		# d3*s2
1018	 sltu	$at,$h1,$at
1019	 addu	$h2,$h2,$at
1020
1021	mflo	($at,$rs2,$d3)
1022	mfhi	($t0,$rs2,$d3)
1023	 addu	$h1,$h1,$a3
1024	 addu	$h2,$h2,$t1
1025	multu	($rs1,$h4)		# h4*s1
1026	 sltu	$a3,$h1,$a3
1027	 addu	$h2,$h2,$a3
1028
1029	mflo	($a3,$rs1,$h4)
1030	 addu	$h1,$h1,$at
1031	 addu	$h2,$h2,$t0
1032	multu	($r2,$d0)		# d0*r2
1033	 sltu	$at,$h1,$at
1034	 addu	$h2,$h2,$at
1035
1036
1037	mflo	($at,$r2,$d0)
1038	mfhi	($h3,$r2,$d0)
1039	 addu	$h1,$h1,$a3
1040	 sltu	$a3,$h1,$a3
1041	multu	($r1,$d1)		# d1*r1
1042	 addu	$h2,$h2,$a3
1043
1044	mflo	($a3,$r1,$d1)
1045	mfhi	($t1,$r1,$d1)
1046	 addu	$h2,$h2,$at
1047	 sltu	$at,$h2,$at
1048	multu	($r0,$d2)		# d2*r0
1049	 addu	$h3,$h3,$at
1050
1051	mflo	($at,$r0,$d2)
1052	mfhi	($t0,$r0,$d2)
1053	 addu	$h2,$h2,$a3
1054	 addu	$h3,$h3,$t1
1055	multu	($rs3,$d3)		# d3*s3
1056	 sltu	$a3,$h2,$a3
1057	 addu	$h3,$h3,$a3
1058
1059	mflo	($a3,$rs3,$d3)
1060	mfhi	($t1,$rs3,$d3)
1061	 addu	$h2,$h2,$at
1062	 addu	$h3,$h3,$t0
1063	multu	($rs2,$h4)		# h4*s2
1064	 sltu	$at,$h2,$at
1065	 addu	$h3,$h3,$at
1066
1067	mflo	($at,$rs2,$h4)
1068	 addu	$h2,$h2,$a3
1069	 addu	$h3,$h3,$t1
1070	multu	($r3,$d0)		# d0*r3
1071	 sltu	$a3,$h2,$a3
1072	 addu	$h3,$h3,$a3
1073
1074
1075	mflo	($a3,$r3,$d0)
1076	mfhi	($t1,$r3,$d0)
1077	 addu	$h2,$h2,$at
1078	 sltu	$at,$h2,$at
1079	multu	($r2,$d1)		# d1*r2
1080	 addu	$h3,$h3,$at
1081
1082	mflo	($at,$r2,$d1)
1083	mfhi	($t0,$r2,$d1)
1084	 addu	$h3,$h3,$a3
1085	 sltu	$a3,$h3,$a3
1086	multu	($r0,$d3)		# d3*r0
1087	 addu	$t1,$t1,$a3
1088
1089	mflo	($a3,$r0,$d3)
1090	mfhi	($d3,$r0,$d3)
1091	 addu	$h3,$h3,$at
1092	 addu	$t1,$t1,$t0
1093	multu	($r1,$d2)		# d2*r1
1094	 sltu	$at,$h3,$at
1095	 addu	$t1,$t1,$at
1096
1097	mflo	($at,$r1,$d2)
1098	mfhi	($t0,$r1,$d2)
1099	 addu	$h3,$h3,$a3
1100	 addu	$t1,$t1,$d3
1101	multu	($rs3,$h4)		# h4*s3
1102	 sltu	$a3,$h3,$a3
1103	 addu	$t1,$t1,$a3
1104
1105	mflo	($a3,$rs3,$h4)
1106	 addu	$h3,$h3,$at
1107	 addu	$t1,$t1,$t0
1108	multu	($r0,$h4)		# h4*r0
1109	 sltu	$at,$h3,$at
1110	 addu	$t1,$t1,$at
1111
1112
1113	mflo	($h4,$r0,$h4)
1114	 addu	$h3,$h3,$a3
1115	 sltu	$a3,$h3,$a3
1116	 addu	$t1,$t1,$a3
1117	addu	$h4,$h4,$t1
1118
1119	li	$padbit,1		# if we loop, padbit is 1
1120#endif
1121	bne	$inp,$len,.Loop
1122
1123	sw	$h0,0($ctx)		# store hash value
1124	sw	$h1,4($ctx)
1125	sw	$h2,8($ctx)
1126	sw	$h3,12($ctx)
1127	sw	$h4,16($ctx)
1128
1129	.set	noreorder
1130.Labort:
1131	lw	$s11,4*11($sp)
1132	lw	$s10,4*10($sp)
1133	lw	$s9, 4*9($sp)
1134	lw	$s8, 4*8($sp)
1135	lw	$s7, 4*7($sp)
1136	lw	$s6, 4*6($sp)
1137	lw	$s5, 4*5($sp)
1138	lw	$s4, 4*4($sp)
1139___
1140$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
1141	lw	$s3, 4*3($sp)
1142	lw	$s2, 4*2($sp)
1143	lw	$s1, 4*1($sp)
1144	lw	$s0, 4*0($sp)
1145___
1146$code.=<<___;
1147	jr	$ra
1148	addu	$sp,$sp,4*12
1149.end	poly1305_blocks
1150___
1151}
1152{
1153my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1154
1155$code.=<<___;
1156.align	5
1157.globl	poly1305_emit
1158.ent	poly1305_emit
1159poly1305_emit:
1160	.frame	$sp,0,$ra
1161	.set	reorder
1162
1163	lw	$tmp4,16($ctx)
1164	lw	$tmp0,0($ctx)
1165	lw	$tmp1,4($ctx)
1166	lw	$tmp2,8($ctx)
1167	lw	$tmp3,12($ctx)
1168
1169	li	$in0,-4			# final reduction
1170	srl	$ctx,$tmp4,2
1171	and	$in0,$in0,$tmp4
1172	andi	$tmp4,$tmp4,3
1173	addu	$ctx,$ctx,$in0
1174
1175	addu	$tmp0,$tmp0,$ctx
1176	sltu	$ctx,$tmp0,$ctx
1177	 addiu	$in0,$tmp0,5		# compare to modulus
1178	addu	$tmp1,$tmp1,$ctx
1179	 sltiu	$in1,$in0,5
1180	sltu	$ctx,$tmp1,$ctx
1181	 addu	$in1,$in1,$tmp1
1182	addu	$tmp2,$tmp2,$ctx
1183	 sltu	$in2,$in1,$tmp1
1184	sltu	$ctx,$tmp2,$ctx
1185	 addu	$in2,$in2,$tmp2
1186	addu	$tmp3,$tmp3,$ctx
1187	 sltu	$in3,$in2,$tmp2
1188	sltu	$ctx,$tmp3,$ctx
1189	 addu	$in3,$in3,$tmp3
1190	addu	$tmp4,$tmp4,$ctx
1191	 sltu	$ctx,$in3,$tmp3
1192	 addu	$ctx,$tmp4
1193
1194	srl	$ctx,2			# see if it carried/borrowed
1195	subu	$ctx,$zero,$ctx
1196
1197	xor	$in0,$tmp0
1198	xor	$in1,$tmp1
1199	xor	$in2,$tmp2
1200	xor	$in3,$tmp3
1201	and	$in0,$ctx
1202	and	$in1,$ctx
1203	and	$in2,$ctx
1204	and	$in3,$ctx
1205	xor	$in0,$tmp0
1206	xor	$in1,$tmp1
1207	xor	$in2,$tmp2
1208	xor	$in3,$tmp3
1209
1210	lw	$tmp0,0($nonce)		# load nonce
1211	lw	$tmp1,4($nonce)
1212	lw	$tmp2,8($nonce)
1213	lw	$tmp3,12($nonce)
1214
1215	addu	$in0,$tmp0		# accumulate nonce
1216	sltu	$ctx,$in0,$tmp0
1217
1218	addu	$in1,$tmp1
1219	sltu	$tmp1,$in1,$tmp1
1220	addu	$in1,$ctx
1221	sltu	$ctx,$in1,$ctx
1222	addu	$ctx,$tmp1
1223
1224	addu	$in2,$tmp2
1225	sltu	$tmp2,$in2,$tmp2
1226	addu	$in2,$ctx
1227	sltu	$ctx,$in2,$ctx
1228	addu	$ctx,$tmp2
1229
1230	addu	$in3,$tmp3
1231	addu	$in3,$ctx
1232
1233	srl	$tmp0,$in0,8		# write mac value
1234	srl	$tmp1,$in0,16
1235	srl	$tmp2,$in0,24
1236	sb	$in0, 0($mac)
1237	sb	$tmp0,1($mac)
1238	srl	$tmp0,$in1,8
1239	sb	$tmp1,2($mac)
1240	srl	$tmp1,$in1,16
1241	sb	$tmp2,3($mac)
1242	srl	$tmp2,$in1,24
1243	sb	$in1, 4($mac)
1244	sb	$tmp0,5($mac)
1245	srl	$tmp0,$in2,8
1246	sb	$tmp1,6($mac)
1247	srl	$tmp1,$in2,16
1248	sb	$tmp2,7($mac)
1249	srl	$tmp2,$in2,24
1250	sb	$in2, 8($mac)
1251	sb	$tmp0,9($mac)
1252	srl	$tmp0,$in3,8
1253	sb	$tmp1,10($mac)
1254	srl	$tmp1,$in3,16
1255	sb	$tmp2,11($mac)
1256	srl	$tmp2,$in3,24
1257	sb	$in3, 12($mac)
1258	sb	$tmp0,13($mac)
1259	sb	$tmp1,14($mac)
1260	sb	$tmp2,15($mac)
1261
1262	jr	$ra
1263.end	poly1305_emit
1264.rdata
1265.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1266.align	2
1267___
1268}
1269}}}
1270
1271$output=pop and open STDOUT,">$output";
1272print $code;
1273close STDOUT;
1274