xref: /openbmc/linux/arch/arm/crypto/poly1305-armv4.pl (revision c95baf12f5077419db01313ab61c2aac007d40cd)
1*a6b803b3SArd Biesheuvel#!/usr/bin/env perl
2*a6b803b3SArd Biesheuvel# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3*a6b803b3SArd Biesheuvel#
4*a6b803b3SArd Biesheuvel# ====================================================================
5*a6b803b3SArd Biesheuvel# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
6*a6b803b3SArd Biesheuvel# project.
7*a6b803b3SArd Biesheuvel# ====================================================================
8*a6b803b3SArd Biesheuvel#
9*a6b803b3SArd Biesheuvel#			IALU(*)/gcc-4.4		NEON
10*a6b803b3SArd Biesheuvel#
11*a6b803b3SArd Biesheuvel# ARM11xx(ARMv6)	7.78/+100%		-
12*a6b803b3SArd Biesheuvel# Cortex-A5		6.35/+130%		3.00
13*a6b803b3SArd Biesheuvel# Cortex-A8		6.25/+115%		2.36
14*a6b803b3SArd Biesheuvel# Cortex-A9		5.10/+95%		2.55
15*a6b803b3SArd Biesheuvel# Cortex-A15		3.85/+85%		1.25(**)
16*a6b803b3SArd Biesheuvel# Snapdragon S4		5.70/+100%		1.48(**)
17*a6b803b3SArd Biesheuvel#
18*a6b803b3SArd Biesheuvel# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
19*a6b803b3SArd Biesheuvel# (**)	these are trade-off results, they can be improved by ~8% but at
20*a6b803b3SArd Biesheuvel#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
21*a6b803b3SArd Biesheuvel#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
22*a6b803b3SArd Biesheuvel
23*a6b803b3SArd Biesheuvel$flavour = shift;
24*a6b803b3SArd Biesheuvelif ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
25*a6b803b3SArd Biesheuvelelse { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
26*a6b803b3SArd Biesheuvel
27*a6b803b3SArd Biesheuvelif ($flavour && $flavour ne "void") {
28*a6b803b3SArd Biesheuvel    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29*a6b803b3SArd Biesheuvel    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
30*a6b803b3SArd Biesheuvel    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
31*a6b803b3SArd Biesheuvel    die "can't locate arm-xlate.pl";
32*a6b803b3SArd Biesheuvel
33*a6b803b3SArd Biesheuvel    open STDOUT,"| \"$^X\" $xlate $flavour $output";
34*a6b803b3SArd Biesheuvel} else {
35*a6b803b3SArd Biesheuvel    open STDOUT,">$output";
36*a6b803b3SArd Biesheuvel}
37*a6b803b3SArd Biesheuvel
38*a6b803b3SArd Biesheuvel($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
39*a6b803b3SArd Biesheuvel
40*a6b803b3SArd Biesheuvel$code.=<<___;
41*a6b803b3SArd Biesheuvel#ifndef	__KERNEL__
42*a6b803b3SArd Biesheuvel# include "arm_arch.h"
43*a6b803b3SArd Biesheuvel#else
44*a6b803b3SArd Biesheuvel# define __ARM_ARCH__ __LINUX_ARM_ARCH__
45*a6b803b3SArd Biesheuvel# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
46*a6b803b3SArd Biesheuvel# define poly1305_init   poly1305_init_arm
47*a6b803b3SArd Biesheuvel# define poly1305_blocks poly1305_blocks_arm
48*a6b803b3SArd Biesheuvel# define poly1305_emit   poly1305_emit_arm
49*a6b803b3SArd Biesheuvel.globl	poly1305_blocks_neon
50*a6b803b3SArd Biesheuvel#endif
51*a6b803b3SArd Biesheuvel
52*a6b803b3SArd Biesheuvel#if defined(__thumb2__)
53*a6b803b3SArd Biesheuvel.syntax	unified
54*a6b803b3SArd Biesheuvel.thumb
55*a6b803b3SArd Biesheuvel#else
56*a6b803b3SArd Biesheuvel.code	32
57*a6b803b3SArd Biesheuvel#endif
58*a6b803b3SArd Biesheuvel
59*a6b803b3SArd Biesheuvel.text
60*a6b803b3SArd Biesheuvel
61*a6b803b3SArd Biesheuvel.globl	poly1305_emit
62*a6b803b3SArd Biesheuvel.globl	poly1305_blocks
63*a6b803b3SArd Biesheuvel.globl	poly1305_init
64*a6b803b3SArd Biesheuvel.type	poly1305_init,%function
65*a6b803b3SArd Biesheuvel.align	5
66*a6b803b3SArd Biesheuvelpoly1305_init:
67*a6b803b3SArd Biesheuvel.Lpoly1305_init:
68*a6b803b3SArd Biesheuvel	stmdb	sp!,{r4-r11}
69*a6b803b3SArd Biesheuvel
70*a6b803b3SArd Biesheuvel	eor	r3,r3,r3
71*a6b803b3SArd Biesheuvel	cmp	$inp,#0
72*a6b803b3SArd Biesheuvel	str	r3,[$ctx,#0]		@ zero hash value
73*a6b803b3SArd Biesheuvel	str	r3,[$ctx,#4]
74*a6b803b3SArd Biesheuvel	str	r3,[$ctx,#8]
75*a6b803b3SArd Biesheuvel	str	r3,[$ctx,#12]
76*a6b803b3SArd Biesheuvel	str	r3,[$ctx,#16]
77*a6b803b3SArd Biesheuvel	str	r3,[$ctx,#36]		@ clear is_base2_26
78*a6b803b3SArd Biesheuvel	add	$ctx,$ctx,#20
79*a6b803b3SArd Biesheuvel
80*a6b803b3SArd Biesheuvel#ifdef	__thumb2__
81*a6b803b3SArd Biesheuvel	it	eq
82*a6b803b3SArd Biesheuvel#endif
83*a6b803b3SArd Biesheuvel	moveq	r0,#0
84*a6b803b3SArd Biesheuvel	beq	.Lno_key
85*a6b803b3SArd Biesheuvel
86*a6b803b3SArd Biesheuvel#if	__ARM_MAX_ARCH__>=7
87*a6b803b3SArd Biesheuvel	mov	r3,#-1
88*a6b803b3SArd Biesheuvel	str	r3,[$ctx,#28]		@ impossible key power value
89*a6b803b3SArd Biesheuvel# ifndef __KERNEL__
90*a6b803b3SArd Biesheuvel	adr	r11,.Lpoly1305_init
91*a6b803b3SArd Biesheuvel	ldr	r12,.LOPENSSL_armcap
92*a6b803b3SArd Biesheuvel# endif
93*a6b803b3SArd Biesheuvel#endif
94*a6b803b3SArd Biesheuvel	ldrb	r4,[$inp,#0]
95*a6b803b3SArd Biesheuvel	mov	r10,#0x0fffffff
96*a6b803b3SArd Biesheuvel	ldrb	r5,[$inp,#1]
97*a6b803b3SArd Biesheuvel	and	r3,r10,#-4		@ 0x0ffffffc
98*a6b803b3SArd Biesheuvel	ldrb	r6,[$inp,#2]
99*a6b803b3SArd Biesheuvel	ldrb	r7,[$inp,#3]
100*a6b803b3SArd Biesheuvel	orr	r4,r4,r5,lsl#8
101*a6b803b3SArd Biesheuvel	ldrb	r5,[$inp,#4]
102*a6b803b3SArd Biesheuvel	orr	r4,r4,r6,lsl#16
103*a6b803b3SArd Biesheuvel	ldrb	r6,[$inp,#5]
104*a6b803b3SArd Biesheuvel	orr	r4,r4,r7,lsl#24
105*a6b803b3SArd Biesheuvel	ldrb	r7,[$inp,#6]
106*a6b803b3SArd Biesheuvel	and	r4,r4,r10
107*a6b803b3SArd Biesheuvel
108*a6b803b3SArd Biesheuvel#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
109*a6b803b3SArd Biesheuvel# if !defined(_WIN32)
110*a6b803b3SArd Biesheuvel	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
111*a6b803b3SArd Biesheuvel# endif
112*a6b803b3SArd Biesheuvel# if defined(__APPLE__) || defined(_WIN32)
113*a6b803b3SArd Biesheuvel	ldr	r12,[r12]
114*a6b803b3SArd Biesheuvel# endif
115*a6b803b3SArd Biesheuvel#endif
116*a6b803b3SArd Biesheuvel	ldrb	r8,[$inp,#7]
117*a6b803b3SArd Biesheuvel	orr	r5,r5,r6,lsl#8
118*a6b803b3SArd Biesheuvel	ldrb	r6,[$inp,#8]
119*a6b803b3SArd Biesheuvel	orr	r5,r5,r7,lsl#16
120*a6b803b3SArd Biesheuvel	ldrb	r7,[$inp,#9]
121*a6b803b3SArd Biesheuvel	orr	r5,r5,r8,lsl#24
122*a6b803b3SArd Biesheuvel	ldrb	r8,[$inp,#10]
123*a6b803b3SArd Biesheuvel	and	r5,r5,r3
124*a6b803b3SArd Biesheuvel
125*a6b803b3SArd Biesheuvel#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
126*a6b803b3SArd Biesheuvel	tst	r12,#ARMV7_NEON		@ check for NEON
127*a6b803b3SArd Biesheuvel# ifdef	__thumb2__
128*a6b803b3SArd Biesheuvel	adr	r9,.Lpoly1305_blocks_neon
129*a6b803b3SArd Biesheuvel	adr	r11,.Lpoly1305_blocks
130*a6b803b3SArd Biesheuvel	it	ne
131*a6b803b3SArd Biesheuvel	movne	r11,r9
132*a6b803b3SArd Biesheuvel	adr	r12,.Lpoly1305_emit
133*a6b803b3SArd Biesheuvel	orr	r11,r11,#1		@ thumb-ify addresses
134*a6b803b3SArd Biesheuvel	orr	r12,r12,#1
135*a6b803b3SArd Biesheuvel# else
136*a6b803b3SArd Biesheuvel	add	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
137*a6b803b3SArd Biesheuvel	ite	eq
138*a6b803b3SArd Biesheuvel	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
139*a6b803b3SArd Biesheuvel	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
140*a6b803b3SArd Biesheuvel# endif
141*a6b803b3SArd Biesheuvel#endif
142*a6b803b3SArd Biesheuvel	ldrb	r9,[$inp,#11]
143*a6b803b3SArd Biesheuvel	orr	r6,r6,r7,lsl#8
144*a6b803b3SArd Biesheuvel	ldrb	r7,[$inp,#12]
145*a6b803b3SArd Biesheuvel	orr	r6,r6,r8,lsl#16
146*a6b803b3SArd Biesheuvel	ldrb	r8,[$inp,#13]
147*a6b803b3SArd Biesheuvel	orr	r6,r6,r9,lsl#24
148*a6b803b3SArd Biesheuvel	ldrb	r9,[$inp,#14]
149*a6b803b3SArd Biesheuvel	and	r6,r6,r3
150*a6b803b3SArd Biesheuvel
151*a6b803b3SArd Biesheuvel	ldrb	r10,[$inp,#15]
152*a6b803b3SArd Biesheuvel	orr	r7,r7,r8,lsl#8
153*a6b803b3SArd Biesheuvel	str	r4,[$ctx,#0]
154*a6b803b3SArd Biesheuvel	orr	r7,r7,r9,lsl#16
155*a6b803b3SArd Biesheuvel	str	r5,[$ctx,#4]
156*a6b803b3SArd Biesheuvel	orr	r7,r7,r10,lsl#24
157*a6b803b3SArd Biesheuvel	str	r6,[$ctx,#8]
158*a6b803b3SArd Biesheuvel	and	r7,r7,r3
159*a6b803b3SArd Biesheuvel	str	r7,[$ctx,#12]
160*a6b803b3SArd Biesheuvel#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
161*a6b803b3SArd Biesheuvel	stmia	r2,{r11,r12}		@ fill functions table
162*a6b803b3SArd Biesheuvel	mov	r0,#1
163*a6b803b3SArd Biesheuvel#else
164*a6b803b3SArd Biesheuvel	mov	r0,#0
165*a6b803b3SArd Biesheuvel#endif
166*a6b803b3SArd Biesheuvel.Lno_key:
167*a6b803b3SArd Biesheuvel	ldmia	sp!,{r4-r11}
168*a6b803b3SArd Biesheuvel#if	__ARM_ARCH__>=5
169*a6b803b3SArd Biesheuvel	ret				@ bx	lr
170*a6b803b3SArd Biesheuvel#else
171*a6b803b3SArd Biesheuvel	tst	lr,#1
172*a6b803b3SArd Biesheuvel	moveq	pc,lr			@ be binary compatible with V4, yet
173*a6b803b3SArd Biesheuvel	bx	lr			@ interoperable with Thumb ISA:-)
174*a6b803b3SArd Biesheuvel#endif
175*a6b803b3SArd Biesheuvel.size	poly1305_init,.-poly1305_init
176*a6b803b3SArd Biesheuvel___
177*a6b803b3SArd Biesheuvel{
178*a6b803b3SArd Biesheuvelmy ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
179*a6b803b3SArd Biesheuvelmy ($s1,$s2,$s3)=($r1,$r2,$r3);
180*a6b803b3SArd Biesheuvel
181*a6b803b3SArd Biesheuvel$code.=<<___;
182*a6b803b3SArd Biesheuvel.type	poly1305_blocks,%function
183*a6b803b3SArd Biesheuvel.align	5
184*a6b803b3SArd Biesheuvelpoly1305_blocks:
185*a6b803b3SArd Biesheuvel.Lpoly1305_blocks:
186*a6b803b3SArd Biesheuvel	stmdb	sp!,{r3-r11,lr}
187*a6b803b3SArd Biesheuvel
188*a6b803b3SArd Biesheuvel	ands	$len,$len,#-16
189*a6b803b3SArd Biesheuvel	beq	.Lno_data
190*a6b803b3SArd Biesheuvel
191*a6b803b3SArd Biesheuvel	add	$len,$len,$inp		@ end pointer
192*a6b803b3SArd Biesheuvel	sub	sp,sp,#32
193*a6b803b3SArd Biesheuvel
194*a6b803b3SArd Biesheuvel#if __ARM_ARCH__<7
195*a6b803b3SArd Biesheuvel	ldmia	$ctx,{$h0-$r3}		@ load context
196*a6b803b3SArd Biesheuvel	add	$ctx,$ctx,#20
197*a6b803b3SArd Biesheuvel	str	$len,[sp,#16]		@ offload stuff
198*a6b803b3SArd Biesheuvel	str	$ctx,[sp,#12]
199*a6b803b3SArd Biesheuvel#else
200*a6b803b3SArd Biesheuvel	ldr	lr,[$ctx,#36]		@ is_base2_26
201*a6b803b3SArd Biesheuvel	ldmia	$ctx!,{$h0-$h4}		@ load hash value
202*a6b803b3SArd Biesheuvel	str	$len,[sp,#16]		@ offload stuff
203*a6b803b3SArd Biesheuvel	str	$ctx,[sp,#12]
204*a6b803b3SArd Biesheuvel
205*a6b803b3SArd Biesheuvel	adds	$r0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
206*a6b803b3SArd Biesheuvel	mov	$r1,$h1,lsr#6
207*a6b803b3SArd Biesheuvel	adcs	$r1,$r1,$h2,lsl#20
208*a6b803b3SArd Biesheuvel	mov	$r2,$h2,lsr#12
209*a6b803b3SArd Biesheuvel	adcs	$r2,$r2,$h3,lsl#14
210*a6b803b3SArd Biesheuvel	mov	$r3,$h3,lsr#18
211*a6b803b3SArd Biesheuvel	adcs	$r3,$r3,$h4,lsl#8
212*a6b803b3SArd Biesheuvel	mov	$len,#0
213*a6b803b3SArd Biesheuvel	teq	lr,#0
214*a6b803b3SArd Biesheuvel	str	$len,[$ctx,#16]		@ clear is_base2_26
215*a6b803b3SArd Biesheuvel	adc	$len,$len,$h4,lsr#24
216*a6b803b3SArd Biesheuvel
217*a6b803b3SArd Biesheuvel	itttt	ne
218*a6b803b3SArd Biesheuvel	movne	$h0,$r0			@ choose between radixes
219*a6b803b3SArd Biesheuvel	movne	$h1,$r1
220*a6b803b3SArd Biesheuvel	movne	$h2,$r2
221*a6b803b3SArd Biesheuvel	movne	$h3,$r3
222*a6b803b3SArd Biesheuvel	ldmia	$ctx,{$r0-$r3}		@ load key
223*a6b803b3SArd Biesheuvel	it	ne
224*a6b803b3SArd Biesheuvel	movne	$h4,$len
225*a6b803b3SArd Biesheuvel#endif
226*a6b803b3SArd Biesheuvel
227*a6b803b3SArd Biesheuvel	mov	lr,$inp
228*a6b803b3SArd Biesheuvel	cmp	$padbit,#0
229*a6b803b3SArd Biesheuvel	str	$r1,[sp,#20]
230*a6b803b3SArd Biesheuvel	str	$r2,[sp,#24]
231*a6b803b3SArd Biesheuvel	str	$r3,[sp,#28]
232*a6b803b3SArd Biesheuvel	b	.Loop
233*a6b803b3SArd Biesheuvel
234*a6b803b3SArd Biesheuvel.align	4
235*a6b803b3SArd Biesheuvel.Loop:
236*a6b803b3SArd Biesheuvel#if __ARM_ARCH__<7
237*a6b803b3SArd Biesheuvel	ldrb	r0,[lr],#16		@ load input
238*a6b803b3SArd Biesheuvel# ifdef	__thumb2__
239*a6b803b3SArd Biesheuvel	it	hi
240*a6b803b3SArd Biesheuvel# endif
241*a6b803b3SArd Biesheuvel	addhi	$h4,$h4,#1		@ 1<<128
242*a6b803b3SArd Biesheuvel	ldrb	r1,[lr,#-15]
243*a6b803b3SArd Biesheuvel	ldrb	r2,[lr,#-14]
244*a6b803b3SArd Biesheuvel	ldrb	r3,[lr,#-13]
245*a6b803b3SArd Biesheuvel	orr	r1,r0,r1,lsl#8
246*a6b803b3SArd Biesheuvel	ldrb	r0,[lr,#-12]
247*a6b803b3SArd Biesheuvel	orr	r2,r1,r2,lsl#16
248*a6b803b3SArd Biesheuvel	ldrb	r1,[lr,#-11]
249*a6b803b3SArd Biesheuvel	orr	r3,r2,r3,lsl#24
250*a6b803b3SArd Biesheuvel	ldrb	r2,[lr,#-10]
251*a6b803b3SArd Biesheuvel	adds	$h0,$h0,r3		@ accumulate input
252*a6b803b3SArd Biesheuvel
253*a6b803b3SArd Biesheuvel	ldrb	r3,[lr,#-9]
254*a6b803b3SArd Biesheuvel	orr	r1,r0,r1,lsl#8
255*a6b803b3SArd Biesheuvel	ldrb	r0,[lr,#-8]
256*a6b803b3SArd Biesheuvel	orr	r2,r1,r2,lsl#16
257*a6b803b3SArd Biesheuvel	ldrb	r1,[lr,#-7]
258*a6b803b3SArd Biesheuvel	orr	r3,r2,r3,lsl#24
259*a6b803b3SArd Biesheuvel	ldrb	r2,[lr,#-6]
260*a6b803b3SArd Biesheuvel	adcs	$h1,$h1,r3
261*a6b803b3SArd Biesheuvel
262*a6b803b3SArd Biesheuvel	ldrb	r3,[lr,#-5]
263*a6b803b3SArd Biesheuvel	orr	r1,r0,r1,lsl#8
264*a6b803b3SArd Biesheuvel	ldrb	r0,[lr,#-4]
265*a6b803b3SArd Biesheuvel	orr	r2,r1,r2,lsl#16
266*a6b803b3SArd Biesheuvel	ldrb	r1,[lr,#-3]
267*a6b803b3SArd Biesheuvel	orr	r3,r2,r3,lsl#24
268*a6b803b3SArd Biesheuvel	ldrb	r2,[lr,#-2]
269*a6b803b3SArd Biesheuvel	adcs	$h2,$h2,r3
270*a6b803b3SArd Biesheuvel
271*a6b803b3SArd Biesheuvel	ldrb	r3,[lr,#-1]
272*a6b803b3SArd Biesheuvel	orr	r1,r0,r1,lsl#8
273*a6b803b3SArd Biesheuvel	str	lr,[sp,#8]		@ offload input pointer
274*a6b803b3SArd Biesheuvel	orr	r2,r1,r2,lsl#16
275*a6b803b3SArd Biesheuvel	add	$s1,$r1,$r1,lsr#2
276*a6b803b3SArd Biesheuvel	orr	r3,r2,r3,lsl#24
277*a6b803b3SArd Biesheuvel#else
278*a6b803b3SArd Biesheuvel	ldr	r0,[lr],#16		@ load input
279*a6b803b3SArd Biesheuvel	it	hi
280*a6b803b3SArd Biesheuvel	addhi	$h4,$h4,#1		@ padbit
281*a6b803b3SArd Biesheuvel	ldr	r1,[lr,#-12]
282*a6b803b3SArd Biesheuvel	ldr	r2,[lr,#-8]
283*a6b803b3SArd Biesheuvel	ldr	r3,[lr,#-4]
284*a6b803b3SArd Biesheuvel# ifdef	__ARMEB__
285*a6b803b3SArd Biesheuvel	rev	r0,r0
286*a6b803b3SArd Biesheuvel	rev	r1,r1
287*a6b803b3SArd Biesheuvel	rev	r2,r2
288*a6b803b3SArd Biesheuvel	rev	r3,r3
289*a6b803b3SArd Biesheuvel# endif
290*a6b803b3SArd Biesheuvel	adds	$h0,$h0,r0		@ accumulate input
291*a6b803b3SArd Biesheuvel	str	lr,[sp,#8]		@ offload input pointer
292*a6b803b3SArd Biesheuvel	adcs	$h1,$h1,r1
293*a6b803b3SArd Biesheuvel	add	$s1,$r1,$r1,lsr#2
294*a6b803b3SArd Biesheuvel	adcs	$h2,$h2,r2
295*a6b803b3SArd Biesheuvel#endif
296*a6b803b3SArd Biesheuvel	add	$s2,$r2,$r2,lsr#2
297*a6b803b3SArd Biesheuvel	adcs	$h3,$h3,r3
298*a6b803b3SArd Biesheuvel	add	$s3,$r3,$r3,lsr#2
299*a6b803b3SArd Biesheuvel
300*a6b803b3SArd Biesheuvel	umull	r2,r3,$h1,$r0
301*a6b803b3SArd Biesheuvel	 adc	$h4,$h4,#0
302*a6b803b3SArd Biesheuvel	umull	r0,r1,$h0,$r0
303*a6b803b3SArd Biesheuvel	umlal	r2,r3,$h4,$s1
304*a6b803b3SArd Biesheuvel	umlal	r0,r1,$h3,$s1
305*a6b803b3SArd Biesheuvel	ldr	$r1,[sp,#20]		@ reload $r1
306*a6b803b3SArd Biesheuvel	umlal	r2,r3,$h2,$s3
307*a6b803b3SArd Biesheuvel	umlal	r0,r1,$h1,$s3
308*a6b803b3SArd Biesheuvel	umlal	r2,r3,$h3,$s2
309*a6b803b3SArd Biesheuvel	umlal	r0,r1,$h2,$s2
310*a6b803b3SArd Biesheuvel	umlal	r2,r3,$h0,$r1
311*a6b803b3SArd Biesheuvel	str	r0,[sp,#0]		@ future $h0
312*a6b803b3SArd Biesheuvel	 mul	r0,$s2,$h4
313*a6b803b3SArd Biesheuvel	ldr	$r2,[sp,#24]		@ reload $r2
314*a6b803b3SArd Biesheuvel	adds	r2,r2,r1		@ d1+=d0>>32
315*a6b803b3SArd Biesheuvel	 eor	r1,r1,r1
316*a6b803b3SArd Biesheuvel	adc	lr,r3,#0		@ future $h2
317*a6b803b3SArd Biesheuvel	str	r2,[sp,#4]		@ future $h1
318*a6b803b3SArd Biesheuvel
319*a6b803b3SArd Biesheuvel	mul	r2,$s3,$h4
320*a6b803b3SArd Biesheuvel	eor	r3,r3,r3
321*a6b803b3SArd Biesheuvel	umlal	r0,r1,$h3,$s3
322*a6b803b3SArd Biesheuvel	ldr	$r3,[sp,#28]		@ reload $r3
323*a6b803b3SArd Biesheuvel	umlal	r2,r3,$h3,$r0
324*a6b803b3SArd Biesheuvel	umlal	r0,r1,$h2,$r0
325*a6b803b3SArd Biesheuvel	umlal	r2,r3,$h2,$r1
326*a6b803b3SArd Biesheuvel	umlal	r0,r1,$h1,$r1
327*a6b803b3SArd Biesheuvel	umlal	r2,r3,$h1,$r2
328*a6b803b3SArd Biesheuvel	umlal	r0,r1,$h0,$r2
329*a6b803b3SArd Biesheuvel	umlal	r2,r3,$h0,$r3
330*a6b803b3SArd Biesheuvel	ldr	$h0,[sp,#0]
331*a6b803b3SArd Biesheuvel	mul	$h4,$r0,$h4
332*a6b803b3SArd Biesheuvel	ldr	$h1,[sp,#4]
333*a6b803b3SArd Biesheuvel
334*a6b803b3SArd Biesheuvel	adds	$h2,lr,r0		@ d2+=d1>>32
335*a6b803b3SArd Biesheuvel	ldr	lr,[sp,#8]		@ reload input pointer
336*a6b803b3SArd Biesheuvel	adc	r1,r1,#0
337*a6b803b3SArd Biesheuvel	adds	$h3,r2,r1		@ d3+=d2>>32
338*a6b803b3SArd Biesheuvel	ldr	r0,[sp,#16]		@ reload end pointer
339*a6b803b3SArd Biesheuvel	adc	r3,r3,#0
340*a6b803b3SArd Biesheuvel	add	$h4,$h4,r3		@ h4+=d3>>32
341*a6b803b3SArd Biesheuvel
342*a6b803b3SArd Biesheuvel	and	r1,$h4,#-4
343*a6b803b3SArd Biesheuvel	and	$h4,$h4,#3
344*a6b803b3SArd Biesheuvel	add	r1,r1,r1,lsr#2		@ *=5
345*a6b803b3SArd Biesheuvel	adds	$h0,$h0,r1
346*a6b803b3SArd Biesheuvel	adcs	$h1,$h1,#0
347*a6b803b3SArd Biesheuvel	adcs	$h2,$h2,#0
348*a6b803b3SArd Biesheuvel	adcs	$h3,$h3,#0
349*a6b803b3SArd Biesheuvel	adc	$h4,$h4,#0
350*a6b803b3SArd Biesheuvel
351*a6b803b3SArd Biesheuvel	cmp	r0,lr			@ done yet?
352*a6b803b3SArd Biesheuvel	bhi	.Loop
353*a6b803b3SArd Biesheuvel
354*a6b803b3SArd Biesheuvel	ldr	$ctx,[sp,#12]
355*a6b803b3SArd Biesheuvel	add	sp,sp,#32
356*a6b803b3SArd Biesheuvel	stmdb	$ctx,{$h0-$h4}		@ store the result
357*a6b803b3SArd Biesheuvel
358*a6b803b3SArd Biesheuvel.Lno_data:
359*a6b803b3SArd Biesheuvel#if	__ARM_ARCH__>=5
360*a6b803b3SArd Biesheuvel	ldmia	sp!,{r3-r11,pc}
361*a6b803b3SArd Biesheuvel#else
362*a6b803b3SArd Biesheuvel	ldmia	sp!,{r3-r11,lr}
363*a6b803b3SArd Biesheuvel	tst	lr,#1
364*a6b803b3SArd Biesheuvel	moveq	pc,lr			@ be binary compatible with V4, yet
365*a6b803b3SArd Biesheuvel	bx	lr			@ interoperable with Thumb ISA:-)
366*a6b803b3SArd Biesheuvel#endif
367*a6b803b3SArd Biesheuvel.size	poly1305_blocks,.-poly1305_blocks
368*a6b803b3SArd Biesheuvel___
369*a6b803b3SArd Biesheuvel}
370*a6b803b3SArd Biesheuvel{
371*a6b803b3SArd Biesheuvelmy ($ctx,$mac,$nonce)=map("r$_",(0..2));
372*a6b803b3SArd Biesheuvelmy ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
373*a6b803b3SArd Biesheuvelmy $g4=$ctx;
374*a6b803b3SArd Biesheuvel
375*a6b803b3SArd Biesheuvel$code.=<<___;
376*a6b803b3SArd Biesheuvel.type	poly1305_emit,%function
377*a6b803b3SArd Biesheuvel.align	5
378*a6b803b3SArd Biesheuvelpoly1305_emit:
379*a6b803b3SArd Biesheuvel.Lpoly1305_emit:
380*a6b803b3SArd Biesheuvel	stmdb	sp!,{r4-r11}
381*a6b803b3SArd Biesheuvel
382*a6b803b3SArd Biesheuvel	ldmia	$ctx,{$h0-$h4}
383*a6b803b3SArd Biesheuvel
384*a6b803b3SArd Biesheuvel#if __ARM_ARCH__>=7
385*a6b803b3SArd Biesheuvel	ldr	ip,[$ctx,#36]		@ is_base2_26
386*a6b803b3SArd Biesheuvel
387*a6b803b3SArd Biesheuvel	adds	$g0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
388*a6b803b3SArd Biesheuvel	mov	$g1,$h1,lsr#6
389*a6b803b3SArd Biesheuvel	adcs	$g1,$g1,$h2,lsl#20
390*a6b803b3SArd Biesheuvel	mov	$g2,$h2,lsr#12
391*a6b803b3SArd Biesheuvel	adcs	$g2,$g2,$h3,lsl#14
392*a6b803b3SArd Biesheuvel	mov	$g3,$h3,lsr#18
393*a6b803b3SArd Biesheuvel	adcs	$g3,$g3,$h4,lsl#8
394*a6b803b3SArd Biesheuvel	mov	$g4,#0
395*a6b803b3SArd Biesheuvel	adc	$g4,$g4,$h4,lsr#24
396*a6b803b3SArd Biesheuvel
397*a6b803b3SArd Biesheuvel	tst	ip,ip
398*a6b803b3SArd Biesheuvel	itttt	ne
399*a6b803b3SArd Biesheuvel	movne	$h0,$g0
400*a6b803b3SArd Biesheuvel	movne	$h1,$g1
401*a6b803b3SArd Biesheuvel	movne	$h2,$g2
402*a6b803b3SArd Biesheuvel	movne	$h3,$g3
403*a6b803b3SArd Biesheuvel	it	ne
404*a6b803b3SArd Biesheuvel	movne	$h4,$g4
405*a6b803b3SArd Biesheuvel#endif
406*a6b803b3SArd Biesheuvel
407*a6b803b3SArd Biesheuvel	adds	$g0,$h0,#5		@ compare to modulus
408*a6b803b3SArd Biesheuvel	adcs	$g1,$h1,#0
409*a6b803b3SArd Biesheuvel	adcs	$g2,$h2,#0
410*a6b803b3SArd Biesheuvel	adcs	$g3,$h3,#0
411*a6b803b3SArd Biesheuvel	adc	$g4,$h4,#0
412*a6b803b3SArd Biesheuvel	tst	$g4,#4			@ did it carry/borrow?
413*a6b803b3SArd Biesheuvel
414*a6b803b3SArd Biesheuvel#ifdef	__thumb2__
415*a6b803b3SArd Biesheuvel	it	ne
416*a6b803b3SArd Biesheuvel#endif
417*a6b803b3SArd Biesheuvel	movne	$h0,$g0
418*a6b803b3SArd Biesheuvel	ldr	$g0,[$nonce,#0]
419*a6b803b3SArd Biesheuvel#ifdef	__thumb2__
420*a6b803b3SArd Biesheuvel	it	ne
421*a6b803b3SArd Biesheuvel#endif
422*a6b803b3SArd Biesheuvel	movne	$h1,$g1
423*a6b803b3SArd Biesheuvel	ldr	$g1,[$nonce,#4]
424*a6b803b3SArd Biesheuvel#ifdef	__thumb2__
425*a6b803b3SArd Biesheuvel	it	ne
426*a6b803b3SArd Biesheuvel#endif
427*a6b803b3SArd Biesheuvel	movne	$h2,$g2
428*a6b803b3SArd Biesheuvel	ldr	$g2,[$nonce,#8]
429*a6b803b3SArd Biesheuvel#ifdef	__thumb2__
430*a6b803b3SArd Biesheuvel	it	ne
431*a6b803b3SArd Biesheuvel#endif
432*a6b803b3SArd Biesheuvel	movne	$h3,$g3
433*a6b803b3SArd Biesheuvel	ldr	$g3,[$nonce,#12]
434*a6b803b3SArd Biesheuvel
435*a6b803b3SArd Biesheuvel	adds	$h0,$h0,$g0
436*a6b803b3SArd Biesheuvel	adcs	$h1,$h1,$g1
437*a6b803b3SArd Biesheuvel	adcs	$h2,$h2,$g2
438*a6b803b3SArd Biesheuvel	adc	$h3,$h3,$g3
439*a6b803b3SArd Biesheuvel
440*a6b803b3SArd Biesheuvel#if __ARM_ARCH__>=7
441*a6b803b3SArd Biesheuvel# ifdef __ARMEB__
442*a6b803b3SArd Biesheuvel	rev	$h0,$h0
443*a6b803b3SArd Biesheuvel	rev	$h1,$h1
444*a6b803b3SArd Biesheuvel	rev	$h2,$h2
445*a6b803b3SArd Biesheuvel	rev	$h3,$h3
446*a6b803b3SArd Biesheuvel# endif
447*a6b803b3SArd Biesheuvel	str	$h0,[$mac,#0]
448*a6b803b3SArd Biesheuvel	str	$h1,[$mac,#4]
449*a6b803b3SArd Biesheuvel	str	$h2,[$mac,#8]
450*a6b803b3SArd Biesheuvel	str	$h3,[$mac,#12]
451*a6b803b3SArd Biesheuvel#else
452*a6b803b3SArd Biesheuvel	strb	$h0,[$mac,#0]
453*a6b803b3SArd Biesheuvel	mov	$h0,$h0,lsr#8
454*a6b803b3SArd Biesheuvel	strb	$h1,[$mac,#4]
455*a6b803b3SArd Biesheuvel	mov	$h1,$h1,lsr#8
456*a6b803b3SArd Biesheuvel	strb	$h2,[$mac,#8]
457*a6b803b3SArd Biesheuvel	mov	$h2,$h2,lsr#8
458*a6b803b3SArd Biesheuvel	strb	$h3,[$mac,#12]
459*a6b803b3SArd Biesheuvel	mov	$h3,$h3,lsr#8
460*a6b803b3SArd Biesheuvel
461*a6b803b3SArd Biesheuvel	strb	$h0,[$mac,#1]
462*a6b803b3SArd Biesheuvel	mov	$h0,$h0,lsr#8
463*a6b803b3SArd Biesheuvel	strb	$h1,[$mac,#5]
464*a6b803b3SArd Biesheuvel	mov	$h1,$h1,lsr#8
465*a6b803b3SArd Biesheuvel	strb	$h2,[$mac,#9]
466*a6b803b3SArd Biesheuvel	mov	$h2,$h2,lsr#8
467*a6b803b3SArd Biesheuvel	strb	$h3,[$mac,#13]
468*a6b803b3SArd Biesheuvel	mov	$h3,$h3,lsr#8
469*a6b803b3SArd Biesheuvel
470*a6b803b3SArd Biesheuvel	strb	$h0,[$mac,#2]
471*a6b803b3SArd Biesheuvel	mov	$h0,$h0,lsr#8
472*a6b803b3SArd Biesheuvel	strb	$h1,[$mac,#6]
473*a6b803b3SArd Biesheuvel	mov	$h1,$h1,lsr#8
474*a6b803b3SArd Biesheuvel	strb	$h2,[$mac,#10]
475*a6b803b3SArd Biesheuvel	mov	$h2,$h2,lsr#8
476*a6b803b3SArd Biesheuvel	strb	$h3,[$mac,#14]
477*a6b803b3SArd Biesheuvel	mov	$h3,$h3,lsr#8
478*a6b803b3SArd Biesheuvel
479*a6b803b3SArd Biesheuvel	strb	$h0,[$mac,#3]
480*a6b803b3SArd Biesheuvel	strb	$h1,[$mac,#7]
481*a6b803b3SArd Biesheuvel	strb	$h2,[$mac,#11]
482*a6b803b3SArd Biesheuvel	strb	$h3,[$mac,#15]
483*a6b803b3SArd Biesheuvel#endif
484*a6b803b3SArd Biesheuvel	ldmia	sp!,{r4-r11}
485*a6b803b3SArd Biesheuvel#if	__ARM_ARCH__>=5
486*a6b803b3SArd Biesheuvel	ret				@ bx	lr
487*a6b803b3SArd Biesheuvel#else
488*a6b803b3SArd Biesheuvel	tst	lr,#1
489*a6b803b3SArd Biesheuvel	moveq	pc,lr			@ be binary compatible with V4, yet
490*a6b803b3SArd Biesheuvel	bx	lr			@ interoperable with Thumb ISA:-)
491*a6b803b3SArd Biesheuvel#endif
492*a6b803b3SArd Biesheuvel.size	poly1305_emit,.-poly1305_emit
493*a6b803b3SArd Biesheuvel___
494*a6b803b3SArd Biesheuvel{
495*a6b803b3SArd Biesheuvelmy ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
496*a6b803b3SArd Biesheuvelmy ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
497*a6b803b3SArd Biesheuvelmy ($T0,$T1,$MASK) = map("q$_",(15,4,0));
498*a6b803b3SArd Biesheuvel
499*a6b803b3SArd Biesheuvelmy ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
500*a6b803b3SArd Biesheuvel
501*a6b803b3SArd Biesheuvel$code.=<<___;
502*a6b803b3SArd Biesheuvel#if	__ARM_MAX_ARCH__>=7
503*a6b803b3SArd Biesheuvel.fpu	neon
504*a6b803b3SArd Biesheuvel
505*a6b803b3SArd Biesheuvel.type	poly1305_init_neon,%function
506*a6b803b3SArd Biesheuvel.align	5
507*a6b803b3SArd Biesheuvelpoly1305_init_neon:
508*a6b803b3SArd Biesheuvel.Lpoly1305_init_neon:
509*a6b803b3SArd Biesheuvel	ldr	r3,[$ctx,#48]		@ first table element
510*a6b803b3SArd Biesheuvel	cmp	r3,#-1			@ is value impossible?
511*a6b803b3SArd Biesheuvel	bne	.Lno_init_neon
512*a6b803b3SArd Biesheuvel
513*a6b803b3SArd Biesheuvel	ldr	r4,[$ctx,#20]		@ load key base 2^32
514*a6b803b3SArd Biesheuvel	ldr	r5,[$ctx,#24]
515*a6b803b3SArd Biesheuvel	ldr	r6,[$ctx,#28]
516*a6b803b3SArd Biesheuvel	ldr	r7,[$ctx,#32]
517*a6b803b3SArd Biesheuvel
518*a6b803b3SArd Biesheuvel	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
519*a6b803b3SArd Biesheuvel	mov	r3,r4,lsr#26
520*a6b803b3SArd Biesheuvel	mov	r4,r5,lsr#20
521*a6b803b3SArd Biesheuvel	orr	r3,r3,r5,lsl#6
522*a6b803b3SArd Biesheuvel	mov	r5,r6,lsr#14
523*a6b803b3SArd Biesheuvel	orr	r4,r4,r6,lsl#12
524*a6b803b3SArd Biesheuvel	mov	r6,r7,lsr#8
525*a6b803b3SArd Biesheuvel	orr	r5,r5,r7,lsl#18
526*a6b803b3SArd Biesheuvel	and	r3,r3,#0x03ffffff
527*a6b803b3SArd Biesheuvel	and	r4,r4,#0x03ffffff
528*a6b803b3SArd Biesheuvel	and	r5,r5,#0x03ffffff
529*a6b803b3SArd Biesheuvel
530*a6b803b3SArd Biesheuvel	vdup.32	$R0,r2			@ r^1 in both lanes
531*a6b803b3SArd Biesheuvel	add	r2,r3,r3,lsl#2		@ *5
532*a6b803b3SArd Biesheuvel	vdup.32	$R1,r3
533*a6b803b3SArd Biesheuvel	add	r3,r4,r4,lsl#2
534*a6b803b3SArd Biesheuvel	vdup.32	$S1,r2
535*a6b803b3SArd Biesheuvel	vdup.32	$R2,r4
536*a6b803b3SArd Biesheuvel	add	r4,r5,r5,lsl#2
537*a6b803b3SArd Biesheuvel	vdup.32	$S2,r3
538*a6b803b3SArd Biesheuvel	vdup.32	$R3,r5
539*a6b803b3SArd Biesheuvel	add	r5,r6,r6,lsl#2
540*a6b803b3SArd Biesheuvel	vdup.32	$S3,r4
541*a6b803b3SArd Biesheuvel	vdup.32	$R4,r6
542*a6b803b3SArd Biesheuvel	vdup.32	$S4,r5
543*a6b803b3SArd Biesheuvel
544*a6b803b3SArd Biesheuvel	mov	$zeros,#2		@ counter
545*a6b803b3SArd Biesheuvel
546*a6b803b3SArd Biesheuvel.Lsquare_neon:
547*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
548*a6b803b3SArd Biesheuvel	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
549*a6b803b3SArd Biesheuvel	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
550*a6b803b3SArd Biesheuvel	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
551*a6b803b3SArd Biesheuvel	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
552*a6b803b3SArd Biesheuvel	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
553*a6b803b3SArd Biesheuvel
554*a6b803b3SArd Biesheuvel	vmull.u32	$D0,$R0,${R0}[1]
555*a6b803b3SArd Biesheuvel	vmull.u32	$D1,$R1,${R0}[1]
556*a6b803b3SArd Biesheuvel	vmull.u32	$D2,$R2,${R0}[1]
557*a6b803b3SArd Biesheuvel	vmull.u32	$D3,$R3,${R0}[1]
558*a6b803b3SArd Biesheuvel	vmull.u32	$D4,$R4,${R0}[1]
559*a6b803b3SArd Biesheuvel
560*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$R4,${S1}[1]
561*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$R0,${R1}[1]
562*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$R1,${R1}[1]
563*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$R2,${R1}[1]
564*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$R3,${R1}[1]
565*a6b803b3SArd Biesheuvel
566*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$R3,${S2}[1]
567*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$R4,${S2}[1]
568*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$R1,${R2}[1]
569*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$R0,${R2}[1]
570*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$R2,${R2}[1]
571*a6b803b3SArd Biesheuvel
572*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$R2,${S3}[1]
573*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$R0,${R3}[1]
574*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$R3,${S3}[1]
575*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$R4,${S3}[1]
576*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$R1,${R3}[1]
577*a6b803b3SArd Biesheuvel
578*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$R4,${S4}[1]
579*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$R1,${S4}[1]
580*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$R2,${S4}[1]
581*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$R3,${S4}[1]
582*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$R0,${R4}[1]
583*a6b803b3SArd Biesheuvel
584*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
585*a6b803b3SArd Biesheuvel	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
586*a6b803b3SArd Biesheuvel	@ and P. Schwabe
587*a6b803b3SArd Biesheuvel	@
588*a6b803b3SArd Biesheuvel	@ H0>>+H1>>+H2>>+H3>>+H4
589*a6b803b3SArd Biesheuvel	@ H3>>+H4>>*5+H0>>+H1
590*a6b803b3SArd Biesheuvel	@
591*a6b803b3SArd Biesheuvel	@ Trivia.
592*a6b803b3SArd Biesheuvel	@
593*a6b803b3SArd Biesheuvel	@ Result of multiplication of n-bit number by m-bit number is
594*a6b803b3SArd Biesheuvel	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
595*a6b803b3SArd Biesheuvel	@ m-bit number multiplied by 2^n is still n+m bits wide.
596*a6b803b3SArd Biesheuvel	@
597*a6b803b3SArd Biesheuvel	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
598*a6b803b3SArd Biesheuvel	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
599*a6b803b3SArd Biesheuvel	@ one is n+1 bits wide.
600*a6b803b3SArd Biesheuvel	@
601*a6b803b3SArd Biesheuvel	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
602*a6b803b3SArd Biesheuvel	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
603*a6b803b3SArd Biesheuvel	@ can be 27. However! In cases when their width exceeds 26 bits
604*a6b803b3SArd Biesheuvel	@ they are limited by 2^26+2^6. This in turn means that *sum*
605*a6b803b3SArd Biesheuvel	@ of the products with these values can still be viewed as sum
606*a6b803b3SArd Biesheuvel	@ of 52-bit numbers as long as the amount of addends is not a
607*a6b803b3SArd Biesheuvel	@ power of 2. For example,
608*a6b803b3SArd Biesheuvel	@
609*a6b803b3SArd Biesheuvel	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
610*a6b803b3SArd Biesheuvel	@
611*a6b803b3SArd Biesheuvel	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
612*a6b803b3SArd Biesheuvel	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
613*a6b803b3SArd Biesheuvel	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
614*a6b803b3SArd Biesheuvel	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
615*a6b803b3SArd Biesheuvel	@ which is less than 32 * (2^52) or 2^57. And when processing
616*a6b803b3SArd Biesheuvel	@ data we are looking at triple as many addends...
617*a6b803b3SArd Biesheuvel	@
618*a6b803b3SArd Biesheuvel	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
619*a6b803b3SArd Biesheuvel	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
620*a6b803b3SArd Biesheuvel	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
621*a6b803b3SArd Biesheuvel	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
622*a6b803b3SArd Biesheuvel	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
623*a6b803b3SArd Biesheuvel	@ This means that result of reduction have to be compressed upon
624*a6b803b3SArd Biesheuvel	@ loop wrap-around. This can be done in the process of reduction
625*a6b803b3SArd Biesheuvel	@ to minimize amount of instructions [as well as amount of
626*a6b803b3SArd Biesheuvel	@ 128-bit instructions, which benefits low-end processors], but
627*a6b803b3SArd Biesheuvel	@ one has to watch for H2 (which is narrower than H0) and 5*H4
628*a6b803b3SArd Biesheuvel	@ not being wider than 58 bits, so that result of right shift
629*a6b803b3SArd Biesheuvel	@ by 26 bits fits in 32 bits. This is also useful on x86,
630*a6b803b3SArd Biesheuvel	@ because it allows to use paddd in place for paddq, which
631*a6b803b3SArd Biesheuvel	@ benefits Atom, where paddq is ridiculously slow.
632*a6b803b3SArd Biesheuvel
633*a6b803b3SArd Biesheuvel	vshr.u64	$T0,$D3,#26
634*a6b803b3SArd Biesheuvel	vmovn.i64	$D3#lo,$D3
635*a6b803b3SArd Biesheuvel	 vshr.u64	$T1,$D0,#26
636*a6b803b3SArd Biesheuvel	 vmovn.i64	$D0#lo,$D0
637*a6b803b3SArd Biesheuvel	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
638*a6b803b3SArd Biesheuvel	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
639*a6b803b3SArd Biesheuvel	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
640*a6b803b3SArd Biesheuvel	 vbic.i32	$D0#lo,#0xfc000000
641*a6b803b3SArd Biesheuvel
642*a6b803b3SArd Biesheuvel	vshrn.u64	$T0#lo,$D4,#26
643*a6b803b3SArd Biesheuvel	vmovn.i64	$D4#lo,$D4
644*a6b803b3SArd Biesheuvel	 vshr.u64	$T1,$D1,#26
645*a6b803b3SArd Biesheuvel	 vmovn.i64	$D1#lo,$D1
646*a6b803b3SArd Biesheuvel	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
647*a6b803b3SArd Biesheuvel	vbic.i32	$D4#lo,#0xfc000000
648*a6b803b3SArd Biesheuvel	 vbic.i32	$D1#lo,#0xfc000000
649*a6b803b3SArd Biesheuvel
650*a6b803b3SArd Biesheuvel	vadd.i32	$D0#lo,$D0#lo,$T0#lo
651*a6b803b3SArd Biesheuvel	vshl.u32	$T0#lo,$T0#lo,#2
652*a6b803b3SArd Biesheuvel	 vshrn.u64	$T1#lo,$D2,#26
653*a6b803b3SArd Biesheuvel	 vmovn.i64	$D2#lo,$D2
654*a6b803b3SArd Biesheuvel	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
655*a6b803b3SArd Biesheuvel	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
656*a6b803b3SArd Biesheuvel	 vbic.i32	$D2#lo,#0xfc000000
657*a6b803b3SArd Biesheuvel
658*a6b803b3SArd Biesheuvel	vshr.u32	$T0#lo,$D0#lo,#26
659*a6b803b3SArd Biesheuvel	vbic.i32	$D0#lo,#0xfc000000
660*a6b803b3SArd Biesheuvel	 vshr.u32	$T1#lo,$D3#lo,#26
661*a6b803b3SArd Biesheuvel	 vbic.i32	$D3#lo,#0xfc000000
662*a6b803b3SArd Biesheuvel	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
663*a6b803b3SArd Biesheuvel	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
664*a6b803b3SArd Biesheuvel
665*a6b803b3SArd Biesheuvel	subs		$zeros,$zeros,#1
666*a6b803b3SArd Biesheuvel	beq		.Lsquare_break_neon
667*a6b803b3SArd Biesheuvel
668*a6b803b3SArd Biesheuvel	add		$tbl0,$ctx,#(48+0*9*4)
669*a6b803b3SArd Biesheuvel	add		$tbl1,$ctx,#(48+1*9*4)
670*a6b803b3SArd Biesheuvel
671*a6b803b3SArd Biesheuvel	vtrn.32		$R0,$D0#lo		@ r^2:r^1
672*a6b803b3SArd Biesheuvel	vtrn.32		$R2,$D2#lo
673*a6b803b3SArd Biesheuvel	vtrn.32		$R3,$D3#lo
674*a6b803b3SArd Biesheuvel	vtrn.32		$R1,$D1#lo
675*a6b803b3SArd Biesheuvel	vtrn.32		$R4,$D4#lo
676*a6b803b3SArd Biesheuvel
677*a6b803b3SArd Biesheuvel	vshl.u32	$S2,$R2,#2		@ *5
678*a6b803b3SArd Biesheuvel	vshl.u32	$S3,$R3,#2
679*a6b803b3SArd Biesheuvel	vshl.u32	$S1,$R1,#2
680*a6b803b3SArd Biesheuvel	vshl.u32	$S4,$R4,#2
681*a6b803b3SArd Biesheuvel	vadd.i32	$S2,$S2,$R2
682*a6b803b3SArd Biesheuvel	vadd.i32	$S1,$S1,$R1
683*a6b803b3SArd Biesheuvel	vadd.i32	$S3,$S3,$R3
684*a6b803b3SArd Biesheuvel	vadd.i32	$S4,$S4,$R4
685*a6b803b3SArd Biesheuvel
686*a6b803b3SArd Biesheuvel	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
687*a6b803b3SArd Biesheuvel	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
688*a6b803b3SArd Biesheuvel	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
689*a6b803b3SArd Biesheuvel	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
690*a6b803b3SArd Biesheuvel	vst1.32		{${S4}[0]},[$tbl0,:32]
691*a6b803b3SArd Biesheuvel	vst1.32		{${S4}[1]},[$tbl1,:32]
692*a6b803b3SArd Biesheuvel
693*a6b803b3SArd Biesheuvel	b		.Lsquare_neon
694*a6b803b3SArd Biesheuvel
695*a6b803b3SArd Biesheuvel.align	4
696*a6b803b3SArd Biesheuvel.Lsquare_break_neon:
697*a6b803b3SArd Biesheuvel	add		$tbl0,$ctx,#(48+2*4*9)
698*a6b803b3SArd Biesheuvel	add		$tbl1,$ctx,#(48+3*4*9)
699*a6b803b3SArd Biesheuvel
700*a6b803b3SArd Biesheuvel	vmov		$R0,$D0#lo		@ r^4:r^3
701*a6b803b3SArd Biesheuvel	vshl.u32	$S1,$D1#lo,#2		@ *5
702*a6b803b3SArd Biesheuvel	vmov		$R1,$D1#lo
703*a6b803b3SArd Biesheuvel	vshl.u32	$S2,$D2#lo,#2
704*a6b803b3SArd Biesheuvel	vmov		$R2,$D2#lo
705*a6b803b3SArd Biesheuvel	vshl.u32	$S3,$D3#lo,#2
706*a6b803b3SArd Biesheuvel	vmov		$R3,$D3#lo
707*a6b803b3SArd Biesheuvel	vshl.u32	$S4,$D4#lo,#2
708*a6b803b3SArd Biesheuvel	vmov		$R4,$D4#lo
709*a6b803b3SArd Biesheuvel	vadd.i32	$S1,$S1,$D1#lo
710*a6b803b3SArd Biesheuvel	vadd.i32	$S2,$S2,$D2#lo
711*a6b803b3SArd Biesheuvel	vadd.i32	$S3,$S3,$D3#lo
712*a6b803b3SArd Biesheuvel	vadd.i32	$S4,$S4,$D4#lo
713*a6b803b3SArd Biesheuvel
714*a6b803b3SArd Biesheuvel	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
715*a6b803b3SArd Biesheuvel	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
716*a6b803b3SArd Biesheuvel	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
717*a6b803b3SArd Biesheuvel	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
718*a6b803b3SArd Biesheuvel	vst1.32		{${S4}[0]},[$tbl0]
719*a6b803b3SArd Biesheuvel	vst1.32		{${S4}[1]},[$tbl1]
720*a6b803b3SArd Biesheuvel
721*a6b803b3SArd Biesheuvel.Lno_init_neon:
722*a6b803b3SArd Biesheuvel	ret				@ bx	lr
723*a6b803b3SArd Biesheuvel.size	poly1305_init_neon,.-poly1305_init_neon
724*a6b803b3SArd Biesheuvel
725*a6b803b3SArd Biesheuvel.type	poly1305_blocks_neon,%function
726*a6b803b3SArd Biesheuvel.align	5
727*a6b803b3SArd Biesheuvelpoly1305_blocks_neon:
728*a6b803b3SArd Biesheuvel.Lpoly1305_blocks_neon:
729*a6b803b3SArd Biesheuvel	ldr	ip,[$ctx,#36]		@ is_base2_26
730*a6b803b3SArd Biesheuvel
731*a6b803b3SArd Biesheuvel	cmp	$len,#64
732*a6b803b3SArd Biesheuvel	blo	.Lpoly1305_blocks
733*a6b803b3SArd Biesheuvel
734*a6b803b3SArd Biesheuvel	stmdb	sp!,{r4-r7}
735*a6b803b3SArd Biesheuvel	vstmdb	sp!,{d8-d15}		@ ABI specification says so
736*a6b803b3SArd Biesheuvel
737*a6b803b3SArd Biesheuvel	tst	ip,ip			@ is_base2_26?
738*a6b803b3SArd Biesheuvel	bne	.Lbase2_26_neon
739*a6b803b3SArd Biesheuvel
740*a6b803b3SArd Biesheuvel	stmdb	sp!,{r1-r3,lr}
741*a6b803b3SArd Biesheuvel	bl	.Lpoly1305_init_neon
742*a6b803b3SArd Biesheuvel
743*a6b803b3SArd Biesheuvel	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
744*a6b803b3SArd Biesheuvel	ldr	r5,[$ctx,#4]
745*a6b803b3SArd Biesheuvel	ldr	r6,[$ctx,#8]
746*a6b803b3SArd Biesheuvel	ldr	r7,[$ctx,#12]
747*a6b803b3SArd Biesheuvel	ldr	ip,[$ctx,#16]
748*a6b803b3SArd Biesheuvel
749*a6b803b3SArd Biesheuvel	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
750*a6b803b3SArd Biesheuvel	mov	r3,r4,lsr#26
751*a6b803b3SArd Biesheuvel	 veor	$D0#lo,$D0#lo,$D0#lo
752*a6b803b3SArd Biesheuvel	mov	r4,r5,lsr#20
753*a6b803b3SArd Biesheuvel	orr	r3,r3,r5,lsl#6
754*a6b803b3SArd Biesheuvel	 veor	$D1#lo,$D1#lo,$D1#lo
755*a6b803b3SArd Biesheuvel	mov	r5,r6,lsr#14
756*a6b803b3SArd Biesheuvel	orr	r4,r4,r6,lsl#12
757*a6b803b3SArd Biesheuvel	 veor	$D2#lo,$D2#lo,$D2#lo
758*a6b803b3SArd Biesheuvel	mov	r6,r7,lsr#8
759*a6b803b3SArd Biesheuvel	orr	r5,r5,r7,lsl#18
760*a6b803b3SArd Biesheuvel	 veor	$D3#lo,$D3#lo,$D3#lo
761*a6b803b3SArd Biesheuvel	and	r3,r3,#0x03ffffff
762*a6b803b3SArd Biesheuvel	orr	r6,r6,ip,lsl#24
763*a6b803b3SArd Biesheuvel	 veor	$D4#lo,$D4#lo,$D4#lo
764*a6b803b3SArd Biesheuvel	and	r4,r4,#0x03ffffff
765*a6b803b3SArd Biesheuvel	mov	r1,#1
766*a6b803b3SArd Biesheuvel	and	r5,r5,#0x03ffffff
767*a6b803b3SArd Biesheuvel	str	r1,[$ctx,#36]		@ set is_base2_26
768*a6b803b3SArd Biesheuvel
769*a6b803b3SArd Biesheuvel	vmov.32	$D0#lo[0],r2
770*a6b803b3SArd Biesheuvel	vmov.32	$D1#lo[0],r3
771*a6b803b3SArd Biesheuvel	vmov.32	$D2#lo[0],r4
772*a6b803b3SArd Biesheuvel	vmov.32	$D3#lo[0],r5
773*a6b803b3SArd Biesheuvel	vmov.32	$D4#lo[0],r6
774*a6b803b3SArd Biesheuvel	adr	$zeros,.Lzeros
775*a6b803b3SArd Biesheuvel
776*a6b803b3SArd Biesheuvel	ldmia	sp!,{r1-r3,lr}
777*a6b803b3SArd Biesheuvel	b	.Lhash_loaded
778*a6b803b3SArd Biesheuvel
779*a6b803b3SArd Biesheuvel.align	4
780*a6b803b3SArd Biesheuvel.Lbase2_26_neon:
781*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
782*a6b803b3SArd Biesheuvel	@ load hash value
783*a6b803b3SArd Biesheuvel
784*a6b803b3SArd Biesheuvel	veor		$D0#lo,$D0#lo,$D0#lo
785*a6b803b3SArd Biesheuvel	veor		$D1#lo,$D1#lo,$D1#lo
786*a6b803b3SArd Biesheuvel	veor		$D2#lo,$D2#lo,$D2#lo
787*a6b803b3SArd Biesheuvel	veor		$D3#lo,$D3#lo,$D3#lo
788*a6b803b3SArd Biesheuvel	veor		$D4#lo,$D4#lo,$D4#lo
789*a6b803b3SArd Biesheuvel	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
790*a6b803b3SArd Biesheuvel	adr		$zeros,.Lzeros
791*a6b803b3SArd Biesheuvel	vld1.32		{$D4#lo[0]},[$ctx]
792*a6b803b3SArd Biesheuvel	sub		$ctx,$ctx,#16		@ rewind
793*a6b803b3SArd Biesheuvel
794*a6b803b3SArd Biesheuvel.Lhash_loaded:
795*a6b803b3SArd Biesheuvel	add		$in2,$inp,#32
796*a6b803b3SArd Biesheuvel	mov		$padbit,$padbit,lsl#24
797*a6b803b3SArd Biesheuvel	tst		$len,#31
798*a6b803b3SArd Biesheuvel	beq		.Leven
799*a6b803b3SArd Biesheuvel
800*a6b803b3SArd Biesheuvel	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
801*a6b803b3SArd Biesheuvel	vmov.32		$H4#lo[0],$padbit
802*a6b803b3SArd Biesheuvel	sub		$len,$len,#16
803*a6b803b3SArd Biesheuvel	add		$in2,$inp,#32
804*a6b803b3SArd Biesheuvel
805*a6b803b3SArd Biesheuvel# ifdef	__ARMEB__
806*a6b803b3SArd Biesheuvel	vrev32.8	$H0,$H0
807*a6b803b3SArd Biesheuvel	vrev32.8	$H3,$H3
808*a6b803b3SArd Biesheuvel	vrev32.8	$H1,$H1
809*a6b803b3SArd Biesheuvel	vrev32.8	$H2,$H2
810*a6b803b3SArd Biesheuvel# endif
811*a6b803b3SArd Biesheuvel	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
812*a6b803b3SArd Biesheuvel	vshl.u32	$H3#lo,$H3#lo,#18
813*a6b803b3SArd Biesheuvel
814*a6b803b3SArd Biesheuvel	vsri.u32	$H3#lo,$H2#lo,#14
815*a6b803b3SArd Biesheuvel	vshl.u32	$H2#lo,$H2#lo,#12
816*a6b803b3SArd Biesheuvel	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
817*a6b803b3SArd Biesheuvel
818*a6b803b3SArd Biesheuvel	vbic.i32	$H3#lo,#0xfc000000
819*a6b803b3SArd Biesheuvel	vsri.u32	$H2#lo,$H1#lo,#20
820*a6b803b3SArd Biesheuvel	vshl.u32	$H1#lo,$H1#lo,#6
821*a6b803b3SArd Biesheuvel
822*a6b803b3SArd Biesheuvel	vbic.i32	$H2#lo,#0xfc000000
823*a6b803b3SArd Biesheuvel	vsri.u32	$H1#lo,$H0#lo,#26
824*a6b803b3SArd Biesheuvel	vadd.i32	$H3#hi,$H3#lo,$D3#lo
825*a6b803b3SArd Biesheuvel
826*a6b803b3SArd Biesheuvel	vbic.i32	$H0#lo,#0xfc000000
827*a6b803b3SArd Biesheuvel	vbic.i32	$H1#lo,#0xfc000000
828*a6b803b3SArd Biesheuvel	vadd.i32	$H2#hi,$H2#lo,$D2#lo
829*a6b803b3SArd Biesheuvel
830*a6b803b3SArd Biesheuvel	vadd.i32	$H0#hi,$H0#lo,$D0#lo
831*a6b803b3SArd Biesheuvel	vadd.i32	$H1#hi,$H1#lo,$D1#lo
832*a6b803b3SArd Biesheuvel
833*a6b803b3SArd Biesheuvel	mov		$tbl1,$zeros
834*a6b803b3SArd Biesheuvel	add		$tbl0,$ctx,#48
835*a6b803b3SArd Biesheuvel
836*a6b803b3SArd Biesheuvel	cmp		$len,$len
837*a6b803b3SArd Biesheuvel	b		.Long_tail
838*a6b803b3SArd Biesheuvel
839*a6b803b3SArd Biesheuvel.align	4
840*a6b803b3SArd Biesheuvel.Leven:
841*a6b803b3SArd Biesheuvel	subs		$len,$len,#64
842*a6b803b3SArd Biesheuvel	it		lo
843*a6b803b3SArd Biesheuvel	movlo		$in2,$zeros
844*a6b803b3SArd Biesheuvel
845*a6b803b3SArd Biesheuvel	vmov.i32	$H4,#1<<24		@ padbit, yes, always
846*a6b803b3SArd Biesheuvel	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
847*a6b803b3SArd Biesheuvel	add		$inp,$inp,#64
848*a6b803b3SArd Biesheuvel	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
849*a6b803b3SArd Biesheuvel	add		$in2,$in2,#64
850*a6b803b3SArd Biesheuvel	itt		hi
851*a6b803b3SArd Biesheuvel	addhi		$tbl1,$ctx,#(48+1*9*4)
852*a6b803b3SArd Biesheuvel	addhi		$tbl0,$ctx,#(48+3*9*4)
853*a6b803b3SArd Biesheuvel
854*a6b803b3SArd Biesheuvel# ifdef	__ARMEB__
855*a6b803b3SArd Biesheuvel	vrev32.8	$H0,$H0
856*a6b803b3SArd Biesheuvel	vrev32.8	$H3,$H3
857*a6b803b3SArd Biesheuvel	vrev32.8	$H1,$H1
858*a6b803b3SArd Biesheuvel	vrev32.8	$H2,$H2
859*a6b803b3SArd Biesheuvel# endif
860*a6b803b3SArd Biesheuvel	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
861*a6b803b3SArd Biesheuvel	vshl.u32	$H3,$H3,#18
862*a6b803b3SArd Biesheuvel
863*a6b803b3SArd Biesheuvel	vsri.u32	$H3,$H2,#14
864*a6b803b3SArd Biesheuvel	vshl.u32	$H2,$H2,#12
865*a6b803b3SArd Biesheuvel
866*a6b803b3SArd Biesheuvel	vbic.i32	$H3,#0xfc000000
867*a6b803b3SArd Biesheuvel	vsri.u32	$H2,$H1,#20
868*a6b803b3SArd Biesheuvel	vshl.u32	$H1,$H1,#6
869*a6b803b3SArd Biesheuvel
870*a6b803b3SArd Biesheuvel	vbic.i32	$H2,#0xfc000000
871*a6b803b3SArd Biesheuvel	vsri.u32	$H1,$H0,#26
872*a6b803b3SArd Biesheuvel
873*a6b803b3SArd Biesheuvel	vbic.i32	$H0,#0xfc000000
874*a6b803b3SArd Biesheuvel	vbic.i32	$H1,#0xfc000000
875*a6b803b3SArd Biesheuvel
876*a6b803b3SArd Biesheuvel	bls		.Lskip_loop
877*a6b803b3SArd Biesheuvel
878*a6b803b3SArd Biesheuvel	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
879*a6b803b3SArd Biesheuvel	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
880*a6b803b3SArd Biesheuvel	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
881*a6b803b3SArd Biesheuvel	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
882*a6b803b3SArd Biesheuvel	b		.Loop_neon
883*a6b803b3SArd Biesheuvel
884*a6b803b3SArd Biesheuvel.align	5
885*a6b803b3SArd Biesheuvel.Loop_neon:
886*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
887*a6b803b3SArd Biesheuvel	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
888*a6b803b3SArd Biesheuvel	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
889*a6b803b3SArd Biesheuvel	@   \___________________/
890*a6b803b3SArd Biesheuvel	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
891*a6b803b3SArd Biesheuvel	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
892*a6b803b3SArd Biesheuvel	@   \___________________/ \____________________/
893*a6b803b3SArd Biesheuvel	@
894*a6b803b3SArd Biesheuvel	@ Note that we start with inp[2:3]*r^2. This is because it
895*a6b803b3SArd Biesheuvel	@ doesn't depend on reduction in previous iteration.
896*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
897*a6b803b3SArd Biesheuvel	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
898*a6b803b3SArd Biesheuvel	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
899*a6b803b3SArd Biesheuvel	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
900*a6b803b3SArd Biesheuvel	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
901*a6b803b3SArd Biesheuvel	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
902*a6b803b3SArd Biesheuvel
903*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
904*a6b803b3SArd Biesheuvel	@ inp[2:3]*r^2
905*a6b803b3SArd Biesheuvel
906*a6b803b3SArd Biesheuvel	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
907*a6b803b3SArd Biesheuvel	vmull.u32	$D2,$H2#hi,${R0}[1]
908*a6b803b3SArd Biesheuvel	vadd.i32	$H0#lo,$H0#lo,$D0#lo
909*a6b803b3SArd Biesheuvel	vmull.u32	$D0,$H0#hi,${R0}[1]
910*a6b803b3SArd Biesheuvel	vadd.i32	$H3#lo,$H3#lo,$D3#lo
911*a6b803b3SArd Biesheuvel	vmull.u32	$D3,$H3#hi,${R0}[1]
912*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H1#hi,${R1}[1]
913*a6b803b3SArd Biesheuvel	vadd.i32	$H1#lo,$H1#lo,$D1#lo
914*a6b803b3SArd Biesheuvel	vmull.u32	$D1,$H1#hi,${R0}[1]
915*a6b803b3SArd Biesheuvel
916*a6b803b3SArd Biesheuvel	vadd.i32	$H4#lo,$H4#lo,$D4#lo
917*a6b803b3SArd Biesheuvel	vmull.u32	$D4,$H4#hi,${R0}[1]
918*a6b803b3SArd Biesheuvel	subs		$len,$len,#64
919*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H4#hi,${S1}[1]
920*a6b803b3SArd Biesheuvel	it		lo
921*a6b803b3SArd Biesheuvel	movlo		$in2,$zeros
922*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H2#hi,${R1}[1]
923*a6b803b3SArd Biesheuvel	vld1.32		${S4}[1],[$tbl1,:32]
924*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H0#hi,${R1}[1]
925*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H3#hi,${R1}[1]
926*a6b803b3SArd Biesheuvel
927*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H3#hi,${S2}[1]
928*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H1#hi,${R2}[1]
929*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H2#hi,${R2}[1]
930*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H4#hi,${S2}[1]
931*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H0#hi,${R2}[1]
932*a6b803b3SArd Biesheuvel
933*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H0#hi,${R3}[1]
934*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H2#hi,${S3}[1]
935*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H1#hi,${R3}[1]
936*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H3#hi,${S3}[1]
937*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H4#hi,${S3}[1]
938*a6b803b3SArd Biesheuvel
939*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H4#hi,${S4}[1]
940*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H1#hi,${S4}[1]
941*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H0#hi,${R4}[1]
942*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H2#hi,${S4}[1]
943*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H3#hi,${S4}[1]
944*a6b803b3SArd Biesheuvel
945*a6b803b3SArd Biesheuvel	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
946*a6b803b3SArd Biesheuvel	add		$in2,$in2,#64
947*a6b803b3SArd Biesheuvel
948*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
949*a6b803b3SArd Biesheuvel	@ (hash+inp[0:1])*r^4 and accumulate
950*a6b803b3SArd Biesheuvel
951*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H3#lo,${R0}[0]
952*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H0#lo,${R0}[0]
953*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H4#lo,${R0}[0]
954*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H1#lo,${R0}[0]
955*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H2#lo,${R0}[0]
956*a6b803b3SArd Biesheuvel	vld1.32		${S4}[0],[$tbl0,:32]
957*a6b803b3SArd Biesheuvel
958*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H2#lo,${R1}[0]
959*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H4#lo,${S1}[0]
960*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H3#lo,${R1}[0]
961*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H0#lo,${R1}[0]
962*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H1#lo,${R1}[0]
963*a6b803b3SArd Biesheuvel
964*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H1#lo,${R2}[0]
965*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H3#lo,${S2}[0]
966*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H2#lo,${R2}[0]
967*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H4#lo,${S2}[0]
968*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H0#lo,${R2}[0]
969*a6b803b3SArd Biesheuvel
970*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H0#lo,${R3}[0]
971*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H2#lo,${S3}[0]
972*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H1#lo,${R3}[0]
973*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H3#lo,${S3}[0]
974*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H4#lo,${S4}[0]
975*a6b803b3SArd Biesheuvel
976*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H4#lo,${S3}[0]
977*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H1#lo,${S4}[0]
978*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H0#lo,${R4}[0]
979*a6b803b3SArd Biesheuvel	vmov.i32	$H4,#1<<24		@ padbit, yes, always
980*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H2#lo,${S4}[0]
981*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H3#lo,${S4}[0]
982*a6b803b3SArd Biesheuvel
983*a6b803b3SArd Biesheuvel	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
984*a6b803b3SArd Biesheuvel	add		$inp,$inp,#64
985*a6b803b3SArd Biesheuvel# ifdef	__ARMEB__
986*a6b803b3SArd Biesheuvel	vrev32.8	$H0,$H0
987*a6b803b3SArd Biesheuvel	vrev32.8	$H1,$H1
988*a6b803b3SArd Biesheuvel	vrev32.8	$H2,$H2
989*a6b803b3SArd Biesheuvel	vrev32.8	$H3,$H3
990*a6b803b3SArd Biesheuvel# endif
991*a6b803b3SArd Biesheuvel
992*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
993*a6b803b3SArd Biesheuvel	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
994*a6b803b3SArd Biesheuvel	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
995*a6b803b3SArd Biesheuvel
996*a6b803b3SArd Biesheuvel	vshr.u64	$T0,$D3,#26
997*a6b803b3SArd Biesheuvel	vmovn.i64	$D3#lo,$D3
998*a6b803b3SArd Biesheuvel	 vshr.u64	$T1,$D0,#26
999*a6b803b3SArd Biesheuvel	 vmovn.i64	$D0#lo,$D0
1000*a6b803b3SArd Biesheuvel	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
1001*a6b803b3SArd Biesheuvel	vbic.i32	$D3#lo,#0xfc000000
1002*a6b803b3SArd Biesheuvel	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
1003*a6b803b3SArd Biesheuvel	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
1004*a6b803b3SArd Biesheuvel	  vshl.u32	$H3,$H3,#18
1005*a6b803b3SArd Biesheuvel	 vbic.i32	$D0#lo,#0xfc000000
1006*a6b803b3SArd Biesheuvel
1007*a6b803b3SArd Biesheuvel	vshrn.u64	$T0#lo,$D4,#26
1008*a6b803b3SArd Biesheuvel	vmovn.i64	$D4#lo,$D4
1009*a6b803b3SArd Biesheuvel	 vshr.u64	$T1,$D1,#26
1010*a6b803b3SArd Biesheuvel	 vmovn.i64	$D1#lo,$D1
1011*a6b803b3SArd Biesheuvel	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
1012*a6b803b3SArd Biesheuvel	  vsri.u32	$H3,$H2,#14
1013*a6b803b3SArd Biesheuvel	vbic.i32	$D4#lo,#0xfc000000
1014*a6b803b3SArd Biesheuvel	  vshl.u32	$H2,$H2,#12
1015*a6b803b3SArd Biesheuvel	 vbic.i32	$D1#lo,#0xfc000000
1016*a6b803b3SArd Biesheuvel
1017*a6b803b3SArd Biesheuvel	vadd.i32	$D0#lo,$D0#lo,$T0#lo
1018*a6b803b3SArd Biesheuvel	vshl.u32	$T0#lo,$T0#lo,#2
1019*a6b803b3SArd Biesheuvel	  vbic.i32	$H3,#0xfc000000
1020*a6b803b3SArd Biesheuvel	 vshrn.u64	$T1#lo,$D2,#26
1021*a6b803b3SArd Biesheuvel	 vmovn.i64	$D2#lo,$D2
1022*a6b803b3SArd Biesheuvel	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
1023*a6b803b3SArd Biesheuvel	  vsri.u32	$H2,$H1,#20
1024*a6b803b3SArd Biesheuvel	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
1025*a6b803b3SArd Biesheuvel	  vshl.u32	$H1,$H1,#6
1026*a6b803b3SArd Biesheuvel	 vbic.i32	$D2#lo,#0xfc000000
1027*a6b803b3SArd Biesheuvel	  vbic.i32	$H2,#0xfc000000
1028*a6b803b3SArd Biesheuvel
1029*a6b803b3SArd Biesheuvel	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
1030*a6b803b3SArd Biesheuvel	vmovn.i64	$D0#lo,$D0
1031*a6b803b3SArd Biesheuvel	  vsri.u32	$H1,$H0,#26
1032*a6b803b3SArd Biesheuvel	  vbic.i32	$H0,#0xfc000000
1033*a6b803b3SArd Biesheuvel	 vshr.u32	$T1#lo,$D3#lo,#26
1034*a6b803b3SArd Biesheuvel	 vbic.i32	$D3#lo,#0xfc000000
1035*a6b803b3SArd Biesheuvel	vbic.i32	$D0#lo,#0xfc000000
1036*a6b803b3SArd Biesheuvel	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
1037*a6b803b3SArd Biesheuvel	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
1038*a6b803b3SArd Biesheuvel	  vbic.i32	$H1,#0xfc000000
1039*a6b803b3SArd Biesheuvel
1040*a6b803b3SArd Biesheuvel	bhi		.Loop_neon
1041*a6b803b3SArd Biesheuvel
1042*a6b803b3SArd Biesheuvel.Lskip_loop:
1043*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1044*a6b803b3SArd Biesheuvel	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1045*a6b803b3SArd Biesheuvel
1046*a6b803b3SArd Biesheuvel	add		$tbl1,$ctx,#(48+0*9*4)
1047*a6b803b3SArd Biesheuvel	add		$tbl0,$ctx,#(48+1*9*4)
1048*a6b803b3SArd Biesheuvel	adds		$len,$len,#32
1049*a6b803b3SArd Biesheuvel	it		ne
1050*a6b803b3SArd Biesheuvel	movne		$len,#0
1051*a6b803b3SArd Biesheuvel	bne		.Long_tail
1052*a6b803b3SArd Biesheuvel
1053*a6b803b3SArd Biesheuvel	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
1054*a6b803b3SArd Biesheuvel	vadd.i32	$H0#hi,$H0#lo,$D0#lo
1055*a6b803b3SArd Biesheuvel	vadd.i32	$H3#hi,$H3#lo,$D3#lo
1056*a6b803b3SArd Biesheuvel	vadd.i32	$H1#hi,$H1#lo,$D1#lo
1057*a6b803b3SArd Biesheuvel	vadd.i32	$H4#hi,$H4#lo,$D4#lo
1058*a6b803b3SArd Biesheuvel
1059*a6b803b3SArd Biesheuvel.Long_tail:
1060*a6b803b3SArd Biesheuvel	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
1061*a6b803b3SArd Biesheuvel	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
1062*a6b803b3SArd Biesheuvel
1063*a6b803b3SArd Biesheuvel	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
1064*a6b803b3SArd Biesheuvel	vmull.u32	$D2,$H2#hi,$R0
1065*a6b803b3SArd Biesheuvel	vadd.i32	$H0#lo,$H0#lo,$D0#lo
1066*a6b803b3SArd Biesheuvel	vmull.u32	$D0,$H0#hi,$R0
1067*a6b803b3SArd Biesheuvel	vadd.i32	$H3#lo,$H3#lo,$D3#lo
1068*a6b803b3SArd Biesheuvel	vmull.u32	$D3,$H3#hi,$R0
1069*a6b803b3SArd Biesheuvel	vadd.i32	$H1#lo,$H1#lo,$D1#lo
1070*a6b803b3SArd Biesheuvel	vmull.u32	$D1,$H1#hi,$R0
1071*a6b803b3SArd Biesheuvel	vadd.i32	$H4#lo,$H4#lo,$D4#lo
1072*a6b803b3SArd Biesheuvel	vmull.u32	$D4,$H4#hi,$R0
1073*a6b803b3SArd Biesheuvel
1074*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H4#hi,$S1
1075*a6b803b3SArd Biesheuvel	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1076*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H2#hi,$R1
1077*a6b803b3SArd Biesheuvel	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1078*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H0#hi,$R1
1079*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H3#hi,$R1
1080*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H1#hi,$R1
1081*a6b803b3SArd Biesheuvel
1082*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H1#hi,$R2
1083*a6b803b3SArd Biesheuvel	vld1.32		${S4}[1],[$tbl1,:32]
1084*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H3#hi,$S2
1085*a6b803b3SArd Biesheuvel	vld1.32		${S4}[0],[$tbl0,:32]
1086*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H2#hi,$R2
1087*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H4#hi,$S2
1088*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H0#hi,$R2
1089*a6b803b3SArd Biesheuvel
1090*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H0#hi,$R3
1091*a6b803b3SArd Biesheuvel	 it		ne
1092*a6b803b3SArd Biesheuvel	 addne		$tbl1,$ctx,#(48+2*9*4)
1093*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H2#hi,$S3
1094*a6b803b3SArd Biesheuvel	 it		ne
1095*a6b803b3SArd Biesheuvel	 addne		$tbl0,$ctx,#(48+3*9*4)
1096*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H1#hi,$R3
1097*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H3#hi,$S3
1098*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H4#hi,$S3
1099*a6b803b3SArd Biesheuvel
1100*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H4#hi,$S4
1101*a6b803b3SArd Biesheuvel	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
1102*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H1#hi,$S4
1103*a6b803b3SArd Biesheuvel	 vshr.u64	$MASK,$MASK,#38
1104*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H0#hi,$R4
1105*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H2#hi,$S4
1106*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H3#hi,$S4
1107*a6b803b3SArd Biesheuvel
1108*a6b803b3SArd Biesheuvel	beq		.Lshort_tail
1109*a6b803b3SArd Biesheuvel
1110*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1111*a6b803b3SArd Biesheuvel	@ (hash+inp[0:1])*r^4:r^3 and accumulate
1112*a6b803b3SArd Biesheuvel
1113*a6b803b3SArd Biesheuvel	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
1114*a6b803b3SArd Biesheuvel	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
1115*a6b803b3SArd Biesheuvel
1116*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H2#lo,$R0
1117*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H0#lo,$R0
1118*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H3#lo,$R0
1119*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H1#lo,$R0
1120*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H4#lo,$R0
1121*a6b803b3SArd Biesheuvel
1122*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H4#lo,$S1
1123*a6b803b3SArd Biesheuvel	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1124*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H2#lo,$R1
1125*a6b803b3SArd Biesheuvel	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1126*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H0#lo,$R1
1127*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H3#lo,$R1
1128*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H1#lo,$R1
1129*a6b803b3SArd Biesheuvel
1130*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H1#lo,$R2
1131*a6b803b3SArd Biesheuvel	vld1.32		${S4}[1],[$tbl1,:32]
1132*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H3#lo,$S2
1133*a6b803b3SArd Biesheuvel	vld1.32		${S4}[0],[$tbl0,:32]
1134*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H2#lo,$R2
1135*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H4#lo,$S2
1136*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H0#lo,$R2
1137*a6b803b3SArd Biesheuvel
1138*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H0#lo,$R3
1139*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H2#lo,$S3
1140*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H1#lo,$R3
1141*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H3#lo,$S3
1142*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H4#lo,$S3
1143*a6b803b3SArd Biesheuvel
1144*a6b803b3SArd Biesheuvel	vmlal.u32	$D3,$H4#lo,$S4
1145*a6b803b3SArd Biesheuvel	 vorn		$MASK,$MASK,$MASK	@ all-ones
1146*a6b803b3SArd Biesheuvel	vmlal.u32	$D0,$H1#lo,$S4
1147*a6b803b3SArd Biesheuvel	 vshr.u64	$MASK,$MASK,#38
1148*a6b803b3SArd Biesheuvel	vmlal.u32	$D4,$H0#lo,$R4
1149*a6b803b3SArd Biesheuvel	vmlal.u32	$D1,$H2#lo,$S4
1150*a6b803b3SArd Biesheuvel	vmlal.u32	$D2,$H3#lo,$S4
1151*a6b803b3SArd Biesheuvel
1152*a6b803b3SArd Biesheuvel.Lshort_tail:
1153*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1154*a6b803b3SArd Biesheuvel	@ horizontal addition
1155*a6b803b3SArd Biesheuvel
1156*a6b803b3SArd Biesheuvel	vadd.i64	$D3#lo,$D3#lo,$D3#hi
1157*a6b803b3SArd Biesheuvel	vadd.i64	$D0#lo,$D0#lo,$D0#hi
1158*a6b803b3SArd Biesheuvel	vadd.i64	$D4#lo,$D4#lo,$D4#hi
1159*a6b803b3SArd Biesheuvel	vadd.i64	$D1#lo,$D1#lo,$D1#hi
1160*a6b803b3SArd Biesheuvel	vadd.i64	$D2#lo,$D2#lo,$D2#hi
1161*a6b803b3SArd Biesheuvel
1162*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1163*a6b803b3SArd Biesheuvel	@ lazy reduction, but without narrowing
1164*a6b803b3SArd Biesheuvel
1165*a6b803b3SArd Biesheuvel	vshr.u64	$T0,$D3,#26
1166*a6b803b3SArd Biesheuvel	vand.i64	$D3,$D3,$MASK
1167*a6b803b3SArd Biesheuvel	 vshr.u64	$T1,$D0,#26
1168*a6b803b3SArd Biesheuvel	 vand.i64	$D0,$D0,$MASK
1169*a6b803b3SArd Biesheuvel	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
1170*a6b803b3SArd Biesheuvel	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
1171*a6b803b3SArd Biesheuvel
1172*a6b803b3SArd Biesheuvel	vshr.u64	$T0,$D4,#26
1173*a6b803b3SArd Biesheuvel	vand.i64	$D4,$D4,$MASK
1174*a6b803b3SArd Biesheuvel	 vshr.u64	$T1,$D1,#26
1175*a6b803b3SArd Biesheuvel	 vand.i64	$D1,$D1,$MASK
1176*a6b803b3SArd Biesheuvel	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
1177*a6b803b3SArd Biesheuvel
1178*a6b803b3SArd Biesheuvel	vadd.i64	$D0,$D0,$T0
1179*a6b803b3SArd Biesheuvel	vshl.u64	$T0,$T0,#2
1180*a6b803b3SArd Biesheuvel	 vshr.u64	$T1,$D2,#26
1181*a6b803b3SArd Biesheuvel	 vand.i64	$D2,$D2,$MASK
1182*a6b803b3SArd Biesheuvel	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
1183*a6b803b3SArd Biesheuvel	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
1184*a6b803b3SArd Biesheuvel
1185*a6b803b3SArd Biesheuvel	vshr.u64	$T0,$D0,#26
1186*a6b803b3SArd Biesheuvel	vand.i64	$D0,$D0,$MASK
1187*a6b803b3SArd Biesheuvel	 vshr.u64	$T1,$D3,#26
1188*a6b803b3SArd Biesheuvel	 vand.i64	$D3,$D3,$MASK
1189*a6b803b3SArd Biesheuvel	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
1190*a6b803b3SArd Biesheuvel	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
1191*a6b803b3SArd Biesheuvel
1192*a6b803b3SArd Biesheuvel	cmp		$len,#0
1193*a6b803b3SArd Biesheuvel	bne		.Leven
1194*a6b803b3SArd Biesheuvel
1195*a6b803b3SArd Biesheuvel	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1196*a6b803b3SArd Biesheuvel	@ store hash value
1197*a6b803b3SArd Biesheuvel
1198*a6b803b3SArd Biesheuvel	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1199*a6b803b3SArd Biesheuvel	vst1.32		{$D4#lo[0]},[$ctx]
1200*a6b803b3SArd Biesheuvel
1201*a6b803b3SArd Biesheuvel	vldmia	sp!,{d8-d15}			@ epilogue
1202*a6b803b3SArd Biesheuvel	ldmia	sp!,{r4-r7}
1203*a6b803b3SArd Biesheuvel	ret					@ bx	lr
1204*a6b803b3SArd Biesheuvel.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1205*a6b803b3SArd Biesheuvel
1206*a6b803b3SArd Biesheuvel.align	5
1207*a6b803b3SArd Biesheuvel.Lzeros:
1208*a6b803b3SArd Biesheuvel.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1209*a6b803b3SArd Biesheuvel#ifndef	__KERNEL__
1210*a6b803b3SArd Biesheuvel.LOPENSSL_armcap:
1211*a6b803b3SArd Biesheuvel# ifdef	_WIN32
1212*a6b803b3SArd Biesheuvel.word	OPENSSL_armcap_P
1213*a6b803b3SArd Biesheuvel# else
1214*a6b803b3SArd Biesheuvel.word	OPENSSL_armcap_P-.Lpoly1305_init
1215*a6b803b3SArd Biesheuvel# endif
1216*a6b803b3SArd Biesheuvel.comm	OPENSSL_armcap_P,4,4
1217*a6b803b3SArd Biesheuvel.hidden	OPENSSL_armcap_P
1218*a6b803b3SArd Biesheuvel#endif
1219*a6b803b3SArd Biesheuvel#endif
1220*a6b803b3SArd Biesheuvel___
1221*a6b803b3SArd Biesheuvel}	}
1222*a6b803b3SArd Biesheuvel$code.=<<___;
1223*a6b803b3SArd Biesheuvel.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
1224*a6b803b3SArd Biesheuvel.align	2
1225*a6b803b3SArd Biesheuvel___
1226*a6b803b3SArd Biesheuvel
1227*a6b803b3SArd Biesheuvelforeach (split("\n",$code)) {
1228*a6b803b3SArd Biesheuvel	s/\`([^\`]*)\`/eval $1/geo;
1229*a6b803b3SArd Biesheuvel
1230*a6b803b3SArd Biesheuvel	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
1231*a6b803b3SArd Biesheuvel	s/\bret\b/bx	lr/go						or
1232*a6b803b3SArd Biesheuvel	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
1233*a6b803b3SArd Biesheuvel
1234*a6b803b3SArd Biesheuvel	print $_,"\n";
1235*a6b803b3SArd Biesheuvel}
1236*a6b803b3SArd Biesheuvelclose STDOUT; # enforce flush
1237