1*a6b803b3SArd Biesheuvel#!/usr/bin/env perl 2*a6b803b3SArd Biesheuvel# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3*a6b803b3SArd Biesheuvel# 4*a6b803b3SArd Biesheuvel# ==================================================================== 5*a6b803b3SArd Biesheuvel# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL 6*a6b803b3SArd Biesheuvel# project. 7*a6b803b3SArd Biesheuvel# ==================================================================== 8*a6b803b3SArd Biesheuvel# 9*a6b803b3SArd Biesheuvel# IALU(*)/gcc-4.4 NEON 10*a6b803b3SArd Biesheuvel# 11*a6b803b3SArd Biesheuvel# ARM11xx(ARMv6) 7.78/+100% - 12*a6b803b3SArd Biesheuvel# Cortex-A5 6.35/+130% 3.00 13*a6b803b3SArd Biesheuvel# Cortex-A8 6.25/+115% 2.36 14*a6b803b3SArd Biesheuvel# Cortex-A9 5.10/+95% 2.55 15*a6b803b3SArd Biesheuvel# Cortex-A15 3.85/+85% 1.25(**) 16*a6b803b3SArd Biesheuvel# Snapdragon S4 5.70/+100% 1.48(**) 17*a6b803b3SArd Biesheuvel# 18*a6b803b3SArd Biesheuvel# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; 19*a6b803b3SArd Biesheuvel# (**) these are trade-off results, they can be improved by ~8% but at 20*a6b803b3SArd Biesheuvel# the cost of 15/12% regression on Cortex-A5/A7, it's even possible 21*a6b803b3SArd Biesheuvel# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; 22*a6b803b3SArd Biesheuvel 23*a6b803b3SArd Biesheuvel$flavour = shift; 24*a6b803b3SArd Biesheuvelif ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 25*a6b803b3SArd Biesheuvelelse { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 26*a6b803b3SArd Biesheuvel 27*a6b803b3SArd Biesheuvelif ($flavour && $flavour ne "void") { 28*a6b803b3SArd Biesheuvel $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 29*a6b803b3SArd Biesheuvel ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 30*a6b803b3SArd Biesheuvel ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 31*a6b803b3SArd Biesheuvel die "can't locate arm-xlate.pl"; 32*a6b803b3SArd Biesheuvel 33*a6b803b3SArd Biesheuvel open STDOUT,"| \"$^X\" $xlate $flavour $output"; 34*a6b803b3SArd Biesheuvel} else { 35*a6b803b3SArd Biesheuvel open STDOUT,">$output"; 36*a6b803b3SArd Biesheuvel} 37*a6b803b3SArd Biesheuvel 38*a6b803b3SArd Biesheuvel($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); 39*a6b803b3SArd Biesheuvel 40*a6b803b3SArd Biesheuvel$code.=<<___; 41*a6b803b3SArd Biesheuvel#ifndef __KERNEL__ 42*a6b803b3SArd Biesheuvel# include "arm_arch.h" 43*a6b803b3SArd Biesheuvel#else 44*a6b803b3SArd Biesheuvel# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 45*a6b803b3SArd Biesheuvel# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ 46*a6b803b3SArd Biesheuvel# define poly1305_init poly1305_init_arm 47*a6b803b3SArd Biesheuvel# define poly1305_blocks poly1305_blocks_arm 48*a6b803b3SArd Biesheuvel# define poly1305_emit poly1305_emit_arm 49*a6b803b3SArd Biesheuvel.globl poly1305_blocks_neon 50*a6b803b3SArd Biesheuvel#endif 51*a6b803b3SArd Biesheuvel 52*a6b803b3SArd Biesheuvel#if defined(__thumb2__) 53*a6b803b3SArd Biesheuvel.syntax unified 54*a6b803b3SArd Biesheuvel.thumb 55*a6b803b3SArd Biesheuvel#else 56*a6b803b3SArd Biesheuvel.code 32 57*a6b803b3SArd Biesheuvel#endif 58*a6b803b3SArd Biesheuvel 59*a6b803b3SArd Biesheuvel.text 60*a6b803b3SArd Biesheuvel 61*a6b803b3SArd Biesheuvel.globl poly1305_emit 62*a6b803b3SArd Biesheuvel.globl poly1305_blocks 63*a6b803b3SArd Biesheuvel.globl poly1305_init 64*a6b803b3SArd Biesheuvel.type poly1305_init,%function 65*a6b803b3SArd Biesheuvel.align 5 66*a6b803b3SArd Biesheuvelpoly1305_init: 67*a6b803b3SArd Biesheuvel.Lpoly1305_init: 68*a6b803b3SArd Biesheuvel stmdb sp!,{r4-r11} 69*a6b803b3SArd Biesheuvel 70*a6b803b3SArd Biesheuvel eor r3,r3,r3 71*a6b803b3SArd Biesheuvel cmp $inp,#0 72*a6b803b3SArd Biesheuvel str r3,[$ctx,#0] @ zero hash value 73*a6b803b3SArd Biesheuvel str r3,[$ctx,#4] 74*a6b803b3SArd Biesheuvel str r3,[$ctx,#8] 75*a6b803b3SArd Biesheuvel str r3,[$ctx,#12] 76*a6b803b3SArd Biesheuvel str r3,[$ctx,#16] 77*a6b803b3SArd Biesheuvel str r3,[$ctx,#36] @ clear is_base2_26 78*a6b803b3SArd Biesheuvel add $ctx,$ctx,#20 79*a6b803b3SArd Biesheuvel 80*a6b803b3SArd Biesheuvel#ifdef __thumb2__ 81*a6b803b3SArd Biesheuvel it eq 82*a6b803b3SArd Biesheuvel#endif 83*a6b803b3SArd Biesheuvel moveq r0,#0 84*a6b803b3SArd Biesheuvel beq .Lno_key 85*a6b803b3SArd Biesheuvel 86*a6b803b3SArd Biesheuvel#if __ARM_MAX_ARCH__>=7 87*a6b803b3SArd Biesheuvel mov r3,#-1 88*a6b803b3SArd Biesheuvel str r3,[$ctx,#28] @ impossible key power value 89*a6b803b3SArd Biesheuvel# ifndef __KERNEL__ 90*a6b803b3SArd Biesheuvel adr r11,.Lpoly1305_init 91*a6b803b3SArd Biesheuvel ldr r12,.LOPENSSL_armcap 92*a6b803b3SArd Biesheuvel# endif 93*a6b803b3SArd Biesheuvel#endif 94*a6b803b3SArd Biesheuvel ldrb r4,[$inp,#0] 95*a6b803b3SArd Biesheuvel mov r10,#0x0fffffff 96*a6b803b3SArd Biesheuvel ldrb r5,[$inp,#1] 97*a6b803b3SArd Biesheuvel and r3,r10,#-4 @ 0x0ffffffc 98*a6b803b3SArd Biesheuvel ldrb r6,[$inp,#2] 99*a6b803b3SArd Biesheuvel ldrb r7,[$inp,#3] 100*a6b803b3SArd Biesheuvel orr r4,r4,r5,lsl#8 101*a6b803b3SArd Biesheuvel ldrb r5,[$inp,#4] 102*a6b803b3SArd Biesheuvel orr r4,r4,r6,lsl#16 103*a6b803b3SArd Biesheuvel ldrb r6,[$inp,#5] 104*a6b803b3SArd Biesheuvel orr r4,r4,r7,lsl#24 105*a6b803b3SArd Biesheuvel ldrb r7,[$inp,#6] 106*a6b803b3SArd Biesheuvel and r4,r4,r10 107*a6b803b3SArd Biesheuvel 108*a6b803b3SArd Biesheuvel#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 109*a6b803b3SArd Biesheuvel# if !defined(_WIN32) 110*a6b803b3SArd Biesheuvel ldr r12,[r11,r12] @ OPENSSL_armcap_P 111*a6b803b3SArd Biesheuvel# endif 112*a6b803b3SArd Biesheuvel# if defined(__APPLE__) || defined(_WIN32) 113*a6b803b3SArd Biesheuvel ldr r12,[r12] 114*a6b803b3SArd Biesheuvel# endif 115*a6b803b3SArd Biesheuvel#endif 116*a6b803b3SArd Biesheuvel ldrb r8,[$inp,#7] 117*a6b803b3SArd Biesheuvel orr r5,r5,r6,lsl#8 118*a6b803b3SArd Biesheuvel ldrb r6,[$inp,#8] 119*a6b803b3SArd Biesheuvel orr r5,r5,r7,lsl#16 120*a6b803b3SArd Biesheuvel ldrb r7,[$inp,#9] 121*a6b803b3SArd Biesheuvel orr r5,r5,r8,lsl#24 122*a6b803b3SArd Biesheuvel ldrb r8,[$inp,#10] 123*a6b803b3SArd Biesheuvel and r5,r5,r3 124*a6b803b3SArd Biesheuvel 125*a6b803b3SArd Biesheuvel#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 126*a6b803b3SArd Biesheuvel tst r12,#ARMV7_NEON @ check for NEON 127*a6b803b3SArd Biesheuvel# ifdef __thumb2__ 128*a6b803b3SArd Biesheuvel adr r9,.Lpoly1305_blocks_neon 129*a6b803b3SArd Biesheuvel adr r11,.Lpoly1305_blocks 130*a6b803b3SArd Biesheuvel it ne 131*a6b803b3SArd Biesheuvel movne r11,r9 132*a6b803b3SArd Biesheuvel adr r12,.Lpoly1305_emit 133*a6b803b3SArd Biesheuvel orr r11,r11,#1 @ thumb-ify addresses 134*a6b803b3SArd Biesheuvel orr r12,r12,#1 135*a6b803b3SArd Biesheuvel# else 136*a6b803b3SArd Biesheuvel add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 137*a6b803b3SArd Biesheuvel ite eq 138*a6b803b3SArd Biesheuvel addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 139*a6b803b3SArd Biesheuvel addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 140*a6b803b3SArd Biesheuvel# endif 141*a6b803b3SArd Biesheuvel#endif 142*a6b803b3SArd Biesheuvel ldrb r9,[$inp,#11] 143*a6b803b3SArd Biesheuvel orr r6,r6,r7,lsl#8 144*a6b803b3SArd Biesheuvel ldrb r7,[$inp,#12] 145*a6b803b3SArd Biesheuvel orr r6,r6,r8,lsl#16 146*a6b803b3SArd Biesheuvel ldrb r8,[$inp,#13] 147*a6b803b3SArd Biesheuvel orr r6,r6,r9,lsl#24 148*a6b803b3SArd Biesheuvel ldrb r9,[$inp,#14] 149*a6b803b3SArd Biesheuvel and r6,r6,r3 150*a6b803b3SArd Biesheuvel 151*a6b803b3SArd Biesheuvel ldrb r10,[$inp,#15] 152*a6b803b3SArd Biesheuvel orr r7,r7,r8,lsl#8 153*a6b803b3SArd Biesheuvel str r4,[$ctx,#0] 154*a6b803b3SArd Biesheuvel orr r7,r7,r9,lsl#16 155*a6b803b3SArd Biesheuvel str r5,[$ctx,#4] 156*a6b803b3SArd Biesheuvel orr r7,r7,r10,lsl#24 157*a6b803b3SArd Biesheuvel str r6,[$ctx,#8] 158*a6b803b3SArd Biesheuvel and r7,r7,r3 159*a6b803b3SArd Biesheuvel str r7,[$ctx,#12] 160*a6b803b3SArd Biesheuvel#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 161*a6b803b3SArd Biesheuvel stmia r2,{r11,r12} @ fill functions table 162*a6b803b3SArd Biesheuvel mov r0,#1 163*a6b803b3SArd Biesheuvel#else 164*a6b803b3SArd Biesheuvel mov r0,#0 165*a6b803b3SArd Biesheuvel#endif 166*a6b803b3SArd Biesheuvel.Lno_key: 167*a6b803b3SArd Biesheuvel ldmia sp!,{r4-r11} 168*a6b803b3SArd Biesheuvel#if __ARM_ARCH__>=5 169*a6b803b3SArd Biesheuvel ret @ bx lr 170*a6b803b3SArd Biesheuvel#else 171*a6b803b3SArd Biesheuvel tst lr,#1 172*a6b803b3SArd Biesheuvel moveq pc,lr @ be binary compatible with V4, yet 173*a6b803b3SArd Biesheuvel bx lr @ interoperable with Thumb ISA:-) 174*a6b803b3SArd Biesheuvel#endif 175*a6b803b3SArd Biesheuvel.size poly1305_init,.-poly1305_init 176*a6b803b3SArd Biesheuvel___ 177*a6b803b3SArd Biesheuvel{ 178*a6b803b3SArd Biesheuvelmy ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); 179*a6b803b3SArd Biesheuvelmy ($s1,$s2,$s3)=($r1,$r2,$r3); 180*a6b803b3SArd Biesheuvel 181*a6b803b3SArd Biesheuvel$code.=<<___; 182*a6b803b3SArd Biesheuvel.type poly1305_blocks,%function 183*a6b803b3SArd Biesheuvel.align 5 184*a6b803b3SArd Biesheuvelpoly1305_blocks: 185*a6b803b3SArd Biesheuvel.Lpoly1305_blocks: 186*a6b803b3SArd Biesheuvel stmdb sp!,{r3-r11,lr} 187*a6b803b3SArd Biesheuvel 188*a6b803b3SArd Biesheuvel ands $len,$len,#-16 189*a6b803b3SArd Biesheuvel beq .Lno_data 190*a6b803b3SArd Biesheuvel 191*a6b803b3SArd Biesheuvel add $len,$len,$inp @ end pointer 192*a6b803b3SArd Biesheuvel sub sp,sp,#32 193*a6b803b3SArd Biesheuvel 194*a6b803b3SArd Biesheuvel#if __ARM_ARCH__<7 195*a6b803b3SArd Biesheuvel ldmia $ctx,{$h0-$r3} @ load context 196*a6b803b3SArd Biesheuvel add $ctx,$ctx,#20 197*a6b803b3SArd Biesheuvel str $len,[sp,#16] @ offload stuff 198*a6b803b3SArd Biesheuvel str $ctx,[sp,#12] 199*a6b803b3SArd Biesheuvel#else 200*a6b803b3SArd Biesheuvel ldr lr,[$ctx,#36] @ is_base2_26 201*a6b803b3SArd Biesheuvel ldmia $ctx!,{$h0-$h4} @ load hash value 202*a6b803b3SArd Biesheuvel str $len,[sp,#16] @ offload stuff 203*a6b803b3SArd Biesheuvel str $ctx,[sp,#12] 204*a6b803b3SArd Biesheuvel 205*a6b803b3SArd Biesheuvel adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 206*a6b803b3SArd Biesheuvel mov $r1,$h1,lsr#6 207*a6b803b3SArd Biesheuvel adcs $r1,$r1,$h2,lsl#20 208*a6b803b3SArd Biesheuvel mov $r2,$h2,lsr#12 209*a6b803b3SArd Biesheuvel adcs $r2,$r2,$h3,lsl#14 210*a6b803b3SArd Biesheuvel mov $r3,$h3,lsr#18 211*a6b803b3SArd Biesheuvel adcs $r3,$r3,$h4,lsl#8 212*a6b803b3SArd Biesheuvel mov $len,#0 213*a6b803b3SArd Biesheuvel teq lr,#0 214*a6b803b3SArd Biesheuvel str $len,[$ctx,#16] @ clear is_base2_26 215*a6b803b3SArd Biesheuvel adc $len,$len,$h4,lsr#24 216*a6b803b3SArd Biesheuvel 217*a6b803b3SArd Biesheuvel itttt ne 218*a6b803b3SArd Biesheuvel movne $h0,$r0 @ choose between radixes 219*a6b803b3SArd Biesheuvel movne $h1,$r1 220*a6b803b3SArd Biesheuvel movne $h2,$r2 221*a6b803b3SArd Biesheuvel movne $h3,$r3 222*a6b803b3SArd Biesheuvel ldmia $ctx,{$r0-$r3} @ load key 223*a6b803b3SArd Biesheuvel it ne 224*a6b803b3SArd Biesheuvel movne $h4,$len 225*a6b803b3SArd Biesheuvel#endif 226*a6b803b3SArd Biesheuvel 227*a6b803b3SArd Biesheuvel mov lr,$inp 228*a6b803b3SArd Biesheuvel cmp $padbit,#0 229*a6b803b3SArd Biesheuvel str $r1,[sp,#20] 230*a6b803b3SArd Biesheuvel str $r2,[sp,#24] 231*a6b803b3SArd Biesheuvel str $r3,[sp,#28] 232*a6b803b3SArd Biesheuvel b .Loop 233*a6b803b3SArd Biesheuvel 234*a6b803b3SArd Biesheuvel.align 4 235*a6b803b3SArd Biesheuvel.Loop: 236*a6b803b3SArd Biesheuvel#if __ARM_ARCH__<7 237*a6b803b3SArd Biesheuvel ldrb r0,[lr],#16 @ load input 238*a6b803b3SArd Biesheuvel# ifdef __thumb2__ 239*a6b803b3SArd Biesheuvel it hi 240*a6b803b3SArd Biesheuvel# endif 241*a6b803b3SArd Biesheuvel addhi $h4,$h4,#1 @ 1<<128 242*a6b803b3SArd Biesheuvel ldrb r1,[lr,#-15] 243*a6b803b3SArd Biesheuvel ldrb r2,[lr,#-14] 244*a6b803b3SArd Biesheuvel ldrb r3,[lr,#-13] 245*a6b803b3SArd Biesheuvel orr r1,r0,r1,lsl#8 246*a6b803b3SArd Biesheuvel ldrb r0,[lr,#-12] 247*a6b803b3SArd Biesheuvel orr r2,r1,r2,lsl#16 248*a6b803b3SArd Biesheuvel ldrb r1,[lr,#-11] 249*a6b803b3SArd Biesheuvel orr r3,r2,r3,lsl#24 250*a6b803b3SArd Biesheuvel ldrb r2,[lr,#-10] 251*a6b803b3SArd Biesheuvel adds $h0,$h0,r3 @ accumulate input 252*a6b803b3SArd Biesheuvel 253*a6b803b3SArd Biesheuvel ldrb r3,[lr,#-9] 254*a6b803b3SArd Biesheuvel orr r1,r0,r1,lsl#8 255*a6b803b3SArd Biesheuvel ldrb r0,[lr,#-8] 256*a6b803b3SArd Biesheuvel orr r2,r1,r2,lsl#16 257*a6b803b3SArd Biesheuvel ldrb r1,[lr,#-7] 258*a6b803b3SArd Biesheuvel orr r3,r2,r3,lsl#24 259*a6b803b3SArd Biesheuvel ldrb r2,[lr,#-6] 260*a6b803b3SArd Biesheuvel adcs $h1,$h1,r3 261*a6b803b3SArd Biesheuvel 262*a6b803b3SArd Biesheuvel ldrb r3,[lr,#-5] 263*a6b803b3SArd Biesheuvel orr r1,r0,r1,lsl#8 264*a6b803b3SArd Biesheuvel ldrb r0,[lr,#-4] 265*a6b803b3SArd Biesheuvel orr r2,r1,r2,lsl#16 266*a6b803b3SArd Biesheuvel ldrb r1,[lr,#-3] 267*a6b803b3SArd Biesheuvel orr r3,r2,r3,lsl#24 268*a6b803b3SArd Biesheuvel ldrb r2,[lr,#-2] 269*a6b803b3SArd Biesheuvel adcs $h2,$h2,r3 270*a6b803b3SArd Biesheuvel 271*a6b803b3SArd Biesheuvel ldrb r3,[lr,#-1] 272*a6b803b3SArd Biesheuvel orr r1,r0,r1,lsl#8 273*a6b803b3SArd Biesheuvel str lr,[sp,#8] @ offload input pointer 274*a6b803b3SArd Biesheuvel orr r2,r1,r2,lsl#16 275*a6b803b3SArd Biesheuvel add $s1,$r1,$r1,lsr#2 276*a6b803b3SArd Biesheuvel orr r3,r2,r3,lsl#24 277*a6b803b3SArd Biesheuvel#else 278*a6b803b3SArd Biesheuvel ldr r0,[lr],#16 @ load input 279*a6b803b3SArd Biesheuvel it hi 280*a6b803b3SArd Biesheuvel addhi $h4,$h4,#1 @ padbit 281*a6b803b3SArd Biesheuvel ldr r1,[lr,#-12] 282*a6b803b3SArd Biesheuvel ldr r2,[lr,#-8] 283*a6b803b3SArd Biesheuvel ldr r3,[lr,#-4] 284*a6b803b3SArd Biesheuvel# ifdef __ARMEB__ 285*a6b803b3SArd Biesheuvel rev r0,r0 286*a6b803b3SArd Biesheuvel rev r1,r1 287*a6b803b3SArd Biesheuvel rev r2,r2 288*a6b803b3SArd Biesheuvel rev r3,r3 289*a6b803b3SArd Biesheuvel# endif 290*a6b803b3SArd Biesheuvel adds $h0,$h0,r0 @ accumulate input 291*a6b803b3SArd Biesheuvel str lr,[sp,#8] @ offload input pointer 292*a6b803b3SArd Biesheuvel adcs $h1,$h1,r1 293*a6b803b3SArd Biesheuvel add $s1,$r1,$r1,lsr#2 294*a6b803b3SArd Biesheuvel adcs $h2,$h2,r2 295*a6b803b3SArd Biesheuvel#endif 296*a6b803b3SArd Biesheuvel add $s2,$r2,$r2,lsr#2 297*a6b803b3SArd Biesheuvel adcs $h3,$h3,r3 298*a6b803b3SArd Biesheuvel add $s3,$r3,$r3,lsr#2 299*a6b803b3SArd Biesheuvel 300*a6b803b3SArd Biesheuvel umull r2,r3,$h1,$r0 301*a6b803b3SArd Biesheuvel adc $h4,$h4,#0 302*a6b803b3SArd Biesheuvel umull r0,r1,$h0,$r0 303*a6b803b3SArd Biesheuvel umlal r2,r3,$h4,$s1 304*a6b803b3SArd Biesheuvel umlal r0,r1,$h3,$s1 305*a6b803b3SArd Biesheuvel ldr $r1,[sp,#20] @ reload $r1 306*a6b803b3SArd Biesheuvel umlal r2,r3,$h2,$s3 307*a6b803b3SArd Biesheuvel umlal r0,r1,$h1,$s3 308*a6b803b3SArd Biesheuvel umlal r2,r3,$h3,$s2 309*a6b803b3SArd Biesheuvel umlal r0,r1,$h2,$s2 310*a6b803b3SArd Biesheuvel umlal r2,r3,$h0,$r1 311*a6b803b3SArd Biesheuvel str r0,[sp,#0] @ future $h0 312*a6b803b3SArd Biesheuvel mul r0,$s2,$h4 313*a6b803b3SArd Biesheuvel ldr $r2,[sp,#24] @ reload $r2 314*a6b803b3SArd Biesheuvel adds r2,r2,r1 @ d1+=d0>>32 315*a6b803b3SArd Biesheuvel eor r1,r1,r1 316*a6b803b3SArd Biesheuvel adc lr,r3,#0 @ future $h2 317*a6b803b3SArd Biesheuvel str r2,[sp,#4] @ future $h1 318*a6b803b3SArd Biesheuvel 319*a6b803b3SArd Biesheuvel mul r2,$s3,$h4 320*a6b803b3SArd Biesheuvel eor r3,r3,r3 321*a6b803b3SArd Biesheuvel umlal r0,r1,$h3,$s3 322*a6b803b3SArd Biesheuvel ldr $r3,[sp,#28] @ reload $r3 323*a6b803b3SArd Biesheuvel umlal r2,r3,$h3,$r0 324*a6b803b3SArd Biesheuvel umlal r0,r1,$h2,$r0 325*a6b803b3SArd Biesheuvel umlal r2,r3,$h2,$r1 326*a6b803b3SArd Biesheuvel umlal r0,r1,$h1,$r1 327*a6b803b3SArd Biesheuvel umlal r2,r3,$h1,$r2 328*a6b803b3SArd Biesheuvel umlal r0,r1,$h0,$r2 329*a6b803b3SArd Biesheuvel umlal r2,r3,$h0,$r3 330*a6b803b3SArd Biesheuvel ldr $h0,[sp,#0] 331*a6b803b3SArd Biesheuvel mul $h4,$r0,$h4 332*a6b803b3SArd Biesheuvel ldr $h1,[sp,#4] 333*a6b803b3SArd Biesheuvel 334*a6b803b3SArd Biesheuvel adds $h2,lr,r0 @ d2+=d1>>32 335*a6b803b3SArd Biesheuvel ldr lr,[sp,#8] @ reload input pointer 336*a6b803b3SArd Biesheuvel adc r1,r1,#0 337*a6b803b3SArd Biesheuvel adds $h3,r2,r1 @ d3+=d2>>32 338*a6b803b3SArd Biesheuvel ldr r0,[sp,#16] @ reload end pointer 339*a6b803b3SArd Biesheuvel adc r3,r3,#0 340*a6b803b3SArd Biesheuvel add $h4,$h4,r3 @ h4+=d3>>32 341*a6b803b3SArd Biesheuvel 342*a6b803b3SArd Biesheuvel and r1,$h4,#-4 343*a6b803b3SArd Biesheuvel and $h4,$h4,#3 344*a6b803b3SArd Biesheuvel add r1,r1,r1,lsr#2 @ *=5 345*a6b803b3SArd Biesheuvel adds $h0,$h0,r1 346*a6b803b3SArd Biesheuvel adcs $h1,$h1,#0 347*a6b803b3SArd Biesheuvel adcs $h2,$h2,#0 348*a6b803b3SArd Biesheuvel adcs $h3,$h3,#0 349*a6b803b3SArd Biesheuvel adc $h4,$h4,#0 350*a6b803b3SArd Biesheuvel 351*a6b803b3SArd Biesheuvel cmp r0,lr @ done yet? 352*a6b803b3SArd Biesheuvel bhi .Loop 353*a6b803b3SArd Biesheuvel 354*a6b803b3SArd Biesheuvel ldr $ctx,[sp,#12] 355*a6b803b3SArd Biesheuvel add sp,sp,#32 356*a6b803b3SArd Biesheuvel stmdb $ctx,{$h0-$h4} @ store the result 357*a6b803b3SArd Biesheuvel 358*a6b803b3SArd Biesheuvel.Lno_data: 359*a6b803b3SArd Biesheuvel#if __ARM_ARCH__>=5 360*a6b803b3SArd Biesheuvel ldmia sp!,{r3-r11,pc} 361*a6b803b3SArd Biesheuvel#else 362*a6b803b3SArd Biesheuvel ldmia sp!,{r3-r11,lr} 363*a6b803b3SArd Biesheuvel tst lr,#1 364*a6b803b3SArd Biesheuvel moveq pc,lr @ be binary compatible with V4, yet 365*a6b803b3SArd Biesheuvel bx lr @ interoperable with Thumb ISA:-) 366*a6b803b3SArd Biesheuvel#endif 367*a6b803b3SArd Biesheuvel.size poly1305_blocks,.-poly1305_blocks 368*a6b803b3SArd Biesheuvel___ 369*a6b803b3SArd Biesheuvel} 370*a6b803b3SArd Biesheuvel{ 371*a6b803b3SArd Biesheuvelmy ($ctx,$mac,$nonce)=map("r$_",(0..2)); 372*a6b803b3SArd Biesheuvelmy ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); 373*a6b803b3SArd Biesheuvelmy $g4=$ctx; 374*a6b803b3SArd Biesheuvel 375*a6b803b3SArd Biesheuvel$code.=<<___; 376*a6b803b3SArd Biesheuvel.type poly1305_emit,%function 377*a6b803b3SArd Biesheuvel.align 5 378*a6b803b3SArd Biesheuvelpoly1305_emit: 379*a6b803b3SArd Biesheuvel.Lpoly1305_emit: 380*a6b803b3SArd Biesheuvel stmdb sp!,{r4-r11} 381*a6b803b3SArd Biesheuvel 382*a6b803b3SArd Biesheuvel ldmia $ctx,{$h0-$h4} 383*a6b803b3SArd Biesheuvel 384*a6b803b3SArd Biesheuvel#if __ARM_ARCH__>=7 385*a6b803b3SArd Biesheuvel ldr ip,[$ctx,#36] @ is_base2_26 386*a6b803b3SArd Biesheuvel 387*a6b803b3SArd Biesheuvel adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 388*a6b803b3SArd Biesheuvel mov $g1,$h1,lsr#6 389*a6b803b3SArd Biesheuvel adcs $g1,$g1,$h2,lsl#20 390*a6b803b3SArd Biesheuvel mov $g2,$h2,lsr#12 391*a6b803b3SArd Biesheuvel adcs $g2,$g2,$h3,lsl#14 392*a6b803b3SArd Biesheuvel mov $g3,$h3,lsr#18 393*a6b803b3SArd Biesheuvel adcs $g3,$g3,$h4,lsl#8 394*a6b803b3SArd Biesheuvel mov $g4,#0 395*a6b803b3SArd Biesheuvel adc $g4,$g4,$h4,lsr#24 396*a6b803b3SArd Biesheuvel 397*a6b803b3SArd Biesheuvel tst ip,ip 398*a6b803b3SArd Biesheuvel itttt ne 399*a6b803b3SArd Biesheuvel movne $h0,$g0 400*a6b803b3SArd Biesheuvel movne $h1,$g1 401*a6b803b3SArd Biesheuvel movne $h2,$g2 402*a6b803b3SArd Biesheuvel movne $h3,$g3 403*a6b803b3SArd Biesheuvel it ne 404*a6b803b3SArd Biesheuvel movne $h4,$g4 405*a6b803b3SArd Biesheuvel#endif 406*a6b803b3SArd Biesheuvel 407*a6b803b3SArd Biesheuvel adds $g0,$h0,#5 @ compare to modulus 408*a6b803b3SArd Biesheuvel adcs $g1,$h1,#0 409*a6b803b3SArd Biesheuvel adcs $g2,$h2,#0 410*a6b803b3SArd Biesheuvel adcs $g3,$h3,#0 411*a6b803b3SArd Biesheuvel adc $g4,$h4,#0 412*a6b803b3SArd Biesheuvel tst $g4,#4 @ did it carry/borrow? 413*a6b803b3SArd Biesheuvel 414*a6b803b3SArd Biesheuvel#ifdef __thumb2__ 415*a6b803b3SArd Biesheuvel it ne 416*a6b803b3SArd Biesheuvel#endif 417*a6b803b3SArd Biesheuvel movne $h0,$g0 418*a6b803b3SArd Biesheuvel ldr $g0,[$nonce,#0] 419*a6b803b3SArd Biesheuvel#ifdef __thumb2__ 420*a6b803b3SArd Biesheuvel it ne 421*a6b803b3SArd Biesheuvel#endif 422*a6b803b3SArd Biesheuvel movne $h1,$g1 423*a6b803b3SArd Biesheuvel ldr $g1,[$nonce,#4] 424*a6b803b3SArd Biesheuvel#ifdef __thumb2__ 425*a6b803b3SArd Biesheuvel it ne 426*a6b803b3SArd Biesheuvel#endif 427*a6b803b3SArd Biesheuvel movne $h2,$g2 428*a6b803b3SArd Biesheuvel ldr $g2,[$nonce,#8] 429*a6b803b3SArd Biesheuvel#ifdef __thumb2__ 430*a6b803b3SArd Biesheuvel it ne 431*a6b803b3SArd Biesheuvel#endif 432*a6b803b3SArd Biesheuvel movne $h3,$g3 433*a6b803b3SArd Biesheuvel ldr $g3,[$nonce,#12] 434*a6b803b3SArd Biesheuvel 435*a6b803b3SArd Biesheuvel adds $h0,$h0,$g0 436*a6b803b3SArd Biesheuvel adcs $h1,$h1,$g1 437*a6b803b3SArd Biesheuvel adcs $h2,$h2,$g2 438*a6b803b3SArd Biesheuvel adc $h3,$h3,$g3 439*a6b803b3SArd Biesheuvel 440*a6b803b3SArd Biesheuvel#if __ARM_ARCH__>=7 441*a6b803b3SArd Biesheuvel# ifdef __ARMEB__ 442*a6b803b3SArd Biesheuvel rev $h0,$h0 443*a6b803b3SArd Biesheuvel rev $h1,$h1 444*a6b803b3SArd Biesheuvel rev $h2,$h2 445*a6b803b3SArd Biesheuvel rev $h3,$h3 446*a6b803b3SArd Biesheuvel# endif 447*a6b803b3SArd Biesheuvel str $h0,[$mac,#0] 448*a6b803b3SArd Biesheuvel str $h1,[$mac,#4] 449*a6b803b3SArd Biesheuvel str $h2,[$mac,#8] 450*a6b803b3SArd Biesheuvel str $h3,[$mac,#12] 451*a6b803b3SArd Biesheuvel#else 452*a6b803b3SArd Biesheuvel strb $h0,[$mac,#0] 453*a6b803b3SArd Biesheuvel mov $h0,$h0,lsr#8 454*a6b803b3SArd Biesheuvel strb $h1,[$mac,#4] 455*a6b803b3SArd Biesheuvel mov $h1,$h1,lsr#8 456*a6b803b3SArd Biesheuvel strb $h2,[$mac,#8] 457*a6b803b3SArd Biesheuvel mov $h2,$h2,lsr#8 458*a6b803b3SArd Biesheuvel strb $h3,[$mac,#12] 459*a6b803b3SArd Biesheuvel mov $h3,$h3,lsr#8 460*a6b803b3SArd Biesheuvel 461*a6b803b3SArd Biesheuvel strb $h0,[$mac,#1] 462*a6b803b3SArd Biesheuvel mov $h0,$h0,lsr#8 463*a6b803b3SArd Biesheuvel strb $h1,[$mac,#5] 464*a6b803b3SArd Biesheuvel mov $h1,$h1,lsr#8 465*a6b803b3SArd Biesheuvel strb $h2,[$mac,#9] 466*a6b803b3SArd Biesheuvel mov $h2,$h2,lsr#8 467*a6b803b3SArd Biesheuvel strb $h3,[$mac,#13] 468*a6b803b3SArd Biesheuvel mov $h3,$h3,lsr#8 469*a6b803b3SArd Biesheuvel 470*a6b803b3SArd Biesheuvel strb $h0,[$mac,#2] 471*a6b803b3SArd Biesheuvel mov $h0,$h0,lsr#8 472*a6b803b3SArd Biesheuvel strb $h1,[$mac,#6] 473*a6b803b3SArd Biesheuvel mov $h1,$h1,lsr#8 474*a6b803b3SArd Biesheuvel strb $h2,[$mac,#10] 475*a6b803b3SArd Biesheuvel mov $h2,$h2,lsr#8 476*a6b803b3SArd Biesheuvel strb $h3,[$mac,#14] 477*a6b803b3SArd Biesheuvel mov $h3,$h3,lsr#8 478*a6b803b3SArd Biesheuvel 479*a6b803b3SArd Biesheuvel strb $h0,[$mac,#3] 480*a6b803b3SArd Biesheuvel strb $h1,[$mac,#7] 481*a6b803b3SArd Biesheuvel strb $h2,[$mac,#11] 482*a6b803b3SArd Biesheuvel strb $h3,[$mac,#15] 483*a6b803b3SArd Biesheuvel#endif 484*a6b803b3SArd Biesheuvel ldmia sp!,{r4-r11} 485*a6b803b3SArd Biesheuvel#if __ARM_ARCH__>=5 486*a6b803b3SArd Biesheuvel ret @ bx lr 487*a6b803b3SArd Biesheuvel#else 488*a6b803b3SArd Biesheuvel tst lr,#1 489*a6b803b3SArd Biesheuvel moveq pc,lr @ be binary compatible with V4, yet 490*a6b803b3SArd Biesheuvel bx lr @ interoperable with Thumb ISA:-) 491*a6b803b3SArd Biesheuvel#endif 492*a6b803b3SArd Biesheuvel.size poly1305_emit,.-poly1305_emit 493*a6b803b3SArd Biesheuvel___ 494*a6b803b3SArd Biesheuvel{ 495*a6b803b3SArd Biesheuvelmy ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); 496*a6b803b3SArd Biesheuvelmy ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); 497*a6b803b3SArd Biesheuvelmy ($T0,$T1,$MASK) = map("q$_",(15,4,0)); 498*a6b803b3SArd Biesheuvel 499*a6b803b3SArd Biesheuvelmy ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); 500*a6b803b3SArd Biesheuvel 501*a6b803b3SArd Biesheuvel$code.=<<___; 502*a6b803b3SArd Biesheuvel#if __ARM_MAX_ARCH__>=7 503*a6b803b3SArd Biesheuvel.fpu neon 504*a6b803b3SArd Biesheuvel 505*a6b803b3SArd Biesheuvel.type poly1305_init_neon,%function 506*a6b803b3SArd Biesheuvel.align 5 507*a6b803b3SArd Biesheuvelpoly1305_init_neon: 508*a6b803b3SArd Biesheuvel.Lpoly1305_init_neon: 509*a6b803b3SArd Biesheuvel ldr r3,[$ctx,#48] @ first table element 510*a6b803b3SArd Biesheuvel cmp r3,#-1 @ is value impossible? 511*a6b803b3SArd Biesheuvel bne .Lno_init_neon 512*a6b803b3SArd Biesheuvel 513*a6b803b3SArd Biesheuvel ldr r4,[$ctx,#20] @ load key base 2^32 514*a6b803b3SArd Biesheuvel ldr r5,[$ctx,#24] 515*a6b803b3SArd Biesheuvel ldr r6,[$ctx,#28] 516*a6b803b3SArd Biesheuvel ldr r7,[$ctx,#32] 517*a6b803b3SArd Biesheuvel 518*a6b803b3SArd Biesheuvel and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 519*a6b803b3SArd Biesheuvel mov r3,r4,lsr#26 520*a6b803b3SArd Biesheuvel mov r4,r5,lsr#20 521*a6b803b3SArd Biesheuvel orr r3,r3,r5,lsl#6 522*a6b803b3SArd Biesheuvel mov r5,r6,lsr#14 523*a6b803b3SArd Biesheuvel orr r4,r4,r6,lsl#12 524*a6b803b3SArd Biesheuvel mov r6,r7,lsr#8 525*a6b803b3SArd Biesheuvel orr r5,r5,r7,lsl#18 526*a6b803b3SArd Biesheuvel and r3,r3,#0x03ffffff 527*a6b803b3SArd Biesheuvel and r4,r4,#0x03ffffff 528*a6b803b3SArd Biesheuvel and r5,r5,#0x03ffffff 529*a6b803b3SArd Biesheuvel 530*a6b803b3SArd Biesheuvel vdup.32 $R0,r2 @ r^1 in both lanes 531*a6b803b3SArd Biesheuvel add r2,r3,r3,lsl#2 @ *5 532*a6b803b3SArd Biesheuvel vdup.32 $R1,r3 533*a6b803b3SArd Biesheuvel add r3,r4,r4,lsl#2 534*a6b803b3SArd Biesheuvel vdup.32 $S1,r2 535*a6b803b3SArd Biesheuvel vdup.32 $R2,r4 536*a6b803b3SArd Biesheuvel add r4,r5,r5,lsl#2 537*a6b803b3SArd Biesheuvel vdup.32 $S2,r3 538*a6b803b3SArd Biesheuvel vdup.32 $R3,r5 539*a6b803b3SArd Biesheuvel add r5,r6,r6,lsl#2 540*a6b803b3SArd Biesheuvel vdup.32 $S3,r4 541*a6b803b3SArd Biesheuvel vdup.32 $R4,r6 542*a6b803b3SArd Biesheuvel vdup.32 $S4,r5 543*a6b803b3SArd Biesheuvel 544*a6b803b3SArd Biesheuvel mov $zeros,#2 @ counter 545*a6b803b3SArd Biesheuvel 546*a6b803b3SArd Biesheuvel.Lsquare_neon: 547*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 548*a6b803b3SArd Biesheuvel @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 549*a6b803b3SArd Biesheuvel @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 550*a6b803b3SArd Biesheuvel @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 551*a6b803b3SArd Biesheuvel @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 552*a6b803b3SArd Biesheuvel @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 553*a6b803b3SArd Biesheuvel 554*a6b803b3SArd Biesheuvel vmull.u32 $D0,$R0,${R0}[1] 555*a6b803b3SArd Biesheuvel vmull.u32 $D1,$R1,${R0}[1] 556*a6b803b3SArd Biesheuvel vmull.u32 $D2,$R2,${R0}[1] 557*a6b803b3SArd Biesheuvel vmull.u32 $D3,$R3,${R0}[1] 558*a6b803b3SArd Biesheuvel vmull.u32 $D4,$R4,${R0}[1] 559*a6b803b3SArd Biesheuvel 560*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$R4,${S1}[1] 561*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$R0,${R1}[1] 562*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$R1,${R1}[1] 563*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$R2,${R1}[1] 564*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$R3,${R1}[1] 565*a6b803b3SArd Biesheuvel 566*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$R3,${S2}[1] 567*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$R4,${S2}[1] 568*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$R1,${R2}[1] 569*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$R0,${R2}[1] 570*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$R2,${R2}[1] 571*a6b803b3SArd Biesheuvel 572*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$R2,${S3}[1] 573*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$R0,${R3}[1] 574*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$R3,${S3}[1] 575*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$R4,${S3}[1] 576*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$R1,${R3}[1] 577*a6b803b3SArd Biesheuvel 578*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$R4,${S4}[1] 579*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$R1,${S4}[1] 580*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$R2,${S4}[1] 581*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$R3,${S4}[1] 582*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$R0,${R4}[1] 583*a6b803b3SArd Biesheuvel 584*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 585*a6b803b3SArd Biesheuvel @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 586*a6b803b3SArd Biesheuvel @ and P. Schwabe 587*a6b803b3SArd Biesheuvel @ 588*a6b803b3SArd Biesheuvel @ H0>>+H1>>+H2>>+H3>>+H4 589*a6b803b3SArd Biesheuvel @ H3>>+H4>>*5+H0>>+H1 590*a6b803b3SArd Biesheuvel @ 591*a6b803b3SArd Biesheuvel @ Trivia. 592*a6b803b3SArd Biesheuvel @ 593*a6b803b3SArd Biesheuvel @ Result of multiplication of n-bit number by m-bit number is 594*a6b803b3SArd Biesheuvel @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 595*a6b803b3SArd Biesheuvel @ m-bit number multiplied by 2^n is still n+m bits wide. 596*a6b803b3SArd Biesheuvel @ 597*a6b803b3SArd Biesheuvel @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 598*a6b803b3SArd Biesheuvel @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 599*a6b803b3SArd Biesheuvel @ one is n+1 bits wide. 600*a6b803b3SArd Biesheuvel @ 601*a6b803b3SArd Biesheuvel @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 602*a6b803b3SArd Biesheuvel @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 603*a6b803b3SArd Biesheuvel @ can be 27. However! In cases when their width exceeds 26 bits 604*a6b803b3SArd Biesheuvel @ they are limited by 2^26+2^6. This in turn means that *sum* 605*a6b803b3SArd Biesheuvel @ of the products with these values can still be viewed as sum 606*a6b803b3SArd Biesheuvel @ of 52-bit numbers as long as the amount of addends is not a 607*a6b803b3SArd Biesheuvel @ power of 2. For example, 608*a6b803b3SArd Biesheuvel @ 609*a6b803b3SArd Biesheuvel @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 610*a6b803b3SArd Biesheuvel @ 611*a6b803b3SArd Biesheuvel @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 612*a6b803b3SArd Biesheuvel @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 613*a6b803b3SArd Biesheuvel @ 8 * (2^52) or 2^55. However, the value is then multiplied by 614*a6b803b3SArd Biesheuvel @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 615*a6b803b3SArd Biesheuvel @ which is less than 32 * (2^52) or 2^57. And when processing 616*a6b803b3SArd Biesheuvel @ data we are looking at triple as many addends... 617*a6b803b3SArd Biesheuvel @ 618*a6b803b3SArd Biesheuvel @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 619*a6b803b3SArd Biesheuvel @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 620*a6b803b3SArd Biesheuvel @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 621*a6b803b3SArd Biesheuvel @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 622*a6b803b3SArd Biesheuvel @ instruction accepts 2x32-bit input and writes 2x64-bit result. 623*a6b803b3SArd Biesheuvel @ This means that result of reduction have to be compressed upon 624*a6b803b3SArd Biesheuvel @ loop wrap-around. This can be done in the process of reduction 625*a6b803b3SArd Biesheuvel @ to minimize amount of instructions [as well as amount of 626*a6b803b3SArd Biesheuvel @ 128-bit instructions, which benefits low-end processors], but 627*a6b803b3SArd Biesheuvel @ one has to watch for H2 (which is narrower than H0) and 5*H4 628*a6b803b3SArd Biesheuvel @ not being wider than 58 bits, so that result of right shift 629*a6b803b3SArd Biesheuvel @ by 26 bits fits in 32 bits. This is also useful on x86, 630*a6b803b3SArd Biesheuvel @ because it allows to use paddd in place for paddq, which 631*a6b803b3SArd Biesheuvel @ benefits Atom, where paddq is ridiculously slow. 632*a6b803b3SArd Biesheuvel 633*a6b803b3SArd Biesheuvel vshr.u64 $T0,$D3,#26 634*a6b803b3SArd Biesheuvel vmovn.i64 $D3#lo,$D3 635*a6b803b3SArd Biesheuvel vshr.u64 $T1,$D0,#26 636*a6b803b3SArd Biesheuvel vmovn.i64 $D0#lo,$D0 637*a6b803b3SArd Biesheuvel vadd.i64 $D4,$D4,$T0 @ h3 -> h4 638*a6b803b3SArd Biesheuvel vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff 639*a6b803b3SArd Biesheuvel vadd.i64 $D1,$D1,$T1 @ h0 -> h1 640*a6b803b3SArd Biesheuvel vbic.i32 $D0#lo,#0xfc000000 641*a6b803b3SArd Biesheuvel 642*a6b803b3SArd Biesheuvel vshrn.u64 $T0#lo,$D4,#26 643*a6b803b3SArd Biesheuvel vmovn.i64 $D4#lo,$D4 644*a6b803b3SArd Biesheuvel vshr.u64 $T1,$D1,#26 645*a6b803b3SArd Biesheuvel vmovn.i64 $D1#lo,$D1 646*a6b803b3SArd Biesheuvel vadd.i64 $D2,$D2,$T1 @ h1 -> h2 647*a6b803b3SArd Biesheuvel vbic.i32 $D4#lo,#0xfc000000 648*a6b803b3SArd Biesheuvel vbic.i32 $D1#lo,#0xfc000000 649*a6b803b3SArd Biesheuvel 650*a6b803b3SArd Biesheuvel vadd.i32 $D0#lo,$D0#lo,$T0#lo 651*a6b803b3SArd Biesheuvel vshl.u32 $T0#lo,$T0#lo,#2 652*a6b803b3SArd Biesheuvel vshrn.u64 $T1#lo,$D2,#26 653*a6b803b3SArd Biesheuvel vmovn.i64 $D2#lo,$D2 654*a6b803b3SArd Biesheuvel vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 655*a6b803b3SArd Biesheuvel vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 656*a6b803b3SArd Biesheuvel vbic.i32 $D2#lo,#0xfc000000 657*a6b803b3SArd Biesheuvel 658*a6b803b3SArd Biesheuvel vshr.u32 $T0#lo,$D0#lo,#26 659*a6b803b3SArd Biesheuvel vbic.i32 $D0#lo,#0xfc000000 660*a6b803b3SArd Biesheuvel vshr.u32 $T1#lo,$D3#lo,#26 661*a6b803b3SArd Biesheuvel vbic.i32 $D3#lo,#0xfc000000 662*a6b803b3SArd Biesheuvel vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 663*a6b803b3SArd Biesheuvel vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 664*a6b803b3SArd Biesheuvel 665*a6b803b3SArd Biesheuvel subs $zeros,$zeros,#1 666*a6b803b3SArd Biesheuvel beq .Lsquare_break_neon 667*a6b803b3SArd Biesheuvel 668*a6b803b3SArd Biesheuvel add $tbl0,$ctx,#(48+0*9*4) 669*a6b803b3SArd Biesheuvel add $tbl1,$ctx,#(48+1*9*4) 670*a6b803b3SArd Biesheuvel 671*a6b803b3SArd Biesheuvel vtrn.32 $R0,$D0#lo @ r^2:r^1 672*a6b803b3SArd Biesheuvel vtrn.32 $R2,$D2#lo 673*a6b803b3SArd Biesheuvel vtrn.32 $R3,$D3#lo 674*a6b803b3SArd Biesheuvel vtrn.32 $R1,$D1#lo 675*a6b803b3SArd Biesheuvel vtrn.32 $R4,$D4#lo 676*a6b803b3SArd Biesheuvel 677*a6b803b3SArd Biesheuvel vshl.u32 $S2,$R2,#2 @ *5 678*a6b803b3SArd Biesheuvel vshl.u32 $S3,$R3,#2 679*a6b803b3SArd Biesheuvel vshl.u32 $S1,$R1,#2 680*a6b803b3SArd Biesheuvel vshl.u32 $S4,$R4,#2 681*a6b803b3SArd Biesheuvel vadd.i32 $S2,$S2,$R2 682*a6b803b3SArd Biesheuvel vadd.i32 $S1,$S1,$R1 683*a6b803b3SArd Biesheuvel vadd.i32 $S3,$S3,$R3 684*a6b803b3SArd Biesheuvel vadd.i32 $S4,$S4,$R4 685*a6b803b3SArd Biesheuvel 686*a6b803b3SArd Biesheuvel vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 687*a6b803b3SArd Biesheuvel vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 688*a6b803b3SArd Biesheuvel vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 689*a6b803b3SArd Biesheuvel vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 690*a6b803b3SArd Biesheuvel vst1.32 {${S4}[0]},[$tbl0,:32] 691*a6b803b3SArd Biesheuvel vst1.32 {${S4}[1]},[$tbl1,:32] 692*a6b803b3SArd Biesheuvel 693*a6b803b3SArd Biesheuvel b .Lsquare_neon 694*a6b803b3SArd Biesheuvel 695*a6b803b3SArd Biesheuvel.align 4 696*a6b803b3SArd Biesheuvel.Lsquare_break_neon: 697*a6b803b3SArd Biesheuvel add $tbl0,$ctx,#(48+2*4*9) 698*a6b803b3SArd Biesheuvel add $tbl1,$ctx,#(48+3*4*9) 699*a6b803b3SArd Biesheuvel 700*a6b803b3SArd Biesheuvel vmov $R0,$D0#lo @ r^4:r^3 701*a6b803b3SArd Biesheuvel vshl.u32 $S1,$D1#lo,#2 @ *5 702*a6b803b3SArd Biesheuvel vmov $R1,$D1#lo 703*a6b803b3SArd Biesheuvel vshl.u32 $S2,$D2#lo,#2 704*a6b803b3SArd Biesheuvel vmov $R2,$D2#lo 705*a6b803b3SArd Biesheuvel vshl.u32 $S3,$D3#lo,#2 706*a6b803b3SArd Biesheuvel vmov $R3,$D3#lo 707*a6b803b3SArd Biesheuvel vshl.u32 $S4,$D4#lo,#2 708*a6b803b3SArd Biesheuvel vmov $R4,$D4#lo 709*a6b803b3SArd Biesheuvel vadd.i32 $S1,$S1,$D1#lo 710*a6b803b3SArd Biesheuvel vadd.i32 $S2,$S2,$D2#lo 711*a6b803b3SArd Biesheuvel vadd.i32 $S3,$S3,$D3#lo 712*a6b803b3SArd Biesheuvel vadd.i32 $S4,$S4,$D4#lo 713*a6b803b3SArd Biesheuvel 714*a6b803b3SArd Biesheuvel vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 715*a6b803b3SArd Biesheuvel vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 716*a6b803b3SArd Biesheuvel vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 717*a6b803b3SArd Biesheuvel vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 718*a6b803b3SArd Biesheuvel vst1.32 {${S4}[0]},[$tbl0] 719*a6b803b3SArd Biesheuvel vst1.32 {${S4}[1]},[$tbl1] 720*a6b803b3SArd Biesheuvel 721*a6b803b3SArd Biesheuvel.Lno_init_neon: 722*a6b803b3SArd Biesheuvel ret @ bx lr 723*a6b803b3SArd Biesheuvel.size poly1305_init_neon,.-poly1305_init_neon 724*a6b803b3SArd Biesheuvel 725*a6b803b3SArd Biesheuvel.type poly1305_blocks_neon,%function 726*a6b803b3SArd Biesheuvel.align 5 727*a6b803b3SArd Biesheuvelpoly1305_blocks_neon: 728*a6b803b3SArd Biesheuvel.Lpoly1305_blocks_neon: 729*a6b803b3SArd Biesheuvel ldr ip,[$ctx,#36] @ is_base2_26 730*a6b803b3SArd Biesheuvel 731*a6b803b3SArd Biesheuvel cmp $len,#64 732*a6b803b3SArd Biesheuvel blo .Lpoly1305_blocks 733*a6b803b3SArd Biesheuvel 734*a6b803b3SArd Biesheuvel stmdb sp!,{r4-r7} 735*a6b803b3SArd Biesheuvel vstmdb sp!,{d8-d15} @ ABI specification says so 736*a6b803b3SArd Biesheuvel 737*a6b803b3SArd Biesheuvel tst ip,ip @ is_base2_26? 738*a6b803b3SArd Biesheuvel bne .Lbase2_26_neon 739*a6b803b3SArd Biesheuvel 740*a6b803b3SArd Biesheuvel stmdb sp!,{r1-r3,lr} 741*a6b803b3SArd Biesheuvel bl .Lpoly1305_init_neon 742*a6b803b3SArd Biesheuvel 743*a6b803b3SArd Biesheuvel ldr r4,[$ctx,#0] @ load hash value base 2^32 744*a6b803b3SArd Biesheuvel ldr r5,[$ctx,#4] 745*a6b803b3SArd Biesheuvel ldr r6,[$ctx,#8] 746*a6b803b3SArd Biesheuvel ldr r7,[$ctx,#12] 747*a6b803b3SArd Biesheuvel ldr ip,[$ctx,#16] 748*a6b803b3SArd Biesheuvel 749*a6b803b3SArd Biesheuvel and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 750*a6b803b3SArd Biesheuvel mov r3,r4,lsr#26 751*a6b803b3SArd Biesheuvel veor $D0#lo,$D0#lo,$D0#lo 752*a6b803b3SArd Biesheuvel mov r4,r5,lsr#20 753*a6b803b3SArd Biesheuvel orr r3,r3,r5,lsl#6 754*a6b803b3SArd Biesheuvel veor $D1#lo,$D1#lo,$D1#lo 755*a6b803b3SArd Biesheuvel mov r5,r6,lsr#14 756*a6b803b3SArd Biesheuvel orr r4,r4,r6,lsl#12 757*a6b803b3SArd Biesheuvel veor $D2#lo,$D2#lo,$D2#lo 758*a6b803b3SArd Biesheuvel mov r6,r7,lsr#8 759*a6b803b3SArd Biesheuvel orr r5,r5,r7,lsl#18 760*a6b803b3SArd Biesheuvel veor $D3#lo,$D3#lo,$D3#lo 761*a6b803b3SArd Biesheuvel and r3,r3,#0x03ffffff 762*a6b803b3SArd Biesheuvel orr r6,r6,ip,lsl#24 763*a6b803b3SArd Biesheuvel veor $D4#lo,$D4#lo,$D4#lo 764*a6b803b3SArd Biesheuvel and r4,r4,#0x03ffffff 765*a6b803b3SArd Biesheuvel mov r1,#1 766*a6b803b3SArd Biesheuvel and r5,r5,#0x03ffffff 767*a6b803b3SArd Biesheuvel str r1,[$ctx,#36] @ set is_base2_26 768*a6b803b3SArd Biesheuvel 769*a6b803b3SArd Biesheuvel vmov.32 $D0#lo[0],r2 770*a6b803b3SArd Biesheuvel vmov.32 $D1#lo[0],r3 771*a6b803b3SArd Biesheuvel vmov.32 $D2#lo[0],r4 772*a6b803b3SArd Biesheuvel vmov.32 $D3#lo[0],r5 773*a6b803b3SArd Biesheuvel vmov.32 $D4#lo[0],r6 774*a6b803b3SArd Biesheuvel adr $zeros,.Lzeros 775*a6b803b3SArd Biesheuvel 776*a6b803b3SArd Biesheuvel ldmia sp!,{r1-r3,lr} 777*a6b803b3SArd Biesheuvel b .Lhash_loaded 778*a6b803b3SArd Biesheuvel 779*a6b803b3SArd Biesheuvel.align 4 780*a6b803b3SArd Biesheuvel.Lbase2_26_neon: 781*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 782*a6b803b3SArd Biesheuvel @ load hash value 783*a6b803b3SArd Biesheuvel 784*a6b803b3SArd Biesheuvel veor $D0#lo,$D0#lo,$D0#lo 785*a6b803b3SArd Biesheuvel veor $D1#lo,$D1#lo,$D1#lo 786*a6b803b3SArd Biesheuvel veor $D2#lo,$D2#lo,$D2#lo 787*a6b803b3SArd Biesheuvel veor $D3#lo,$D3#lo,$D3#lo 788*a6b803b3SArd Biesheuvel veor $D4#lo,$D4#lo,$D4#lo 789*a6b803b3SArd Biesheuvel vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 790*a6b803b3SArd Biesheuvel adr $zeros,.Lzeros 791*a6b803b3SArd Biesheuvel vld1.32 {$D4#lo[0]},[$ctx] 792*a6b803b3SArd Biesheuvel sub $ctx,$ctx,#16 @ rewind 793*a6b803b3SArd Biesheuvel 794*a6b803b3SArd Biesheuvel.Lhash_loaded: 795*a6b803b3SArd Biesheuvel add $in2,$inp,#32 796*a6b803b3SArd Biesheuvel mov $padbit,$padbit,lsl#24 797*a6b803b3SArd Biesheuvel tst $len,#31 798*a6b803b3SArd Biesheuvel beq .Leven 799*a6b803b3SArd Biesheuvel 800*a6b803b3SArd Biesheuvel vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! 801*a6b803b3SArd Biesheuvel vmov.32 $H4#lo[0],$padbit 802*a6b803b3SArd Biesheuvel sub $len,$len,#16 803*a6b803b3SArd Biesheuvel add $in2,$inp,#32 804*a6b803b3SArd Biesheuvel 805*a6b803b3SArd Biesheuvel# ifdef __ARMEB__ 806*a6b803b3SArd Biesheuvel vrev32.8 $H0,$H0 807*a6b803b3SArd Biesheuvel vrev32.8 $H3,$H3 808*a6b803b3SArd Biesheuvel vrev32.8 $H1,$H1 809*a6b803b3SArd Biesheuvel vrev32.8 $H2,$H2 810*a6b803b3SArd Biesheuvel# endif 811*a6b803b3SArd Biesheuvel vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 812*a6b803b3SArd Biesheuvel vshl.u32 $H3#lo,$H3#lo,#18 813*a6b803b3SArd Biesheuvel 814*a6b803b3SArd Biesheuvel vsri.u32 $H3#lo,$H2#lo,#14 815*a6b803b3SArd Biesheuvel vshl.u32 $H2#lo,$H2#lo,#12 816*a6b803b3SArd Biesheuvel vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi 817*a6b803b3SArd Biesheuvel 818*a6b803b3SArd Biesheuvel vbic.i32 $H3#lo,#0xfc000000 819*a6b803b3SArd Biesheuvel vsri.u32 $H2#lo,$H1#lo,#20 820*a6b803b3SArd Biesheuvel vshl.u32 $H1#lo,$H1#lo,#6 821*a6b803b3SArd Biesheuvel 822*a6b803b3SArd Biesheuvel vbic.i32 $H2#lo,#0xfc000000 823*a6b803b3SArd Biesheuvel vsri.u32 $H1#lo,$H0#lo,#26 824*a6b803b3SArd Biesheuvel vadd.i32 $H3#hi,$H3#lo,$D3#lo 825*a6b803b3SArd Biesheuvel 826*a6b803b3SArd Biesheuvel vbic.i32 $H0#lo,#0xfc000000 827*a6b803b3SArd Biesheuvel vbic.i32 $H1#lo,#0xfc000000 828*a6b803b3SArd Biesheuvel vadd.i32 $H2#hi,$H2#lo,$D2#lo 829*a6b803b3SArd Biesheuvel 830*a6b803b3SArd Biesheuvel vadd.i32 $H0#hi,$H0#lo,$D0#lo 831*a6b803b3SArd Biesheuvel vadd.i32 $H1#hi,$H1#lo,$D1#lo 832*a6b803b3SArd Biesheuvel 833*a6b803b3SArd Biesheuvel mov $tbl1,$zeros 834*a6b803b3SArd Biesheuvel add $tbl0,$ctx,#48 835*a6b803b3SArd Biesheuvel 836*a6b803b3SArd Biesheuvel cmp $len,$len 837*a6b803b3SArd Biesheuvel b .Long_tail 838*a6b803b3SArd Biesheuvel 839*a6b803b3SArd Biesheuvel.align 4 840*a6b803b3SArd Biesheuvel.Leven: 841*a6b803b3SArd Biesheuvel subs $len,$len,#64 842*a6b803b3SArd Biesheuvel it lo 843*a6b803b3SArd Biesheuvel movlo $in2,$zeros 844*a6b803b3SArd Biesheuvel 845*a6b803b3SArd Biesheuvel vmov.i32 $H4,#1<<24 @ padbit, yes, always 846*a6b803b3SArd Biesheuvel vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 847*a6b803b3SArd Biesheuvel add $inp,$inp,#64 848*a6b803b3SArd Biesheuvel vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 849*a6b803b3SArd Biesheuvel add $in2,$in2,#64 850*a6b803b3SArd Biesheuvel itt hi 851*a6b803b3SArd Biesheuvel addhi $tbl1,$ctx,#(48+1*9*4) 852*a6b803b3SArd Biesheuvel addhi $tbl0,$ctx,#(48+3*9*4) 853*a6b803b3SArd Biesheuvel 854*a6b803b3SArd Biesheuvel# ifdef __ARMEB__ 855*a6b803b3SArd Biesheuvel vrev32.8 $H0,$H0 856*a6b803b3SArd Biesheuvel vrev32.8 $H3,$H3 857*a6b803b3SArd Biesheuvel vrev32.8 $H1,$H1 858*a6b803b3SArd Biesheuvel vrev32.8 $H2,$H2 859*a6b803b3SArd Biesheuvel# endif 860*a6b803b3SArd Biesheuvel vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 861*a6b803b3SArd Biesheuvel vshl.u32 $H3,$H3,#18 862*a6b803b3SArd Biesheuvel 863*a6b803b3SArd Biesheuvel vsri.u32 $H3,$H2,#14 864*a6b803b3SArd Biesheuvel vshl.u32 $H2,$H2,#12 865*a6b803b3SArd Biesheuvel 866*a6b803b3SArd Biesheuvel vbic.i32 $H3,#0xfc000000 867*a6b803b3SArd Biesheuvel vsri.u32 $H2,$H1,#20 868*a6b803b3SArd Biesheuvel vshl.u32 $H1,$H1,#6 869*a6b803b3SArd Biesheuvel 870*a6b803b3SArd Biesheuvel vbic.i32 $H2,#0xfc000000 871*a6b803b3SArd Biesheuvel vsri.u32 $H1,$H0,#26 872*a6b803b3SArd Biesheuvel 873*a6b803b3SArd Biesheuvel vbic.i32 $H0,#0xfc000000 874*a6b803b3SArd Biesheuvel vbic.i32 $H1,#0xfc000000 875*a6b803b3SArd Biesheuvel 876*a6b803b3SArd Biesheuvel bls .Lskip_loop 877*a6b803b3SArd Biesheuvel 878*a6b803b3SArd Biesheuvel vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 879*a6b803b3SArd Biesheuvel vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 880*a6b803b3SArd Biesheuvel vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 881*a6b803b3SArd Biesheuvel vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 882*a6b803b3SArd Biesheuvel b .Loop_neon 883*a6b803b3SArd Biesheuvel 884*a6b803b3SArd Biesheuvel.align 5 885*a6b803b3SArd Biesheuvel.Loop_neon: 886*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 887*a6b803b3SArd Biesheuvel @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 888*a6b803b3SArd Biesheuvel @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 889*a6b803b3SArd Biesheuvel @ \___________________/ 890*a6b803b3SArd Biesheuvel @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 891*a6b803b3SArd Biesheuvel @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 892*a6b803b3SArd Biesheuvel @ \___________________/ \____________________/ 893*a6b803b3SArd Biesheuvel @ 894*a6b803b3SArd Biesheuvel @ Note that we start with inp[2:3]*r^2. This is because it 895*a6b803b3SArd Biesheuvel @ doesn't depend on reduction in previous iteration. 896*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 897*a6b803b3SArd Biesheuvel @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 898*a6b803b3SArd Biesheuvel @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 899*a6b803b3SArd Biesheuvel @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 900*a6b803b3SArd Biesheuvel @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 901*a6b803b3SArd Biesheuvel @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 902*a6b803b3SArd Biesheuvel 903*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 904*a6b803b3SArd Biesheuvel @ inp[2:3]*r^2 905*a6b803b3SArd Biesheuvel 906*a6b803b3SArd Biesheuvel vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] 907*a6b803b3SArd Biesheuvel vmull.u32 $D2,$H2#hi,${R0}[1] 908*a6b803b3SArd Biesheuvel vadd.i32 $H0#lo,$H0#lo,$D0#lo 909*a6b803b3SArd Biesheuvel vmull.u32 $D0,$H0#hi,${R0}[1] 910*a6b803b3SArd Biesheuvel vadd.i32 $H3#lo,$H3#lo,$D3#lo 911*a6b803b3SArd Biesheuvel vmull.u32 $D3,$H3#hi,${R0}[1] 912*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H1#hi,${R1}[1] 913*a6b803b3SArd Biesheuvel vadd.i32 $H1#lo,$H1#lo,$D1#lo 914*a6b803b3SArd Biesheuvel vmull.u32 $D1,$H1#hi,${R0}[1] 915*a6b803b3SArd Biesheuvel 916*a6b803b3SArd Biesheuvel vadd.i32 $H4#lo,$H4#lo,$D4#lo 917*a6b803b3SArd Biesheuvel vmull.u32 $D4,$H4#hi,${R0}[1] 918*a6b803b3SArd Biesheuvel subs $len,$len,#64 919*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H4#hi,${S1}[1] 920*a6b803b3SArd Biesheuvel it lo 921*a6b803b3SArd Biesheuvel movlo $in2,$zeros 922*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H2#hi,${R1}[1] 923*a6b803b3SArd Biesheuvel vld1.32 ${S4}[1],[$tbl1,:32] 924*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H0#hi,${R1}[1] 925*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H3#hi,${R1}[1] 926*a6b803b3SArd Biesheuvel 927*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H3#hi,${S2}[1] 928*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H1#hi,${R2}[1] 929*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H2#hi,${R2}[1] 930*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H4#hi,${S2}[1] 931*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H0#hi,${R2}[1] 932*a6b803b3SArd Biesheuvel 933*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H0#hi,${R3}[1] 934*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H2#hi,${S3}[1] 935*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H1#hi,${R3}[1] 936*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H3#hi,${S3}[1] 937*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H4#hi,${S3}[1] 938*a6b803b3SArd Biesheuvel 939*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H4#hi,${S4}[1] 940*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H1#hi,${S4}[1] 941*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H0#hi,${R4}[1] 942*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H2#hi,${S4}[1] 943*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H3#hi,${S4}[1] 944*a6b803b3SArd Biesheuvel 945*a6b803b3SArd Biesheuvel vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 946*a6b803b3SArd Biesheuvel add $in2,$in2,#64 947*a6b803b3SArd Biesheuvel 948*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 949*a6b803b3SArd Biesheuvel @ (hash+inp[0:1])*r^4 and accumulate 950*a6b803b3SArd Biesheuvel 951*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H3#lo,${R0}[0] 952*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H0#lo,${R0}[0] 953*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H4#lo,${R0}[0] 954*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H1#lo,${R0}[0] 955*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H2#lo,${R0}[0] 956*a6b803b3SArd Biesheuvel vld1.32 ${S4}[0],[$tbl0,:32] 957*a6b803b3SArd Biesheuvel 958*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H2#lo,${R1}[0] 959*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H4#lo,${S1}[0] 960*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H3#lo,${R1}[0] 961*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H0#lo,${R1}[0] 962*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H1#lo,${R1}[0] 963*a6b803b3SArd Biesheuvel 964*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H1#lo,${R2}[0] 965*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H3#lo,${S2}[0] 966*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H2#lo,${R2}[0] 967*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H4#lo,${S2}[0] 968*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H0#lo,${R2}[0] 969*a6b803b3SArd Biesheuvel 970*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H0#lo,${R3}[0] 971*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H2#lo,${S3}[0] 972*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H1#lo,${R3}[0] 973*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H3#lo,${S3}[0] 974*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H4#lo,${S4}[0] 975*a6b803b3SArd Biesheuvel 976*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H4#lo,${S3}[0] 977*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H1#lo,${S4}[0] 978*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H0#lo,${R4}[0] 979*a6b803b3SArd Biesheuvel vmov.i32 $H4,#1<<24 @ padbit, yes, always 980*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H2#lo,${S4}[0] 981*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H3#lo,${S4}[0] 982*a6b803b3SArd Biesheuvel 983*a6b803b3SArd Biesheuvel vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 984*a6b803b3SArd Biesheuvel add $inp,$inp,#64 985*a6b803b3SArd Biesheuvel# ifdef __ARMEB__ 986*a6b803b3SArd Biesheuvel vrev32.8 $H0,$H0 987*a6b803b3SArd Biesheuvel vrev32.8 $H1,$H1 988*a6b803b3SArd Biesheuvel vrev32.8 $H2,$H2 989*a6b803b3SArd Biesheuvel vrev32.8 $H3,$H3 990*a6b803b3SArd Biesheuvel# endif 991*a6b803b3SArd Biesheuvel 992*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 993*a6b803b3SArd Biesheuvel @ lazy reduction interleaved with base 2^32 -> base 2^26 of 994*a6b803b3SArd Biesheuvel @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. 995*a6b803b3SArd Biesheuvel 996*a6b803b3SArd Biesheuvel vshr.u64 $T0,$D3,#26 997*a6b803b3SArd Biesheuvel vmovn.i64 $D3#lo,$D3 998*a6b803b3SArd Biesheuvel vshr.u64 $T1,$D0,#26 999*a6b803b3SArd Biesheuvel vmovn.i64 $D0#lo,$D0 1000*a6b803b3SArd Biesheuvel vadd.i64 $D4,$D4,$T0 @ h3 -> h4 1001*a6b803b3SArd Biesheuvel vbic.i32 $D3#lo,#0xfc000000 1002*a6b803b3SArd Biesheuvel vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 1003*a6b803b3SArd Biesheuvel vadd.i64 $D1,$D1,$T1 @ h0 -> h1 1004*a6b803b3SArd Biesheuvel vshl.u32 $H3,$H3,#18 1005*a6b803b3SArd Biesheuvel vbic.i32 $D0#lo,#0xfc000000 1006*a6b803b3SArd Biesheuvel 1007*a6b803b3SArd Biesheuvel vshrn.u64 $T0#lo,$D4,#26 1008*a6b803b3SArd Biesheuvel vmovn.i64 $D4#lo,$D4 1009*a6b803b3SArd Biesheuvel vshr.u64 $T1,$D1,#26 1010*a6b803b3SArd Biesheuvel vmovn.i64 $D1#lo,$D1 1011*a6b803b3SArd Biesheuvel vadd.i64 $D2,$D2,$T1 @ h1 -> h2 1012*a6b803b3SArd Biesheuvel vsri.u32 $H3,$H2,#14 1013*a6b803b3SArd Biesheuvel vbic.i32 $D4#lo,#0xfc000000 1014*a6b803b3SArd Biesheuvel vshl.u32 $H2,$H2,#12 1015*a6b803b3SArd Biesheuvel vbic.i32 $D1#lo,#0xfc000000 1016*a6b803b3SArd Biesheuvel 1017*a6b803b3SArd Biesheuvel vadd.i32 $D0#lo,$D0#lo,$T0#lo 1018*a6b803b3SArd Biesheuvel vshl.u32 $T0#lo,$T0#lo,#2 1019*a6b803b3SArd Biesheuvel vbic.i32 $H3,#0xfc000000 1020*a6b803b3SArd Biesheuvel vshrn.u64 $T1#lo,$D2,#26 1021*a6b803b3SArd Biesheuvel vmovn.i64 $D2#lo,$D2 1022*a6b803b3SArd Biesheuvel vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] 1023*a6b803b3SArd Biesheuvel vsri.u32 $H2,$H1,#20 1024*a6b803b3SArd Biesheuvel vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 1025*a6b803b3SArd Biesheuvel vshl.u32 $H1,$H1,#6 1026*a6b803b3SArd Biesheuvel vbic.i32 $D2#lo,#0xfc000000 1027*a6b803b3SArd Biesheuvel vbic.i32 $H2,#0xfc000000 1028*a6b803b3SArd Biesheuvel 1029*a6b803b3SArd Biesheuvel vshrn.u64 $T0#lo,$D0,#26 @ re-narrow 1030*a6b803b3SArd Biesheuvel vmovn.i64 $D0#lo,$D0 1031*a6b803b3SArd Biesheuvel vsri.u32 $H1,$H0,#26 1032*a6b803b3SArd Biesheuvel vbic.i32 $H0,#0xfc000000 1033*a6b803b3SArd Biesheuvel vshr.u32 $T1#lo,$D3#lo,#26 1034*a6b803b3SArd Biesheuvel vbic.i32 $D3#lo,#0xfc000000 1035*a6b803b3SArd Biesheuvel vbic.i32 $D0#lo,#0xfc000000 1036*a6b803b3SArd Biesheuvel vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 1037*a6b803b3SArd Biesheuvel vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 1038*a6b803b3SArd Biesheuvel vbic.i32 $H1,#0xfc000000 1039*a6b803b3SArd Biesheuvel 1040*a6b803b3SArd Biesheuvel bhi .Loop_neon 1041*a6b803b3SArd Biesheuvel 1042*a6b803b3SArd Biesheuvel.Lskip_loop: 1043*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1044*a6b803b3SArd Biesheuvel @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1045*a6b803b3SArd Biesheuvel 1046*a6b803b3SArd Biesheuvel add $tbl1,$ctx,#(48+0*9*4) 1047*a6b803b3SArd Biesheuvel add $tbl0,$ctx,#(48+1*9*4) 1048*a6b803b3SArd Biesheuvel adds $len,$len,#32 1049*a6b803b3SArd Biesheuvel it ne 1050*a6b803b3SArd Biesheuvel movne $len,#0 1051*a6b803b3SArd Biesheuvel bne .Long_tail 1052*a6b803b3SArd Biesheuvel 1053*a6b803b3SArd Biesheuvel vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi 1054*a6b803b3SArd Biesheuvel vadd.i32 $H0#hi,$H0#lo,$D0#lo 1055*a6b803b3SArd Biesheuvel vadd.i32 $H3#hi,$H3#lo,$D3#lo 1056*a6b803b3SArd Biesheuvel vadd.i32 $H1#hi,$H1#lo,$D1#lo 1057*a6b803b3SArd Biesheuvel vadd.i32 $H4#hi,$H4#lo,$D4#lo 1058*a6b803b3SArd Biesheuvel 1059*a6b803b3SArd Biesheuvel.Long_tail: 1060*a6b803b3SArd Biesheuvel vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 1061*a6b803b3SArd Biesheuvel vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 1062*a6b803b3SArd Biesheuvel 1063*a6b803b3SArd Biesheuvel vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant 1064*a6b803b3SArd Biesheuvel vmull.u32 $D2,$H2#hi,$R0 1065*a6b803b3SArd Biesheuvel vadd.i32 $H0#lo,$H0#lo,$D0#lo 1066*a6b803b3SArd Biesheuvel vmull.u32 $D0,$H0#hi,$R0 1067*a6b803b3SArd Biesheuvel vadd.i32 $H3#lo,$H3#lo,$D3#lo 1068*a6b803b3SArd Biesheuvel vmull.u32 $D3,$H3#hi,$R0 1069*a6b803b3SArd Biesheuvel vadd.i32 $H1#lo,$H1#lo,$D1#lo 1070*a6b803b3SArd Biesheuvel vmull.u32 $D1,$H1#hi,$R0 1071*a6b803b3SArd Biesheuvel vadd.i32 $H4#lo,$H4#lo,$D4#lo 1072*a6b803b3SArd Biesheuvel vmull.u32 $D4,$H4#hi,$R0 1073*a6b803b3SArd Biesheuvel 1074*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H4#hi,$S1 1075*a6b803b3SArd Biesheuvel vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 1076*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H2#hi,$R1 1077*a6b803b3SArd Biesheuvel vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 1078*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H0#hi,$R1 1079*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H3#hi,$R1 1080*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H1#hi,$R1 1081*a6b803b3SArd Biesheuvel 1082*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H1#hi,$R2 1083*a6b803b3SArd Biesheuvel vld1.32 ${S4}[1],[$tbl1,:32] 1084*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H3#hi,$S2 1085*a6b803b3SArd Biesheuvel vld1.32 ${S4}[0],[$tbl0,:32] 1086*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H2#hi,$R2 1087*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H4#hi,$S2 1088*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H0#hi,$R2 1089*a6b803b3SArd Biesheuvel 1090*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H0#hi,$R3 1091*a6b803b3SArd Biesheuvel it ne 1092*a6b803b3SArd Biesheuvel addne $tbl1,$ctx,#(48+2*9*4) 1093*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H2#hi,$S3 1094*a6b803b3SArd Biesheuvel it ne 1095*a6b803b3SArd Biesheuvel addne $tbl0,$ctx,#(48+3*9*4) 1096*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H1#hi,$R3 1097*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H3#hi,$S3 1098*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H4#hi,$S3 1099*a6b803b3SArd Biesheuvel 1100*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H4#hi,$S4 1101*a6b803b3SArd Biesheuvel vorn $MASK,$MASK,$MASK @ all-ones, can be redundant 1102*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H1#hi,$S4 1103*a6b803b3SArd Biesheuvel vshr.u64 $MASK,$MASK,#38 1104*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H0#hi,$R4 1105*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H2#hi,$S4 1106*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H3#hi,$S4 1107*a6b803b3SArd Biesheuvel 1108*a6b803b3SArd Biesheuvel beq .Lshort_tail 1109*a6b803b3SArd Biesheuvel 1110*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1111*a6b803b3SArd Biesheuvel @ (hash+inp[0:1])*r^4:r^3 and accumulate 1112*a6b803b3SArd Biesheuvel 1113*a6b803b3SArd Biesheuvel vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 1114*a6b803b3SArd Biesheuvel vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 1115*a6b803b3SArd Biesheuvel 1116*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H2#lo,$R0 1117*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H0#lo,$R0 1118*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H3#lo,$R0 1119*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H1#lo,$R0 1120*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H4#lo,$R0 1121*a6b803b3SArd Biesheuvel 1122*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H4#lo,$S1 1123*a6b803b3SArd Biesheuvel vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 1124*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H2#lo,$R1 1125*a6b803b3SArd Biesheuvel vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 1126*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H0#lo,$R1 1127*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H3#lo,$R1 1128*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H1#lo,$R1 1129*a6b803b3SArd Biesheuvel 1130*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H1#lo,$R2 1131*a6b803b3SArd Biesheuvel vld1.32 ${S4}[1],[$tbl1,:32] 1132*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H3#lo,$S2 1133*a6b803b3SArd Biesheuvel vld1.32 ${S4}[0],[$tbl0,:32] 1134*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H2#lo,$R2 1135*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H4#lo,$S2 1136*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H0#lo,$R2 1137*a6b803b3SArd Biesheuvel 1138*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H0#lo,$R3 1139*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H2#lo,$S3 1140*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H1#lo,$R3 1141*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H3#lo,$S3 1142*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H4#lo,$S3 1143*a6b803b3SArd Biesheuvel 1144*a6b803b3SArd Biesheuvel vmlal.u32 $D3,$H4#lo,$S4 1145*a6b803b3SArd Biesheuvel vorn $MASK,$MASK,$MASK @ all-ones 1146*a6b803b3SArd Biesheuvel vmlal.u32 $D0,$H1#lo,$S4 1147*a6b803b3SArd Biesheuvel vshr.u64 $MASK,$MASK,#38 1148*a6b803b3SArd Biesheuvel vmlal.u32 $D4,$H0#lo,$R4 1149*a6b803b3SArd Biesheuvel vmlal.u32 $D1,$H2#lo,$S4 1150*a6b803b3SArd Biesheuvel vmlal.u32 $D2,$H3#lo,$S4 1151*a6b803b3SArd Biesheuvel 1152*a6b803b3SArd Biesheuvel.Lshort_tail: 1153*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1154*a6b803b3SArd Biesheuvel @ horizontal addition 1155*a6b803b3SArd Biesheuvel 1156*a6b803b3SArd Biesheuvel vadd.i64 $D3#lo,$D3#lo,$D3#hi 1157*a6b803b3SArd Biesheuvel vadd.i64 $D0#lo,$D0#lo,$D0#hi 1158*a6b803b3SArd Biesheuvel vadd.i64 $D4#lo,$D4#lo,$D4#hi 1159*a6b803b3SArd Biesheuvel vadd.i64 $D1#lo,$D1#lo,$D1#hi 1160*a6b803b3SArd Biesheuvel vadd.i64 $D2#lo,$D2#lo,$D2#hi 1161*a6b803b3SArd Biesheuvel 1162*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1163*a6b803b3SArd Biesheuvel @ lazy reduction, but without narrowing 1164*a6b803b3SArd Biesheuvel 1165*a6b803b3SArd Biesheuvel vshr.u64 $T0,$D3,#26 1166*a6b803b3SArd Biesheuvel vand.i64 $D3,$D3,$MASK 1167*a6b803b3SArd Biesheuvel vshr.u64 $T1,$D0,#26 1168*a6b803b3SArd Biesheuvel vand.i64 $D0,$D0,$MASK 1169*a6b803b3SArd Biesheuvel vadd.i64 $D4,$D4,$T0 @ h3 -> h4 1170*a6b803b3SArd Biesheuvel vadd.i64 $D1,$D1,$T1 @ h0 -> h1 1171*a6b803b3SArd Biesheuvel 1172*a6b803b3SArd Biesheuvel vshr.u64 $T0,$D4,#26 1173*a6b803b3SArd Biesheuvel vand.i64 $D4,$D4,$MASK 1174*a6b803b3SArd Biesheuvel vshr.u64 $T1,$D1,#26 1175*a6b803b3SArd Biesheuvel vand.i64 $D1,$D1,$MASK 1176*a6b803b3SArd Biesheuvel vadd.i64 $D2,$D2,$T1 @ h1 -> h2 1177*a6b803b3SArd Biesheuvel 1178*a6b803b3SArd Biesheuvel vadd.i64 $D0,$D0,$T0 1179*a6b803b3SArd Biesheuvel vshl.u64 $T0,$T0,#2 1180*a6b803b3SArd Biesheuvel vshr.u64 $T1,$D2,#26 1181*a6b803b3SArd Biesheuvel vand.i64 $D2,$D2,$MASK 1182*a6b803b3SArd Biesheuvel vadd.i64 $D0,$D0,$T0 @ h4 -> h0 1183*a6b803b3SArd Biesheuvel vadd.i64 $D3,$D3,$T1 @ h2 -> h3 1184*a6b803b3SArd Biesheuvel 1185*a6b803b3SArd Biesheuvel vshr.u64 $T0,$D0,#26 1186*a6b803b3SArd Biesheuvel vand.i64 $D0,$D0,$MASK 1187*a6b803b3SArd Biesheuvel vshr.u64 $T1,$D3,#26 1188*a6b803b3SArd Biesheuvel vand.i64 $D3,$D3,$MASK 1189*a6b803b3SArd Biesheuvel vadd.i64 $D1,$D1,$T0 @ h0 -> h1 1190*a6b803b3SArd Biesheuvel vadd.i64 $D4,$D4,$T1 @ h3 -> h4 1191*a6b803b3SArd Biesheuvel 1192*a6b803b3SArd Biesheuvel cmp $len,#0 1193*a6b803b3SArd Biesheuvel bne .Leven 1194*a6b803b3SArd Biesheuvel 1195*a6b803b3SArd Biesheuvel @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1196*a6b803b3SArd Biesheuvel @ store hash value 1197*a6b803b3SArd Biesheuvel 1198*a6b803b3SArd Biesheuvel vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 1199*a6b803b3SArd Biesheuvel vst1.32 {$D4#lo[0]},[$ctx] 1200*a6b803b3SArd Biesheuvel 1201*a6b803b3SArd Biesheuvel vldmia sp!,{d8-d15} @ epilogue 1202*a6b803b3SArd Biesheuvel ldmia sp!,{r4-r7} 1203*a6b803b3SArd Biesheuvel ret @ bx lr 1204*a6b803b3SArd Biesheuvel.size poly1305_blocks_neon,.-poly1305_blocks_neon 1205*a6b803b3SArd Biesheuvel 1206*a6b803b3SArd Biesheuvel.align 5 1207*a6b803b3SArd Biesheuvel.Lzeros: 1208*a6b803b3SArd Biesheuvel.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1209*a6b803b3SArd Biesheuvel#ifndef __KERNEL__ 1210*a6b803b3SArd Biesheuvel.LOPENSSL_armcap: 1211*a6b803b3SArd Biesheuvel# ifdef _WIN32 1212*a6b803b3SArd Biesheuvel.word OPENSSL_armcap_P 1213*a6b803b3SArd Biesheuvel# else 1214*a6b803b3SArd Biesheuvel.word OPENSSL_armcap_P-.Lpoly1305_init 1215*a6b803b3SArd Biesheuvel# endif 1216*a6b803b3SArd Biesheuvel.comm OPENSSL_armcap_P,4,4 1217*a6b803b3SArd Biesheuvel.hidden OPENSSL_armcap_P 1218*a6b803b3SArd Biesheuvel#endif 1219*a6b803b3SArd Biesheuvel#endif 1220*a6b803b3SArd Biesheuvel___ 1221*a6b803b3SArd Biesheuvel} } 1222*a6b803b3SArd Biesheuvel$code.=<<___; 1223*a6b803b3SArd Biesheuvel.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" 1224*a6b803b3SArd Biesheuvel.align 2 1225*a6b803b3SArd Biesheuvel___ 1226*a6b803b3SArd Biesheuvel 1227*a6b803b3SArd Biesheuvelforeach (split("\n",$code)) { 1228*a6b803b3SArd Biesheuvel s/\`([^\`]*)\`/eval $1/geo; 1229*a6b803b3SArd Biesheuvel 1230*a6b803b3SArd Biesheuvel s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 1231*a6b803b3SArd Biesheuvel s/\bret\b/bx lr/go or 1232*a6b803b3SArd Biesheuvel s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 1233*a6b803b3SArd Biesheuvel 1234*a6b803b3SArd Biesheuvel print $_,"\n"; 1235*a6b803b3SArd Biesheuvel} 1236*a6b803b3SArd Biesheuvelclose STDOUT; # enforce flush 1237