1*09ef057bSDanny Tsen/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*09ef057bSDanny Tsen# 3*09ef057bSDanny Tsen# Accelerated poly1305 implementation for ppc64le. 4*09ef057bSDanny Tsen# 5*09ef057bSDanny Tsen# Copyright 2023- IBM Corp. All rights reserved 6*09ef057bSDanny Tsen# 7*09ef057bSDanny Tsen#=================================================================================== 8*09ef057bSDanny Tsen# Written by Danny Tsen <dtsen@us.ibm.com> 9*09ef057bSDanny Tsen# 10*09ef057bSDanny Tsen# Poly1305 - this version mainly using vector/VSX/Scalar 11*09ef057bSDanny Tsen# - 26 bits limbs 12*09ef057bSDanny Tsen# - Handle multiple 64 byte blcok. 13*09ef057bSDanny Tsen# 14*09ef057bSDanny Tsen# Block size 16 bytes 15*09ef057bSDanny Tsen# key = (r, s) 16*09ef057bSDanny Tsen# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF 17*09ef057bSDanny Tsen# p = 2^130 - 5 18*09ef057bSDanny Tsen# a += m 19*09ef057bSDanny Tsen# a = (r + a) % p 20*09ef057bSDanny Tsen# a += s 21*09ef057bSDanny Tsen# 22*09ef057bSDanny Tsen# Improve performance by breaking down polynominal to the sum of products with 23*09ef057bSDanny Tsen# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r 24*09ef057bSDanny Tsen# 25*09ef057bSDanny Tsen# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 26*09ef057bSDanny Tsen# to 9 vectors for multiplications. 27*09ef057bSDanny Tsen# 28*09ef057bSDanny Tsen# setup r^4, r^3, r^2, r vectors 29*09ef057bSDanny Tsen# vs [r^1, r^3, r^2, r^4] 30*09ef057bSDanny Tsen# vs0 = [r0,.....] 31*09ef057bSDanny Tsen# vs1 = [r1,.....] 32*09ef057bSDanny Tsen# vs2 = [r2,.....] 33*09ef057bSDanny Tsen# vs3 = [r3,.....] 34*09ef057bSDanny Tsen# vs4 = [r4,.....] 35*09ef057bSDanny Tsen# vs5 = [r1*5,...] 36*09ef057bSDanny Tsen# vs6 = [r2*5,...] 37*09ef057bSDanny Tsen# vs7 = [r2*5,...] 38*09ef057bSDanny Tsen# vs8 = [r4*5,...] 39*09ef057bSDanny Tsen# 40*09ef057bSDanny Tsen# Each word in a vector consists a member of a "r/s" in [a * r/s]. 41*09ef057bSDanny Tsen# 42*09ef057bSDanny Tsen# r0, r4*5, r3*5, r2*5, r1*5; 43*09ef057bSDanny Tsen# r1, r0, r4*5, r3*5, r2*5; 44*09ef057bSDanny Tsen# r2, r1, r0, r4*5, r3*5; 45*09ef057bSDanny Tsen# r3, r2, r1, r0, r4*5; 46*09ef057bSDanny Tsen# r4, r3, r2, r1, r0 ; 47*09ef057bSDanny Tsen# 48*09ef057bSDanny Tsen# 49*09ef057bSDanny Tsen# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) 50*09ef057bSDanny Tsen# k = 32 bytes key 51*09ef057bSDanny Tsen# r3 = k (r, s) 52*09ef057bSDanny Tsen# r4 = mlen 53*09ef057bSDanny Tsen# r5 = m 54*09ef057bSDanny Tsen# 55*09ef057bSDanny Tsen#include <asm/ppc_asm.h> 56*09ef057bSDanny Tsen#include <asm/asm-offsets.h> 57*09ef057bSDanny Tsen#include <asm/asm-compat.h> 58*09ef057bSDanny Tsen#include <linux/linkage.h> 59*09ef057bSDanny Tsen 60*09ef057bSDanny Tsen.machine "any" 61*09ef057bSDanny Tsen 62*09ef057bSDanny Tsen.text 63*09ef057bSDanny Tsen 64*09ef057bSDanny Tsen.macro SAVE_GPR GPR OFFSET FRAME 65*09ef057bSDanny Tsen std \GPR,\OFFSET(\FRAME) 66*09ef057bSDanny Tsen.endm 67*09ef057bSDanny Tsen 68*09ef057bSDanny Tsen.macro SAVE_VRS VRS OFFSET FRAME 69*09ef057bSDanny Tsen li 16, \OFFSET 70*09ef057bSDanny Tsen stvx \VRS, 16, \FRAME 71*09ef057bSDanny Tsen.endm 72*09ef057bSDanny Tsen 73*09ef057bSDanny Tsen.macro SAVE_VSX VSX OFFSET FRAME 74*09ef057bSDanny Tsen li 16, \OFFSET 75*09ef057bSDanny Tsen stxvx \VSX, 16, \FRAME 76*09ef057bSDanny Tsen.endm 77*09ef057bSDanny Tsen 78*09ef057bSDanny Tsen.macro RESTORE_GPR GPR OFFSET FRAME 79*09ef057bSDanny Tsen ld \GPR,\OFFSET(\FRAME) 80*09ef057bSDanny Tsen.endm 81*09ef057bSDanny Tsen 82*09ef057bSDanny Tsen.macro RESTORE_VRS VRS OFFSET FRAME 83*09ef057bSDanny Tsen li 16, \OFFSET 84*09ef057bSDanny Tsen lvx \VRS, 16, \FRAME 85*09ef057bSDanny Tsen.endm 86*09ef057bSDanny Tsen 87*09ef057bSDanny Tsen.macro RESTORE_VSX VSX OFFSET FRAME 88*09ef057bSDanny Tsen li 16, \OFFSET 89*09ef057bSDanny Tsen lxvx \VSX, 16, \FRAME 90*09ef057bSDanny Tsen.endm 91*09ef057bSDanny Tsen 92*09ef057bSDanny Tsen.macro SAVE_REGS 93*09ef057bSDanny Tsen mflr 0 94*09ef057bSDanny Tsen std 0, 16(1) 95*09ef057bSDanny Tsen stdu 1,-752(1) 96*09ef057bSDanny Tsen 97*09ef057bSDanny Tsen SAVE_GPR 14, 112, 1 98*09ef057bSDanny Tsen SAVE_GPR 15, 120, 1 99*09ef057bSDanny Tsen SAVE_GPR 16, 128, 1 100*09ef057bSDanny Tsen SAVE_GPR 17, 136, 1 101*09ef057bSDanny Tsen SAVE_GPR 18, 144, 1 102*09ef057bSDanny Tsen SAVE_GPR 19, 152, 1 103*09ef057bSDanny Tsen SAVE_GPR 20, 160, 1 104*09ef057bSDanny Tsen SAVE_GPR 21, 168, 1 105*09ef057bSDanny Tsen SAVE_GPR 22, 176, 1 106*09ef057bSDanny Tsen SAVE_GPR 23, 184, 1 107*09ef057bSDanny Tsen SAVE_GPR 24, 192, 1 108*09ef057bSDanny Tsen SAVE_GPR 25, 200, 1 109*09ef057bSDanny Tsen SAVE_GPR 26, 208, 1 110*09ef057bSDanny Tsen SAVE_GPR 27, 216, 1 111*09ef057bSDanny Tsen SAVE_GPR 28, 224, 1 112*09ef057bSDanny Tsen SAVE_GPR 29, 232, 1 113*09ef057bSDanny Tsen SAVE_GPR 30, 240, 1 114*09ef057bSDanny Tsen SAVE_GPR 31, 248, 1 115*09ef057bSDanny Tsen 116*09ef057bSDanny Tsen addi 9, 1, 256 117*09ef057bSDanny Tsen SAVE_VRS 20, 0, 9 118*09ef057bSDanny Tsen SAVE_VRS 21, 16, 9 119*09ef057bSDanny Tsen SAVE_VRS 22, 32, 9 120*09ef057bSDanny Tsen SAVE_VRS 23, 48, 9 121*09ef057bSDanny Tsen SAVE_VRS 24, 64, 9 122*09ef057bSDanny Tsen SAVE_VRS 25, 80, 9 123*09ef057bSDanny Tsen SAVE_VRS 26, 96, 9 124*09ef057bSDanny Tsen SAVE_VRS 27, 112, 9 125*09ef057bSDanny Tsen SAVE_VRS 28, 128, 9 126*09ef057bSDanny Tsen SAVE_VRS 29, 144, 9 127*09ef057bSDanny Tsen SAVE_VRS 30, 160, 9 128*09ef057bSDanny Tsen SAVE_VRS 31, 176, 9 129*09ef057bSDanny Tsen 130*09ef057bSDanny Tsen SAVE_VSX 14, 192, 9 131*09ef057bSDanny Tsen SAVE_VSX 15, 208, 9 132*09ef057bSDanny Tsen SAVE_VSX 16, 224, 9 133*09ef057bSDanny Tsen SAVE_VSX 17, 240, 9 134*09ef057bSDanny Tsen SAVE_VSX 18, 256, 9 135*09ef057bSDanny Tsen SAVE_VSX 19, 272, 9 136*09ef057bSDanny Tsen SAVE_VSX 20, 288, 9 137*09ef057bSDanny Tsen SAVE_VSX 21, 304, 9 138*09ef057bSDanny Tsen SAVE_VSX 22, 320, 9 139*09ef057bSDanny Tsen SAVE_VSX 23, 336, 9 140*09ef057bSDanny Tsen SAVE_VSX 24, 352, 9 141*09ef057bSDanny Tsen SAVE_VSX 25, 368, 9 142*09ef057bSDanny Tsen SAVE_VSX 26, 384, 9 143*09ef057bSDanny Tsen SAVE_VSX 27, 400, 9 144*09ef057bSDanny Tsen SAVE_VSX 28, 416, 9 145*09ef057bSDanny Tsen SAVE_VSX 29, 432, 9 146*09ef057bSDanny Tsen SAVE_VSX 30, 448, 9 147*09ef057bSDanny Tsen SAVE_VSX 31, 464, 9 148*09ef057bSDanny Tsen.endm # SAVE_REGS 149*09ef057bSDanny Tsen 150*09ef057bSDanny Tsen.macro RESTORE_REGS 151*09ef057bSDanny Tsen addi 9, 1, 256 152*09ef057bSDanny Tsen RESTORE_VRS 20, 0, 9 153*09ef057bSDanny Tsen RESTORE_VRS 21, 16, 9 154*09ef057bSDanny Tsen RESTORE_VRS 22, 32, 9 155*09ef057bSDanny Tsen RESTORE_VRS 23, 48, 9 156*09ef057bSDanny Tsen RESTORE_VRS 24, 64, 9 157*09ef057bSDanny Tsen RESTORE_VRS 25, 80, 9 158*09ef057bSDanny Tsen RESTORE_VRS 26, 96, 9 159*09ef057bSDanny Tsen RESTORE_VRS 27, 112, 9 160*09ef057bSDanny Tsen RESTORE_VRS 28, 128, 9 161*09ef057bSDanny Tsen RESTORE_VRS 29, 144, 9 162*09ef057bSDanny Tsen RESTORE_VRS 30, 160, 9 163*09ef057bSDanny Tsen RESTORE_VRS 31, 176, 9 164*09ef057bSDanny Tsen 165*09ef057bSDanny Tsen RESTORE_VSX 14, 192, 9 166*09ef057bSDanny Tsen RESTORE_VSX 15, 208, 9 167*09ef057bSDanny Tsen RESTORE_VSX 16, 224, 9 168*09ef057bSDanny Tsen RESTORE_VSX 17, 240, 9 169*09ef057bSDanny Tsen RESTORE_VSX 18, 256, 9 170*09ef057bSDanny Tsen RESTORE_VSX 19, 272, 9 171*09ef057bSDanny Tsen RESTORE_VSX 20, 288, 9 172*09ef057bSDanny Tsen RESTORE_VSX 21, 304, 9 173*09ef057bSDanny Tsen RESTORE_VSX 22, 320, 9 174*09ef057bSDanny Tsen RESTORE_VSX 23, 336, 9 175*09ef057bSDanny Tsen RESTORE_VSX 24, 352, 9 176*09ef057bSDanny Tsen RESTORE_VSX 25, 368, 9 177*09ef057bSDanny Tsen RESTORE_VSX 26, 384, 9 178*09ef057bSDanny Tsen RESTORE_VSX 27, 400, 9 179*09ef057bSDanny Tsen RESTORE_VSX 28, 416, 9 180*09ef057bSDanny Tsen RESTORE_VSX 29, 432, 9 181*09ef057bSDanny Tsen RESTORE_VSX 30, 448, 9 182*09ef057bSDanny Tsen RESTORE_VSX 31, 464, 9 183*09ef057bSDanny Tsen 184*09ef057bSDanny Tsen RESTORE_GPR 14, 112, 1 185*09ef057bSDanny Tsen RESTORE_GPR 15, 120, 1 186*09ef057bSDanny Tsen RESTORE_GPR 16, 128, 1 187*09ef057bSDanny Tsen RESTORE_GPR 17, 136, 1 188*09ef057bSDanny Tsen RESTORE_GPR 18, 144, 1 189*09ef057bSDanny Tsen RESTORE_GPR 19, 152, 1 190*09ef057bSDanny Tsen RESTORE_GPR 20, 160, 1 191*09ef057bSDanny Tsen RESTORE_GPR 21, 168, 1 192*09ef057bSDanny Tsen RESTORE_GPR 22, 176, 1 193*09ef057bSDanny Tsen RESTORE_GPR 23, 184, 1 194*09ef057bSDanny Tsen RESTORE_GPR 24, 192, 1 195*09ef057bSDanny Tsen RESTORE_GPR 25, 200, 1 196*09ef057bSDanny Tsen RESTORE_GPR 26, 208, 1 197*09ef057bSDanny Tsen RESTORE_GPR 27, 216, 1 198*09ef057bSDanny Tsen RESTORE_GPR 28, 224, 1 199*09ef057bSDanny Tsen RESTORE_GPR 29, 232, 1 200*09ef057bSDanny Tsen RESTORE_GPR 30, 240, 1 201*09ef057bSDanny Tsen RESTORE_GPR 31, 248, 1 202*09ef057bSDanny Tsen 203*09ef057bSDanny Tsen addi 1, 1, 752 204*09ef057bSDanny Tsen ld 0, 16(1) 205*09ef057bSDanny Tsen mtlr 0 206*09ef057bSDanny Tsen.endm # RESTORE_REGS 207*09ef057bSDanny Tsen 208*09ef057bSDanny Tsen# 209*09ef057bSDanny Tsen# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5; 210*09ef057bSDanny Tsen# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5; 211*09ef057bSDanny Tsen# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5; 212*09ef057bSDanny Tsen# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5; 213*09ef057bSDanny Tsen# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ; 214*09ef057bSDanny Tsen# 215*09ef057bSDanny Tsen# [r^2, r^3, r^1, r^4] 216*09ef057bSDanny Tsen# [m3, m2, m4, m1] 217*09ef057bSDanny Tsen# 218*09ef057bSDanny Tsen# multiply odd and even words 219*09ef057bSDanny Tsen.macro mul_odd 220*09ef057bSDanny Tsen vmulouw 14, 4, 26 221*09ef057bSDanny Tsen vmulouw 10, 5, 3 222*09ef057bSDanny Tsen vmulouw 11, 6, 2 223*09ef057bSDanny Tsen vmulouw 12, 7, 1 224*09ef057bSDanny Tsen vmulouw 13, 8, 0 225*09ef057bSDanny Tsen vmulouw 15, 4, 27 226*09ef057bSDanny Tsen vaddudm 14, 14, 10 227*09ef057bSDanny Tsen vaddudm 14, 14, 11 228*09ef057bSDanny Tsen vmulouw 10, 5, 26 229*09ef057bSDanny Tsen vmulouw 11, 6, 3 230*09ef057bSDanny Tsen vaddudm 14, 14, 12 231*09ef057bSDanny Tsen vaddudm 14, 14, 13 # x0 232*09ef057bSDanny Tsen vaddudm 15, 15, 10 233*09ef057bSDanny Tsen vaddudm 15, 15, 11 234*09ef057bSDanny Tsen vmulouw 12, 7, 2 235*09ef057bSDanny Tsen vmulouw 13, 8, 1 236*09ef057bSDanny Tsen vaddudm 15, 15, 12 237*09ef057bSDanny Tsen vaddudm 15, 15, 13 # x1 238*09ef057bSDanny Tsen vmulouw 16, 4, 28 239*09ef057bSDanny Tsen vmulouw 10, 5, 27 240*09ef057bSDanny Tsen vmulouw 11, 6, 26 241*09ef057bSDanny Tsen vaddudm 16, 16, 10 242*09ef057bSDanny Tsen vaddudm 16, 16, 11 243*09ef057bSDanny Tsen vmulouw 12, 7, 3 244*09ef057bSDanny Tsen vmulouw 13, 8, 2 245*09ef057bSDanny Tsen vaddudm 16, 16, 12 246*09ef057bSDanny Tsen vaddudm 16, 16, 13 # x2 247*09ef057bSDanny Tsen vmulouw 17, 4, 29 248*09ef057bSDanny Tsen vmulouw 10, 5, 28 249*09ef057bSDanny Tsen vmulouw 11, 6, 27 250*09ef057bSDanny Tsen vaddudm 17, 17, 10 251*09ef057bSDanny Tsen vaddudm 17, 17, 11 252*09ef057bSDanny Tsen vmulouw 12, 7, 26 253*09ef057bSDanny Tsen vmulouw 13, 8, 3 254*09ef057bSDanny Tsen vaddudm 17, 17, 12 255*09ef057bSDanny Tsen vaddudm 17, 17, 13 # x3 256*09ef057bSDanny Tsen vmulouw 18, 4, 30 257*09ef057bSDanny Tsen vmulouw 10, 5, 29 258*09ef057bSDanny Tsen vmulouw 11, 6, 28 259*09ef057bSDanny Tsen vaddudm 18, 18, 10 260*09ef057bSDanny Tsen vaddudm 18, 18, 11 261*09ef057bSDanny Tsen vmulouw 12, 7, 27 262*09ef057bSDanny Tsen vmulouw 13, 8, 26 263*09ef057bSDanny Tsen vaddudm 18, 18, 12 264*09ef057bSDanny Tsen vaddudm 18, 18, 13 # x4 265*09ef057bSDanny Tsen.endm 266*09ef057bSDanny Tsen 267*09ef057bSDanny Tsen.macro mul_even 268*09ef057bSDanny Tsen vmuleuw 9, 4, 26 269*09ef057bSDanny Tsen vmuleuw 10, 5, 3 270*09ef057bSDanny Tsen vmuleuw 11, 6, 2 271*09ef057bSDanny Tsen vmuleuw 12, 7, 1 272*09ef057bSDanny Tsen vmuleuw 13, 8, 0 273*09ef057bSDanny Tsen vaddudm 14, 14, 9 274*09ef057bSDanny Tsen vaddudm 14, 14, 10 275*09ef057bSDanny Tsen vaddudm 14, 14, 11 276*09ef057bSDanny Tsen vaddudm 14, 14, 12 277*09ef057bSDanny Tsen vaddudm 14, 14, 13 # x0 278*09ef057bSDanny Tsen 279*09ef057bSDanny Tsen vmuleuw 9, 4, 27 280*09ef057bSDanny Tsen vmuleuw 10, 5, 26 281*09ef057bSDanny Tsen vmuleuw 11, 6, 3 282*09ef057bSDanny Tsen vmuleuw 12, 7, 2 283*09ef057bSDanny Tsen vmuleuw 13, 8, 1 284*09ef057bSDanny Tsen vaddudm 15, 15, 9 285*09ef057bSDanny Tsen vaddudm 15, 15, 10 286*09ef057bSDanny Tsen vaddudm 15, 15, 11 287*09ef057bSDanny Tsen vaddudm 15, 15, 12 288*09ef057bSDanny Tsen vaddudm 15, 15, 13 # x1 289*09ef057bSDanny Tsen 290*09ef057bSDanny Tsen vmuleuw 9, 4, 28 291*09ef057bSDanny Tsen vmuleuw 10, 5, 27 292*09ef057bSDanny Tsen vmuleuw 11, 6, 26 293*09ef057bSDanny Tsen vmuleuw 12, 7, 3 294*09ef057bSDanny Tsen vmuleuw 13, 8, 2 295*09ef057bSDanny Tsen vaddudm 16, 16, 9 296*09ef057bSDanny Tsen vaddudm 16, 16, 10 297*09ef057bSDanny Tsen vaddudm 16, 16, 11 298*09ef057bSDanny Tsen vaddudm 16, 16, 12 299*09ef057bSDanny Tsen vaddudm 16, 16, 13 # x2 300*09ef057bSDanny Tsen 301*09ef057bSDanny Tsen vmuleuw 9, 4, 29 302*09ef057bSDanny Tsen vmuleuw 10, 5, 28 303*09ef057bSDanny Tsen vmuleuw 11, 6, 27 304*09ef057bSDanny Tsen vmuleuw 12, 7, 26 305*09ef057bSDanny Tsen vmuleuw 13, 8, 3 306*09ef057bSDanny Tsen vaddudm 17, 17, 9 307*09ef057bSDanny Tsen vaddudm 17, 17, 10 308*09ef057bSDanny Tsen vaddudm 17, 17, 11 309*09ef057bSDanny Tsen vaddudm 17, 17, 12 310*09ef057bSDanny Tsen vaddudm 17, 17, 13 # x3 311*09ef057bSDanny Tsen 312*09ef057bSDanny Tsen vmuleuw 9, 4, 30 313*09ef057bSDanny Tsen vmuleuw 10, 5, 29 314*09ef057bSDanny Tsen vmuleuw 11, 6, 28 315*09ef057bSDanny Tsen vmuleuw 12, 7, 27 316*09ef057bSDanny Tsen vmuleuw 13, 8, 26 317*09ef057bSDanny Tsen vaddudm 18, 18, 9 318*09ef057bSDanny Tsen vaddudm 18, 18, 10 319*09ef057bSDanny Tsen vaddudm 18, 18, 11 320*09ef057bSDanny Tsen vaddudm 18, 18, 12 321*09ef057bSDanny Tsen vaddudm 18, 18, 13 # x4 322*09ef057bSDanny Tsen.endm 323*09ef057bSDanny Tsen 324*09ef057bSDanny Tsen# 325*09ef057bSDanny Tsen# poly1305_setup_r 326*09ef057bSDanny Tsen# 327*09ef057bSDanny Tsen# setup r^4, r^3, r^2, r vectors 328*09ef057bSDanny Tsen# [r, r^3, r^2, r^4] 329*09ef057bSDanny Tsen# vs0 = [r0,...] 330*09ef057bSDanny Tsen# vs1 = [r1,...] 331*09ef057bSDanny Tsen# vs2 = [r2,...] 332*09ef057bSDanny Tsen# vs3 = [r3,...] 333*09ef057bSDanny Tsen# vs4 = [r4,...] 334*09ef057bSDanny Tsen# vs5 = [r4*5,...] 335*09ef057bSDanny Tsen# vs6 = [r3*5,...] 336*09ef057bSDanny Tsen# vs7 = [r2*5,...] 337*09ef057bSDanny Tsen# vs8 = [r1*5,...] 338*09ef057bSDanny Tsen# 339*09ef057bSDanny Tsen# r0, r4*5, r3*5, r2*5, r1*5; 340*09ef057bSDanny Tsen# r1, r0, r4*5, r3*5, r2*5; 341*09ef057bSDanny Tsen# r2, r1, r0, r4*5, r3*5; 342*09ef057bSDanny Tsen# r3, r2, r1, r0, r4*5; 343*09ef057bSDanny Tsen# r4, r3, r2, r1, r0 ; 344*09ef057bSDanny Tsen# 345*09ef057bSDanny Tsen.macro poly1305_setup_r 346*09ef057bSDanny Tsen 347*09ef057bSDanny Tsen # save r 348*09ef057bSDanny Tsen xxlor 26, 58, 58 349*09ef057bSDanny Tsen xxlor 27, 59, 59 350*09ef057bSDanny Tsen xxlor 28, 60, 60 351*09ef057bSDanny Tsen xxlor 29, 61, 61 352*09ef057bSDanny Tsen xxlor 30, 62, 62 353*09ef057bSDanny Tsen 354*09ef057bSDanny Tsen xxlxor 31, 31, 31 355*09ef057bSDanny Tsen 356*09ef057bSDanny Tsen# [r, r^3, r^2, r^4] 357*09ef057bSDanny Tsen # compute r^2 358*09ef057bSDanny Tsen vmr 4, 26 359*09ef057bSDanny Tsen vmr 5, 27 360*09ef057bSDanny Tsen vmr 6, 28 361*09ef057bSDanny Tsen vmr 7, 29 362*09ef057bSDanny Tsen vmr 8, 30 363*09ef057bSDanny Tsen bl do_mul # r^2 r^1 364*09ef057bSDanny Tsen xxpermdi 58, 58, 36, 0x3 # r0 365*09ef057bSDanny Tsen xxpermdi 59, 59, 37, 0x3 # r1 366*09ef057bSDanny Tsen xxpermdi 60, 60, 38, 0x3 # r2 367*09ef057bSDanny Tsen xxpermdi 61, 61, 39, 0x3 # r3 368*09ef057bSDanny Tsen xxpermdi 62, 62, 40, 0x3 # r4 369*09ef057bSDanny Tsen xxpermdi 36, 36, 36, 0x3 370*09ef057bSDanny Tsen xxpermdi 37, 37, 37, 0x3 371*09ef057bSDanny Tsen xxpermdi 38, 38, 38, 0x3 372*09ef057bSDanny Tsen xxpermdi 39, 39, 39, 0x3 373*09ef057bSDanny Tsen xxpermdi 40, 40, 40, 0x3 374*09ef057bSDanny Tsen vspltisb 13, 2 375*09ef057bSDanny Tsen vsld 9, 27, 13 376*09ef057bSDanny Tsen vsld 10, 28, 13 377*09ef057bSDanny Tsen vsld 11, 29, 13 378*09ef057bSDanny Tsen vsld 12, 30, 13 379*09ef057bSDanny Tsen vaddudm 0, 9, 27 380*09ef057bSDanny Tsen vaddudm 1, 10, 28 381*09ef057bSDanny Tsen vaddudm 2, 11, 29 382*09ef057bSDanny Tsen vaddudm 3, 12, 30 383*09ef057bSDanny Tsen 384*09ef057bSDanny Tsen bl do_mul # r^4 r^3 385*09ef057bSDanny Tsen vmrgow 26, 26, 4 386*09ef057bSDanny Tsen vmrgow 27, 27, 5 387*09ef057bSDanny Tsen vmrgow 28, 28, 6 388*09ef057bSDanny Tsen vmrgow 29, 29, 7 389*09ef057bSDanny Tsen vmrgow 30, 30, 8 390*09ef057bSDanny Tsen vspltisb 13, 2 391*09ef057bSDanny Tsen vsld 9, 27, 13 392*09ef057bSDanny Tsen vsld 10, 28, 13 393*09ef057bSDanny Tsen vsld 11, 29, 13 394*09ef057bSDanny Tsen vsld 12, 30, 13 395*09ef057bSDanny Tsen vaddudm 0, 9, 27 396*09ef057bSDanny Tsen vaddudm 1, 10, 28 397*09ef057bSDanny Tsen vaddudm 2, 11, 29 398*09ef057bSDanny Tsen vaddudm 3, 12, 30 399*09ef057bSDanny Tsen 400*09ef057bSDanny Tsen # r^2 r^4 401*09ef057bSDanny Tsen xxlor 0, 58, 58 402*09ef057bSDanny Tsen xxlor 1, 59, 59 403*09ef057bSDanny Tsen xxlor 2, 60, 60 404*09ef057bSDanny Tsen xxlor 3, 61, 61 405*09ef057bSDanny Tsen xxlor 4, 62, 62 406*09ef057bSDanny Tsen xxlor 5, 32, 32 407*09ef057bSDanny Tsen xxlor 6, 33, 33 408*09ef057bSDanny Tsen xxlor 7, 34, 34 409*09ef057bSDanny Tsen xxlor 8, 35, 35 410*09ef057bSDanny Tsen 411*09ef057bSDanny Tsen vspltw 9, 26, 3 412*09ef057bSDanny Tsen vspltw 10, 26, 2 413*09ef057bSDanny Tsen vmrgow 26, 10, 9 414*09ef057bSDanny Tsen vspltw 9, 27, 3 415*09ef057bSDanny Tsen vspltw 10, 27, 2 416*09ef057bSDanny Tsen vmrgow 27, 10, 9 417*09ef057bSDanny Tsen vspltw 9, 28, 3 418*09ef057bSDanny Tsen vspltw 10, 28, 2 419*09ef057bSDanny Tsen vmrgow 28, 10, 9 420*09ef057bSDanny Tsen vspltw 9, 29, 3 421*09ef057bSDanny Tsen vspltw 10, 29, 2 422*09ef057bSDanny Tsen vmrgow 29, 10, 9 423*09ef057bSDanny Tsen vspltw 9, 30, 3 424*09ef057bSDanny Tsen vspltw 10, 30, 2 425*09ef057bSDanny Tsen vmrgow 30, 10, 9 426*09ef057bSDanny Tsen 427*09ef057bSDanny Tsen vsld 9, 27, 13 428*09ef057bSDanny Tsen vsld 10, 28, 13 429*09ef057bSDanny Tsen vsld 11, 29, 13 430*09ef057bSDanny Tsen vsld 12, 30, 13 431*09ef057bSDanny Tsen vaddudm 0, 9, 27 432*09ef057bSDanny Tsen vaddudm 1, 10, 28 433*09ef057bSDanny Tsen vaddudm 2, 11, 29 434*09ef057bSDanny Tsen vaddudm 3, 12, 30 435*09ef057bSDanny Tsen.endm 436*09ef057bSDanny Tsen 437*09ef057bSDanny TsenSYM_FUNC_START_LOCAL(do_mul) 438*09ef057bSDanny Tsen mul_odd 439*09ef057bSDanny Tsen 440*09ef057bSDanny Tsen # do reduction ( h %= p ) 441*09ef057bSDanny Tsen # carry reduction 442*09ef057bSDanny Tsen vspltisb 9, 2 443*09ef057bSDanny Tsen vsrd 10, 14, 31 444*09ef057bSDanny Tsen vsrd 11, 17, 31 445*09ef057bSDanny Tsen vand 7, 17, 25 446*09ef057bSDanny Tsen vand 4, 14, 25 447*09ef057bSDanny Tsen vaddudm 18, 18, 11 448*09ef057bSDanny Tsen vsrd 12, 18, 31 449*09ef057bSDanny Tsen vaddudm 15, 15, 10 450*09ef057bSDanny Tsen 451*09ef057bSDanny Tsen vsrd 11, 15, 31 452*09ef057bSDanny Tsen vand 8, 18, 25 453*09ef057bSDanny Tsen vand 5, 15, 25 454*09ef057bSDanny Tsen vaddudm 4, 4, 12 455*09ef057bSDanny Tsen vsld 10, 12, 9 456*09ef057bSDanny Tsen vaddudm 6, 16, 11 457*09ef057bSDanny Tsen 458*09ef057bSDanny Tsen vsrd 13, 6, 31 459*09ef057bSDanny Tsen vand 6, 6, 25 460*09ef057bSDanny Tsen vaddudm 4, 4, 10 461*09ef057bSDanny Tsen vsrd 10, 4, 31 462*09ef057bSDanny Tsen vaddudm 7, 7, 13 463*09ef057bSDanny Tsen 464*09ef057bSDanny Tsen vsrd 11, 7, 31 465*09ef057bSDanny Tsen vand 7, 7, 25 466*09ef057bSDanny Tsen vand 4, 4, 25 467*09ef057bSDanny Tsen vaddudm 5, 5, 10 468*09ef057bSDanny Tsen vaddudm 8, 8, 11 469*09ef057bSDanny Tsen blr 470*09ef057bSDanny TsenSYM_FUNC_END(do_mul) 471*09ef057bSDanny Tsen 472*09ef057bSDanny Tsen# 473*09ef057bSDanny Tsen# init key 474*09ef057bSDanny Tsen# 475*09ef057bSDanny Tsen.macro do_poly1305_init 476*09ef057bSDanny Tsen addis 10, 2, rmask@toc@ha 477*09ef057bSDanny Tsen addi 10, 10, rmask@toc@l 478*09ef057bSDanny Tsen 479*09ef057bSDanny Tsen ld 11, 0(10) 480*09ef057bSDanny Tsen ld 12, 8(10) 481*09ef057bSDanny Tsen 482*09ef057bSDanny Tsen li 14, 16 483*09ef057bSDanny Tsen li 15, 32 484*09ef057bSDanny Tsen addis 10, 2, cnum@toc@ha 485*09ef057bSDanny Tsen addi 10, 10, cnum@toc@l 486*09ef057bSDanny Tsen lvx 25, 0, 10 # v25 - mask 487*09ef057bSDanny Tsen lvx 31, 14, 10 # v31 = 1a 488*09ef057bSDanny Tsen lvx 19, 15, 10 # v19 = 1 << 24 489*09ef057bSDanny Tsen lxv 24, 48(10) # vs24 490*09ef057bSDanny Tsen lxv 25, 64(10) # vs25 491*09ef057bSDanny Tsen 492*09ef057bSDanny Tsen # initialize 493*09ef057bSDanny Tsen # load key from r3 to vectors 494*09ef057bSDanny Tsen ld 9, 24(3) 495*09ef057bSDanny Tsen ld 10, 32(3) 496*09ef057bSDanny Tsen and. 9, 9, 11 497*09ef057bSDanny Tsen and. 10, 10, 12 498*09ef057bSDanny Tsen 499*09ef057bSDanny Tsen # break 26 bits 500*09ef057bSDanny Tsen extrdi 14, 9, 26, 38 501*09ef057bSDanny Tsen extrdi 15, 9, 26, 12 502*09ef057bSDanny Tsen extrdi 16, 9, 12, 0 503*09ef057bSDanny Tsen mtvsrdd 58, 0, 14 504*09ef057bSDanny Tsen insrdi 16, 10, 14, 38 505*09ef057bSDanny Tsen mtvsrdd 59, 0, 15 506*09ef057bSDanny Tsen extrdi 17, 10, 26, 24 507*09ef057bSDanny Tsen mtvsrdd 60, 0, 16 508*09ef057bSDanny Tsen extrdi 18, 10, 24, 0 509*09ef057bSDanny Tsen mtvsrdd 61, 0, 17 510*09ef057bSDanny Tsen mtvsrdd 62, 0, 18 511*09ef057bSDanny Tsen 512*09ef057bSDanny Tsen # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5 513*09ef057bSDanny Tsen li 9, 5 514*09ef057bSDanny Tsen mtvsrdd 36, 0, 9 515*09ef057bSDanny Tsen vmulouw 0, 27, 4 # v0 = rr0 516*09ef057bSDanny Tsen vmulouw 1, 28, 4 # v1 = rr1 517*09ef057bSDanny Tsen vmulouw 2, 29, 4 # v2 = rr2 518*09ef057bSDanny Tsen vmulouw 3, 30, 4 # v3 = rr3 519*09ef057bSDanny Tsen.endm 520*09ef057bSDanny Tsen 521*09ef057bSDanny Tsen# 522*09ef057bSDanny Tsen# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) 523*09ef057bSDanny Tsen# k = 32 bytes key 524*09ef057bSDanny Tsen# r3 = k (r, s) 525*09ef057bSDanny Tsen# r4 = mlen 526*09ef057bSDanny Tsen# r5 = m 527*09ef057bSDanny Tsen# 528*09ef057bSDanny TsenSYM_FUNC_START(poly1305_p10le_4blocks) 529*09ef057bSDanny Tsen.align 5 530*09ef057bSDanny Tsen cmpdi 5, 64 531*09ef057bSDanny Tsen blt Out_no_poly1305 532*09ef057bSDanny Tsen 533*09ef057bSDanny Tsen SAVE_REGS 534*09ef057bSDanny Tsen 535*09ef057bSDanny Tsen do_poly1305_init 536*09ef057bSDanny Tsen 537*09ef057bSDanny Tsen li 21, 0 # counter to message 538*09ef057bSDanny Tsen 539*09ef057bSDanny Tsen poly1305_setup_r 540*09ef057bSDanny Tsen 541*09ef057bSDanny Tsen # load previous H state 542*09ef057bSDanny Tsen # break/convert r6 to 26 bits 543*09ef057bSDanny Tsen ld 9, 0(3) 544*09ef057bSDanny Tsen ld 10, 8(3) 545*09ef057bSDanny Tsen ld 19, 16(3) 546*09ef057bSDanny Tsen sldi 19, 19, 24 547*09ef057bSDanny Tsen mtvsrdd 41, 0, 19 548*09ef057bSDanny Tsen extrdi 14, 9, 26, 38 549*09ef057bSDanny Tsen extrdi 15, 9, 26, 12 550*09ef057bSDanny Tsen extrdi 16, 9, 12, 0 551*09ef057bSDanny Tsen mtvsrdd 36, 0, 14 552*09ef057bSDanny Tsen insrdi 16, 10, 14, 38 553*09ef057bSDanny Tsen mtvsrdd 37, 0, 15 554*09ef057bSDanny Tsen extrdi 17, 10, 26, 24 555*09ef057bSDanny Tsen mtvsrdd 38, 0, 16 556*09ef057bSDanny Tsen extrdi 18, 10, 24, 0 557*09ef057bSDanny Tsen mtvsrdd 39, 0, 17 558*09ef057bSDanny Tsen mtvsrdd 40, 0, 18 559*09ef057bSDanny Tsen vor 8, 8, 9 560*09ef057bSDanny Tsen 561*09ef057bSDanny Tsen # input m1 m2 562*09ef057bSDanny Tsen add 20, 4, 21 563*09ef057bSDanny Tsen xxlor 49, 24, 24 564*09ef057bSDanny Tsen xxlor 50, 25, 25 565*09ef057bSDanny Tsen lxvw4x 43, 0, 20 566*09ef057bSDanny Tsen addi 17, 20, 16 567*09ef057bSDanny Tsen lxvw4x 44, 0, 17 568*09ef057bSDanny Tsen vperm 14, 11, 12, 17 569*09ef057bSDanny Tsen vperm 15, 11, 12, 18 570*09ef057bSDanny Tsen vand 9, 14, 25 # a0 571*09ef057bSDanny Tsen vsrd 10, 14, 31 # >> 26 572*09ef057bSDanny Tsen vsrd 11, 10, 31 # 12 bits left 573*09ef057bSDanny Tsen vand 10, 10, 25 # a1 574*09ef057bSDanny Tsen vspltisb 13, 12 575*09ef057bSDanny Tsen vand 16, 15, 25 576*09ef057bSDanny Tsen vsld 12, 16, 13 577*09ef057bSDanny Tsen vor 11, 11, 12 578*09ef057bSDanny Tsen vand 11, 11, 25 # a2 579*09ef057bSDanny Tsen vspltisb 13, 14 580*09ef057bSDanny Tsen vsrd 12, 15, 13 # >> 14 581*09ef057bSDanny Tsen vsrd 13, 12, 31 # >> 26, a4 582*09ef057bSDanny Tsen vand 12, 12, 25 # a3 583*09ef057bSDanny Tsen 584*09ef057bSDanny Tsen vaddudm 20, 4, 9 585*09ef057bSDanny Tsen vaddudm 21, 5, 10 586*09ef057bSDanny Tsen vaddudm 22, 6, 11 587*09ef057bSDanny Tsen vaddudm 23, 7, 12 588*09ef057bSDanny Tsen vaddudm 24, 8, 13 589*09ef057bSDanny Tsen 590*09ef057bSDanny Tsen # m3 m4 591*09ef057bSDanny Tsen addi 17, 17, 16 592*09ef057bSDanny Tsen lxvw4x 43, 0, 17 593*09ef057bSDanny Tsen addi 17, 17, 16 594*09ef057bSDanny Tsen lxvw4x 44, 0, 17 595*09ef057bSDanny Tsen vperm 14, 11, 12, 17 596*09ef057bSDanny Tsen vperm 15, 11, 12, 18 597*09ef057bSDanny Tsen vand 9, 14, 25 # a0 598*09ef057bSDanny Tsen vsrd 10, 14, 31 # >> 26 599*09ef057bSDanny Tsen vsrd 11, 10, 31 # 12 bits left 600*09ef057bSDanny Tsen vand 10, 10, 25 # a1 601*09ef057bSDanny Tsen vspltisb 13, 12 602*09ef057bSDanny Tsen vand 16, 15, 25 603*09ef057bSDanny Tsen vsld 12, 16, 13 604*09ef057bSDanny Tsen vspltisb 13, 14 605*09ef057bSDanny Tsen vor 11, 11, 12 606*09ef057bSDanny Tsen vand 11, 11, 25 # a2 607*09ef057bSDanny Tsen vsrd 12, 15, 13 # >> 14 608*09ef057bSDanny Tsen vsrd 13, 12, 31 # >> 26, a4 609*09ef057bSDanny Tsen vand 12, 12, 25 # a3 610*09ef057bSDanny Tsen 611*09ef057bSDanny Tsen # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] 612*09ef057bSDanny Tsen vmrgow 4, 9, 20 613*09ef057bSDanny Tsen vmrgow 5, 10, 21 614*09ef057bSDanny Tsen vmrgow 6, 11, 22 615*09ef057bSDanny Tsen vmrgow 7, 12, 23 616*09ef057bSDanny Tsen vmrgow 8, 13, 24 617*09ef057bSDanny Tsen vaddudm 8, 8, 19 618*09ef057bSDanny Tsen 619*09ef057bSDanny Tsen addi 5, 5, -64 # len -= 64 620*09ef057bSDanny Tsen addi 21, 21, 64 # offset += 64 621*09ef057bSDanny Tsen 622*09ef057bSDanny Tsen li 9, 64 623*09ef057bSDanny Tsen divdu 31, 5, 9 624*09ef057bSDanny Tsen 625*09ef057bSDanny Tsen cmpdi 31, 0 626*09ef057bSDanny Tsen ble Skip_block_loop 627*09ef057bSDanny Tsen 628*09ef057bSDanny Tsen mtctr 31 629*09ef057bSDanny Tsen 630*09ef057bSDanny Tsen# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r 631*09ef057bSDanny Tsen# Rewrite the polynominal sum of product as follows, 632*09ef057bSDanny Tsen# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2 633*09ef057bSDanny Tsen# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2 634*09ef057bSDanny Tsen# .... Repeat 635*09ef057bSDanny Tsen# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 --> 636*09ef057bSDanny Tsen# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r 637*09ef057bSDanny Tsen# 638*09ef057bSDanny Tsenloop_4blocks: 639*09ef057bSDanny Tsen 640*09ef057bSDanny Tsen # Multiply odd words and even words 641*09ef057bSDanny Tsen mul_odd 642*09ef057bSDanny Tsen mul_even 643*09ef057bSDanny Tsen # carry reduction 644*09ef057bSDanny Tsen vspltisb 9, 2 645*09ef057bSDanny Tsen vsrd 10, 14, 31 646*09ef057bSDanny Tsen vsrd 11, 17, 31 647*09ef057bSDanny Tsen vand 7, 17, 25 648*09ef057bSDanny Tsen vand 4, 14, 25 649*09ef057bSDanny Tsen vaddudm 18, 18, 11 650*09ef057bSDanny Tsen vsrd 12, 18, 31 651*09ef057bSDanny Tsen vaddudm 15, 15, 10 652*09ef057bSDanny Tsen 653*09ef057bSDanny Tsen vsrd 11, 15, 31 654*09ef057bSDanny Tsen vand 8, 18, 25 655*09ef057bSDanny Tsen vand 5, 15, 25 656*09ef057bSDanny Tsen vaddudm 4, 4, 12 657*09ef057bSDanny Tsen vsld 10, 12, 9 658*09ef057bSDanny Tsen vaddudm 6, 16, 11 659*09ef057bSDanny Tsen 660*09ef057bSDanny Tsen vsrd 13, 6, 31 661*09ef057bSDanny Tsen vand 6, 6, 25 662*09ef057bSDanny Tsen vaddudm 4, 4, 10 663*09ef057bSDanny Tsen vsrd 10, 4, 31 664*09ef057bSDanny Tsen vaddudm 7, 7, 13 665*09ef057bSDanny Tsen 666*09ef057bSDanny Tsen vsrd 11, 7, 31 667*09ef057bSDanny Tsen vand 7, 7, 25 668*09ef057bSDanny Tsen vand 4, 4, 25 669*09ef057bSDanny Tsen vaddudm 5, 5, 10 670*09ef057bSDanny Tsen vaddudm 8, 8, 11 671*09ef057bSDanny Tsen 672*09ef057bSDanny Tsen # input m1 m2 m3 m4 673*09ef057bSDanny Tsen add 20, 4, 21 674*09ef057bSDanny Tsen xxlor 49, 24, 24 675*09ef057bSDanny Tsen xxlor 50, 25, 25 676*09ef057bSDanny Tsen lxvw4x 43, 0, 20 677*09ef057bSDanny Tsen addi 17, 20, 16 678*09ef057bSDanny Tsen lxvw4x 44, 0, 17 679*09ef057bSDanny Tsen vperm 14, 11, 12, 17 680*09ef057bSDanny Tsen vperm 15, 11, 12, 18 681*09ef057bSDanny Tsen addi 17, 17, 16 682*09ef057bSDanny Tsen lxvw4x 43, 0, 17 683*09ef057bSDanny Tsen addi 17, 17, 16 684*09ef057bSDanny Tsen lxvw4x 44, 0, 17 685*09ef057bSDanny Tsen vperm 17, 11, 12, 17 686*09ef057bSDanny Tsen vperm 18, 11, 12, 18 687*09ef057bSDanny Tsen 688*09ef057bSDanny Tsen vand 20, 14, 25 # a0 689*09ef057bSDanny Tsen vand 9, 17, 25 # a0 690*09ef057bSDanny Tsen vsrd 21, 14, 31 # >> 26 691*09ef057bSDanny Tsen vsrd 22, 21, 31 # 12 bits left 692*09ef057bSDanny Tsen vsrd 10, 17, 31 # >> 26 693*09ef057bSDanny Tsen vsrd 11, 10, 31 # 12 bits left 694*09ef057bSDanny Tsen 695*09ef057bSDanny Tsen vand 21, 21, 25 # a1 696*09ef057bSDanny Tsen vand 10, 10, 25 # a1 697*09ef057bSDanny Tsen 698*09ef057bSDanny Tsen vspltisb 13, 12 699*09ef057bSDanny Tsen vand 16, 15, 25 700*09ef057bSDanny Tsen vsld 23, 16, 13 701*09ef057bSDanny Tsen vor 22, 22, 23 702*09ef057bSDanny Tsen vand 22, 22, 25 # a2 703*09ef057bSDanny Tsen vand 16, 18, 25 704*09ef057bSDanny Tsen vsld 12, 16, 13 705*09ef057bSDanny Tsen vor 11, 11, 12 706*09ef057bSDanny Tsen vand 11, 11, 25 # a2 707*09ef057bSDanny Tsen vspltisb 13, 14 708*09ef057bSDanny Tsen vsrd 23, 15, 13 # >> 14 709*09ef057bSDanny Tsen vsrd 24, 23, 31 # >> 26, a4 710*09ef057bSDanny Tsen vand 23, 23, 25 # a3 711*09ef057bSDanny Tsen vsrd 12, 18, 13 # >> 14 712*09ef057bSDanny Tsen vsrd 13, 12, 31 # >> 26, a4 713*09ef057bSDanny Tsen vand 12, 12, 25 # a3 714*09ef057bSDanny Tsen 715*09ef057bSDanny Tsen vaddudm 4, 4, 20 716*09ef057bSDanny Tsen vaddudm 5, 5, 21 717*09ef057bSDanny Tsen vaddudm 6, 6, 22 718*09ef057bSDanny Tsen vaddudm 7, 7, 23 719*09ef057bSDanny Tsen vaddudm 8, 8, 24 720*09ef057bSDanny Tsen 721*09ef057bSDanny Tsen # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] 722*09ef057bSDanny Tsen vmrgow 4, 9, 4 723*09ef057bSDanny Tsen vmrgow 5, 10, 5 724*09ef057bSDanny Tsen vmrgow 6, 11, 6 725*09ef057bSDanny Tsen vmrgow 7, 12, 7 726*09ef057bSDanny Tsen vmrgow 8, 13, 8 727*09ef057bSDanny Tsen vaddudm 8, 8, 19 728*09ef057bSDanny Tsen 729*09ef057bSDanny Tsen addi 5, 5, -64 # len -= 64 730*09ef057bSDanny Tsen addi 21, 21, 64 # offset += 64 731*09ef057bSDanny Tsen 732*09ef057bSDanny Tsen bdnz loop_4blocks 733*09ef057bSDanny Tsen 734*09ef057bSDanny TsenSkip_block_loop: 735*09ef057bSDanny Tsen xxlor 58, 0, 0 736*09ef057bSDanny Tsen xxlor 59, 1, 1 737*09ef057bSDanny Tsen xxlor 60, 2, 2 738*09ef057bSDanny Tsen xxlor 61, 3, 3 739*09ef057bSDanny Tsen xxlor 62, 4, 4 740*09ef057bSDanny Tsen xxlor 32, 5, 5 741*09ef057bSDanny Tsen xxlor 33, 6, 6 742*09ef057bSDanny Tsen xxlor 34, 7, 7 743*09ef057bSDanny Tsen xxlor 35, 8, 8 744*09ef057bSDanny Tsen 745*09ef057bSDanny Tsen # Multiply odd words and even words 746*09ef057bSDanny Tsen mul_odd 747*09ef057bSDanny Tsen mul_even 748*09ef057bSDanny Tsen 749*09ef057bSDanny Tsen # Sum the products. 750*09ef057bSDanny Tsen xxpermdi 41, 31, 46, 0 751*09ef057bSDanny Tsen xxpermdi 42, 31, 47, 0 752*09ef057bSDanny Tsen vaddudm 4, 14, 9 753*09ef057bSDanny Tsen xxpermdi 36, 31, 36, 3 754*09ef057bSDanny Tsen vaddudm 5, 15, 10 755*09ef057bSDanny Tsen xxpermdi 37, 31, 37, 3 756*09ef057bSDanny Tsen xxpermdi 43, 31, 48, 0 757*09ef057bSDanny Tsen vaddudm 6, 16, 11 758*09ef057bSDanny Tsen xxpermdi 38, 31, 38, 3 759*09ef057bSDanny Tsen xxpermdi 44, 31, 49, 0 760*09ef057bSDanny Tsen vaddudm 7, 17, 12 761*09ef057bSDanny Tsen xxpermdi 39, 31, 39, 3 762*09ef057bSDanny Tsen xxpermdi 45, 31, 50, 0 763*09ef057bSDanny Tsen vaddudm 8, 18, 13 764*09ef057bSDanny Tsen xxpermdi 40, 31, 40, 3 765*09ef057bSDanny Tsen 766*09ef057bSDanny Tsen # carry reduction 767*09ef057bSDanny Tsen vspltisb 9, 2 768*09ef057bSDanny Tsen vsrd 10, 4, 31 769*09ef057bSDanny Tsen vsrd 11, 7, 31 770*09ef057bSDanny Tsen vand 7, 7, 25 771*09ef057bSDanny Tsen vand 4, 4, 25 772*09ef057bSDanny Tsen vaddudm 8, 8, 11 773*09ef057bSDanny Tsen vsrd 12, 8, 31 774*09ef057bSDanny Tsen vaddudm 5, 5, 10 775*09ef057bSDanny Tsen 776*09ef057bSDanny Tsen vsrd 11, 5, 31 777*09ef057bSDanny Tsen vand 8, 8, 25 778*09ef057bSDanny Tsen vand 5, 5, 25 779*09ef057bSDanny Tsen vaddudm 4, 4, 12 780*09ef057bSDanny Tsen vsld 10, 12, 9 781*09ef057bSDanny Tsen vaddudm 6, 6, 11 782*09ef057bSDanny Tsen 783*09ef057bSDanny Tsen vsrd 13, 6, 31 784*09ef057bSDanny Tsen vand 6, 6, 25 785*09ef057bSDanny Tsen vaddudm 4, 4, 10 786*09ef057bSDanny Tsen vsrd 10, 4, 31 787*09ef057bSDanny Tsen vaddudm 7, 7, 13 788*09ef057bSDanny Tsen 789*09ef057bSDanny Tsen vsrd 11, 7, 31 790*09ef057bSDanny Tsen vand 7, 7, 25 791*09ef057bSDanny Tsen vand 4, 4, 25 792*09ef057bSDanny Tsen vaddudm 5, 5, 10 793*09ef057bSDanny Tsen vsrd 10, 5, 31 794*09ef057bSDanny Tsen vand 5, 5, 25 795*09ef057bSDanny Tsen vaddudm 6, 6, 10 796*09ef057bSDanny Tsen vaddudm 8, 8, 11 797*09ef057bSDanny Tsen 798*09ef057bSDanny Tsen b do_final_update 799*09ef057bSDanny Tsen 800*09ef057bSDanny Tsendo_final_update: 801*09ef057bSDanny Tsen # combine 26 bit limbs 802*09ef057bSDanny Tsen # v4, v5, v6, v7 and v8 are 26 bit vectors 803*09ef057bSDanny Tsen vsld 5, 5, 31 804*09ef057bSDanny Tsen vor 20, 4, 5 805*09ef057bSDanny Tsen vspltisb 11, 12 806*09ef057bSDanny Tsen vsrd 12, 6, 11 807*09ef057bSDanny Tsen vsld 6, 6, 31 808*09ef057bSDanny Tsen vsld 6, 6, 31 809*09ef057bSDanny Tsen vor 20, 20, 6 810*09ef057bSDanny Tsen vspltisb 11, 14 811*09ef057bSDanny Tsen vsld 7, 7, 11 812*09ef057bSDanny Tsen vor 21, 7, 12 813*09ef057bSDanny Tsen mfvsrld 16, 40 # save last 2 bytes 814*09ef057bSDanny Tsen vsld 8, 8, 11 815*09ef057bSDanny Tsen vsld 8, 8, 31 816*09ef057bSDanny Tsen vor 21, 21, 8 817*09ef057bSDanny Tsen mfvsrld 17, 52 818*09ef057bSDanny Tsen mfvsrld 19, 53 819*09ef057bSDanny Tsen srdi 16, 16, 24 820*09ef057bSDanny Tsen 821*09ef057bSDanny Tsen std 17, 0(3) 822*09ef057bSDanny Tsen std 19, 8(3) 823*09ef057bSDanny Tsen stw 16, 16(3) 824*09ef057bSDanny Tsen 825*09ef057bSDanny TsenOut_loop: 826*09ef057bSDanny Tsen li 3, 0 827*09ef057bSDanny Tsen 828*09ef057bSDanny Tsen RESTORE_REGS 829*09ef057bSDanny Tsen 830*09ef057bSDanny Tsen blr 831*09ef057bSDanny Tsen 832*09ef057bSDanny TsenOut_no_poly1305: 833*09ef057bSDanny Tsen li 3, 0 834*09ef057bSDanny Tsen blr 835*09ef057bSDanny TsenSYM_FUNC_END(poly1305_p10le_4blocks) 836*09ef057bSDanny Tsen 837*09ef057bSDanny Tsen# 838*09ef057bSDanny Tsen# ======================================================================= 839*09ef057bSDanny Tsen# The following functions implement 64 x 64 bits multiplication poly1305. 840*09ef057bSDanny Tsen# 841*09ef057bSDanny TsenSYM_FUNC_START_LOCAL(Poly1305_init_64) 842*09ef057bSDanny Tsen # mask 0x0FFFFFFC0FFFFFFC 843*09ef057bSDanny Tsen # mask 0x0FFFFFFC0FFFFFFF 844*09ef057bSDanny Tsen addis 10, 2, rmask@toc@ha 845*09ef057bSDanny Tsen addi 10, 10, rmask@toc@l 846*09ef057bSDanny Tsen ld 11, 0(10) 847*09ef057bSDanny Tsen ld 12, 8(10) 848*09ef057bSDanny Tsen 849*09ef057bSDanny Tsen # initialize 850*09ef057bSDanny Tsen # load key from r3 851*09ef057bSDanny Tsen ld 9, 24(3) 852*09ef057bSDanny Tsen ld 10, 32(3) 853*09ef057bSDanny Tsen and. 9, 9, 11 # cramp mask r0 854*09ef057bSDanny Tsen and. 10, 10, 12 # cramp mask r1 855*09ef057bSDanny Tsen 856*09ef057bSDanny Tsen srdi 21, 10, 2 857*09ef057bSDanny Tsen add 19, 21, 10 # s1: r19 - (r1 >> 2) *5 858*09ef057bSDanny Tsen 859*09ef057bSDanny Tsen # setup r and s 860*09ef057bSDanny Tsen li 25, 0 861*09ef057bSDanny Tsen mtvsrdd 32+0, 9, 19 # r0, s1 862*09ef057bSDanny Tsen mtvsrdd 32+1, 10, 9 # r1, r0 863*09ef057bSDanny Tsen mtvsrdd 32+2, 19, 25 # s1 864*09ef057bSDanny Tsen mtvsrdd 32+3, 9, 25 # r0 865*09ef057bSDanny Tsen 866*09ef057bSDanny Tsen blr 867*09ef057bSDanny TsenSYM_FUNC_END(Poly1305_init_64) 868*09ef057bSDanny Tsen 869*09ef057bSDanny Tsen# Poly1305_mult 870*09ef057bSDanny Tsen# v6 = (h0, h1), v8 = h2 871*09ef057bSDanny Tsen# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0 872*09ef057bSDanny Tsen# 873*09ef057bSDanny Tsen# Output: v7, v10, v11 874*09ef057bSDanny Tsen# 875*09ef057bSDanny TsenSYM_FUNC_START_LOCAL(Poly1305_mult) 876*09ef057bSDanny Tsen # 877*09ef057bSDanny Tsen # d0 = h0 * r0 + h1 * s1 878*09ef057bSDanny Tsen vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1 879*09ef057bSDanny Tsen 880*09ef057bSDanny Tsen # d1 = h0 * r1 + h1 * r0 + h2 * s1 881*09ef057bSDanny Tsen vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0 882*09ef057bSDanny Tsen vmsumudm 10, 8, 2, 11 # d1 += h2 * s1 883*09ef057bSDanny Tsen 884*09ef057bSDanny Tsen # d2 = r0 885*09ef057bSDanny Tsen vmsumudm 11, 8, 3, 9 # d2 = h2 * r0 886*09ef057bSDanny Tsen blr 887*09ef057bSDanny TsenSYM_FUNC_END(Poly1305_mult) 888*09ef057bSDanny Tsen 889*09ef057bSDanny Tsen# 890*09ef057bSDanny Tsen# carry reduction 891*09ef057bSDanny Tsen# h %=p 892*09ef057bSDanny Tsen# 893*09ef057bSDanny Tsen# Input: v7, v10, v11 894*09ef057bSDanny Tsen# Output: r27, r28, r29 895*09ef057bSDanny Tsen# 896*09ef057bSDanny TsenSYM_FUNC_START_LOCAL(Carry_reduction) 897*09ef057bSDanny Tsen mfvsrld 27, 32+7 898*09ef057bSDanny Tsen mfvsrld 28, 32+10 899*09ef057bSDanny Tsen mfvsrld 29, 32+11 900*09ef057bSDanny Tsen mfvsrd 20, 32+7 # h0.h 901*09ef057bSDanny Tsen mfvsrd 21, 32+10 # h1.h 902*09ef057bSDanny Tsen 903*09ef057bSDanny Tsen addc 28, 28, 20 904*09ef057bSDanny Tsen adde 29, 29, 21 905*09ef057bSDanny Tsen srdi 22, 29, 0x2 906*09ef057bSDanny Tsen sldi 23, 22, 0x2 907*09ef057bSDanny Tsen add 23, 23, 22 # (h2 & 3) * 5 908*09ef057bSDanny Tsen addc 27, 27, 23 # h0 909*09ef057bSDanny Tsen addze 28, 28 # h1 910*09ef057bSDanny Tsen andi. 29, 29, 0x3 # h2 911*09ef057bSDanny Tsen blr 912*09ef057bSDanny TsenSYM_FUNC_END(Carry_reduction) 913*09ef057bSDanny Tsen 914*09ef057bSDanny Tsen# 915*09ef057bSDanny Tsen# poly1305 multiplication 916*09ef057bSDanny Tsen# h *= r, h %= p 917*09ef057bSDanny Tsen# d0 = h0 * r0 + h1 * s1 918*09ef057bSDanny Tsen# d1 = h0 * r1 + h1 * r0 + h2 * s1 919*09ef057bSDanny Tsen# d2 = h0 * r0 920*09ef057bSDanny Tsen# 921*09ef057bSDanny Tsen# 922*09ef057bSDanny Tsen# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit) 923*09ef057bSDanny Tsen# - no highbit if final leftover block (highbit = 0) 924*09ef057bSDanny Tsen# 925*09ef057bSDanny TsenSYM_FUNC_START(poly1305_64s) 926*09ef057bSDanny Tsen cmpdi 5, 0 927*09ef057bSDanny Tsen ble Out_no_poly1305_64 928*09ef057bSDanny Tsen 929*09ef057bSDanny Tsen mflr 0 930*09ef057bSDanny Tsen std 0, 16(1) 931*09ef057bSDanny Tsen stdu 1,-400(1) 932*09ef057bSDanny Tsen 933*09ef057bSDanny Tsen SAVE_GPR 14, 112, 1 934*09ef057bSDanny Tsen SAVE_GPR 15, 120, 1 935*09ef057bSDanny Tsen SAVE_GPR 16, 128, 1 936*09ef057bSDanny Tsen SAVE_GPR 17, 136, 1 937*09ef057bSDanny Tsen SAVE_GPR 18, 144, 1 938*09ef057bSDanny Tsen SAVE_GPR 19, 152, 1 939*09ef057bSDanny Tsen SAVE_GPR 20, 160, 1 940*09ef057bSDanny Tsen SAVE_GPR 21, 168, 1 941*09ef057bSDanny Tsen SAVE_GPR 22, 176, 1 942*09ef057bSDanny Tsen SAVE_GPR 23, 184, 1 943*09ef057bSDanny Tsen SAVE_GPR 24, 192, 1 944*09ef057bSDanny Tsen SAVE_GPR 25, 200, 1 945*09ef057bSDanny Tsen SAVE_GPR 26, 208, 1 946*09ef057bSDanny Tsen SAVE_GPR 27, 216, 1 947*09ef057bSDanny Tsen SAVE_GPR 28, 224, 1 948*09ef057bSDanny Tsen SAVE_GPR 29, 232, 1 949*09ef057bSDanny Tsen SAVE_GPR 30, 240, 1 950*09ef057bSDanny Tsen SAVE_GPR 31, 248, 1 951*09ef057bSDanny Tsen 952*09ef057bSDanny Tsen # Init poly1305 953*09ef057bSDanny Tsen bl Poly1305_init_64 954*09ef057bSDanny Tsen 955*09ef057bSDanny Tsen li 25, 0 # offset to inp and outp 956*09ef057bSDanny Tsen 957*09ef057bSDanny Tsen add 11, 25, 4 958*09ef057bSDanny Tsen 959*09ef057bSDanny Tsen # load h 960*09ef057bSDanny Tsen # h0, h1, h2? 961*09ef057bSDanny Tsen ld 27, 0(3) 962*09ef057bSDanny Tsen ld 28, 8(3) 963*09ef057bSDanny Tsen lwz 29, 16(3) 964*09ef057bSDanny Tsen 965*09ef057bSDanny Tsen li 30, 16 966*09ef057bSDanny Tsen divdu 31, 5, 30 967*09ef057bSDanny Tsen 968*09ef057bSDanny Tsen mtctr 31 969*09ef057bSDanny Tsen 970*09ef057bSDanny Tsen mr 24, 6 # highbit 971*09ef057bSDanny Tsen 972*09ef057bSDanny TsenLoop_block_64: 973*09ef057bSDanny Tsen vxor 9, 9, 9 974*09ef057bSDanny Tsen 975*09ef057bSDanny Tsen ld 20, 0(11) 976*09ef057bSDanny Tsen ld 21, 8(11) 977*09ef057bSDanny Tsen addi 11, 11, 16 978*09ef057bSDanny Tsen 979*09ef057bSDanny Tsen addc 27, 27, 20 980*09ef057bSDanny Tsen adde 28, 28, 21 981*09ef057bSDanny Tsen adde 29, 29, 24 982*09ef057bSDanny Tsen 983*09ef057bSDanny Tsen li 22, 0 984*09ef057bSDanny Tsen mtvsrdd 32+6, 27, 28 # h0, h1 985*09ef057bSDanny Tsen mtvsrdd 32+8, 29, 22 # h2 986*09ef057bSDanny Tsen 987*09ef057bSDanny Tsen bl Poly1305_mult 988*09ef057bSDanny Tsen 989*09ef057bSDanny Tsen bl Carry_reduction 990*09ef057bSDanny Tsen 991*09ef057bSDanny Tsen bdnz Loop_block_64 992*09ef057bSDanny Tsen 993*09ef057bSDanny Tsen std 27, 0(3) 994*09ef057bSDanny Tsen std 28, 8(3) 995*09ef057bSDanny Tsen stw 29, 16(3) 996*09ef057bSDanny Tsen 997*09ef057bSDanny Tsen li 3, 0 998*09ef057bSDanny Tsen 999*09ef057bSDanny Tsen RESTORE_GPR 14, 112, 1 1000*09ef057bSDanny Tsen RESTORE_GPR 15, 120, 1 1001*09ef057bSDanny Tsen RESTORE_GPR 16, 128, 1 1002*09ef057bSDanny Tsen RESTORE_GPR 17, 136, 1 1003*09ef057bSDanny Tsen RESTORE_GPR 18, 144, 1 1004*09ef057bSDanny Tsen RESTORE_GPR 19, 152, 1 1005*09ef057bSDanny Tsen RESTORE_GPR 20, 160, 1 1006*09ef057bSDanny Tsen RESTORE_GPR 21, 168, 1 1007*09ef057bSDanny Tsen RESTORE_GPR 22, 176, 1 1008*09ef057bSDanny Tsen RESTORE_GPR 23, 184, 1 1009*09ef057bSDanny Tsen RESTORE_GPR 24, 192, 1 1010*09ef057bSDanny Tsen RESTORE_GPR 25, 200, 1 1011*09ef057bSDanny Tsen RESTORE_GPR 26, 208, 1 1012*09ef057bSDanny Tsen RESTORE_GPR 27, 216, 1 1013*09ef057bSDanny Tsen RESTORE_GPR 28, 224, 1 1014*09ef057bSDanny Tsen RESTORE_GPR 29, 232, 1 1015*09ef057bSDanny Tsen RESTORE_GPR 30, 240, 1 1016*09ef057bSDanny Tsen RESTORE_GPR 31, 248, 1 1017*09ef057bSDanny Tsen 1018*09ef057bSDanny Tsen addi 1, 1, 400 1019*09ef057bSDanny Tsen ld 0, 16(1) 1020*09ef057bSDanny Tsen mtlr 0 1021*09ef057bSDanny Tsen 1022*09ef057bSDanny Tsen blr 1023*09ef057bSDanny Tsen 1024*09ef057bSDanny TsenOut_no_poly1305_64: 1025*09ef057bSDanny Tsen li 3, 0 1026*09ef057bSDanny Tsen blr 1027*09ef057bSDanny TsenSYM_FUNC_END(poly1305_64s) 1028*09ef057bSDanny Tsen 1029*09ef057bSDanny Tsen# 1030*09ef057bSDanny Tsen# Input: r3 = h, r4 = s, r5 = mac 1031*09ef057bSDanny Tsen# mac = h + s 1032*09ef057bSDanny Tsen# 1033*09ef057bSDanny TsenSYM_FUNC_START(poly1305_emit_64) 1034*09ef057bSDanny Tsen ld 10, 0(3) 1035*09ef057bSDanny Tsen ld 11, 8(3) 1036*09ef057bSDanny Tsen ld 12, 16(3) 1037*09ef057bSDanny Tsen 1038*09ef057bSDanny Tsen # compare modulus 1039*09ef057bSDanny Tsen # h + 5 + (-p) 1040*09ef057bSDanny Tsen mr 6, 10 1041*09ef057bSDanny Tsen mr 7, 11 1042*09ef057bSDanny Tsen mr 8, 12 1043*09ef057bSDanny Tsen addic. 6, 6, 5 1044*09ef057bSDanny Tsen addze 7, 7 1045*09ef057bSDanny Tsen addze 8, 8 1046*09ef057bSDanny Tsen srdi 9, 8, 2 # overflow? 1047*09ef057bSDanny Tsen cmpdi 9, 0 1048*09ef057bSDanny Tsen beq Skip_h64 1049*09ef057bSDanny Tsen mr 10, 6 1050*09ef057bSDanny Tsen mr 11, 7 1051*09ef057bSDanny Tsen mr 12, 8 1052*09ef057bSDanny Tsen 1053*09ef057bSDanny TsenSkip_h64: 1054*09ef057bSDanny Tsen ld 6, 0(4) 1055*09ef057bSDanny Tsen ld 7, 8(4) 1056*09ef057bSDanny Tsen addc 10, 10, 6 1057*09ef057bSDanny Tsen adde 11, 11, 7 1058*09ef057bSDanny Tsen addze 12, 12 1059*09ef057bSDanny Tsen 1060*09ef057bSDanny Tsen std 10, 0(5) 1061*09ef057bSDanny Tsen std 11, 8(5) 1062*09ef057bSDanny Tsen blr 1063*09ef057bSDanny TsenSYM_FUNC_END(poly1305_emit_64) 1064*09ef057bSDanny Tsen 1065*09ef057bSDanny TsenSYM_DATA_START_LOCAL(RMASK) 1066*09ef057bSDanny Tsen.align 5 1067*09ef057bSDanny Tsenrmask: 1068*09ef057bSDanny Tsen.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f 1069*09ef057bSDanny Tsencnum: 1070*09ef057bSDanny Tsen.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000 1071*09ef057bSDanny Tsen.long 0x1a, 0x00, 0x1a, 0x00 1072*09ef057bSDanny Tsen.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 1073*09ef057bSDanny Tsen.long 0x00010203, 0x04050607, 0x10111213, 0x14151617 1074*09ef057bSDanny Tsen.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f 1075*09ef057bSDanny TsenSYM_DATA_END(RMASK) 1076