1*09ef057bSDanny Tsen/* SPDX-License-Identifier: GPL-2.0-or-later */
2*09ef057bSDanny Tsen#
3*09ef057bSDanny Tsen# Accelerated poly1305 implementation for ppc64le.
4*09ef057bSDanny Tsen#
5*09ef057bSDanny Tsen# Copyright 2023- IBM Corp. All rights reserved
6*09ef057bSDanny Tsen#
7*09ef057bSDanny Tsen#===================================================================================
8*09ef057bSDanny Tsen# Written by Danny Tsen <dtsen@us.ibm.com>
9*09ef057bSDanny Tsen#
10*09ef057bSDanny Tsen# Poly1305 - this version mainly using vector/VSX/Scalar
11*09ef057bSDanny Tsen#  - 26 bits limbs
12*09ef057bSDanny Tsen#  - Handle multiple 64 byte blcok.
13*09ef057bSDanny Tsen#
14*09ef057bSDanny Tsen# Block size 16 bytes
15*09ef057bSDanny Tsen# key = (r, s)
16*09ef057bSDanny Tsen# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
17*09ef057bSDanny Tsen# p = 2^130 - 5
18*09ef057bSDanny Tsen# a += m
19*09ef057bSDanny Tsen# a = (r + a) % p
20*09ef057bSDanny Tsen# a += s
21*09ef057bSDanny Tsen#
22*09ef057bSDanny Tsen# Improve performance by breaking down polynominal to the sum of products with
23*09ef057bSDanny Tsen#     h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
24*09ef057bSDanny Tsen#
25*09ef057bSDanny Tsen#  07/22/21 - this revison based on the above sum of products.  Setup r^4, r^3, r^2, r and s3, s2, s1, s0
26*09ef057bSDanny Tsen#             to 9 vectors for multiplications.
27*09ef057bSDanny Tsen#
28*09ef057bSDanny Tsen# setup r^4, r^3, r^2, r vectors
29*09ef057bSDanny Tsen#    vs    [r^1, r^3, r^2, r^4]
30*09ef057bSDanny Tsen#    vs0 = [r0,.....]
31*09ef057bSDanny Tsen#    vs1 = [r1,.....]
32*09ef057bSDanny Tsen#    vs2 = [r2,.....]
33*09ef057bSDanny Tsen#    vs3 = [r3,.....]
34*09ef057bSDanny Tsen#    vs4 = [r4,.....]
35*09ef057bSDanny Tsen#    vs5 = [r1*5,...]
36*09ef057bSDanny Tsen#    vs6 = [r2*5,...]
37*09ef057bSDanny Tsen#    vs7 = [r2*5,...]
38*09ef057bSDanny Tsen#    vs8 = [r4*5,...]
39*09ef057bSDanny Tsen#
40*09ef057bSDanny Tsen#  Each word in a vector consists a member of a "r/s" in [a * r/s].
41*09ef057bSDanny Tsen#
42*09ef057bSDanny Tsen# r0, r4*5, r3*5, r2*5, r1*5;
43*09ef057bSDanny Tsen# r1, r0,   r4*5, r3*5, r2*5;
44*09ef057bSDanny Tsen# r2, r1,   r0,   r4*5, r3*5;
45*09ef057bSDanny Tsen# r3, r2,   r1,   r0,   r4*5;
46*09ef057bSDanny Tsen# r4, r3,   r2,   r1,   r0  ;
47*09ef057bSDanny Tsen#
48*09ef057bSDanny Tsen#
49*09ef057bSDanny Tsen# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
50*09ef057bSDanny Tsen#  k = 32 bytes key
51*09ef057bSDanny Tsen#  r3 = k (r, s)
52*09ef057bSDanny Tsen#  r4 = mlen
53*09ef057bSDanny Tsen#  r5 = m
54*09ef057bSDanny Tsen#
55*09ef057bSDanny Tsen#include <asm/ppc_asm.h>
56*09ef057bSDanny Tsen#include <asm/asm-offsets.h>
57*09ef057bSDanny Tsen#include <asm/asm-compat.h>
58*09ef057bSDanny Tsen#include <linux/linkage.h>
59*09ef057bSDanny Tsen
60*09ef057bSDanny Tsen.machine "any"
61*09ef057bSDanny Tsen
62*09ef057bSDanny Tsen.text
63*09ef057bSDanny Tsen
64*09ef057bSDanny Tsen.macro	SAVE_GPR GPR OFFSET FRAME
65*09ef057bSDanny Tsen	std	\GPR,\OFFSET(\FRAME)
66*09ef057bSDanny Tsen.endm
67*09ef057bSDanny Tsen
68*09ef057bSDanny Tsen.macro	SAVE_VRS VRS OFFSET FRAME
69*09ef057bSDanny Tsen	li	16, \OFFSET
70*09ef057bSDanny Tsen	stvx	\VRS, 16, \FRAME
71*09ef057bSDanny Tsen.endm
72*09ef057bSDanny Tsen
73*09ef057bSDanny Tsen.macro	SAVE_VSX VSX OFFSET FRAME
74*09ef057bSDanny Tsen	li	16, \OFFSET
75*09ef057bSDanny Tsen	stxvx	\VSX, 16, \FRAME
76*09ef057bSDanny Tsen.endm
77*09ef057bSDanny Tsen
78*09ef057bSDanny Tsen.macro	RESTORE_GPR GPR OFFSET FRAME
79*09ef057bSDanny Tsen	ld	\GPR,\OFFSET(\FRAME)
80*09ef057bSDanny Tsen.endm
81*09ef057bSDanny Tsen
82*09ef057bSDanny Tsen.macro	RESTORE_VRS VRS OFFSET FRAME
83*09ef057bSDanny Tsen	li	16, \OFFSET
84*09ef057bSDanny Tsen	lvx	\VRS, 16, \FRAME
85*09ef057bSDanny Tsen.endm
86*09ef057bSDanny Tsen
87*09ef057bSDanny Tsen.macro	RESTORE_VSX VSX OFFSET FRAME
88*09ef057bSDanny Tsen	li	16, \OFFSET
89*09ef057bSDanny Tsen	lxvx	\VSX, 16, \FRAME
90*09ef057bSDanny Tsen.endm
91*09ef057bSDanny Tsen
92*09ef057bSDanny Tsen.macro SAVE_REGS
93*09ef057bSDanny Tsen	mflr 0
94*09ef057bSDanny Tsen	std 0, 16(1)
95*09ef057bSDanny Tsen	stdu 1,-752(1)
96*09ef057bSDanny Tsen
97*09ef057bSDanny Tsen	SAVE_GPR 14, 112, 1
98*09ef057bSDanny Tsen	SAVE_GPR 15, 120, 1
99*09ef057bSDanny Tsen	SAVE_GPR 16, 128, 1
100*09ef057bSDanny Tsen	SAVE_GPR 17, 136, 1
101*09ef057bSDanny Tsen	SAVE_GPR 18, 144, 1
102*09ef057bSDanny Tsen	SAVE_GPR 19, 152, 1
103*09ef057bSDanny Tsen	SAVE_GPR 20, 160, 1
104*09ef057bSDanny Tsen	SAVE_GPR 21, 168, 1
105*09ef057bSDanny Tsen	SAVE_GPR 22, 176, 1
106*09ef057bSDanny Tsen	SAVE_GPR 23, 184, 1
107*09ef057bSDanny Tsen	SAVE_GPR 24, 192, 1
108*09ef057bSDanny Tsen	SAVE_GPR 25, 200, 1
109*09ef057bSDanny Tsen	SAVE_GPR 26, 208, 1
110*09ef057bSDanny Tsen	SAVE_GPR 27, 216, 1
111*09ef057bSDanny Tsen	SAVE_GPR 28, 224, 1
112*09ef057bSDanny Tsen	SAVE_GPR 29, 232, 1
113*09ef057bSDanny Tsen	SAVE_GPR 30, 240, 1
114*09ef057bSDanny Tsen	SAVE_GPR 31, 248, 1
115*09ef057bSDanny Tsen
116*09ef057bSDanny Tsen	addi	9, 1, 256
117*09ef057bSDanny Tsen	SAVE_VRS 20, 0, 9
118*09ef057bSDanny Tsen	SAVE_VRS 21, 16, 9
119*09ef057bSDanny Tsen	SAVE_VRS 22, 32, 9
120*09ef057bSDanny Tsen	SAVE_VRS 23, 48, 9
121*09ef057bSDanny Tsen	SAVE_VRS 24, 64, 9
122*09ef057bSDanny Tsen	SAVE_VRS 25, 80, 9
123*09ef057bSDanny Tsen	SAVE_VRS 26, 96, 9
124*09ef057bSDanny Tsen	SAVE_VRS 27, 112, 9
125*09ef057bSDanny Tsen	SAVE_VRS 28, 128, 9
126*09ef057bSDanny Tsen	SAVE_VRS 29, 144, 9
127*09ef057bSDanny Tsen	SAVE_VRS 30, 160, 9
128*09ef057bSDanny Tsen	SAVE_VRS 31, 176, 9
129*09ef057bSDanny Tsen
130*09ef057bSDanny Tsen	SAVE_VSX 14, 192, 9
131*09ef057bSDanny Tsen	SAVE_VSX 15, 208, 9
132*09ef057bSDanny Tsen	SAVE_VSX 16, 224, 9
133*09ef057bSDanny Tsen	SAVE_VSX 17, 240, 9
134*09ef057bSDanny Tsen	SAVE_VSX 18, 256, 9
135*09ef057bSDanny Tsen	SAVE_VSX 19, 272, 9
136*09ef057bSDanny Tsen	SAVE_VSX 20, 288, 9
137*09ef057bSDanny Tsen	SAVE_VSX 21, 304, 9
138*09ef057bSDanny Tsen	SAVE_VSX 22, 320, 9
139*09ef057bSDanny Tsen	SAVE_VSX 23, 336, 9
140*09ef057bSDanny Tsen	SAVE_VSX 24, 352, 9
141*09ef057bSDanny Tsen	SAVE_VSX 25, 368, 9
142*09ef057bSDanny Tsen	SAVE_VSX 26, 384, 9
143*09ef057bSDanny Tsen	SAVE_VSX 27, 400, 9
144*09ef057bSDanny Tsen	SAVE_VSX 28, 416, 9
145*09ef057bSDanny Tsen	SAVE_VSX 29, 432, 9
146*09ef057bSDanny Tsen	SAVE_VSX 30, 448, 9
147*09ef057bSDanny Tsen	SAVE_VSX 31, 464, 9
148*09ef057bSDanny Tsen.endm # SAVE_REGS
149*09ef057bSDanny Tsen
150*09ef057bSDanny Tsen.macro RESTORE_REGS
151*09ef057bSDanny Tsen	addi	9, 1, 256
152*09ef057bSDanny Tsen	RESTORE_VRS 20, 0, 9
153*09ef057bSDanny Tsen	RESTORE_VRS 21, 16, 9
154*09ef057bSDanny Tsen	RESTORE_VRS 22, 32, 9
155*09ef057bSDanny Tsen	RESTORE_VRS 23, 48, 9
156*09ef057bSDanny Tsen	RESTORE_VRS 24, 64, 9
157*09ef057bSDanny Tsen	RESTORE_VRS 25, 80, 9
158*09ef057bSDanny Tsen	RESTORE_VRS 26, 96, 9
159*09ef057bSDanny Tsen	RESTORE_VRS 27, 112, 9
160*09ef057bSDanny Tsen	RESTORE_VRS 28, 128, 9
161*09ef057bSDanny Tsen	RESTORE_VRS 29, 144, 9
162*09ef057bSDanny Tsen	RESTORE_VRS 30, 160, 9
163*09ef057bSDanny Tsen	RESTORE_VRS 31, 176, 9
164*09ef057bSDanny Tsen
165*09ef057bSDanny Tsen	RESTORE_VSX 14, 192, 9
166*09ef057bSDanny Tsen	RESTORE_VSX 15, 208, 9
167*09ef057bSDanny Tsen	RESTORE_VSX 16, 224, 9
168*09ef057bSDanny Tsen	RESTORE_VSX 17, 240, 9
169*09ef057bSDanny Tsen	RESTORE_VSX 18, 256, 9
170*09ef057bSDanny Tsen	RESTORE_VSX 19, 272, 9
171*09ef057bSDanny Tsen	RESTORE_VSX 20, 288, 9
172*09ef057bSDanny Tsen	RESTORE_VSX 21, 304, 9
173*09ef057bSDanny Tsen	RESTORE_VSX 22, 320, 9
174*09ef057bSDanny Tsen	RESTORE_VSX 23, 336, 9
175*09ef057bSDanny Tsen	RESTORE_VSX 24, 352, 9
176*09ef057bSDanny Tsen	RESTORE_VSX 25, 368, 9
177*09ef057bSDanny Tsen	RESTORE_VSX 26, 384, 9
178*09ef057bSDanny Tsen	RESTORE_VSX 27, 400, 9
179*09ef057bSDanny Tsen	RESTORE_VSX 28, 416, 9
180*09ef057bSDanny Tsen	RESTORE_VSX 29, 432, 9
181*09ef057bSDanny Tsen	RESTORE_VSX 30, 448, 9
182*09ef057bSDanny Tsen	RESTORE_VSX 31, 464, 9
183*09ef057bSDanny Tsen
184*09ef057bSDanny Tsen	RESTORE_GPR 14, 112, 1
185*09ef057bSDanny Tsen	RESTORE_GPR 15, 120, 1
186*09ef057bSDanny Tsen	RESTORE_GPR 16, 128, 1
187*09ef057bSDanny Tsen	RESTORE_GPR 17, 136, 1
188*09ef057bSDanny Tsen	RESTORE_GPR 18, 144, 1
189*09ef057bSDanny Tsen	RESTORE_GPR 19, 152, 1
190*09ef057bSDanny Tsen	RESTORE_GPR 20, 160, 1
191*09ef057bSDanny Tsen	RESTORE_GPR 21, 168, 1
192*09ef057bSDanny Tsen	RESTORE_GPR 22, 176, 1
193*09ef057bSDanny Tsen	RESTORE_GPR 23, 184, 1
194*09ef057bSDanny Tsen	RESTORE_GPR 24, 192, 1
195*09ef057bSDanny Tsen	RESTORE_GPR 25, 200, 1
196*09ef057bSDanny Tsen	RESTORE_GPR 26, 208, 1
197*09ef057bSDanny Tsen	RESTORE_GPR 27, 216, 1
198*09ef057bSDanny Tsen	RESTORE_GPR 28, 224, 1
199*09ef057bSDanny Tsen	RESTORE_GPR 29, 232, 1
200*09ef057bSDanny Tsen	RESTORE_GPR 30, 240, 1
201*09ef057bSDanny Tsen	RESTORE_GPR 31, 248, 1
202*09ef057bSDanny Tsen
203*09ef057bSDanny Tsen	addi    1, 1, 752
204*09ef057bSDanny Tsen	ld 0, 16(1)
205*09ef057bSDanny Tsen	mtlr 0
206*09ef057bSDanny Tsen.endm # RESTORE_REGS
207*09ef057bSDanny Tsen
208*09ef057bSDanny Tsen#
209*09ef057bSDanny Tsen# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
210*09ef057bSDanny Tsen# p[1] = a0*r1 + a1*r0   + a2*r4*5 + a3*r3*5 + a4*r2*5;
211*09ef057bSDanny Tsen# p[2] = a0*r2 + a1*r1   + a2*r0   + a3*r4*5 + a4*r3*5;
212*09ef057bSDanny Tsen# p[3] = a0*r3 + a1*r2   + a2*r1   + a3*r0   + a4*r4*5;
213*09ef057bSDanny Tsen# p[4] = a0*r4 + a1*r3   + a2*r2   + a3*r1   + a4*r0  ;
214*09ef057bSDanny Tsen#
215*09ef057bSDanny Tsen#    [r^2, r^3, r^1, r^4]
216*09ef057bSDanny Tsen#    [m3,  m2,  m4,  m1]
217*09ef057bSDanny Tsen#
218*09ef057bSDanny Tsen# multiply odd and even words
219*09ef057bSDanny Tsen.macro mul_odd
220*09ef057bSDanny Tsen	vmulouw	14, 4, 26
221*09ef057bSDanny Tsen	vmulouw	10, 5, 3
222*09ef057bSDanny Tsen	vmulouw	11, 6, 2
223*09ef057bSDanny Tsen	vmulouw	12, 7, 1
224*09ef057bSDanny Tsen	vmulouw	13, 8, 0
225*09ef057bSDanny Tsen	vmulouw	15, 4, 27
226*09ef057bSDanny Tsen	vaddudm	14, 14, 10
227*09ef057bSDanny Tsen	vaddudm	14, 14, 11
228*09ef057bSDanny Tsen	vmulouw	10, 5, 26
229*09ef057bSDanny Tsen	vmulouw	11, 6, 3
230*09ef057bSDanny Tsen	vaddudm	14, 14, 12
231*09ef057bSDanny Tsen	vaddudm	14, 14, 13	# x0
232*09ef057bSDanny Tsen	vaddudm	15, 15, 10
233*09ef057bSDanny Tsen	vaddudm	15, 15, 11
234*09ef057bSDanny Tsen	vmulouw	12, 7, 2
235*09ef057bSDanny Tsen	vmulouw	13, 8, 1
236*09ef057bSDanny Tsen	vaddudm	15, 15, 12
237*09ef057bSDanny Tsen	vaddudm	15, 15, 13	# x1
238*09ef057bSDanny Tsen	vmulouw	16, 4, 28
239*09ef057bSDanny Tsen	vmulouw	10, 5, 27
240*09ef057bSDanny Tsen	vmulouw	11, 6, 26
241*09ef057bSDanny Tsen	vaddudm	16, 16, 10
242*09ef057bSDanny Tsen	vaddudm	16, 16, 11
243*09ef057bSDanny Tsen	vmulouw	12, 7, 3
244*09ef057bSDanny Tsen	vmulouw	13, 8, 2
245*09ef057bSDanny Tsen	vaddudm	16, 16, 12
246*09ef057bSDanny Tsen	vaddudm	16, 16, 13	# x2
247*09ef057bSDanny Tsen	vmulouw	17, 4, 29
248*09ef057bSDanny Tsen	vmulouw	10, 5, 28
249*09ef057bSDanny Tsen	vmulouw	11, 6, 27
250*09ef057bSDanny Tsen	vaddudm	17, 17, 10
251*09ef057bSDanny Tsen	vaddudm	17, 17, 11
252*09ef057bSDanny Tsen	vmulouw	12, 7, 26
253*09ef057bSDanny Tsen	vmulouw	13, 8, 3
254*09ef057bSDanny Tsen	vaddudm	17, 17, 12
255*09ef057bSDanny Tsen	vaddudm	17, 17, 13	# x3
256*09ef057bSDanny Tsen	vmulouw	18, 4, 30
257*09ef057bSDanny Tsen	vmulouw	10, 5, 29
258*09ef057bSDanny Tsen	vmulouw	11, 6, 28
259*09ef057bSDanny Tsen	vaddudm	18, 18, 10
260*09ef057bSDanny Tsen	vaddudm	18, 18, 11
261*09ef057bSDanny Tsen	vmulouw	12, 7, 27
262*09ef057bSDanny Tsen	vmulouw	13, 8, 26
263*09ef057bSDanny Tsen	vaddudm	18, 18, 12
264*09ef057bSDanny Tsen	vaddudm	18, 18, 13	# x4
265*09ef057bSDanny Tsen.endm
266*09ef057bSDanny Tsen
267*09ef057bSDanny Tsen.macro mul_even
268*09ef057bSDanny Tsen	vmuleuw	9, 4, 26
269*09ef057bSDanny Tsen	vmuleuw	10, 5, 3
270*09ef057bSDanny Tsen	vmuleuw	11, 6, 2
271*09ef057bSDanny Tsen	vmuleuw	12, 7, 1
272*09ef057bSDanny Tsen	vmuleuw	13, 8, 0
273*09ef057bSDanny Tsen	vaddudm	14, 14, 9
274*09ef057bSDanny Tsen	vaddudm	14, 14, 10
275*09ef057bSDanny Tsen	vaddudm	14, 14, 11
276*09ef057bSDanny Tsen	vaddudm	14, 14, 12
277*09ef057bSDanny Tsen	vaddudm	14, 14, 13	# x0
278*09ef057bSDanny Tsen
279*09ef057bSDanny Tsen	vmuleuw	9, 4, 27
280*09ef057bSDanny Tsen	vmuleuw	10, 5, 26
281*09ef057bSDanny Tsen	vmuleuw	11, 6, 3
282*09ef057bSDanny Tsen	vmuleuw	12, 7, 2
283*09ef057bSDanny Tsen	vmuleuw	13, 8, 1
284*09ef057bSDanny Tsen	vaddudm	15, 15, 9
285*09ef057bSDanny Tsen	vaddudm	15, 15, 10
286*09ef057bSDanny Tsen	vaddudm	15, 15, 11
287*09ef057bSDanny Tsen	vaddudm	15, 15, 12
288*09ef057bSDanny Tsen	vaddudm	15, 15, 13	# x1
289*09ef057bSDanny Tsen
290*09ef057bSDanny Tsen	vmuleuw	9, 4, 28
291*09ef057bSDanny Tsen	vmuleuw	10, 5, 27
292*09ef057bSDanny Tsen	vmuleuw	11, 6, 26
293*09ef057bSDanny Tsen	vmuleuw	12, 7, 3
294*09ef057bSDanny Tsen	vmuleuw	13, 8, 2
295*09ef057bSDanny Tsen	vaddudm	16, 16, 9
296*09ef057bSDanny Tsen	vaddudm	16, 16, 10
297*09ef057bSDanny Tsen	vaddudm	16, 16, 11
298*09ef057bSDanny Tsen	vaddudm	16, 16, 12
299*09ef057bSDanny Tsen	vaddudm	16, 16, 13	# x2
300*09ef057bSDanny Tsen
301*09ef057bSDanny Tsen	vmuleuw	9, 4, 29
302*09ef057bSDanny Tsen	vmuleuw	10, 5, 28
303*09ef057bSDanny Tsen	vmuleuw	11, 6, 27
304*09ef057bSDanny Tsen	vmuleuw	12, 7, 26
305*09ef057bSDanny Tsen	vmuleuw	13, 8, 3
306*09ef057bSDanny Tsen	vaddudm	17, 17, 9
307*09ef057bSDanny Tsen	vaddudm	17, 17, 10
308*09ef057bSDanny Tsen	vaddudm	17, 17, 11
309*09ef057bSDanny Tsen	vaddudm	17, 17, 12
310*09ef057bSDanny Tsen	vaddudm	17, 17, 13	# x3
311*09ef057bSDanny Tsen
312*09ef057bSDanny Tsen	vmuleuw	9, 4, 30
313*09ef057bSDanny Tsen	vmuleuw	10, 5, 29
314*09ef057bSDanny Tsen	vmuleuw	11, 6, 28
315*09ef057bSDanny Tsen	vmuleuw	12, 7, 27
316*09ef057bSDanny Tsen	vmuleuw	13, 8, 26
317*09ef057bSDanny Tsen	vaddudm	18, 18, 9
318*09ef057bSDanny Tsen	vaddudm	18, 18, 10
319*09ef057bSDanny Tsen	vaddudm	18, 18, 11
320*09ef057bSDanny Tsen	vaddudm	18, 18, 12
321*09ef057bSDanny Tsen	vaddudm	18, 18, 13	# x4
322*09ef057bSDanny Tsen.endm
323*09ef057bSDanny Tsen
324*09ef057bSDanny Tsen#
325*09ef057bSDanny Tsen# poly1305_setup_r
326*09ef057bSDanny Tsen#
327*09ef057bSDanny Tsen# setup r^4, r^3, r^2, r vectors
328*09ef057bSDanny Tsen#    [r, r^3, r^2, r^4]
329*09ef057bSDanny Tsen#    vs0 = [r0,...]
330*09ef057bSDanny Tsen#    vs1 = [r1,...]
331*09ef057bSDanny Tsen#    vs2 = [r2,...]
332*09ef057bSDanny Tsen#    vs3 = [r3,...]
333*09ef057bSDanny Tsen#    vs4 = [r4,...]
334*09ef057bSDanny Tsen#    vs5 = [r4*5,...]
335*09ef057bSDanny Tsen#    vs6 = [r3*5,...]
336*09ef057bSDanny Tsen#    vs7 = [r2*5,...]
337*09ef057bSDanny Tsen#    vs8 = [r1*5,...]
338*09ef057bSDanny Tsen#
339*09ef057bSDanny Tsen# r0, r4*5, r3*5, r2*5, r1*5;
340*09ef057bSDanny Tsen# r1, r0,   r4*5, r3*5, r2*5;
341*09ef057bSDanny Tsen# r2, r1,   r0,   r4*5, r3*5;
342*09ef057bSDanny Tsen# r3, r2,   r1,   r0,   r4*5;
343*09ef057bSDanny Tsen# r4, r3,   r2,   r1,   r0  ;
344*09ef057bSDanny Tsen#
345*09ef057bSDanny Tsen.macro poly1305_setup_r
346*09ef057bSDanny Tsen
347*09ef057bSDanny Tsen	# save r
348*09ef057bSDanny Tsen	xxlor	26, 58, 58
349*09ef057bSDanny Tsen	xxlor	27, 59, 59
350*09ef057bSDanny Tsen	xxlor	28, 60, 60
351*09ef057bSDanny Tsen	xxlor	29, 61, 61
352*09ef057bSDanny Tsen	xxlor	30, 62, 62
353*09ef057bSDanny Tsen
354*09ef057bSDanny Tsen	xxlxor	31, 31, 31
355*09ef057bSDanny Tsen
356*09ef057bSDanny Tsen#    [r, r^3, r^2, r^4]
357*09ef057bSDanny Tsen	# compute r^2
358*09ef057bSDanny Tsen	vmr	4, 26
359*09ef057bSDanny Tsen	vmr	5, 27
360*09ef057bSDanny Tsen	vmr	6, 28
361*09ef057bSDanny Tsen	vmr	7, 29
362*09ef057bSDanny Tsen	vmr	8, 30
363*09ef057bSDanny Tsen	bl	do_mul		# r^2 r^1
364*09ef057bSDanny Tsen	xxpermdi 58, 58, 36, 0x3		# r0
365*09ef057bSDanny Tsen	xxpermdi 59, 59, 37, 0x3		# r1
366*09ef057bSDanny Tsen	xxpermdi 60, 60, 38, 0x3		# r2
367*09ef057bSDanny Tsen	xxpermdi 61, 61, 39, 0x3		# r3
368*09ef057bSDanny Tsen	xxpermdi 62, 62, 40, 0x3		# r4
369*09ef057bSDanny Tsen	xxpermdi 36, 36, 36, 0x3
370*09ef057bSDanny Tsen	xxpermdi 37, 37, 37, 0x3
371*09ef057bSDanny Tsen	xxpermdi 38, 38, 38, 0x3
372*09ef057bSDanny Tsen	xxpermdi 39, 39, 39, 0x3
373*09ef057bSDanny Tsen	xxpermdi 40, 40, 40, 0x3
374*09ef057bSDanny Tsen	vspltisb 13, 2
375*09ef057bSDanny Tsen	vsld	9, 27, 13
376*09ef057bSDanny Tsen	vsld	10, 28, 13
377*09ef057bSDanny Tsen	vsld	11, 29, 13
378*09ef057bSDanny Tsen	vsld	12, 30, 13
379*09ef057bSDanny Tsen	vaddudm	0, 9, 27
380*09ef057bSDanny Tsen	vaddudm	1, 10, 28
381*09ef057bSDanny Tsen	vaddudm	2, 11, 29
382*09ef057bSDanny Tsen	vaddudm	3, 12, 30
383*09ef057bSDanny Tsen
384*09ef057bSDanny Tsen	bl	do_mul		# r^4 r^3
385*09ef057bSDanny Tsen	vmrgow	26, 26, 4
386*09ef057bSDanny Tsen	vmrgow	27, 27, 5
387*09ef057bSDanny Tsen	vmrgow	28, 28, 6
388*09ef057bSDanny Tsen	vmrgow	29, 29, 7
389*09ef057bSDanny Tsen	vmrgow	30, 30, 8
390*09ef057bSDanny Tsen	vspltisb 13, 2
391*09ef057bSDanny Tsen	vsld	9, 27, 13
392*09ef057bSDanny Tsen	vsld	10, 28, 13
393*09ef057bSDanny Tsen	vsld	11, 29, 13
394*09ef057bSDanny Tsen	vsld	12, 30, 13
395*09ef057bSDanny Tsen	vaddudm	0, 9, 27
396*09ef057bSDanny Tsen	vaddudm	1, 10, 28
397*09ef057bSDanny Tsen	vaddudm	2, 11, 29
398*09ef057bSDanny Tsen	vaddudm	3, 12, 30
399*09ef057bSDanny Tsen
400*09ef057bSDanny Tsen	# r^2 r^4
401*09ef057bSDanny Tsen	xxlor	0, 58, 58
402*09ef057bSDanny Tsen	xxlor	1, 59, 59
403*09ef057bSDanny Tsen	xxlor	2, 60, 60
404*09ef057bSDanny Tsen	xxlor	3, 61, 61
405*09ef057bSDanny Tsen	xxlor	4, 62, 62
406*09ef057bSDanny Tsen	xxlor	5, 32, 32
407*09ef057bSDanny Tsen	xxlor	6, 33, 33
408*09ef057bSDanny Tsen	xxlor	7, 34, 34
409*09ef057bSDanny Tsen	xxlor	8, 35, 35
410*09ef057bSDanny Tsen
411*09ef057bSDanny Tsen	vspltw	9, 26, 3
412*09ef057bSDanny Tsen	vspltw	10, 26, 2
413*09ef057bSDanny Tsen	vmrgow	26, 10, 9
414*09ef057bSDanny Tsen	vspltw	9, 27, 3
415*09ef057bSDanny Tsen	vspltw	10, 27, 2
416*09ef057bSDanny Tsen	vmrgow	27, 10, 9
417*09ef057bSDanny Tsen	vspltw	9, 28, 3
418*09ef057bSDanny Tsen	vspltw	10, 28, 2
419*09ef057bSDanny Tsen	vmrgow	28, 10, 9
420*09ef057bSDanny Tsen	vspltw	9, 29, 3
421*09ef057bSDanny Tsen	vspltw	10, 29, 2
422*09ef057bSDanny Tsen	vmrgow	29, 10, 9
423*09ef057bSDanny Tsen	vspltw	9, 30, 3
424*09ef057bSDanny Tsen	vspltw	10, 30, 2
425*09ef057bSDanny Tsen	vmrgow	30, 10, 9
426*09ef057bSDanny Tsen
427*09ef057bSDanny Tsen	vsld	9, 27, 13
428*09ef057bSDanny Tsen	vsld	10, 28, 13
429*09ef057bSDanny Tsen	vsld	11, 29, 13
430*09ef057bSDanny Tsen	vsld	12, 30, 13
431*09ef057bSDanny Tsen	vaddudm	0, 9, 27
432*09ef057bSDanny Tsen	vaddudm	1, 10, 28
433*09ef057bSDanny Tsen	vaddudm	2, 11, 29
434*09ef057bSDanny Tsen	vaddudm	3, 12, 30
435*09ef057bSDanny Tsen.endm
436*09ef057bSDanny Tsen
437*09ef057bSDanny TsenSYM_FUNC_START_LOCAL(do_mul)
438*09ef057bSDanny Tsen	mul_odd
439*09ef057bSDanny Tsen
440*09ef057bSDanny Tsen	# do reduction ( h %= p )
441*09ef057bSDanny Tsen	# carry reduction
442*09ef057bSDanny Tsen	vspltisb 9, 2
443*09ef057bSDanny Tsen	vsrd	10, 14, 31
444*09ef057bSDanny Tsen	vsrd	11, 17, 31
445*09ef057bSDanny Tsen	vand	7, 17, 25
446*09ef057bSDanny Tsen	vand	4, 14, 25
447*09ef057bSDanny Tsen	vaddudm	18, 18, 11
448*09ef057bSDanny Tsen	vsrd	12, 18, 31
449*09ef057bSDanny Tsen	vaddudm	15, 15, 10
450*09ef057bSDanny Tsen
451*09ef057bSDanny Tsen	vsrd	11, 15, 31
452*09ef057bSDanny Tsen	vand	8, 18, 25
453*09ef057bSDanny Tsen	vand	5, 15, 25
454*09ef057bSDanny Tsen	vaddudm	4, 4, 12
455*09ef057bSDanny Tsen	vsld	10, 12, 9
456*09ef057bSDanny Tsen	vaddudm	6, 16, 11
457*09ef057bSDanny Tsen
458*09ef057bSDanny Tsen	vsrd	13, 6, 31
459*09ef057bSDanny Tsen	vand	6, 6, 25
460*09ef057bSDanny Tsen	vaddudm	4, 4, 10
461*09ef057bSDanny Tsen	vsrd	10, 4, 31
462*09ef057bSDanny Tsen	vaddudm	7, 7, 13
463*09ef057bSDanny Tsen
464*09ef057bSDanny Tsen	vsrd	11, 7, 31
465*09ef057bSDanny Tsen	vand	7, 7, 25
466*09ef057bSDanny Tsen	vand	4, 4, 25
467*09ef057bSDanny Tsen	vaddudm	5, 5, 10
468*09ef057bSDanny Tsen	vaddudm	8, 8, 11
469*09ef057bSDanny Tsen	blr
470*09ef057bSDanny TsenSYM_FUNC_END(do_mul)
471*09ef057bSDanny Tsen
472*09ef057bSDanny Tsen#
473*09ef057bSDanny Tsen# init key
474*09ef057bSDanny Tsen#
475*09ef057bSDanny Tsen.macro do_poly1305_init
476*09ef057bSDanny Tsen	addis	10, 2, rmask@toc@ha
477*09ef057bSDanny Tsen	addi	10, 10, rmask@toc@l
478*09ef057bSDanny Tsen
479*09ef057bSDanny Tsen	ld	11, 0(10)
480*09ef057bSDanny Tsen	ld	12, 8(10)
481*09ef057bSDanny Tsen
482*09ef057bSDanny Tsen	li	14, 16
483*09ef057bSDanny Tsen	li	15, 32
484*09ef057bSDanny Tsen	addis	10, 2, cnum@toc@ha
485*09ef057bSDanny Tsen	addi	10, 10, cnum@toc@l
486*09ef057bSDanny Tsen	lvx	25, 0, 10	# v25 - mask
487*09ef057bSDanny Tsen	lvx	31, 14, 10	# v31 = 1a
488*09ef057bSDanny Tsen	lvx	19, 15, 10	# v19 = 1 << 24
489*09ef057bSDanny Tsen	lxv	24, 48(10)	# vs24
490*09ef057bSDanny Tsen	lxv	25, 64(10)	# vs25
491*09ef057bSDanny Tsen
492*09ef057bSDanny Tsen	# initialize
493*09ef057bSDanny Tsen	# load key from r3 to vectors
494*09ef057bSDanny Tsen	ld	9, 24(3)
495*09ef057bSDanny Tsen	ld	10, 32(3)
496*09ef057bSDanny Tsen	and.	9, 9, 11
497*09ef057bSDanny Tsen	and.	10, 10, 12
498*09ef057bSDanny Tsen
499*09ef057bSDanny Tsen	# break 26 bits
500*09ef057bSDanny Tsen	extrdi	14, 9, 26, 38
501*09ef057bSDanny Tsen	extrdi	15, 9, 26, 12
502*09ef057bSDanny Tsen	extrdi	16, 9, 12, 0
503*09ef057bSDanny Tsen	mtvsrdd	58, 0, 14
504*09ef057bSDanny Tsen	insrdi	16, 10, 14, 38
505*09ef057bSDanny Tsen	mtvsrdd	59, 0, 15
506*09ef057bSDanny Tsen	extrdi	17, 10, 26, 24
507*09ef057bSDanny Tsen	mtvsrdd	60, 0, 16
508*09ef057bSDanny Tsen	extrdi	18, 10, 24, 0
509*09ef057bSDanny Tsen	mtvsrdd	61, 0, 17
510*09ef057bSDanny Tsen	mtvsrdd	62, 0, 18
511*09ef057bSDanny Tsen
512*09ef057bSDanny Tsen	# r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
513*09ef057bSDanny Tsen	li	9, 5
514*09ef057bSDanny Tsen	mtvsrdd	36, 0, 9
515*09ef057bSDanny Tsen	vmulouw	0, 27, 4		# v0 = rr0
516*09ef057bSDanny Tsen	vmulouw	1, 28, 4		# v1 = rr1
517*09ef057bSDanny Tsen	vmulouw	2, 29, 4		# v2 = rr2
518*09ef057bSDanny Tsen	vmulouw	3, 30, 4		# v3 = rr3
519*09ef057bSDanny Tsen.endm
520*09ef057bSDanny Tsen
521*09ef057bSDanny Tsen#
522*09ef057bSDanny Tsen# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
523*09ef057bSDanny Tsen#  k = 32 bytes key
524*09ef057bSDanny Tsen#  r3 = k (r, s)
525*09ef057bSDanny Tsen#  r4 = mlen
526*09ef057bSDanny Tsen#  r5 = m
527*09ef057bSDanny Tsen#
528*09ef057bSDanny TsenSYM_FUNC_START(poly1305_p10le_4blocks)
529*09ef057bSDanny Tsen.align 5
530*09ef057bSDanny Tsen	cmpdi	5, 64
531*09ef057bSDanny Tsen	blt	Out_no_poly1305
532*09ef057bSDanny Tsen
533*09ef057bSDanny Tsen	SAVE_REGS
534*09ef057bSDanny Tsen
535*09ef057bSDanny Tsen	do_poly1305_init
536*09ef057bSDanny Tsen
537*09ef057bSDanny Tsen	li	21, 0	# counter to message
538*09ef057bSDanny Tsen
539*09ef057bSDanny Tsen	poly1305_setup_r
540*09ef057bSDanny Tsen
541*09ef057bSDanny Tsen	# load previous H state
542*09ef057bSDanny Tsen	# break/convert r6 to 26 bits
543*09ef057bSDanny Tsen	ld	9, 0(3)
544*09ef057bSDanny Tsen	ld	10, 8(3)
545*09ef057bSDanny Tsen	ld	19, 16(3)
546*09ef057bSDanny Tsen	sldi	19, 19, 24
547*09ef057bSDanny Tsen	mtvsrdd	41, 0, 19
548*09ef057bSDanny Tsen	extrdi	14, 9, 26, 38
549*09ef057bSDanny Tsen	extrdi	15, 9, 26, 12
550*09ef057bSDanny Tsen	extrdi	16, 9, 12, 0
551*09ef057bSDanny Tsen	mtvsrdd	36, 0, 14
552*09ef057bSDanny Tsen	insrdi	16, 10, 14, 38
553*09ef057bSDanny Tsen	mtvsrdd	37, 0, 15
554*09ef057bSDanny Tsen	extrdi	17, 10, 26, 24
555*09ef057bSDanny Tsen	mtvsrdd	38, 0, 16
556*09ef057bSDanny Tsen	extrdi	18, 10, 24, 0
557*09ef057bSDanny Tsen	mtvsrdd	39, 0, 17
558*09ef057bSDanny Tsen	mtvsrdd	40, 0, 18
559*09ef057bSDanny Tsen	vor	8, 8, 9
560*09ef057bSDanny Tsen
561*09ef057bSDanny Tsen	# input m1 m2
562*09ef057bSDanny Tsen	add	20, 4, 21
563*09ef057bSDanny Tsen	xxlor	49, 24, 24
564*09ef057bSDanny Tsen	xxlor	50, 25, 25
565*09ef057bSDanny Tsen	lxvw4x	43, 0, 20
566*09ef057bSDanny Tsen	addi	17, 20, 16
567*09ef057bSDanny Tsen	lxvw4x	44, 0, 17
568*09ef057bSDanny Tsen	vperm	14, 11, 12, 17
569*09ef057bSDanny Tsen	vperm	15, 11, 12, 18
570*09ef057bSDanny Tsen	vand	9, 14, 25	# a0
571*09ef057bSDanny Tsen	vsrd	10, 14, 31	# >> 26
572*09ef057bSDanny Tsen	vsrd	11, 10, 31	# 12 bits left
573*09ef057bSDanny Tsen	vand	10, 10, 25	# a1
574*09ef057bSDanny Tsen	vspltisb 13, 12
575*09ef057bSDanny Tsen	vand	16, 15, 25
576*09ef057bSDanny Tsen	vsld	12, 16, 13
577*09ef057bSDanny Tsen	vor	11, 11, 12
578*09ef057bSDanny Tsen	vand	11, 11, 25	# a2
579*09ef057bSDanny Tsen	vspltisb 13, 14
580*09ef057bSDanny Tsen	vsrd	12, 15, 13	# >> 14
581*09ef057bSDanny Tsen	vsrd	13, 12, 31	# >> 26, a4
582*09ef057bSDanny Tsen	vand	12, 12, 25	# a3
583*09ef057bSDanny Tsen
584*09ef057bSDanny Tsen	vaddudm	20, 4, 9
585*09ef057bSDanny Tsen	vaddudm	21, 5, 10
586*09ef057bSDanny Tsen	vaddudm	22, 6, 11
587*09ef057bSDanny Tsen	vaddudm	23, 7, 12
588*09ef057bSDanny Tsen	vaddudm	24, 8, 13
589*09ef057bSDanny Tsen
590*09ef057bSDanny Tsen	# m3 m4
591*09ef057bSDanny Tsen	addi	17, 17, 16
592*09ef057bSDanny Tsen	lxvw4x	43, 0, 17
593*09ef057bSDanny Tsen	addi	17, 17, 16
594*09ef057bSDanny Tsen	lxvw4x	44, 0, 17
595*09ef057bSDanny Tsen	vperm	14, 11, 12, 17
596*09ef057bSDanny Tsen	vperm	15, 11, 12, 18
597*09ef057bSDanny Tsen	vand	9, 14, 25	# a0
598*09ef057bSDanny Tsen	vsrd	10, 14, 31	# >> 26
599*09ef057bSDanny Tsen	vsrd	11, 10, 31	# 12 bits left
600*09ef057bSDanny Tsen	vand	10, 10, 25	# a1
601*09ef057bSDanny Tsen	vspltisb 13, 12
602*09ef057bSDanny Tsen	vand	16, 15, 25
603*09ef057bSDanny Tsen	vsld	12, 16, 13
604*09ef057bSDanny Tsen	vspltisb 13, 14
605*09ef057bSDanny Tsen	vor	11, 11, 12
606*09ef057bSDanny Tsen	vand	11, 11, 25	# a2
607*09ef057bSDanny Tsen	vsrd	12, 15, 13	# >> 14
608*09ef057bSDanny Tsen	vsrd	13, 12, 31	# >> 26, a4
609*09ef057bSDanny Tsen	vand	12, 12, 25	# a3
610*09ef057bSDanny Tsen
611*09ef057bSDanny Tsen	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
612*09ef057bSDanny Tsen	vmrgow	4, 9, 20
613*09ef057bSDanny Tsen	vmrgow	5, 10, 21
614*09ef057bSDanny Tsen	vmrgow	6, 11, 22
615*09ef057bSDanny Tsen	vmrgow	7, 12, 23
616*09ef057bSDanny Tsen	vmrgow	8, 13, 24
617*09ef057bSDanny Tsen	vaddudm	8, 8, 19
618*09ef057bSDanny Tsen
619*09ef057bSDanny Tsen	addi	5, 5, -64	# len -= 64
620*09ef057bSDanny Tsen	addi	21, 21, 64	# offset += 64
621*09ef057bSDanny Tsen
622*09ef057bSDanny Tsen	li      9, 64
623*09ef057bSDanny Tsen	divdu   31, 5, 9
624*09ef057bSDanny Tsen
625*09ef057bSDanny Tsen	cmpdi	31, 0
626*09ef057bSDanny Tsen	ble	Skip_block_loop
627*09ef057bSDanny Tsen
628*09ef057bSDanny Tsen	mtctr	31
629*09ef057bSDanny Tsen
630*09ef057bSDanny Tsen# h4 =   m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
631*09ef057bSDanny Tsen# Rewrite the polynominal sum of product as follows,
632*09ef057bSDanny Tsen# h1 = (h0 + m1) * r^2,	h2 = (h0 + m2) * r^2
633*09ef057bSDanny Tsen# h3 = (h1 + m3) * r^2,	h4 = (h2 + m4) * r^2  --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
634*09ef057bSDanny Tsen#  .... Repeat
635*09ef057bSDanny Tsen# h5 = (h3 + m5) * r^2,	h6 = (h4 + m6) * r^2  -->
636*09ef057bSDanny Tsen# h7 = (h5 + m7) * r^2,	h8 = (h6 + m8) * r^1  --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
637*09ef057bSDanny Tsen#
638*09ef057bSDanny Tsenloop_4blocks:
639*09ef057bSDanny Tsen
640*09ef057bSDanny Tsen	# Multiply odd words and even words
641*09ef057bSDanny Tsen	mul_odd
642*09ef057bSDanny Tsen	mul_even
643*09ef057bSDanny Tsen	# carry reduction
644*09ef057bSDanny Tsen	vspltisb 9, 2
645*09ef057bSDanny Tsen	vsrd	10, 14, 31
646*09ef057bSDanny Tsen	vsrd	11, 17, 31
647*09ef057bSDanny Tsen	vand	7, 17, 25
648*09ef057bSDanny Tsen	vand	4, 14, 25
649*09ef057bSDanny Tsen	vaddudm	18, 18, 11
650*09ef057bSDanny Tsen	vsrd	12, 18, 31
651*09ef057bSDanny Tsen	vaddudm	15, 15, 10
652*09ef057bSDanny Tsen
653*09ef057bSDanny Tsen	vsrd	11, 15, 31
654*09ef057bSDanny Tsen	vand	8, 18, 25
655*09ef057bSDanny Tsen	vand	5, 15, 25
656*09ef057bSDanny Tsen	vaddudm	4, 4, 12
657*09ef057bSDanny Tsen	vsld	10, 12, 9
658*09ef057bSDanny Tsen	vaddudm	6, 16, 11
659*09ef057bSDanny Tsen
660*09ef057bSDanny Tsen	vsrd	13, 6, 31
661*09ef057bSDanny Tsen	vand	6, 6, 25
662*09ef057bSDanny Tsen	vaddudm	4, 4, 10
663*09ef057bSDanny Tsen	vsrd	10, 4, 31
664*09ef057bSDanny Tsen	vaddudm	7, 7, 13
665*09ef057bSDanny Tsen
666*09ef057bSDanny Tsen	vsrd	11, 7, 31
667*09ef057bSDanny Tsen	vand	7, 7, 25
668*09ef057bSDanny Tsen	vand	4, 4, 25
669*09ef057bSDanny Tsen	vaddudm	5, 5, 10
670*09ef057bSDanny Tsen	vaddudm	8, 8, 11
671*09ef057bSDanny Tsen
672*09ef057bSDanny Tsen	# input m1  m2  m3  m4
673*09ef057bSDanny Tsen	add	20, 4, 21
674*09ef057bSDanny Tsen	xxlor	49, 24, 24
675*09ef057bSDanny Tsen	xxlor	50, 25, 25
676*09ef057bSDanny Tsen	lxvw4x	43, 0, 20
677*09ef057bSDanny Tsen	addi	17, 20, 16
678*09ef057bSDanny Tsen	lxvw4x	44, 0, 17
679*09ef057bSDanny Tsen	vperm	14, 11, 12, 17
680*09ef057bSDanny Tsen	vperm	15, 11, 12, 18
681*09ef057bSDanny Tsen	addi	17, 17, 16
682*09ef057bSDanny Tsen	lxvw4x	43, 0, 17
683*09ef057bSDanny Tsen	addi	17, 17, 16
684*09ef057bSDanny Tsen	lxvw4x	44, 0, 17
685*09ef057bSDanny Tsen	vperm	17, 11, 12, 17
686*09ef057bSDanny Tsen	vperm	18, 11, 12, 18
687*09ef057bSDanny Tsen
688*09ef057bSDanny Tsen	vand	20, 14, 25	# a0
689*09ef057bSDanny Tsen	vand	9, 17, 25	# a0
690*09ef057bSDanny Tsen	vsrd	21, 14, 31	# >> 26
691*09ef057bSDanny Tsen	vsrd	22, 21, 31	# 12 bits left
692*09ef057bSDanny Tsen	vsrd	10, 17, 31	# >> 26
693*09ef057bSDanny Tsen	vsrd	11, 10, 31	# 12 bits left
694*09ef057bSDanny Tsen
695*09ef057bSDanny Tsen	vand	21, 21, 25	# a1
696*09ef057bSDanny Tsen	vand	10, 10, 25	# a1
697*09ef057bSDanny Tsen
698*09ef057bSDanny Tsen	vspltisb 13, 12
699*09ef057bSDanny Tsen	vand	16, 15, 25
700*09ef057bSDanny Tsen	vsld	23, 16, 13
701*09ef057bSDanny Tsen	vor	22, 22, 23
702*09ef057bSDanny Tsen	vand	22, 22, 25	# a2
703*09ef057bSDanny Tsen	vand	16, 18, 25
704*09ef057bSDanny Tsen	vsld	12, 16, 13
705*09ef057bSDanny Tsen	vor	11, 11, 12
706*09ef057bSDanny Tsen	vand	11, 11, 25	# a2
707*09ef057bSDanny Tsen	vspltisb 13, 14
708*09ef057bSDanny Tsen	vsrd	23, 15, 13	# >> 14
709*09ef057bSDanny Tsen	vsrd	24, 23, 31	# >> 26, a4
710*09ef057bSDanny Tsen	vand	23, 23, 25	# a3
711*09ef057bSDanny Tsen	vsrd	12, 18, 13	# >> 14
712*09ef057bSDanny Tsen	vsrd	13, 12, 31	# >> 26, a4
713*09ef057bSDanny Tsen	vand	12, 12, 25	# a3
714*09ef057bSDanny Tsen
715*09ef057bSDanny Tsen	vaddudm	4, 4, 20
716*09ef057bSDanny Tsen	vaddudm	5, 5, 21
717*09ef057bSDanny Tsen	vaddudm	6, 6, 22
718*09ef057bSDanny Tsen	vaddudm	7, 7, 23
719*09ef057bSDanny Tsen	vaddudm	8, 8, 24
720*09ef057bSDanny Tsen
721*09ef057bSDanny Tsen	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
722*09ef057bSDanny Tsen	vmrgow	4, 9, 4
723*09ef057bSDanny Tsen	vmrgow	5, 10, 5
724*09ef057bSDanny Tsen	vmrgow	6, 11, 6
725*09ef057bSDanny Tsen	vmrgow	7, 12, 7
726*09ef057bSDanny Tsen	vmrgow	8, 13, 8
727*09ef057bSDanny Tsen	vaddudm	8, 8, 19
728*09ef057bSDanny Tsen
729*09ef057bSDanny Tsen	addi	5, 5, -64	# len -= 64
730*09ef057bSDanny Tsen	addi	21, 21, 64	# offset += 64
731*09ef057bSDanny Tsen
732*09ef057bSDanny Tsen	bdnz	loop_4blocks
733*09ef057bSDanny Tsen
734*09ef057bSDanny TsenSkip_block_loop:
735*09ef057bSDanny Tsen	xxlor	58, 0, 0
736*09ef057bSDanny Tsen	xxlor	59, 1, 1
737*09ef057bSDanny Tsen	xxlor	60, 2, 2
738*09ef057bSDanny Tsen	xxlor	61, 3, 3
739*09ef057bSDanny Tsen	xxlor	62, 4, 4
740*09ef057bSDanny Tsen	xxlor	32, 5, 5
741*09ef057bSDanny Tsen	xxlor	33, 6, 6
742*09ef057bSDanny Tsen	xxlor	34, 7, 7
743*09ef057bSDanny Tsen	xxlor	35, 8, 8
744*09ef057bSDanny Tsen
745*09ef057bSDanny Tsen	# Multiply odd words and even words
746*09ef057bSDanny Tsen	mul_odd
747*09ef057bSDanny Tsen	mul_even
748*09ef057bSDanny Tsen
749*09ef057bSDanny Tsen	# Sum the products.
750*09ef057bSDanny Tsen	xxpermdi 41, 31, 46, 0
751*09ef057bSDanny Tsen	xxpermdi 42, 31, 47, 0
752*09ef057bSDanny Tsen	vaddudm	4, 14, 9
753*09ef057bSDanny Tsen	xxpermdi 36, 31, 36, 3
754*09ef057bSDanny Tsen	vaddudm	5, 15, 10
755*09ef057bSDanny Tsen	xxpermdi 37, 31, 37, 3
756*09ef057bSDanny Tsen	xxpermdi 43, 31, 48, 0
757*09ef057bSDanny Tsen	vaddudm	6, 16, 11
758*09ef057bSDanny Tsen	xxpermdi 38, 31, 38, 3
759*09ef057bSDanny Tsen	xxpermdi 44, 31, 49, 0
760*09ef057bSDanny Tsen	vaddudm	7, 17, 12
761*09ef057bSDanny Tsen	xxpermdi 39, 31, 39, 3
762*09ef057bSDanny Tsen	xxpermdi 45, 31, 50, 0
763*09ef057bSDanny Tsen	vaddudm	8, 18, 13
764*09ef057bSDanny Tsen	xxpermdi 40, 31, 40, 3
765*09ef057bSDanny Tsen
766*09ef057bSDanny Tsen	# carry reduction
767*09ef057bSDanny Tsen	vspltisb 9, 2
768*09ef057bSDanny Tsen	vsrd	10, 4, 31
769*09ef057bSDanny Tsen	vsrd	11, 7, 31
770*09ef057bSDanny Tsen	vand	7, 7, 25
771*09ef057bSDanny Tsen	vand	4, 4, 25
772*09ef057bSDanny Tsen	vaddudm	8, 8, 11
773*09ef057bSDanny Tsen	vsrd	12, 8, 31
774*09ef057bSDanny Tsen	vaddudm	5, 5, 10
775*09ef057bSDanny Tsen
776*09ef057bSDanny Tsen	vsrd	11, 5, 31
777*09ef057bSDanny Tsen	vand	8, 8, 25
778*09ef057bSDanny Tsen	vand	5, 5, 25
779*09ef057bSDanny Tsen	vaddudm	4, 4, 12
780*09ef057bSDanny Tsen	vsld	10, 12, 9
781*09ef057bSDanny Tsen	vaddudm	6, 6, 11
782*09ef057bSDanny Tsen
783*09ef057bSDanny Tsen	vsrd	13, 6, 31
784*09ef057bSDanny Tsen	vand	6, 6, 25
785*09ef057bSDanny Tsen	vaddudm	4, 4, 10
786*09ef057bSDanny Tsen	vsrd	10, 4, 31
787*09ef057bSDanny Tsen	vaddudm	7, 7, 13
788*09ef057bSDanny Tsen
789*09ef057bSDanny Tsen	vsrd	11, 7, 31
790*09ef057bSDanny Tsen	vand	7, 7, 25
791*09ef057bSDanny Tsen	vand	4, 4, 25
792*09ef057bSDanny Tsen	vaddudm	5, 5, 10
793*09ef057bSDanny Tsen	vsrd	10, 5, 31
794*09ef057bSDanny Tsen	vand	5, 5, 25
795*09ef057bSDanny Tsen	vaddudm	6, 6, 10
796*09ef057bSDanny Tsen	vaddudm	8, 8, 11
797*09ef057bSDanny Tsen
798*09ef057bSDanny Tsen	b	do_final_update
799*09ef057bSDanny Tsen
800*09ef057bSDanny Tsendo_final_update:
801*09ef057bSDanny Tsen	# combine 26 bit limbs
802*09ef057bSDanny Tsen	# v4, v5, v6, v7 and v8 are 26 bit vectors
803*09ef057bSDanny Tsen	vsld	5, 5, 31
804*09ef057bSDanny Tsen	vor	20, 4, 5
805*09ef057bSDanny Tsen	vspltisb 11, 12
806*09ef057bSDanny Tsen	vsrd	12, 6, 11
807*09ef057bSDanny Tsen	vsld	6, 6, 31
808*09ef057bSDanny Tsen	vsld	6, 6, 31
809*09ef057bSDanny Tsen	vor	20, 20, 6
810*09ef057bSDanny Tsen	vspltisb 11, 14
811*09ef057bSDanny Tsen	vsld	7, 7, 11
812*09ef057bSDanny Tsen	vor	21, 7, 12
813*09ef057bSDanny Tsen	mfvsrld	16, 40		# save last 2 bytes
814*09ef057bSDanny Tsen	vsld	8, 8, 11
815*09ef057bSDanny Tsen	vsld	8, 8, 31
816*09ef057bSDanny Tsen	vor	21, 21, 8
817*09ef057bSDanny Tsen	mfvsrld	17, 52
818*09ef057bSDanny Tsen	mfvsrld	19, 53
819*09ef057bSDanny Tsen	srdi	16, 16, 24
820*09ef057bSDanny Tsen
821*09ef057bSDanny Tsen	std	17, 0(3)
822*09ef057bSDanny Tsen	std	19, 8(3)
823*09ef057bSDanny Tsen	stw	16, 16(3)
824*09ef057bSDanny Tsen
825*09ef057bSDanny TsenOut_loop:
826*09ef057bSDanny Tsen	li	3, 0
827*09ef057bSDanny Tsen
828*09ef057bSDanny Tsen	RESTORE_REGS
829*09ef057bSDanny Tsen
830*09ef057bSDanny Tsen	blr
831*09ef057bSDanny Tsen
832*09ef057bSDanny TsenOut_no_poly1305:
833*09ef057bSDanny Tsen	li	3, 0
834*09ef057bSDanny Tsen	blr
835*09ef057bSDanny TsenSYM_FUNC_END(poly1305_p10le_4blocks)
836*09ef057bSDanny Tsen
837*09ef057bSDanny Tsen#
838*09ef057bSDanny Tsen# =======================================================================
839*09ef057bSDanny Tsen# The following functions implement 64 x 64 bits multiplication poly1305.
840*09ef057bSDanny Tsen#
841*09ef057bSDanny TsenSYM_FUNC_START_LOCAL(Poly1305_init_64)
842*09ef057bSDanny Tsen	#  mask 0x0FFFFFFC0FFFFFFC
843*09ef057bSDanny Tsen	#  mask 0x0FFFFFFC0FFFFFFF
844*09ef057bSDanny Tsen	addis	10, 2, rmask@toc@ha
845*09ef057bSDanny Tsen	addi	10, 10, rmask@toc@l
846*09ef057bSDanny Tsen	ld	11, 0(10)
847*09ef057bSDanny Tsen	ld	12, 8(10)
848*09ef057bSDanny Tsen
849*09ef057bSDanny Tsen	# initialize
850*09ef057bSDanny Tsen	# load key from r3
851*09ef057bSDanny Tsen	ld	9, 24(3)
852*09ef057bSDanny Tsen	ld	10, 32(3)
853*09ef057bSDanny Tsen	and.	9, 9, 11	# cramp mask r0
854*09ef057bSDanny Tsen	and.	10, 10, 12	# cramp mask r1
855*09ef057bSDanny Tsen
856*09ef057bSDanny Tsen        srdi    21, 10, 2
857*09ef057bSDanny Tsen        add     19, 21, 10      # s1: r19 - (r1 >> 2) *5
858*09ef057bSDanny Tsen
859*09ef057bSDanny Tsen        # setup r and s
860*09ef057bSDanny Tsen        li      25, 0
861*09ef057bSDanny Tsen	mtvsrdd 32+0, 9, 19	# r0, s1
862*09ef057bSDanny Tsen	mtvsrdd 32+1, 10, 9	# r1, r0
863*09ef057bSDanny Tsen	mtvsrdd 32+2, 19, 25	# s1
864*09ef057bSDanny Tsen	mtvsrdd 32+3, 9, 25	# r0
865*09ef057bSDanny Tsen
866*09ef057bSDanny Tsen	blr
867*09ef057bSDanny TsenSYM_FUNC_END(Poly1305_init_64)
868*09ef057bSDanny Tsen
869*09ef057bSDanny Tsen# Poly1305_mult
870*09ef057bSDanny Tsen# v6 = (h0, h1), v8 = h2
871*09ef057bSDanny Tsen# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
872*09ef057bSDanny Tsen#
873*09ef057bSDanny Tsen# Output: v7, v10, v11
874*09ef057bSDanny Tsen#
875*09ef057bSDanny TsenSYM_FUNC_START_LOCAL(Poly1305_mult)
876*09ef057bSDanny Tsen	#
877*09ef057bSDanny Tsen	#	d0 = h0 * r0 + h1 * s1
878*09ef057bSDanny Tsen	vmsumudm	7, 6, 0, 9		# h0 * r0, h1 * s1
879*09ef057bSDanny Tsen
880*09ef057bSDanny Tsen	#	d1 = h0 * r1 + h1 * r0 + h2 * s1
881*09ef057bSDanny Tsen	vmsumudm	11, 6, 1, 9		# h0 * r1, h1 * r0
882*09ef057bSDanny Tsen	vmsumudm	10, 8, 2, 11		# d1 += h2 * s1
883*09ef057bSDanny Tsen
884*09ef057bSDanny Tsen	#       d2 = r0
885*09ef057bSDanny Tsen	vmsumudm	11, 8, 3, 9		# d2 = h2 * r0
886*09ef057bSDanny Tsen	blr
887*09ef057bSDanny TsenSYM_FUNC_END(Poly1305_mult)
888*09ef057bSDanny Tsen
889*09ef057bSDanny Tsen#
890*09ef057bSDanny Tsen# carry reduction
891*09ef057bSDanny Tsen# h %=p
892*09ef057bSDanny Tsen#
893*09ef057bSDanny Tsen# Input: v7, v10, v11
894*09ef057bSDanny Tsen# Output: r27, r28, r29
895*09ef057bSDanny Tsen#
896*09ef057bSDanny TsenSYM_FUNC_START_LOCAL(Carry_reduction)
897*09ef057bSDanny Tsen	mfvsrld	27, 32+7
898*09ef057bSDanny Tsen	mfvsrld	28, 32+10
899*09ef057bSDanny Tsen	mfvsrld	29, 32+11
900*09ef057bSDanny Tsen	mfvsrd	20, 32+7	# h0.h
901*09ef057bSDanny Tsen	mfvsrd	21, 32+10	# h1.h
902*09ef057bSDanny Tsen
903*09ef057bSDanny Tsen	addc	28, 28, 20
904*09ef057bSDanny Tsen	adde	29, 29, 21
905*09ef057bSDanny Tsen	srdi	22, 29, 0x2
906*09ef057bSDanny Tsen	sldi	23, 22, 0x2
907*09ef057bSDanny Tsen	add	23, 23, 22	# (h2 & 3) * 5
908*09ef057bSDanny Tsen	addc	27, 27, 23	# h0
909*09ef057bSDanny Tsen	addze	28, 28		# h1
910*09ef057bSDanny Tsen	andi.	29, 29, 0x3	# h2
911*09ef057bSDanny Tsen	blr
912*09ef057bSDanny TsenSYM_FUNC_END(Carry_reduction)
913*09ef057bSDanny Tsen
914*09ef057bSDanny Tsen#
915*09ef057bSDanny Tsen# poly1305 multiplication
916*09ef057bSDanny Tsen# h *= r, h %= p
917*09ef057bSDanny Tsen#	d0 = h0 * r0 + h1 * s1
918*09ef057bSDanny Tsen#	d1 = h0 * r1 + h1 * r0 + h2 * s1
919*09ef057bSDanny Tsen#       d2 = h0 * r0
920*09ef057bSDanny Tsen#
921*09ef057bSDanny Tsen#
922*09ef057bSDanny Tsen# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
923*09ef057bSDanny Tsen#   - no highbit if final leftover block (highbit = 0)
924*09ef057bSDanny Tsen#
925*09ef057bSDanny TsenSYM_FUNC_START(poly1305_64s)
926*09ef057bSDanny Tsen	cmpdi	5, 0
927*09ef057bSDanny Tsen	ble	Out_no_poly1305_64
928*09ef057bSDanny Tsen
929*09ef057bSDanny Tsen	mflr 0
930*09ef057bSDanny Tsen	std 0, 16(1)
931*09ef057bSDanny Tsen	stdu 1,-400(1)
932*09ef057bSDanny Tsen
933*09ef057bSDanny Tsen	SAVE_GPR 14, 112, 1
934*09ef057bSDanny Tsen	SAVE_GPR 15, 120, 1
935*09ef057bSDanny Tsen	SAVE_GPR 16, 128, 1
936*09ef057bSDanny Tsen	SAVE_GPR 17, 136, 1
937*09ef057bSDanny Tsen	SAVE_GPR 18, 144, 1
938*09ef057bSDanny Tsen	SAVE_GPR 19, 152, 1
939*09ef057bSDanny Tsen	SAVE_GPR 20, 160, 1
940*09ef057bSDanny Tsen	SAVE_GPR 21, 168, 1
941*09ef057bSDanny Tsen	SAVE_GPR 22, 176, 1
942*09ef057bSDanny Tsen	SAVE_GPR 23, 184, 1
943*09ef057bSDanny Tsen	SAVE_GPR 24, 192, 1
944*09ef057bSDanny Tsen	SAVE_GPR 25, 200, 1
945*09ef057bSDanny Tsen	SAVE_GPR 26, 208, 1
946*09ef057bSDanny Tsen	SAVE_GPR 27, 216, 1
947*09ef057bSDanny Tsen	SAVE_GPR 28, 224, 1
948*09ef057bSDanny Tsen	SAVE_GPR 29, 232, 1
949*09ef057bSDanny Tsen	SAVE_GPR 30, 240, 1
950*09ef057bSDanny Tsen	SAVE_GPR 31, 248, 1
951*09ef057bSDanny Tsen
952*09ef057bSDanny Tsen	# Init poly1305
953*09ef057bSDanny Tsen	bl Poly1305_init_64
954*09ef057bSDanny Tsen
955*09ef057bSDanny Tsen	li 25, 0			# offset to inp and outp
956*09ef057bSDanny Tsen
957*09ef057bSDanny Tsen	add 11, 25, 4
958*09ef057bSDanny Tsen
959*09ef057bSDanny Tsen	# load h
960*09ef057bSDanny Tsen	# h0, h1, h2?
961*09ef057bSDanny Tsen        ld	27, 0(3)
962*09ef057bSDanny Tsen        ld	28, 8(3)
963*09ef057bSDanny Tsen        lwz	29, 16(3)
964*09ef057bSDanny Tsen
965*09ef057bSDanny Tsen        li      30, 16
966*09ef057bSDanny Tsen        divdu   31, 5, 30
967*09ef057bSDanny Tsen
968*09ef057bSDanny Tsen        mtctr   31
969*09ef057bSDanny Tsen
970*09ef057bSDanny Tsen        mr      24, 6		# highbit
971*09ef057bSDanny Tsen
972*09ef057bSDanny TsenLoop_block_64:
973*09ef057bSDanny Tsen	vxor	9, 9, 9
974*09ef057bSDanny Tsen
975*09ef057bSDanny Tsen	ld	20, 0(11)
976*09ef057bSDanny Tsen	ld	21, 8(11)
977*09ef057bSDanny Tsen	addi	11, 11, 16
978*09ef057bSDanny Tsen
979*09ef057bSDanny Tsen	addc	27, 27, 20
980*09ef057bSDanny Tsen	adde	28, 28, 21
981*09ef057bSDanny Tsen	adde	29, 29, 24
982*09ef057bSDanny Tsen
983*09ef057bSDanny Tsen	li	22, 0
984*09ef057bSDanny Tsen	mtvsrdd	32+6, 27, 28	# h0, h1
985*09ef057bSDanny Tsen	mtvsrdd	32+8, 29, 22	# h2
986*09ef057bSDanny Tsen
987*09ef057bSDanny Tsen	bl	Poly1305_mult
988*09ef057bSDanny Tsen
989*09ef057bSDanny Tsen	bl	Carry_reduction
990*09ef057bSDanny Tsen
991*09ef057bSDanny Tsen	bdnz	Loop_block_64
992*09ef057bSDanny Tsen
993*09ef057bSDanny Tsen	std	27, 0(3)
994*09ef057bSDanny Tsen	std	28, 8(3)
995*09ef057bSDanny Tsen	stw	29, 16(3)
996*09ef057bSDanny Tsen
997*09ef057bSDanny Tsen	li	3, 0
998*09ef057bSDanny Tsen
999*09ef057bSDanny Tsen	RESTORE_GPR 14, 112, 1
1000*09ef057bSDanny Tsen	RESTORE_GPR 15, 120, 1
1001*09ef057bSDanny Tsen	RESTORE_GPR 16, 128, 1
1002*09ef057bSDanny Tsen	RESTORE_GPR 17, 136, 1
1003*09ef057bSDanny Tsen	RESTORE_GPR 18, 144, 1
1004*09ef057bSDanny Tsen	RESTORE_GPR 19, 152, 1
1005*09ef057bSDanny Tsen	RESTORE_GPR 20, 160, 1
1006*09ef057bSDanny Tsen	RESTORE_GPR 21, 168, 1
1007*09ef057bSDanny Tsen	RESTORE_GPR 22, 176, 1
1008*09ef057bSDanny Tsen	RESTORE_GPR 23, 184, 1
1009*09ef057bSDanny Tsen	RESTORE_GPR 24, 192, 1
1010*09ef057bSDanny Tsen	RESTORE_GPR 25, 200, 1
1011*09ef057bSDanny Tsen	RESTORE_GPR 26, 208, 1
1012*09ef057bSDanny Tsen	RESTORE_GPR 27, 216, 1
1013*09ef057bSDanny Tsen	RESTORE_GPR 28, 224, 1
1014*09ef057bSDanny Tsen	RESTORE_GPR 29, 232, 1
1015*09ef057bSDanny Tsen	RESTORE_GPR 30, 240, 1
1016*09ef057bSDanny Tsen	RESTORE_GPR 31, 248, 1
1017*09ef057bSDanny Tsen
1018*09ef057bSDanny Tsen	addi    1, 1, 400
1019*09ef057bSDanny Tsen	ld 0, 16(1)
1020*09ef057bSDanny Tsen	mtlr 0
1021*09ef057bSDanny Tsen
1022*09ef057bSDanny Tsen	blr
1023*09ef057bSDanny Tsen
1024*09ef057bSDanny TsenOut_no_poly1305_64:
1025*09ef057bSDanny Tsen	li	3, 0
1026*09ef057bSDanny Tsen	blr
1027*09ef057bSDanny TsenSYM_FUNC_END(poly1305_64s)
1028*09ef057bSDanny Tsen
1029*09ef057bSDanny Tsen#
1030*09ef057bSDanny Tsen# Input: r3 = h, r4 = s, r5 = mac
1031*09ef057bSDanny Tsen# mac = h + s
1032*09ef057bSDanny Tsen#
1033*09ef057bSDanny TsenSYM_FUNC_START(poly1305_emit_64)
1034*09ef057bSDanny Tsen	ld	10, 0(3)
1035*09ef057bSDanny Tsen	ld	11, 8(3)
1036*09ef057bSDanny Tsen	ld	12, 16(3)
1037*09ef057bSDanny Tsen
1038*09ef057bSDanny Tsen	# compare modulus
1039*09ef057bSDanny Tsen	# h + 5 + (-p)
1040*09ef057bSDanny Tsen	mr	6, 10
1041*09ef057bSDanny Tsen	mr	7, 11
1042*09ef057bSDanny Tsen	mr	8, 12
1043*09ef057bSDanny Tsen	addic.	6, 6, 5
1044*09ef057bSDanny Tsen	addze	7, 7
1045*09ef057bSDanny Tsen	addze	8, 8
1046*09ef057bSDanny Tsen	srdi	9, 8, 2		# overflow?
1047*09ef057bSDanny Tsen	cmpdi	9, 0
1048*09ef057bSDanny Tsen	beq	Skip_h64
1049*09ef057bSDanny Tsen	mr	10, 6
1050*09ef057bSDanny Tsen	mr	11, 7
1051*09ef057bSDanny Tsen	mr	12, 8
1052*09ef057bSDanny Tsen
1053*09ef057bSDanny TsenSkip_h64:
1054*09ef057bSDanny Tsen	ld	6, 0(4)
1055*09ef057bSDanny Tsen	ld	7, 8(4)
1056*09ef057bSDanny Tsen	addc	10, 10, 6
1057*09ef057bSDanny Tsen	adde	11, 11, 7
1058*09ef057bSDanny Tsen	addze	12, 12
1059*09ef057bSDanny Tsen
1060*09ef057bSDanny Tsen	std	10, 0(5)
1061*09ef057bSDanny Tsen	std	11, 8(5)
1062*09ef057bSDanny Tsen	blr
1063*09ef057bSDanny TsenSYM_FUNC_END(poly1305_emit_64)
1064*09ef057bSDanny Tsen
1065*09ef057bSDanny TsenSYM_DATA_START_LOCAL(RMASK)
1066*09ef057bSDanny Tsen.align 5
1067*09ef057bSDanny Tsenrmask:
1068*09ef057bSDanny Tsen.byte	0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
1069*09ef057bSDanny Tsencnum:
1070*09ef057bSDanny Tsen.long	0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
1071*09ef057bSDanny Tsen.long	0x1a, 0x00, 0x1a, 0x00
1072*09ef057bSDanny Tsen.long	0x01000000, 0x01000000, 0x01000000, 0x01000000
1073*09ef057bSDanny Tsen.long	0x00010203, 0x04050607, 0x10111213, 0x14151617
1074*09ef057bSDanny Tsen.long	0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
1075*09ef057bSDanny TsenSYM_DATA_END(RMASK)
1076