xref: /openbmc/linux/lib/raid6/loongarch_simd.c (revision 8f3f06df)
1*8f3f06dfSWANG Xuerui // SPDX-License-Identifier: GPL-2.0-or-later
2*8f3f06dfSWANG Xuerui /*
3*8f3f06dfSWANG Xuerui  * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
4*8f3f06dfSWANG Xuerui  *
5*8f3f06dfSWANG Xuerui  * Copyright 2023 WANG Xuerui <git@xen0n.name>
6*8f3f06dfSWANG Xuerui  *
7*8f3f06dfSWANG Xuerui  * Based on the generic RAID-6 code (int.uc):
8*8f3f06dfSWANG Xuerui  *
9*8f3f06dfSWANG Xuerui  * Copyright 2002-2004 H. Peter Anvin
10*8f3f06dfSWANG Xuerui  */
11*8f3f06dfSWANG Xuerui 
12*8f3f06dfSWANG Xuerui #include <linux/raid/pq.h>
13*8f3f06dfSWANG Xuerui #include "loongarch.h"
14*8f3f06dfSWANG Xuerui 
15*8f3f06dfSWANG Xuerui /*
16*8f3f06dfSWANG Xuerui  * The vector algorithms are currently priority 0, which means the generic
17*8f3f06dfSWANG Xuerui  * scalar algorithms are not being disabled if vector support is present.
18*8f3f06dfSWANG Xuerui  * This is like the similar LoongArch RAID5 XOR code, with the main reason
19*8f3f06dfSWANG Xuerui  * repeated here: it cannot be ruled out at this point of time, that some
20*8f3f06dfSWANG Xuerui  * future (maybe reduced) models could run the vector algorithms slower than
21*8f3f06dfSWANG Xuerui  * the scalar ones, maybe for errata or micro-op reasons. It may be
22*8f3f06dfSWANG Xuerui  * appropriate to revisit this after one or two more uarch generations.
23*8f3f06dfSWANG Xuerui  */
24*8f3f06dfSWANG Xuerui 
25*8f3f06dfSWANG Xuerui #ifdef CONFIG_CPU_HAS_LSX
26*8f3f06dfSWANG Xuerui #define NSIZE 16
27*8f3f06dfSWANG Xuerui 
raid6_has_lsx(void)28*8f3f06dfSWANG Xuerui static int raid6_has_lsx(void)
29*8f3f06dfSWANG Xuerui {
30*8f3f06dfSWANG Xuerui 	return cpu_has_lsx;
31*8f3f06dfSWANG Xuerui }
32*8f3f06dfSWANG Xuerui 
raid6_lsx_gen_syndrome(int disks,size_t bytes,void ** ptrs)33*8f3f06dfSWANG Xuerui static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
34*8f3f06dfSWANG Xuerui {
35*8f3f06dfSWANG Xuerui 	u8 **dptr = (u8 **)ptrs;
36*8f3f06dfSWANG Xuerui 	u8 *p, *q;
37*8f3f06dfSWANG Xuerui 	int d, z, z0;
38*8f3f06dfSWANG Xuerui 
39*8f3f06dfSWANG Xuerui 	z0 = disks - 3;		/* Highest data disk */
40*8f3f06dfSWANG Xuerui 	p = dptr[z0+1];		/* XOR parity */
41*8f3f06dfSWANG Xuerui 	q = dptr[z0+2];		/* RS syndrome */
42*8f3f06dfSWANG Xuerui 
43*8f3f06dfSWANG Xuerui 	kernel_fpu_begin();
44*8f3f06dfSWANG Xuerui 
45*8f3f06dfSWANG Xuerui 	/*
46*8f3f06dfSWANG Xuerui 	 * $vr0, $vr1, $vr2, $vr3: wp
47*8f3f06dfSWANG Xuerui 	 * $vr4, $vr5, $vr6, $vr7: wq
48*8f3f06dfSWANG Xuerui 	 * $vr8, $vr9, $vr10, $vr11: wd
49*8f3f06dfSWANG Xuerui 	 * $vr12, $vr13, $vr14, $vr15: w2
50*8f3f06dfSWANG Xuerui 	 * $vr16, $vr17, $vr18, $vr19: w1
51*8f3f06dfSWANG Xuerui 	 */
52*8f3f06dfSWANG Xuerui 	for (d = 0; d < bytes; d += NSIZE*4) {
53*8f3f06dfSWANG Xuerui 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
54*8f3f06dfSWANG Xuerui 		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
55*8f3f06dfSWANG Xuerui 		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
56*8f3f06dfSWANG Xuerui 		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
57*8f3f06dfSWANG Xuerui 		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
58*8f3f06dfSWANG Xuerui 		asm volatile("vori.b $vr4, $vr0, 0");
59*8f3f06dfSWANG Xuerui 		asm volatile("vori.b $vr5, $vr1, 0");
60*8f3f06dfSWANG Xuerui 		asm volatile("vori.b $vr6, $vr2, 0");
61*8f3f06dfSWANG Xuerui 		asm volatile("vori.b $vr7, $vr3, 0");
62*8f3f06dfSWANG Xuerui 		for (z = z0-1; z >= 0; z--) {
63*8f3f06dfSWANG Xuerui 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
64*8f3f06dfSWANG Xuerui 			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
65*8f3f06dfSWANG Xuerui 			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
66*8f3f06dfSWANG Xuerui 			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
67*8f3f06dfSWANG Xuerui 			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
68*8f3f06dfSWANG Xuerui 			/* wp$$ ^= wd$$; */
69*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr0, $vr0, $vr8");
70*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr1, $vr1, $vr9");
71*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr2, $vr2, $vr10");
72*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr3, $vr3, $vr11");
73*8f3f06dfSWANG Xuerui 			/* w2$$ = MASK(wq$$); */
74*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr12, $vr4, 0");
75*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr13, $vr5, 0");
76*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr14, $vr6, 0");
77*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr15, $vr7, 0");
78*8f3f06dfSWANG Xuerui 			/* w1$$ = SHLBYTE(wq$$); */
79*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr16, $vr4, 1");
80*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr17, $vr5, 1");
81*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr18, $vr6, 1");
82*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr19, $vr7, 1");
83*8f3f06dfSWANG Xuerui 			/* w2$$ &= NBYTES(0x1d); */
84*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
85*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
86*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
87*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
88*8f3f06dfSWANG Xuerui 			/* w1$$ ^= w2$$; */
89*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr16, $vr16, $vr12");
90*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr17, $vr17, $vr13");
91*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr18, $vr18, $vr14");
92*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr19, $vr19, $vr15");
93*8f3f06dfSWANG Xuerui 			/* wq$$ = w1$$ ^ wd$$; */
94*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr4, $vr16, $vr8");
95*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr5, $vr17, $vr9");
96*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr6, $vr18, $vr10");
97*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr7, $vr19, $vr11");
98*8f3f06dfSWANG Xuerui 		}
99*8f3f06dfSWANG Xuerui 		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
100*8f3f06dfSWANG Xuerui 		asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
101*8f3f06dfSWANG Xuerui 		asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
102*8f3f06dfSWANG Xuerui 		asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
103*8f3f06dfSWANG Xuerui 		asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
104*8f3f06dfSWANG Xuerui 		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
105*8f3f06dfSWANG Xuerui 		asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
106*8f3f06dfSWANG Xuerui 		asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
107*8f3f06dfSWANG Xuerui 		asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
108*8f3f06dfSWANG Xuerui 		asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
109*8f3f06dfSWANG Xuerui 	}
110*8f3f06dfSWANG Xuerui 
111*8f3f06dfSWANG Xuerui 	kernel_fpu_end();
112*8f3f06dfSWANG Xuerui }
113*8f3f06dfSWANG Xuerui 
raid6_lsx_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)114*8f3f06dfSWANG Xuerui static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
115*8f3f06dfSWANG Xuerui 				   size_t bytes, void **ptrs)
116*8f3f06dfSWANG Xuerui {
117*8f3f06dfSWANG Xuerui 	u8 **dptr = (u8 **)ptrs;
118*8f3f06dfSWANG Xuerui 	u8 *p, *q;
119*8f3f06dfSWANG Xuerui 	int d, z, z0;
120*8f3f06dfSWANG Xuerui 
121*8f3f06dfSWANG Xuerui 	z0 = stop;		/* P/Q right side optimization */
122*8f3f06dfSWANG Xuerui 	p = dptr[disks-2];	/* XOR parity */
123*8f3f06dfSWANG Xuerui 	q = dptr[disks-1];	/* RS syndrome */
124*8f3f06dfSWANG Xuerui 
125*8f3f06dfSWANG Xuerui 	kernel_fpu_begin();
126*8f3f06dfSWANG Xuerui 
127*8f3f06dfSWANG Xuerui 	/*
128*8f3f06dfSWANG Xuerui 	 * $vr0, $vr1, $vr2, $vr3: wp
129*8f3f06dfSWANG Xuerui 	 * $vr4, $vr5, $vr6, $vr7: wq
130*8f3f06dfSWANG Xuerui 	 * $vr8, $vr9, $vr10, $vr11: wd
131*8f3f06dfSWANG Xuerui 	 * $vr12, $vr13, $vr14, $vr15: w2
132*8f3f06dfSWANG Xuerui 	 * $vr16, $vr17, $vr18, $vr19: w1
133*8f3f06dfSWANG Xuerui 	 */
134*8f3f06dfSWANG Xuerui 	for (d = 0; d < bytes; d += NSIZE*4) {
135*8f3f06dfSWANG Xuerui 		/* P/Q data pages */
136*8f3f06dfSWANG Xuerui 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
137*8f3f06dfSWANG Xuerui 		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
138*8f3f06dfSWANG Xuerui 		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
139*8f3f06dfSWANG Xuerui 		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
140*8f3f06dfSWANG Xuerui 		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
141*8f3f06dfSWANG Xuerui 		asm volatile("vori.b $vr4, $vr0, 0");
142*8f3f06dfSWANG Xuerui 		asm volatile("vori.b $vr5, $vr1, 0");
143*8f3f06dfSWANG Xuerui 		asm volatile("vori.b $vr6, $vr2, 0");
144*8f3f06dfSWANG Xuerui 		asm volatile("vori.b $vr7, $vr3, 0");
145*8f3f06dfSWANG Xuerui 		for (z = z0-1; z >= start; z--) {
146*8f3f06dfSWANG Xuerui 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
147*8f3f06dfSWANG Xuerui 			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
148*8f3f06dfSWANG Xuerui 			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
149*8f3f06dfSWANG Xuerui 			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
150*8f3f06dfSWANG Xuerui 			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
151*8f3f06dfSWANG Xuerui 			/* wp$$ ^= wd$$; */
152*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr0, $vr0, $vr8");
153*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr1, $vr1, $vr9");
154*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr2, $vr2, $vr10");
155*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr3, $vr3, $vr11");
156*8f3f06dfSWANG Xuerui 			/* w2$$ = MASK(wq$$); */
157*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr12, $vr4, 0");
158*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr13, $vr5, 0");
159*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr14, $vr6, 0");
160*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr15, $vr7, 0");
161*8f3f06dfSWANG Xuerui 			/* w1$$ = SHLBYTE(wq$$); */
162*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr16, $vr4, 1");
163*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr17, $vr5, 1");
164*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr18, $vr6, 1");
165*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr19, $vr7, 1");
166*8f3f06dfSWANG Xuerui 			/* w2$$ &= NBYTES(0x1d); */
167*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
168*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
169*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
170*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
171*8f3f06dfSWANG Xuerui 			/* w1$$ ^= w2$$; */
172*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr16, $vr16, $vr12");
173*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr17, $vr17, $vr13");
174*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr18, $vr18, $vr14");
175*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr19, $vr19, $vr15");
176*8f3f06dfSWANG Xuerui 			/* wq$$ = w1$$ ^ wd$$; */
177*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr4, $vr16, $vr8");
178*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr5, $vr17, $vr9");
179*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr6, $vr18, $vr10");
180*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr7, $vr19, $vr11");
181*8f3f06dfSWANG Xuerui 		}
182*8f3f06dfSWANG Xuerui 
183*8f3f06dfSWANG Xuerui 		/* P/Q left side optimization */
184*8f3f06dfSWANG Xuerui 		for (z = start-1; z >= 0; z--) {
185*8f3f06dfSWANG Xuerui 			/* w2$$ = MASK(wq$$); */
186*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr12, $vr4, 0");
187*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr13, $vr5, 0");
188*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr14, $vr6, 0");
189*8f3f06dfSWANG Xuerui 			asm volatile("vslti.b $vr15, $vr7, 0");
190*8f3f06dfSWANG Xuerui 			/* w1$$ = SHLBYTE(wq$$); */
191*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr16, $vr4, 1");
192*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr17, $vr5, 1");
193*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr18, $vr6, 1");
194*8f3f06dfSWANG Xuerui 			asm volatile("vslli.b $vr19, $vr7, 1");
195*8f3f06dfSWANG Xuerui 			/* w2$$ &= NBYTES(0x1d); */
196*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
197*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
198*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
199*8f3f06dfSWANG Xuerui 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
200*8f3f06dfSWANG Xuerui 			/* wq$$ = w1$$ ^ w2$$; */
201*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr4, $vr16, $vr12");
202*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr5, $vr17, $vr13");
203*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr6, $vr18, $vr14");
204*8f3f06dfSWANG Xuerui 			asm volatile("vxor.v $vr7, $vr19, $vr15");
205*8f3f06dfSWANG Xuerui 		}
206*8f3f06dfSWANG Xuerui 		/*
207*8f3f06dfSWANG Xuerui 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
208*8f3f06dfSWANG Xuerui 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
209*8f3f06dfSWANG Xuerui 		 */
210*8f3f06dfSWANG Xuerui 		asm volatile(
211*8f3f06dfSWANG Xuerui 			"vld $vr20, %0\n\t"
212*8f3f06dfSWANG Xuerui 			"vld $vr21, %1\n\t"
213*8f3f06dfSWANG Xuerui 			"vld $vr22, %2\n\t"
214*8f3f06dfSWANG Xuerui 			"vld $vr23, %3\n\t"
215*8f3f06dfSWANG Xuerui 			"vld $vr24, %4\n\t"
216*8f3f06dfSWANG Xuerui 			"vld $vr25, %5\n\t"
217*8f3f06dfSWANG Xuerui 			"vld $vr26, %6\n\t"
218*8f3f06dfSWANG Xuerui 			"vld $vr27, %7\n\t"
219*8f3f06dfSWANG Xuerui 			"vxor.v $vr20, $vr20, $vr0\n\t"
220*8f3f06dfSWANG Xuerui 			"vxor.v $vr21, $vr21, $vr1\n\t"
221*8f3f06dfSWANG Xuerui 			"vxor.v $vr22, $vr22, $vr2\n\t"
222*8f3f06dfSWANG Xuerui 			"vxor.v $vr23, $vr23, $vr3\n\t"
223*8f3f06dfSWANG Xuerui 			"vxor.v $vr24, $vr24, $vr4\n\t"
224*8f3f06dfSWANG Xuerui 			"vxor.v $vr25, $vr25, $vr5\n\t"
225*8f3f06dfSWANG Xuerui 			"vxor.v $vr26, $vr26, $vr6\n\t"
226*8f3f06dfSWANG Xuerui 			"vxor.v $vr27, $vr27, $vr7\n\t"
227*8f3f06dfSWANG Xuerui 			"vst $vr20, %0\n\t"
228*8f3f06dfSWANG Xuerui 			"vst $vr21, %1\n\t"
229*8f3f06dfSWANG Xuerui 			"vst $vr22, %2\n\t"
230*8f3f06dfSWANG Xuerui 			"vst $vr23, %3\n\t"
231*8f3f06dfSWANG Xuerui 			"vst $vr24, %4\n\t"
232*8f3f06dfSWANG Xuerui 			"vst $vr25, %5\n\t"
233*8f3f06dfSWANG Xuerui 			"vst $vr26, %6\n\t"
234*8f3f06dfSWANG Xuerui 			"vst $vr27, %7\n\t"
235*8f3f06dfSWANG Xuerui 			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
236*8f3f06dfSWANG Xuerui 			  "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
237*8f3f06dfSWANG Xuerui 			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
238*8f3f06dfSWANG Xuerui 			  "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
239*8f3f06dfSWANG Xuerui 		);
240*8f3f06dfSWANG Xuerui 	}
241*8f3f06dfSWANG Xuerui 
242*8f3f06dfSWANG Xuerui 	kernel_fpu_end();
243*8f3f06dfSWANG Xuerui }
244*8f3f06dfSWANG Xuerui 
245*8f3f06dfSWANG Xuerui const struct raid6_calls raid6_lsx = {
246*8f3f06dfSWANG Xuerui 	raid6_lsx_gen_syndrome,
247*8f3f06dfSWANG Xuerui 	raid6_lsx_xor_syndrome,
248*8f3f06dfSWANG Xuerui 	raid6_has_lsx,
249*8f3f06dfSWANG Xuerui 	"lsx",
250*8f3f06dfSWANG Xuerui 	.priority = 0 /* see the comment near the top of the file for reason */
251*8f3f06dfSWANG Xuerui };
252*8f3f06dfSWANG Xuerui 
253*8f3f06dfSWANG Xuerui #undef NSIZE
254*8f3f06dfSWANG Xuerui #endif /* CONFIG_CPU_HAS_LSX */
255*8f3f06dfSWANG Xuerui 
256*8f3f06dfSWANG Xuerui #ifdef CONFIG_CPU_HAS_LASX
257*8f3f06dfSWANG Xuerui #define NSIZE 32
258*8f3f06dfSWANG Xuerui 
raid6_has_lasx(void)259*8f3f06dfSWANG Xuerui static int raid6_has_lasx(void)
260*8f3f06dfSWANG Xuerui {
261*8f3f06dfSWANG Xuerui 	return cpu_has_lasx;
262*8f3f06dfSWANG Xuerui }
263*8f3f06dfSWANG Xuerui 
raid6_lasx_gen_syndrome(int disks,size_t bytes,void ** ptrs)264*8f3f06dfSWANG Xuerui static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
265*8f3f06dfSWANG Xuerui {
266*8f3f06dfSWANG Xuerui 	u8 **dptr = (u8 **)ptrs;
267*8f3f06dfSWANG Xuerui 	u8 *p, *q;
268*8f3f06dfSWANG Xuerui 	int d, z, z0;
269*8f3f06dfSWANG Xuerui 
270*8f3f06dfSWANG Xuerui 	z0 = disks - 3;		/* Highest data disk */
271*8f3f06dfSWANG Xuerui 	p = dptr[z0+1];		/* XOR parity */
272*8f3f06dfSWANG Xuerui 	q = dptr[z0+2];		/* RS syndrome */
273*8f3f06dfSWANG Xuerui 
274*8f3f06dfSWANG Xuerui 	kernel_fpu_begin();
275*8f3f06dfSWANG Xuerui 
276*8f3f06dfSWANG Xuerui 	/*
277*8f3f06dfSWANG Xuerui 	 * $xr0, $xr1: wp
278*8f3f06dfSWANG Xuerui 	 * $xr2, $xr3: wq
279*8f3f06dfSWANG Xuerui 	 * $xr4, $xr5: wd
280*8f3f06dfSWANG Xuerui 	 * $xr6, $xr7: w2
281*8f3f06dfSWANG Xuerui 	 * $xr8, $xr9: w1
282*8f3f06dfSWANG Xuerui 	 */
283*8f3f06dfSWANG Xuerui 	for (d = 0; d < bytes; d += NSIZE*2) {
284*8f3f06dfSWANG Xuerui 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
285*8f3f06dfSWANG Xuerui 		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
286*8f3f06dfSWANG Xuerui 		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
287*8f3f06dfSWANG Xuerui 		asm volatile("xvori.b $xr2, $xr0, 0");
288*8f3f06dfSWANG Xuerui 		asm volatile("xvori.b $xr3, $xr1, 0");
289*8f3f06dfSWANG Xuerui 		for (z = z0-1; z >= 0; z--) {
290*8f3f06dfSWANG Xuerui 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
291*8f3f06dfSWANG Xuerui 			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
292*8f3f06dfSWANG Xuerui 			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
293*8f3f06dfSWANG Xuerui 			/* wp$$ ^= wd$$; */
294*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr0, $xr0, $xr4");
295*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr1, $xr1, $xr5");
296*8f3f06dfSWANG Xuerui 			/* w2$$ = MASK(wq$$); */
297*8f3f06dfSWANG Xuerui 			asm volatile("xvslti.b $xr6, $xr2, 0");
298*8f3f06dfSWANG Xuerui 			asm volatile("xvslti.b $xr7, $xr3, 0");
299*8f3f06dfSWANG Xuerui 			/* w1$$ = SHLBYTE(wq$$); */
300*8f3f06dfSWANG Xuerui 			asm volatile("xvslli.b $xr8, $xr2, 1");
301*8f3f06dfSWANG Xuerui 			asm volatile("xvslli.b $xr9, $xr3, 1");
302*8f3f06dfSWANG Xuerui 			/* w2$$ &= NBYTES(0x1d); */
303*8f3f06dfSWANG Xuerui 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
304*8f3f06dfSWANG Xuerui 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
305*8f3f06dfSWANG Xuerui 			/* w1$$ ^= w2$$; */
306*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr8, $xr8, $xr6");
307*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr9, $xr9, $xr7");
308*8f3f06dfSWANG Xuerui 			/* wq$$ = w1$$ ^ wd$$; */
309*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr2, $xr8, $xr4");
310*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr3, $xr9, $xr5");
311*8f3f06dfSWANG Xuerui 		}
312*8f3f06dfSWANG Xuerui 		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
313*8f3f06dfSWANG Xuerui 		asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
314*8f3f06dfSWANG Xuerui 		asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
315*8f3f06dfSWANG Xuerui 		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
316*8f3f06dfSWANG Xuerui 		asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
317*8f3f06dfSWANG Xuerui 		asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
318*8f3f06dfSWANG Xuerui 	}
319*8f3f06dfSWANG Xuerui 
320*8f3f06dfSWANG Xuerui 	kernel_fpu_end();
321*8f3f06dfSWANG Xuerui }
322*8f3f06dfSWANG Xuerui 
raid6_lasx_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)323*8f3f06dfSWANG Xuerui static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
324*8f3f06dfSWANG Xuerui 				    size_t bytes, void **ptrs)
325*8f3f06dfSWANG Xuerui {
326*8f3f06dfSWANG Xuerui 	u8 **dptr = (u8 **)ptrs;
327*8f3f06dfSWANG Xuerui 	u8 *p, *q;
328*8f3f06dfSWANG Xuerui 	int d, z, z0;
329*8f3f06dfSWANG Xuerui 
330*8f3f06dfSWANG Xuerui 	z0 = stop;		/* P/Q right side optimization */
331*8f3f06dfSWANG Xuerui 	p = dptr[disks-2];	/* XOR parity */
332*8f3f06dfSWANG Xuerui 	q = dptr[disks-1];	/* RS syndrome */
333*8f3f06dfSWANG Xuerui 
334*8f3f06dfSWANG Xuerui 	kernel_fpu_begin();
335*8f3f06dfSWANG Xuerui 
336*8f3f06dfSWANG Xuerui 	/*
337*8f3f06dfSWANG Xuerui 	 * $xr0, $xr1: wp
338*8f3f06dfSWANG Xuerui 	 * $xr2, $xr3: wq
339*8f3f06dfSWANG Xuerui 	 * $xr4, $xr5: wd
340*8f3f06dfSWANG Xuerui 	 * $xr6, $xr7: w2
341*8f3f06dfSWANG Xuerui 	 * $xr8, $xr9: w1
342*8f3f06dfSWANG Xuerui 	 */
343*8f3f06dfSWANG Xuerui 	for (d = 0; d < bytes; d += NSIZE*2) {
344*8f3f06dfSWANG Xuerui 		/* P/Q data pages */
345*8f3f06dfSWANG Xuerui 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
346*8f3f06dfSWANG Xuerui 		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
347*8f3f06dfSWANG Xuerui 		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
348*8f3f06dfSWANG Xuerui 		asm volatile("xvori.b $xr2, $xr0, 0");
349*8f3f06dfSWANG Xuerui 		asm volatile("xvori.b $xr3, $xr1, 0");
350*8f3f06dfSWANG Xuerui 		for (z = z0-1; z >= start; z--) {
351*8f3f06dfSWANG Xuerui 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
352*8f3f06dfSWANG Xuerui 			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
353*8f3f06dfSWANG Xuerui 			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
354*8f3f06dfSWANG Xuerui 			/* wp$$ ^= wd$$; */
355*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr0, $xr0, $xr4");
356*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr1, $xr1, $xr5");
357*8f3f06dfSWANG Xuerui 			/* w2$$ = MASK(wq$$); */
358*8f3f06dfSWANG Xuerui 			asm volatile("xvslti.b $xr6, $xr2, 0");
359*8f3f06dfSWANG Xuerui 			asm volatile("xvslti.b $xr7, $xr3, 0");
360*8f3f06dfSWANG Xuerui 			/* w1$$ = SHLBYTE(wq$$); */
361*8f3f06dfSWANG Xuerui 			asm volatile("xvslli.b $xr8, $xr2, 1");
362*8f3f06dfSWANG Xuerui 			asm volatile("xvslli.b $xr9, $xr3, 1");
363*8f3f06dfSWANG Xuerui 			/* w2$$ &= NBYTES(0x1d); */
364*8f3f06dfSWANG Xuerui 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
365*8f3f06dfSWANG Xuerui 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
366*8f3f06dfSWANG Xuerui 			/* w1$$ ^= w2$$; */
367*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr8, $xr8, $xr6");
368*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr9, $xr9, $xr7");
369*8f3f06dfSWANG Xuerui 			/* wq$$ = w1$$ ^ wd$$; */
370*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr2, $xr8, $xr4");
371*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr3, $xr9, $xr5");
372*8f3f06dfSWANG Xuerui 		}
373*8f3f06dfSWANG Xuerui 
374*8f3f06dfSWANG Xuerui 		/* P/Q left side optimization */
375*8f3f06dfSWANG Xuerui 		for (z = start-1; z >= 0; z--) {
376*8f3f06dfSWANG Xuerui 			/* w2$$ = MASK(wq$$); */
377*8f3f06dfSWANG Xuerui 			asm volatile("xvslti.b $xr6, $xr2, 0");
378*8f3f06dfSWANG Xuerui 			asm volatile("xvslti.b $xr7, $xr3, 0");
379*8f3f06dfSWANG Xuerui 			/* w1$$ = SHLBYTE(wq$$); */
380*8f3f06dfSWANG Xuerui 			asm volatile("xvslli.b $xr8, $xr2, 1");
381*8f3f06dfSWANG Xuerui 			asm volatile("xvslli.b $xr9, $xr3, 1");
382*8f3f06dfSWANG Xuerui 			/* w2$$ &= NBYTES(0x1d); */
383*8f3f06dfSWANG Xuerui 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
384*8f3f06dfSWANG Xuerui 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
385*8f3f06dfSWANG Xuerui 			/* wq$$ = w1$$ ^ w2$$; */
386*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr2, $xr8, $xr6");
387*8f3f06dfSWANG Xuerui 			asm volatile("xvxor.v $xr3, $xr9, $xr7");
388*8f3f06dfSWANG Xuerui 		}
389*8f3f06dfSWANG Xuerui 		/*
390*8f3f06dfSWANG Xuerui 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
391*8f3f06dfSWANG Xuerui 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
392*8f3f06dfSWANG Xuerui 		 */
393*8f3f06dfSWANG Xuerui 		asm volatile(
394*8f3f06dfSWANG Xuerui 			"xvld $xr10, %0\n\t"
395*8f3f06dfSWANG Xuerui 			"xvld $xr11, %1\n\t"
396*8f3f06dfSWANG Xuerui 			"xvld $xr12, %2\n\t"
397*8f3f06dfSWANG Xuerui 			"xvld $xr13, %3\n\t"
398*8f3f06dfSWANG Xuerui 			"xvxor.v $xr10, $xr10, $xr0\n\t"
399*8f3f06dfSWANG Xuerui 			"xvxor.v $xr11, $xr11, $xr1\n\t"
400*8f3f06dfSWANG Xuerui 			"xvxor.v $xr12, $xr12, $xr2\n\t"
401*8f3f06dfSWANG Xuerui 			"xvxor.v $xr13, $xr13, $xr3\n\t"
402*8f3f06dfSWANG Xuerui 			"xvst $xr10, %0\n\t"
403*8f3f06dfSWANG Xuerui 			"xvst $xr11, %1\n\t"
404*8f3f06dfSWANG Xuerui 			"xvst $xr12, %2\n\t"
405*8f3f06dfSWANG Xuerui 			"xvst $xr13, %3\n\t"
406*8f3f06dfSWANG Xuerui 			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
407*8f3f06dfSWANG Xuerui 			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
408*8f3f06dfSWANG Xuerui 		);
409*8f3f06dfSWANG Xuerui 	}
410*8f3f06dfSWANG Xuerui 
411*8f3f06dfSWANG Xuerui 	kernel_fpu_end();
412*8f3f06dfSWANG Xuerui }
413*8f3f06dfSWANG Xuerui 
414*8f3f06dfSWANG Xuerui const struct raid6_calls raid6_lasx = {
415*8f3f06dfSWANG Xuerui 	raid6_lasx_gen_syndrome,
416*8f3f06dfSWANG Xuerui 	raid6_lasx_xor_syndrome,
417*8f3f06dfSWANG Xuerui 	raid6_has_lasx,
418*8f3f06dfSWANG Xuerui 	"lasx",
419*8f3f06dfSWANG Xuerui 	.priority = 0 /* see the comment near the top of the file for reason */
420*8f3f06dfSWANG Xuerui };
421*8f3f06dfSWANG Xuerui #undef NSIZE
422*8f3f06dfSWANG Xuerui #endif /* CONFIG_CPU_HAS_LASX */
423