1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX)
4 *
5 * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
6 *
7 * Originally based on recov_avx2.c and recov_ssse3.c:
8 *
9 * Copyright (C) 2012 Intel Corporation
10 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
11 */
12
13 #include <linux/raid/pq.h>
14 #include "loongarch.h"
15
16 /*
17 * Unlike with the syndrome calculation algorithms, there's no boot-time
18 * selection of recovery algorithms by benchmarking, so we have to specify
19 * the priorities and hope the future cores will all have decent vector
20 * support (i.e. no LASX slower than LSX, or even scalar code).
21 */
22
23 #ifdef CONFIG_CPU_HAS_LSX
raid6_has_lsx(void)24 static int raid6_has_lsx(void)
25 {
26 return cpu_has_lsx;
27 }
28
raid6_2data_recov_lsx(int disks,size_t bytes,int faila,int failb,void ** ptrs)29 static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
30 int failb, void **ptrs)
31 {
32 u8 *p, *q, *dp, *dq;
33 const u8 *pbmul; /* P multiplier table for B data */
34 const u8 *qmul; /* Q multiplier table (for both) */
35
36 p = (u8 *)ptrs[disks - 2];
37 q = (u8 *)ptrs[disks - 1];
38
39 /*
40 * Compute syndrome with zero for the missing data pages
41 * Use the dead data pages as temporary storage for
42 * delta p and delta q
43 */
44 dp = (u8 *)ptrs[faila];
45 ptrs[faila] = (void *)raid6_empty_zero_page;
46 ptrs[disks - 2] = dp;
47 dq = (u8 *)ptrs[failb];
48 ptrs[failb] = (void *)raid6_empty_zero_page;
49 ptrs[disks - 1] = dq;
50
51 raid6_call.gen_syndrome(disks, bytes, ptrs);
52
53 /* Restore pointer table */
54 ptrs[faila] = dp;
55 ptrs[failb] = dq;
56 ptrs[disks - 2] = p;
57 ptrs[disks - 1] = q;
58
59 /* Now, pick the proper data tables */
60 pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
61 qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
62
63 kernel_fpu_begin();
64
65 /*
66 * vr20, vr21: qmul
67 * vr22, vr23: pbmul
68 */
69 asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
70 asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
71 asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
72 asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
73
74 while (bytes) {
75 /* vr4 - vr7: Q */
76 asm volatile("vld $vr4, %0" : : "m" (q[0]));
77 asm volatile("vld $vr5, %0" : : "m" (q[16]));
78 asm volatile("vld $vr6, %0" : : "m" (q[32]));
79 asm volatile("vld $vr7, %0" : : "m" (q[48]));
80 /* vr4 - vr7: Q + Qxy */
81 asm volatile("vld $vr8, %0" : : "m" (dq[0]));
82 asm volatile("vld $vr9, %0" : : "m" (dq[16]));
83 asm volatile("vld $vr10, %0" : : "m" (dq[32]));
84 asm volatile("vld $vr11, %0" : : "m" (dq[48]));
85 asm volatile("vxor.v $vr4, $vr4, $vr8");
86 asm volatile("vxor.v $vr5, $vr5, $vr9");
87 asm volatile("vxor.v $vr6, $vr6, $vr10");
88 asm volatile("vxor.v $vr7, $vr7, $vr11");
89 /* vr0 - vr3: P */
90 asm volatile("vld $vr0, %0" : : "m" (p[0]));
91 asm volatile("vld $vr1, %0" : : "m" (p[16]));
92 asm volatile("vld $vr2, %0" : : "m" (p[32]));
93 asm volatile("vld $vr3, %0" : : "m" (p[48]));
94 /* vr0 - vr3: P + Pxy */
95 asm volatile("vld $vr8, %0" : : "m" (dp[0]));
96 asm volatile("vld $vr9, %0" : : "m" (dp[16]));
97 asm volatile("vld $vr10, %0" : : "m" (dp[32]));
98 asm volatile("vld $vr11, %0" : : "m" (dp[48]));
99 asm volatile("vxor.v $vr0, $vr0, $vr8");
100 asm volatile("vxor.v $vr1, $vr1, $vr9");
101 asm volatile("vxor.v $vr2, $vr2, $vr10");
102 asm volatile("vxor.v $vr3, $vr3, $vr11");
103
104 /* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */
105 asm volatile("vsrli.b $vr8, $vr4, 4");
106 asm volatile("vsrli.b $vr9, $vr5, 4");
107 asm volatile("vsrli.b $vr10, $vr6, 4");
108 asm volatile("vsrli.b $vr11, $vr7, 4");
109 /* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */
110 asm volatile("vandi.b $vr4, $vr4, 0x0f");
111 asm volatile("vandi.b $vr5, $vr5, 0x0f");
112 asm volatile("vandi.b $vr6, $vr6, 0x0f");
113 asm volatile("vandi.b $vr7, $vr7, 0x0f");
114 /* lookup from qmul[0] */
115 asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4");
116 asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5");
117 asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6");
118 asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7");
119 /* lookup from qmul[16] */
120 asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8");
121 asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9");
122 asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10");
123 asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11");
124 /* vr16 - vr19: B(Q + Qxy) */
125 asm volatile("vxor.v $vr16, $vr8, $vr4");
126 asm volatile("vxor.v $vr17, $vr9, $vr5");
127 asm volatile("vxor.v $vr18, $vr10, $vr6");
128 asm volatile("vxor.v $vr19, $vr11, $vr7");
129
130 /* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */
131 asm volatile("vsrli.b $vr4, $vr0, 4");
132 asm volatile("vsrli.b $vr5, $vr1, 4");
133 asm volatile("vsrli.b $vr6, $vr2, 4");
134 asm volatile("vsrli.b $vr7, $vr3, 4");
135 /* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */
136 asm volatile("vandi.b $vr12, $vr0, 0x0f");
137 asm volatile("vandi.b $vr13, $vr1, 0x0f");
138 asm volatile("vandi.b $vr14, $vr2, 0x0f");
139 asm volatile("vandi.b $vr15, $vr3, 0x0f");
140 /* lookup from pbmul[0] */
141 asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12");
142 asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13");
143 asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14");
144 asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15");
145 /* lookup from pbmul[16] */
146 asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4");
147 asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5");
148 asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6");
149 asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7");
150 /* vr4 - vr7: A(P + Pxy) */
151 asm volatile("vxor.v $vr4, $vr4, $vr12");
152 asm volatile("vxor.v $vr5, $vr5, $vr13");
153 asm volatile("vxor.v $vr6, $vr6, $vr14");
154 asm volatile("vxor.v $vr7, $vr7, $vr15");
155
156 /* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */
157 asm volatile("vxor.v $vr4, $vr4, $vr16");
158 asm volatile("vxor.v $vr5, $vr5, $vr17");
159 asm volatile("vxor.v $vr6, $vr6, $vr18");
160 asm volatile("vxor.v $vr7, $vr7, $vr19");
161 asm volatile("vst $vr4, %0" : "=m" (dq[0]));
162 asm volatile("vst $vr5, %0" : "=m" (dq[16]));
163 asm volatile("vst $vr6, %0" : "=m" (dq[32]));
164 asm volatile("vst $vr7, %0" : "=m" (dq[48]));
165
166 /* vr0 - vr3: P + Pxy + Dx = Dy */
167 asm volatile("vxor.v $vr0, $vr0, $vr4");
168 asm volatile("vxor.v $vr1, $vr1, $vr5");
169 asm volatile("vxor.v $vr2, $vr2, $vr6");
170 asm volatile("vxor.v $vr3, $vr3, $vr7");
171 asm volatile("vst $vr0, %0" : "=m" (dp[0]));
172 asm volatile("vst $vr1, %0" : "=m" (dp[16]));
173 asm volatile("vst $vr2, %0" : "=m" (dp[32]));
174 asm volatile("vst $vr3, %0" : "=m" (dp[48]));
175
176 bytes -= 64;
177 p += 64;
178 q += 64;
179 dp += 64;
180 dq += 64;
181 }
182
183 kernel_fpu_end();
184 }
185
raid6_datap_recov_lsx(int disks,size_t bytes,int faila,void ** ptrs)186 static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
187 void **ptrs)
188 {
189 u8 *p, *q, *dq;
190 const u8 *qmul; /* Q multiplier table */
191
192 p = (u8 *)ptrs[disks - 2];
193 q = (u8 *)ptrs[disks - 1];
194
195 /*
196 * Compute syndrome with zero for the missing data page
197 * Use the dead data page as temporary storage for delta q
198 */
199 dq = (u8 *)ptrs[faila];
200 ptrs[faila] = (void *)raid6_empty_zero_page;
201 ptrs[disks - 1] = dq;
202
203 raid6_call.gen_syndrome(disks, bytes, ptrs);
204
205 /* Restore pointer table */
206 ptrs[faila] = dq;
207 ptrs[disks - 1] = q;
208
209 /* Now, pick the proper data tables */
210 qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
211
212 kernel_fpu_begin();
213
214 /* vr22, vr23: qmul */
215 asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
216 asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
217
218 while (bytes) {
219 /* vr0 - vr3: P + Dx */
220 asm volatile("vld $vr0, %0" : : "m" (p[0]));
221 asm volatile("vld $vr1, %0" : : "m" (p[16]));
222 asm volatile("vld $vr2, %0" : : "m" (p[32]));
223 asm volatile("vld $vr3, %0" : : "m" (p[48]));
224 /* vr4 - vr7: Qx */
225 asm volatile("vld $vr4, %0" : : "m" (dq[0]));
226 asm volatile("vld $vr5, %0" : : "m" (dq[16]));
227 asm volatile("vld $vr6, %0" : : "m" (dq[32]));
228 asm volatile("vld $vr7, %0" : : "m" (dq[48]));
229 /* vr4 - vr7: Q + Qx */
230 asm volatile("vld $vr8, %0" : : "m" (q[0]));
231 asm volatile("vld $vr9, %0" : : "m" (q[16]));
232 asm volatile("vld $vr10, %0" : : "m" (q[32]));
233 asm volatile("vld $vr11, %0" : : "m" (q[48]));
234 asm volatile("vxor.v $vr4, $vr4, $vr8");
235 asm volatile("vxor.v $vr5, $vr5, $vr9");
236 asm volatile("vxor.v $vr6, $vr6, $vr10");
237 asm volatile("vxor.v $vr7, $vr7, $vr11");
238
239 /* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */
240 asm volatile("vsrli.b $vr8, $vr4, 4");
241 asm volatile("vsrli.b $vr9, $vr5, 4");
242 asm volatile("vsrli.b $vr10, $vr6, 4");
243 asm volatile("vsrli.b $vr11, $vr7, 4");
244 /* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */
245 asm volatile("vandi.b $vr4, $vr4, 0x0f");
246 asm volatile("vandi.b $vr5, $vr5, 0x0f");
247 asm volatile("vandi.b $vr6, $vr6, 0x0f");
248 asm volatile("vandi.b $vr7, $vr7, 0x0f");
249 /* lookup from qmul[0] */
250 asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4");
251 asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5");
252 asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6");
253 asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7");
254 /* lookup from qmul[16] */
255 asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8");
256 asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9");
257 asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10");
258 asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11");
259 /* vr4 - vr7: qmul(Q + Qx) = Dx */
260 asm volatile("vxor.v $vr4, $vr4, $vr8");
261 asm volatile("vxor.v $vr5, $vr5, $vr9");
262 asm volatile("vxor.v $vr6, $vr6, $vr10");
263 asm volatile("vxor.v $vr7, $vr7, $vr11");
264 asm volatile("vst $vr4, %0" : "=m" (dq[0]));
265 asm volatile("vst $vr5, %0" : "=m" (dq[16]));
266 asm volatile("vst $vr6, %0" : "=m" (dq[32]));
267 asm volatile("vst $vr7, %0" : "=m" (dq[48]));
268
269 /* vr0 - vr3: P + Dx + Dx = P */
270 asm volatile("vxor.v $vr0, $vr0, $vr4");
271 asm volatile("vxor.v $vr1, $vr1, $vr5");
272 asm volatile("vxor.v $vr2, $vr2, $vr6");
273 asm volatile("vxor.v $vr3, $vr3, $vr7");
274 asm volatile("vst $vr0, %0" : "=m" (p[0]));
275 asm volatile("vst $vr1, %0" : "=m" (p[16]));
276 asm volatile("vst $vr2, %0" : "=m" (p[32]));
277 asm volatile("vst $vr3, %0" : "=m" (p[48]));
278
279 bytes -= 64;
280 p += 64;
281 q += 64;
282 dq += 64;
283 }
284
285 kernel_fpu_end();
286 }
287
288 const struct raid6_recov_calls raid6_recov_lsx = {
289 .data2 = raid6_2data_recov_lsx,
290 .datap = raid6_datap_recov_lsx,
291 .valid = raid6_has_lsx,
292 .name = "lsx",
293 .priority = 1,
294 };
295 #endif /* CONFIG_CPU_HAS_LSX */
296
297 #ifdef CONFIG_CPU_HAS_LASX
raid6_has_lasx(void)298 static int raid6_has_lasx(void)
299 {
300 return cpu_has_lasx;
301 }
302
raid6_2data_recov_lasx(int disks,size_t bytes,int faila,int failb,void ** ptrs)303 static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
304 int failb, void **ptrs)
305 {
306 u8 *p, *q, *dp, *dq;
307 const u8 *pbmul; /* P multiplier table for B data */
308 const u8 *qmul; /* Q multiplier table (for both) */
309
310 p = (u8 *)ptrs[disks - 2];
311 q = (u8 *)ptrs[disks - 1];
312
313 /*
314 * Compute syndrome with zero for the missing data pages
315 * Use the dead data pages as temporary storage for
316 * delta p and delta q
317 */
318 dp = (u8 *)ptrs[faila];
319 ptrs[faila] = (void *)raid6_empty_zero_page;
320 ptrs[disks - 2] = dp;
321 dq = (u8 *)ptrs[failb];
322 ptrs[failb] = (void *)raid6_empty_zero_page;
323 ptrs[disks - 1] = dq;
324
325 raid6_call.gen_syndrome(disks, bytes, ptrs);
326
327 /* Restore pointer table */
328 ptrs[faila] = dp;
329 ptrs[failb] = dq;
330 ptrs[disks - 2] = p;
331 ptrs[disks - 1] = q;
332
333 /* Now, pick the proper data tables */
334 pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
335 qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
336
337 kernel_fpu_begin();
338
339 /*
340 * xr20, xr21: qmul
341 * xr22, xr23: pbmul
342 */
343 asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
344 asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
345 asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
346 asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
347 asm volatile("xvreplve0.q $xr20, $xr20");
348 asm volatile("xvreplve0.q $xr21, $xr21");
349 asm volatile("xvreplve0.q $xr22, $xr22");
350 asm volatile("xvreplve0.q $xr23, $xr23");
351
352 while (bytes) {
353 /* xr0, xr1: Q */
354 asm volatile("xvld $xr0, %0" : : "m" (q[0]));
355 asm volatile("xvld $xr1, %0" : : "m" (q[32]));
356 /* xr0, xr1: Q + Qxy */
357 asm volatile("xvld $xr4, %0" : : "m" (dq[0]));
358 asm volatile("xvld $xr5, %0" : : "m" (dq[32]));
359 asm volatile("xvxor.v $xr0, $xr0, $xr4");
360 asm volatile("xvxor.v $xr1, $xr1, $xr5");
361 /* xr2, xr3: P */
362 asm volatile("xvld $xr2, %0" : : "m" (p[0]));
363 asm volatile("xvld $xr3, %0" : : "m" (p[32]));
364 /* xr2, xr3: P + Pxy */
365 asm volatile("xvld $xr4, %0" : : "m" (dp[0]));
366 asm volatile("xvld $xr5, %0" : : "m" (dp[32]));
367 asm volatile("xvxor.v $xr2, $xr2, $xr4");
368 asm volatile("xvxor.v $xr3, $xr3, $xr5");
369
370 /* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */
371 asm volatile("xvsrli.b $xr4, $xr0, 4");
372 asm volatile("xvsrli.b $xr5, $xr1, 4");
373 /* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */
374 asm volatile("xvandi.b $xr0, $xr0, 0x0f");
375 asm volatile("xvandi.b $xr1, $xr1, 0x0f");
376 /* lookup from qmul[0] */
377 asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0");
378 asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1");
379 /* lookup from qmul[16] */
380 asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4");
381 asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5");
382 /* xr6, xr7: B(Q + Qxy) */
383 asm volatile("xvxor.v $xr6, $xr4, $xr0");
384 asm volatile("xvxor.v $xr7, $xr5, $xr1");
385
386 /* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */
387 asm volatile("xvsrli.b $xr4, $xr2, 4");
388 asm volatile("xvsrli.b $xr5, $xr3, 4");
389 /* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */
390 asm volatile("xvandi.b $xr0, $xr2, 0x0f");
391 asm volatile("xvandi.b $xr1, $xr3, 0x0f");
392 /* lookup from pbmul[0] */
393 asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0");
394 asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1");
395 /* lookup from pbmul[16] */
396 asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
397 asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
398 /* xr0, xr1: A(P + Pxy) */
399 asm volatile("xvxor.v $xr0, $xr0, $xr4");
400 asm volatile("xvxor.v $xr1, $xr1, $xr5");
401
402 /* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */
403 asm volatile("xvxor.v $xr0, $xr0, $xr6");
404 asm volatile("xvxor.v $xr1, $xr1, $xr7");
405
406 /* xr2, xr3: P + Pxy + Dx = Dy */
407 asm volatile("xvxor.v $xr2, $xr2, $xr0");
408 asm volatile("xvxor.v $xr3, $xr3, $xr1");
409
410 asm volatile("xvst $xr0, %0" : "=m" (dq[0]));
411 asm volatile("xvst $xr1, %0" : "=m" (dq[32]));
412 asm volatile("xvst $xr2, %0" : "=m" (dp[0]));
413 asm volatile("xvst $xr3, %0" : "=m" (dp[32]));
414
415 bytes -= 64;
416 p += 64;
417 q += 64;
418 dp += 64;
419 dq += 64;
420 }
421
422 kernel_fpu_end();
423 }
424
raid6_datap_recov_lasx(int disks,size_t bytes,int faila,void ** ptrs)425 static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
426 void **ptrs)
427 {
428 u8 *p, *q, *dq;
429 const u8 *qmul; /* Q multiplier table */
430
431 p = (u8 *)ptrs[disks - 2];
432 q = (u8 *)ptrs[disks - 1];
433
434 /*
435 * Compute syndrome with zero for the missing data page
436 * Use the dead data page as temporary storage for delta q
437 */
438 dq = (u8 *)ptrs[faila];
439 ptrs[faila] = (void *)raid6_empty_zero_page;
440 ptrs[disks - 1] = dq;
441
442 raid6_call.gen_syndrome(disks, bytes, ptrs);
443
444 /* Restore pointer table */
445 ptrs[faila] = dq;
446 ptrs[disks - 1] = q;
447
448 /* Now, pick the proper data tables */
449 qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
450
451 kernel_fpu_begin();
452
453 /* xr22, xr23: qmul */
454 asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
455 asm volatile("xvreplve0.q $xr22, $xr22");
456 asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
457 asm volatile("xvreplve0.q $xr23, $xr23");
458
459 while (bytes) {
460 /* xr0, xr1: P + Dx */
461 asm volatile("xvld $xr0, %0" : : "m" (p[0]));
462 asm volatile("xvld $xr1, %0" : : "m" (p[32]));
463 /* xr2, xr3: Qx */
464 asm volatile("xvld $xr2, %0" : : "m" (dq[0]));
465 asm volatile("xvld $xr3, %0" : : "m" (dq[32]));
466 /* xr2, xr3: Q + Qx */
467 asm volatile("xvld $xr4, %0" : : "m" (q[0]));
468 asm volatile("xvld $xr5, %0" : : "m" (q[32]));
469 asm volatile("xvxor.v $xr2, $xr2, $xr4");
470 asm volatile("xvxor.v $xr3, $xr3, $xr5");
471
472 /* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */
473 asm volatile("xvsrli.b $xr4, $xr2, 4");
474 asm volatile("xvsrli.b $xr5, $xr3, 4");
475 /* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */
476 asm volatile("xvandi.b $xr2, $xr2, 0x0f");
477 asm volatile("xvandi.b $xr3, $xr3, 0x0f");
478 /* lookup from qmul[0] */
479 asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2");
480 asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3");
481 /* lookup from qmul[16] */
482 asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
483 asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
484 /* xr2, xr3: qmul(Q + Qx) = Dx */
485 asm volatile("xvxor.v $xr2, $xr2, $xr4");
486 asm volatile("xvxor.v $xr3, $xr3, $xr5");
487
488 /* xr0, xr1: P + Dx + Dx = P */
489 asm volatile("xvxor.v $xr0, $xr0, $xr2");
490 asm volatile("xvxor.v $xr1, $xr1, $xr3");
491
492 asm volatile("xvst $xr2, %0" : "=m" (dq[0]));
493 asm volatile("xvst $xr3, %0" : "=m" (dq[32]));
494 asm volatile("xvst $xr0, %0" : "=m" (p[0]));
495 asm volatile("xvst $xr1, %0" : "=m" (p[32]));
496
497 bytes -= 64;
498 p += 64;
499 q += 64;
500 dq += 64;
501 }
502
503 kernel_fpu_end();
504 }
505
506 const struct raid6_recov_calls raid6_recov_lasx = {
507 .data2 = raid6_2data_recov_lasx,
508 .datap = raid6_datap_recov_lasx,
509 .valid = raid6_has_lasx,
510 .name = "lasx",
511 .priority = 2,
512 };
513 #endif /* CONFIG_CPU_HAS_LASX */
514