xref: /openbmc/linux/lib/raid6/avx2.c (revision d4576c56)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- ------------------------------------------------------- *
3  *
4  *   Copyright (C) 2012 Intel Corporation
5  *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
6  *
7  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
8  *
9  * ----------------------------------------------------------------------- */
10 
11 /*
12  * AVX2 implementation of RAID-6 syndrome functions
13  *
14  */
15 
16 #include <linux/raid/pq.h>
17 #include "x86.h"
18 
19 static const struct raid6_avx2_constants {
20 	u64 x1d[4];
21 } raid6_avx2_constants __aligned(32) = {
22 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
23 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
24 };
25 
26 static int raid6_have_avx2(void)
27 {
28 	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
29 }
30 
31 /*
32  * Plain AVX2 implementation
33  */
34 static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
35 {
36 	u8 **dptr = (u8 **)ptrs;
37 	u8 *p, *q;
38 	int d, z, z0;
39 
40 	z0 = disks - 3;		/* Highest data disk */
41 	p = dptr[z0+1];		/* XOR parity */
42 	q = dptr[z0+2];		/* RS syndrome */
43 
44 	kernel_fpu_begin();
45 
46 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
47 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
48 
49 	for (d = 0; d < bytes; d += 32) {
50 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
51 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
52 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
53 		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
54 		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
55 		for (z = z0-2; z >= 0; z--) {
56 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
57 			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
58 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
59 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
60 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
61 			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
62 			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
63 			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
64 		}
65 		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
66 		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
67 		asm volatile("vpand %ymm0,%ymm5,%ymm5");
68 		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
69 		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
70 		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
71 
72 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
73 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
74 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
75 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
76 	}
77 
78 	asm volatile("sfence" : : : "memory");
79 	kernel_fpu_end();
80 }
81 
82 static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
83 				     size_t bytes, void **ptrs)
84 {
85 	u8 **dptr = (u8 **)ptrs;
86 	u8 *p, *q;
87 	int d, z, z0;
88 
89 	z0 = stop;		/* P/Q right side optimization */
90 	p = dptr[disks-2];	/* XOR parity */
91 	q = dptr[disks-1];	/* RS syndrome */
92 
93 	kernel_fpu_begin();
94 
95 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
96 
97 	for (d = 0 ; d < bytes ; d += 32) {
98 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
99 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
100 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
101 		/* P/Q data pages */
102 		for (z = z0-1 ; z >= start ; z--) {
103 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
104 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
105 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
106 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
107 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
108 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
109 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
110 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
111 		}
112 		/* P/Q left side optimization */
113 		for (z = start-1 ; z >= 0 ; z--) {
114 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
115 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
116 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
117 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
118 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119 		}
120 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
121 		/* Don't use movntdq for r/w memory area < cache line */
122 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
123 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
124 	}
125 
126 	asm volatile("sfence" : : : "memory");
127 	kernel_fpu_end();
128 }
129 
130 const struct raid6_calls raid6_avx2x1 = {
131 	raid6_avx21_gen_syndrome,
132 	raid6_avx21_xor_syndrome,
133 	raid6_have_avx2,
134 	"avx2x1",
135 	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
136 };
137 
138 /*
139  * Unrolled-by-2 AVX2 implementation
140  */
141 static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
142 {
143 	u8 **dptr = (u8 **)ptrs;
144 	u8 *p, *q;
145 	int d, z, z0;
146 
147 	z0 = disks - 3;		/* Highest data disk */
148 	p = dptr[z0+1];		/* XOR parity */
149 	q = dptr[z0+2];		/* RS syndrome */
150 
151 	kernel_fpu_begin();
152 
153 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
154 	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
155 
156 	/* We uniformly assume a single prefetch covers at least 32 bytes */
157 	for (d = 0; d < bytes; d += 64) {
158 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
159 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
160 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
161 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
162 		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
163 		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
164 		for (z = z0-1; z >= 0; z--) {
165 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
166 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
167 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
168 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
169 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
170 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
171 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
172 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
173 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
174 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
175 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
176 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
177 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
178 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
179 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
180 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
181 		}
182 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
183 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
184 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
185 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
186 	}
187 
188 	asm volatile("sfence" : : : "memory");
189 	kernel_fpu_end();
190 }
191 
192 static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
193 				     size_t bytes, void **ptrs)
194 {
195 	u8 **dptr = (u8 **)ptrs;
196 	u8 *p, *q;
197 	int d, z, z0;
198 
199 	z0 = stop;		/* P/Q right side optimization */
200 	p = dptr[disks-2];	/* XOR parity */
201 	q = dptr[disks-1];	/* RS syndrome */
202 
203 	kernel_fpu_begin();
204 
205 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
206 
207 	for (d = 0 ; d < bytes ; d += 64) {
208 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
209 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
210 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
211 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
212 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
213 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
214 		/* P/Q data pages */
215 		for (z = z0-1 ; z >= start ; z--) {
216 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
217 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
218 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
219 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
220 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
221 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
222 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
223 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
224 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
225 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
226 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
227 			asm volatile("vmovdqa %0,%%ymm7"
228 				     :: "m" (dptr[z][d+32]));
229 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
230 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
231 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
232 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
233 		}
234 		/* P/Q left side optimization */
235 		for (z = start-1 ; z >= 0 ; z--) {
236 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
237 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
238 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
239 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
240 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
241 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
242 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
243 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
244 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
245 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
246 		}
247 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
248 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
249 		/* Don't use movntdq for r/w memory area < cache line */
250 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
251 		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
252 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
253 		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
254 	}
255 
256 	asm volatile("sfence" : : : "memory");
257 	kernel_fpu_end();
258 }
259 
260 const struct raid6_calls raid6_avx2x2 = {
261 	raid6_avx22_gen_syndrome,
262 	raid6_avx22_xor_syndrome,
263 	raid6_have_avx2,
264 	"avx2x2",
265 	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
266 };
267 
268 #ifdef CONFIG_X86_64
269 
270 /*
271  * Unrolled-by-4 AVX2 implementation
272  */
273 static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
274 {
275 	u8 **dptr = (u8 **)ptrs;
276 	u8 *p, *q;
277 	int d, z, z0;
278 
279 	z0 = disks - 3;		/* Highest data disk */
280 	p = dptr[z0+1];		/* XOR parity */
281 	q = dptr[z0+2];		/* RS syndrome */
282 
283 	kernel_fpu_begin();
284 
285 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
286 	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
287 	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
288 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
289 	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
290 	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
291 	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
292 	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
293 	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
294 	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
295 
296 	for (d = 0; d < bytes; d += 128) {
297 		for (z = z0; z >= 0; z--) {
298 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
299 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
300 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
301 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
302 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
303 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
304 			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
305 			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
306 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
307 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
308 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
309 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
310 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
311 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
312 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
313 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
314 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
315 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
316 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
317 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
318 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
319 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
320 			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
321 			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
322 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
323 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
324 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
325 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
326 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
327 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
328 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
329 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
330 		}
331 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
332 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
333 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
334 		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
335 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
336 		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
337 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
338 		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
339 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
340 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
341 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
342 		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
343 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
344 		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
345 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
346 		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
347 	}
348 
349 	asm volatile("sfence" : : : "memory");
350 	kernel_fpu_end();
351 }
352 
353 static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
354 				     size_t bytes, void **ptrs)
355 {
356 	u8 **dptr = (u8 **)ptrs;
357 	u8 *p, *q;
358 	int d, z, z0;
359 
360 	z0 = stop;		/* P/Q right side optimization */
361 	p = dptr[disks-2];	/* XOR parity */
362 	q = dptr[disks-1];	/* RS syndrome */
363 
364 	kernel_fpu_begin();
365 
366 	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
367 
368 	for (d = 0 ; d < bytes ; d += 128) {
369 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
370 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
371 		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
372 		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
373 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
374 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
375 		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
376 		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
377 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
378 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
379 		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
380 		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
381 		/* P/Q data pages */
382 		for (z = z0-1 ; z >= start ; z--) {
383 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
384 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
385 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
386 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
387 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
388 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
389 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
390 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
391 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
392 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
393 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
394 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
395 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
396 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
397 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
398 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
399 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
400 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
401 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
402 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
403 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
404 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
405 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
406 			asm volatile("vmovdqa %0,%%ymm7"
407 				     :: "m" (dptr[z][d+32]));
408 			asm volatile("vmovdqa %0,%%ymm13"
409 				     :: "m" (dptr[z][d+64]));
410 			asm volatile("vmovdqa %0,%%ymm15"
411 				     :: "m" (dptr[z][d+96]));
412 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
413 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
414 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
415 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
416 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
417 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
418 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
419 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
420 		}
421 		asm volatile("prefetchnta %0" :: "m" (q[d]));
422 		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
423 		/* P/Q left side optimization */
424 		for (z = start-1 ; z >= 0 ; z--) {
425 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
426 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
427 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
428 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
429 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
430 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
431 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
432 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
433 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
434 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
435 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
436 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
437 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
438 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
439 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
440 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
441 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
442 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
443 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
444 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
445 		}
446 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
447 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
448 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
449 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
450 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
451 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
452 		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
453 		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
454 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
455 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
456 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
457 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
458 	}
459 	asm volatile("sfence" : : : "memory");
460 	kernel_fpu_end();
461 }
462 
463 const struct raid6_calls raid6_avx2x4 = {
464 	raid6_avx24_gen_syndrome,
465 	raid6_avx24_xor_syndrome,
466 	raid6_have_avx2,
467 	"avx2x4",
468 	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
469 };
470 #endif /* CONFIG_X86_64 */
471