xref: /openbmc/linux/lib/raid6/avx2.c (revision 160b8e75)
1 /* -*- linux-c -*- ------------------------------------------------------- *
2  *
3  *   Copyright (C) 2012 Intel Corporation
4  *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
5  *
6  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
7  *
8  *
9  *   This program is free software; you can redistribute it and/or modify
10  *   it under the terms of the GNU General Public License as published by
11  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
12  *   Boston MA 02111-1307, USA; either version 2 of the License, or
13  *   (at your option) any later version; incorporated herein by reference.
14  *
15  * ----------------------------------------------------------------------- */
16 
17 /*
18  * AVX2 implementation of RAID-6 syndrome functions
19  *
20  */
21 
22 #ifdef CONFIG_AS_AVX2
23 
24 #include <linux/raid/pq.h>
25 #include "x86.h"
26 
27 static const struct raid6_avx2_constants {
28 	u64 x1d[4];
29 } raid6_avx2_constants __aligned(32) = {
30 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
31 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
32 };
33 
34 static int raid6_have_avx2(void)
35 {
36 	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
37 }
38 
39 /*
40  * Plain AVX2 implementation
41  */
42 static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
43 {
44 	u8 **dptr = (u8 **)ptrs;
45 	u8 *p, *q;
46 	int d, z, z0;
47 
48 	z0 = disks - 3;		/* Highest data disk */
49 	p = dptr[z0+1];		/* XOR parity */
50 	q = dptr[z0+2];		/* RS syndrome */
51 
52 	kernel_fpu_begin();
53 
54 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
55 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
56 
57 	for (d = 0; d < bytes; d += 32) {
58 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
59 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
60 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
61 		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
62 		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
63 		for (z = z0-2; z >= 0; z--) {
64 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
65 			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
66 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
67 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
68 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
69 			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
70 			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
71 			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
72 		}
73 		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
74 		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
75 		asm volatile("vpand %ymm0,%ymm5,%ymm5");
76 		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
77 		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
78 		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
79 
80 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
81 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
82 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
83 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
84 	}
85 
86 	asm volatile("sfence" : : : "memory");
87 	kernel_fpu_end();
88 }
89 
90 static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
91 				     size_t bytes, void **ptrs)
92 {
93 	u8 **dptr = (u8 **)ptrs;
94 	u8 *p, *q;
95 	int d, z, z0;
96 
97 	z0 = stop;		/* P/Q right side optimization */
98 	p = dptr[disks-2];	/* XOR parity */
99 	q = dptr[disks-1];	/* RS syndrome */
100 
101 	kernel_fpu_begin();
102 
103 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
104 
105 	for (d = 0 ; d < bytes ; d += 32) {
106 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
107 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
108 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
109 		/* P/Q data pages */
110 		for (z = z0-1 ; z >= start ; z--) {
111 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
112 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
113 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
114 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
115 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
116 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
117 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
118 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119 		}
120 		/* P/Q left side optimization */
121 		for (z = start-1 ; z >= 0 ; z--) {
122 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
123 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
124 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
125 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
126 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
127 		}
128 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
129 		/* Don't use movntdq for r/w memory area < cache line */
130 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
131 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
132 	}
133 
134 	asm volatile("sfence" : : : "memory");
135 	kernel_fpu_end();
136 }
137 
138 const struct raid6_calls raid6_avx2x1 = {
139 	raid6_avx21_gen_syndrome,
140 	raid6_avx21_xor_syndrome,
141 	raid6_have_avx2,
142 	"avx2x1",
143 	1			/* Has cache hints */
144 };
145 
146 /*
147  * Unrolled-by-2 AVX2 implementation
148  */
149 static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
150 {
151 	u8 **dptr = (u8 **)ptrs;
152 	u8 *p, *q;
153 	int d, z, z0;
154 
155 	z0 = disks - 3;		/* Highest data disk */
156 	p = dptr[z0+1];		/* XOR parity */
157 	q = dptr[z0+2];		/* RS syndrome */
158 
159 	kernel_fpu_begin();
160 
161 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
162 	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
163 
164 	/* We uniformly assume a single prefetch covers at least 32 bytes */
165 	for (d = 0; d < bytes; d += 64) {
166 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
167 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
168 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
169 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
170 		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
171 		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
172 		for (z = z0-1; z >= 0; z--) {
173 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
174 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
175 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
176 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
177 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
178 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
179 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
180 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
181 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
182 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
183 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
184 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
185 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
186 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
187 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
188 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
189 		}
190 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
191 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
192 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
193 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
194 	}
195 
196 	asm volatile("sfence" : : : "memory");
197 	kernel_fpu_end();
198 }
199 
200 static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
201 				     size_t bytes, void **ptrs)
202 {
203 	u8 **dptr = (u8 **)ptrs;
204 	u8 *p, *q;
205 	int d, z, z0;
206 
207 	z0 = stop;		/* P/Q right side optimization */
208 	p = dptr[disks-2];	/* XOR parity */
209 	q = dptr[disks-1];	/* RS syndrome */
210 
211 	kernel_fpu_begin();
212 
213 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
214 
215 	for (d = 0 ; d < bytes ; d += 64) {
216 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
217 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
218 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
219 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
220 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
221 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
222 		/* P/Q data pages */
223 		for (z = z0-1 ; z >= start ; z--) {
224 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
225 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
226 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
227 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
228 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
229 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
230 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
231 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
232 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
233 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
234 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
235 			asm volatile("vmovdqa %0,%%ymm7"
236 				     :: "m" (dptr[z][d+32]));
237 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
238 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
239 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
240 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
241 		}
242 		/* P/Q left side optimization */
243 		for (z = start-1 ; z >= 0 ; z--) {
244 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
245 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
246 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
247 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
248 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
249 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
250 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
251 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
252 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
253 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
254 		}
255 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
256 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
257 		/* Don't use movntdq for r/w memory area < cache line */
258 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
259 		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
260 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
261 		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
262 	}
263 
264 	asm volatile("sfence" : : : "memory");
265 	kernel_fpu_end();
266 }
267 
268 const struct raid6_calls raid6_avx2x2 = {
269 	raid6_avx22_gen_syndrome,
270 	raid6_avx22_xor_syndrome,
271 	raid6_have_avx2,
272 	"avx2x2",
273 	1			/* Has cache hints */
274 };
275 
276 #ifdef CONFIG_X86_64
277 
278 /*
279  * Unrolled-by-4 AVX2 implementation
280  */
281 static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
282 {
283 	u8 **dptr = (u8 **)ptrs;
284 	u8 *p, *q;
285 	int d, z, z0;
286 
287 	z0 = disks - 3;		/* Highest data disk */
288 	p = dptr[z0+1];		/* XOR parity */
289 	q = dptr[z0+2];		/* RS syndrome */
290 
291 	kernel_fpu_begin();
292 
293 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
294 	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
295 	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
296 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
297 	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
298 	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
299 	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
300 	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
301 	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
302 	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
303 
304 	for (d = 0; d < bytes; d += 128) {
305 		for (z = z0; z >= 0; z--) {
306 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
307 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
308 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
309 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
310 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
311 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
312 			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
313 			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
314 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
315 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
316 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
317 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
318 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
319 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
320 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
321 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
322 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
323 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
324 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
325 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
326 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
327 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
328 			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
329 			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
330 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
331 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
332 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
333 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
334 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
335 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
336 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
337 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
338 		}
339 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
340 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
341 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
342 		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
343 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
344 		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
345 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
346 		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
347 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
348 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
349 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
350 		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
351 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
352 		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
353 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
354 		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
355 	}
356 
357 	asm volatile("sfence" : : : "memory");
358 	kernel_fpu_end();
359 }
360 
361 static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
362 				     size_t bytes, void **ptrs)
363 {
364 	u8 **dptr = (u8 **)ptrs;
365 	u8 *p, *q;
366 	int d, z, z0;
367 
368 	z0 = stop;		/* P/Q right side optimization */
369 	p = dptr[disks-2];	/* XOR parity */
370 	q = dptr[disks-1];	/* RS syndrome */
371 
372 	kernel_fpu_begin();
373 
374 	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
375 
376 	for (d = 0 ; d < bytes ; d += 128) {
377 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
378 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
379 		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
380 		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
381 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
382 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
383 		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
384 		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
385 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
386 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
387 		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
388 		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
389 		/* P/Q data pages */
390 		for (z = z0-1 ; z >= start ; z--) {
391 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
392 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
393 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
394 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
395 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
396 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
397 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
398 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
399 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
400 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
401 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
402 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
403 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
404 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
405 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
406 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
407 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
408 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
409 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
410 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
411 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
412 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
413 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
414 			asm volatile("vmovdqa %0,%%ymm7"
415 				     :: "m" (dptr[z][d+32]));
416 			asm volatile("vmovdqa %0,%%ymm13"
417 				     :: "m" (dptr[z][d+64]));
418 			asm volatile("vmovdqa %0,%%ymm15"
419 				     :: "m" (dptr[z][d+96]));
420 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
421 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
422 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
423 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
424 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
425 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
426 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
427 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
428 		}
429 		asm volatile("prefetchnta %0" :: "m" (q[d]));
430 		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
431 		/* P/Q left side optimization */
432 		for (z = start-1 ; z >= 0 ; z--) {
433 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
434 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
435 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
436 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
437 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
438 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
439 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
440 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
441 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
442 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
443 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
444 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
445 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
446 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
447 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
448 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
449 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
450 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
451 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
452 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
453 		}
454 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
455 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
456 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
457 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
458 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
459 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
460 		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
461 		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
462 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
463 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
464 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
465 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
466 	}
467 	asm volatile("sfence" : : : "memory");
468 	kernel_fpu_end();
469 }
470 
471 const struct raid6_calls raid6_avx2x4 = {
472 	raid6_avx24_gen_syndrome,
473 	raid6_avx24_xor_syndrome,
474 	raid6_have_avx2,
475 	"avx2x4",
476 	1			/* Has cache hints */
477 };
478 #endif
479 
480 #endif /* CONFIG_AS_AVX2 */
481