xref: /openbmc/linux/lib/raid6/avx512.c (revision 791d3ef2)
1 /* -*- linux-c -*- --------------------------------------------------------
2  *
3  *   Copyright (C) 2016 Intel Corporation
4  *
5  *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
6  *   Author: Megha Dey <megha.dey@linux.intel.com>
7  *
8  *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
9  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
10  *
11  *   This program is free software; you can redistribute it and/or modify
12  *   it under the terms of the GNU General Public License as published by
13  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
14  *   Boston MA 02111-1307, USA; either version 2 of the License, or
15  *   (at your option) any later version; incorporated herein by reference.
16  *
17  * -----------------------------------------------------------------------
18  */
19 
20 /*
21  * AVX512 implementation of RAID-6 syndrome functions
22  *
23  */
24 
25 #ifdef CONFIG_AS_AVX512
26 
27 #include <linux/raid/pq.h>
28 #include "x86.h"
29 
30 static const struct raid6_avx512_constants {
31 	u64 x1d[8];
32 } raid6_avx512_constants __aligned(512/8) = {
33 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
34 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
35 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
36 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
37 };
38 
39 static int raid6_have_avx512(void)
40 {
41 	return boot_cpu_has(X86_FEATURE_AVX2) &&
42 		boot_cpu_has(X86_FEATURE_AVX) &&
43 		boot_cpu_has(X86_FEATURE_AVX512F) &&
44 		boot_cpu_has(X86_FEATURE_AVX512BW) &&
45 		boot_cpu_has(X86_FEATURE_AVX512VL) &&
46 		boot_cpu_has(X86_FEATURE_AVX512DQ);
47 }
48 
49 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
50 {
51 	u8 **dptr = (u8 **)ptrs;
52 	u8 *p, *q;
53 	int d, z, z0;
54 
55 	z0 = disks - 3;         /* Highest data disk */
56 	p = dptr[z0+1];         /* XOR parity */
57 	q = dptr[z0+2];         /* RS syndrome */
58 
59 	kernel_fpu_begin();
60 
61 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
62 		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
63 		     :
64 		     : "m" (raid6_avx512_constants.x1d[0]));
65 
66 	for (d = 0; d < bytes; d += 64) {
67 		asm volatile("prefetchnta %0\n\t"
68 			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
69 			     "prefetchnta %1\n\t"
70 			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
71 			     "vmovdqa64 %1,%%zmm6"
72 			     :
73 			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
74 		for (z = z0-2; z >= 0; z--) {
75 			asm volatile("prefetchnta %0\n\t"
76 				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
77 				     "vpmovm2b %%k1,%%zmm5\n\t"
78 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
79 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
80 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
81 				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
82 				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
83 				     "vmovdqa64 %0,%%zmm6"
84 				     :
85 				     : "m" (dptr[z][d]));
86 		}
87 		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
88 			     "vpmovm2b %%k1,%%zmm5\n\t"
89 			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
90 			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
91 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
92 			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
93 			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
94 			     "vmovntdq %%zmm2,%0\n\t"
95 			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
96 			     "vmovntdq %%zmm4,%1\n\t"
97 			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
98 			     :
99 			     : "m" (p[d]), "m" (q[d]));
100 	}
101 
102 	asm volatile("sfence" : : : "memory");
103 	kernel_fpu_end();
104 }
105 
106 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
107 				       size_t bytes, void **ptrs)
108 {
109 	u8 **dptr = (u8 **)ptrs;
110 	u8 *p, *q;
111 	int d, z, z0;
112 
113 	z0 = stop;		/* P/Q right side optimization */
114 	p = dptr[disks-2];	/* XOR parity */
115 	q = dptr[disks-1];	/* RS syndrome */
116 
117 	kernel_fpu_begin();
118 
119 	asm volatile("vmovdqa64 %0,%%zmm0"
120 		     : : "m" (raid6_avx512_constants.x1d[0]));
121 
122 	for (d = 0 ; d < bytes ; d += 64) {
123 		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
124 			     "vmovdqa64 %1,%%zmm2\n\t"
125 			     "vpxorq %%zmm4,%%zmm2,%%zmm2"
126 			     :
127 			     : "m" (dptr[z0][d]),  "m" (p[d]));
128 		/* P/Q data pages */
129 		for (z = z0-1 ; z >= start ; z--) {
130 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
131 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
132 				     "vpmovm2b %%k1,%%zmm5\n\t"
133 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
134 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
135 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
136 				     "vmovdqa64 %0,%%zmm5\n\t"
137 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
138 				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
139 				     :
140 				     : "m" (dptr[z][d]));
141 		}
142 		/* P/Q left side optimization */
143 		for (z = start-1 ; z >= 0 ; z--) {
144 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
145 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
146 				     "vpmovm2b %%k1,%%zmm5\n\t"
147 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
148 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
149 				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
150 				     :
151 				     : );
152 		}
153 		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
154 		/* Don't use movntdq for r/w memory area < cache line */
155 			     "vmovdqa64 %%zmm4,%0\n\t"
156 			     "vmovdqa64 %%zmm2,%1"
157 			     :
158 			     : "m" (q[d]), "m" (p[d]));
159 	}
160 
161 	asm volatile("sfence" : : : "memory");
162 	kernel_fpu_end();
163 }
164 
165 const struct raid6_calls raid6_avx512x1 = {
166 	raid6_avx5121_gen_syndrome,
167 	raid6_avx5121_xor_syndrome,
168 	raid6_have_avx512,
169 	"avx512x1",
170 	1                       /* Has cache hints */
171 };
172 
173 /*
174  * Unrolled-by-2 AVX512 implementation
175  */
176 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
177 {
178 	u8 **dptr = (u8 **)ptrs;
179 	u8 *p, *q;
180 	int d, z, z0;
181 
182 	z0 = disks - 3;         /* Highest data disk */
183 	p = dptr[z0+1];         /* XOR parity */
184 	q = dptr[z0+2];         /* RS syndrome */
185 
186 	kernel_fpu_begin();
187 
188 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
189 		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
190 		     :
191 		     : "m" (raid6_avx512_constants.x1d[0]));
192 
193 	/* We uniformly assume a single prefetch covers at least 64 bytes */
194 	for (d = 0; d < bytes; d += 128) {
195 		asm volatile("prefetchnta %0\n\t"
196 			     "prefetchnta %1\n\t"
197 			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
198 			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
199 			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
200 			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
201 			     :
202 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
203 		for (z = z0-1; z >= 0; z--) {
204 			asm volatile("prefetchnta %0\n\t"
205 				     "prefetchnta %1\n\t"
206 				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
207 				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
208 				     "vpmovm2b %%k1,%%zmm5\n\t"
209 				     "vpmovm2b %%k2,%%zmm7\n\t"
210 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
211 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
212 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
213 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
214 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
215 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
216 				     "vmovdqa64 %0,%%zmm5\n\t"
217 				     "vmovdqa64 %1,%%zmm7\n\t"
218 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
219 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
220 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
221 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
222 				     :
223 				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
224 		}
225 		asm volatile("vmovntdq %%zmm2,%0\n\t"
226 			     "vmovntdq %%zmm3,%1\n\t"
227 			     "vmovntdq %%zmm4,%2\n\t"
228 			     "vmovntdq %%zmm6,%3"
229 			     :
230 			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
231 			       "m" (q[d+64]));
232 	}
233 
234 	asm volatile("sfence" : : : "memory");
235 	kernel_fpu_end();
236 }
237 
238 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
239 				       size_t bytes, void **ptrs)
240 {
241 	u8 **dptr = (u8 **)ptrs;
242 	u8 *p, *q;
243 	int d, z, z0;
244 
245 	z0 = stop;		/* P/Q right side optimization */
246 	p = dptr[disks-2];	/* XOR parity */
247 	q = dptr[disks-1];	/* RS syndrome */
248 
249 	kernel_fpu_begin();
250 
251 	asm volatile("vmovdqa64 %0,%%zmm0"
252 		     : : "m" (raid6_avx512_constants.x1d[0]));
253 
254 	for (d = 0 ; d < bytes ; d += 128) {
255 		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
256 			     "vmovdqa64 %1,%%zmm6\n\t"
257 			     "vmovdqa64 %2,%%zmm2\n\t"
258 			     "vmovdqa64 %3,%%zmm3\n\t"
259 			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
260 			     "vpxorq %%zmm6,%%zmm3,%%zmm3"
261 			     :
262 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
263 			       "m" (p[d]), "m" (p[d+64]));
264 		/* P/Q data pages */
265 		for (z = z0-1 ; z >= start ; z--) {
266 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
267 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
268 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
269 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
270 				     "vpmovm2b %%k1,%%zmm5\n\t"
271 				     "vpmovm2b %%k2,%%zmm7\n\t"
272 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
273 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
274 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
275 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
276 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
277 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
278 				     "vmovdqa64 %0,%%zmm5\n\t"
279 				     "vmovdqa64 %1,%%zmm7\n\t"
280 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
281 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
282 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
283 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
284 				     :
285 				     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
286 		}
287 		/* P/Q left side optimization */
288 		for (z = start-1 ; z >= 0 ; z--) {
289 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
290 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
291 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
292 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
293 				     "vpmovm2b %%k1,%%zmm5\n\t"
294 				     "vpmovm2b %%k2,%%zmm7\n\t"
295 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
296 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
297 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
298 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
299 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
300 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
301 				     :
302 				     : );
303 		}
304 		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
305 			     "vpxorq %1,%%zmm6,%%zmm6\n\t"
306 			     /* Don't use movntdq for r/w
307 			      * memory area < cache line
308 			      */
309 			     "vmovdqa64 %%zmm4,%0\n\t"
310 			     "vmovdqa64 %%zmm6,%1\n\t"
311 			     "vmovdqa64 %%zmm2,%2\n\t"
312 			     "vmovdqa64 %%zmm3,%3"
313 			     :
314 			     : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
315 			       "m" (p[d+64]));
316 	}
317 
318 	asm volatile("sfence" : : : "memory");
319 	kernel_fpu_end();
320 }
321 
322 const struct raid6_calls raid6_avx512x2 = {
323 	raid6_avx5122_gen_syndrome,
324 	raid6_avx5122_xor_syndrome,
325 	raid6_have_avx512,
326 	"avx512x2",
327 	1                       /* Has cache hints */
328 };
329 
330 #ifdef CONFIG_X86_64
331 
332 /*
333  * Unrolled-by-4 AVX2 implementation
334  */
335 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
336 {
337 	u8 **dptr = (u8 **)ptrs;
338 	u8 *p, *q;
339 	int d, z, z0;
340 
341 	z0 = disks - 3;         /* Highest data disk */
342 	p = dptr[z0+1];         /* XOR parity */
343 	q = dptr[z0+2];         /* RS syndrome */
344 
345 	kernel_fpu_begin();
346 
347 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
348 		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
349 		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
350 		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
351 		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
352 		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
353 		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
354 		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
355 		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
356 		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
357 		     :
358 		     : "m" (raid6_avx512_constants.x1d[0]));
359 
360 	for (d = 0; d < bytes; d += 256) {
361 		for (z = z0; z >= 0; z--) {
362 		asm volatile("prefetchnta %0\n\t"
363 			     "prefetchnta %1\n\t"
364 			     "prefetchnta %2\n\t"
365 			     "prefetchnta %3\n\t"
366 			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
367 			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
368 			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
369 			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
370 			     "vpmovm2b %%k1,%%zmm5\n\t"
371 			     "vpmovm2b %%k2,%%zmm7\n\t"
372 			     "vpmovm2b %%k3,%%zmm13\n\t"
373 			     "vpmovm2b %%k4,%%zmm15\n\t"
374 			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
375 			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
376 			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
377 			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
378 			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
379 			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
380 			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
381 			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
382 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
383 			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
384 			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
385 			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
386 			     "vmovdqa64 %0,%%zmm5\n\t"
387 			     "vmovdqa64 %1,%%zmm7\n\t"
388 			     "vmovdqa64 %2,%%zmm13\n\t"
389 			     "vmovdqa64 %3,%%zmm15\n\t"
390 			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
391 			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
392 			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
393 			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
394 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
395 			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
396 			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
397 			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
398 			     :
399 			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
400 			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
401 		}
402 		asm volatile("vmovntdq %%zmm2,%0\n\t"
403 			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
404 			     "vmovntdq %%zmm3,%1\n\t"
405 			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
406 			     "vmovntdq %%zmm10,%2\n\t"
407 			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
408 			     "vmovntdq %%zmm11,%3\n\t"
409 			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
410 			     "vmovntdq %%zmm4,%4\n\t"
411 			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
412 			     "vmovntdq %%zmm6,%5\n\t"
413 			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
414 			     "vmovntdq %%zmm12,%6\n\t"
415 			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
416 			     "vmovntdq %%zmm14,%7\n\t"
417 			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
418 			     :
419 			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
420 			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
421 			       "m" (q[d+128]), "m" (q[d+192]));
422 	}
423 
424 	asm volatile("sfence" : : : "memory");
425 	kernel_fpu_end();
426 }
427 
428 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
429 				       size_t bytes, void **ptrs)
430 {
431 	u8 **dptr = (u8 **)ptrs;
432 	u8 *p, *q;
433 	int d, z, z0;
434 
435 	z0 = stop;		/* P/Q right side optimization */
436 	p = dptr[disks-2];	/* XOR parity */
437 	q = dptr[disks-1];	/* RS syndrome */
438 
439 	kernel_fpu_begin();
440 
441 	asm volatile("vmovdqa64 %0,%%zmm0"
442 		     :: "m" (raid6_avx512_constants.x1d[0]));
443 
444 	for (d = 0 ; d < bytes ; d += 256) {
445 		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
446 			     "vmovdqa64 %1,%%zmm6\n\t"
447 			     "vmovdqa64 %2,%%zmm12\n\t"
448 			     "vmovdqa64 %3,%%zmm14\n\t"
449 			     "vmovdqa64 %4,%%zmm2\n\t"
450 			     "vmovdqa64 %5,%%zmm3\n\t"
451 			     "vmovdqa64 %6,%%zmm10\n\t"
452 			     "vmovdqa64 %7,%%zmm11\n\t"
453 			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
454 			     "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
455 			     "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
456 			     "vpxorq %%zmm14,%%zmm11,%%zmm11"
457 			     :
458 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
459 			       "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
460 			       "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
461 			       "m" (p[d+192]));
462 		/* P/Q data pages */
463 		for (z = z0-1 ; z >= start ; z--) {
464 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
465 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
466 				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
467 				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
468 				     "prefetchnta %0\n\t"
469 				     "prefetchnta %2\n\t"
470 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
471 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
472 				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
473 				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
474 				     "vpmovm2b %%k1,%%zmm5\n\t"
475 				     "vpmovm2b %%k2,%%zmm7\n\t"
476 				     "vpmovm2b %%k3,%%zmm13\n\t"
477 				     "vpmovm2b %%k4,%%zmm15\n\t"
478 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
479 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
480 				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
481 				     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
482 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
483 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
484 				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
485 				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
486 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
487 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
488 				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
489 				     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
490 				     "vmovdqa64 %0,%%zmm5\n\t"
491 				     "vmovdqa64 %1,%%zmm7\n\t"
492 				     "vmovdqa64 %2,%%zmm13\n\t"
493 				     "vmovdqa64 %3,%%zmm15\n\t"
494 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
495 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
496 				     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
497 				     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
498 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
499 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
500 				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
501 				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
502 				     :
503 				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
504 				       "m" (dptr[z][d+128]),
505 				       "m" (dptr[z][d+192]));
506 		}
507 		asm volatile("prefetchnta %0\n\t"
508 			     "prefetchnta %1\n\t"
509 			     :
510 			     : "m" (q[d]), "m" (q[d+128]));
511 		/* P/Q left side optimization */
512 		for (z = start-1 ; z >= 0 ; z--) {
513 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
514 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
515 				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
516 				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
517 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
518 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
519 				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
520 				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
521 				     "vpmovm2b %%k1,%%zmm5\n\t"
522 				     "vpmovm2b %%k2,%%zmm7\n\t"
523 				     "vpmovm2b %%k3,%%zmm13\n\t"
524 				     "vpmovm2b %%k4,%%zmm15\n\t"
525 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
526 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
527 				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
528 				     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
529 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
530 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
531 				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
532 				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
533 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
534 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
535 				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
536 				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
537 				     :
538 				     : );
539 		}
540 		asm volatile("vmovntdq %%zmm2,%0\n\t"
541 			     "vmovntdq %%zmm3,%1\n\t"
542 			     "vmovntdq %%zmm10,%2\n\t"
543 			     "vmovntdq %%zmm11,%3\n\t"
544 			     "vpxorq %4,%%zmm4,%%zmm4\n\t"
545 			     "vpxorq %5,%%zmm6,%%zmm6\n\t"
546 			     "vpxorq %6,%%zmm12,%%zmm12\n\t"
547 			     "vpxorq %7,%%zmm14,%%zmm14\n\t"
548 			     "vmovntdq %%zmm4,%4\n\t"
549 			     "vmovntdq %%zmm6,%5\n\t"
550 			     "vmovntdq %%zmm12,%6\n\t"
551 			     "vmovntdq %%zmm14,%7"
552 			     :
553 			     : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
554 			       "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
555 			       "m" (q[d+128]), "m" (q[d+192]));
556 	}
557 	asm volatile("sfence" : : : "memory");
558 	kernel_fpu_end();
559 }
560 const struct raid6_calls raid6_avx512x4 = {
561 	raid6_avx5124_gen_syndrome,
562 	raid6_avx5124_xor_syndrome,
563 	raid6_have_avx512,
564 	"avx512x4",
565 	1                       /* Has cache hints */
566 };
567 #endif
568 
569 #endif /* CONFIG_AS_AVX512 */
570