xref: /openbmc/linux/lib/raid6/avx512.c (revision 03ab8e6297acd1bc0eedaa050e2a1635c576fd11)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /* -*- linux-c -*- --------------------------------------------------------
3   *
4   *   Copyright (C) 2016 Intel Corporation
5   *
6   *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
7   *   Author: Megha Dey <megha.dey@linux.intel.com>
8   *
9   *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
10   *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
11   *
12   * -----------------------------------------------------------------------
13   */
14  
15  /*
16   * AVX512 implementation of RAID-6 syndrome functions
17   *
18   */
19  
20  #ifdef CONFIG_AS_AVX512
21  
22  #include <linux/raid/pq.h>
23  #include "x86.h"
24  
25  static const struct raid6_avx512_constants {
26  	u64 x1d[8];
27  } raid6_avx512_constants __aligned(512/8) = {
28  	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
29  	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
30  	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
31  	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
32  };
33  
raid6_have_avx512(void)34  static int raid6_have_avx512(void)
35  {
36  	return boot_cpu_has(X86_FEATURE_AVX2) &&
37  		boot_cpu_has(X86_FEATURE_AVX) &&
38  		boot_cpu_has(X86_FEATURE_AVX512F) &&
39  		boot_cpu_has(X86_FEATURE_AVX512BW) &&
40  		boot_cpu_has(X86_FEATURE_AVX512VL) &&
41  		boot_cpu_has(X86_FEATURE_AVX512DQ);
42  }
43  
raid6_avx5121_gen_syndrome(int disks,size_t bytes,void ** ptrs)44  static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
45  {
46  	u8 **dptr = (u8 **)ptrs;
47  	u8 *p, *q;
48  	int d, z, z0;
49  
50  	z0 = disks - 3;         /* Highest data disk */
51  	p = dptr[z0+1];         /* XOR parity */
52  	q = dptr[z0+2];         /* RS syndrome */
53  
54  	kernel_fpu_begin();
55  
56  	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
57  		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
58  		     :
59  		     : "m" (raid6_avx512_constants.x1d[0]));
60  
61  	for (d = 0; d < bytes; d += 64) {
62  		asm volatile("prefetchnta %0\n\t"
63  			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
64  			     "prefetchnta %1\n\t"
65  			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
66  			     "vmovdqa64 %1,%%zmm6"
67  			     :
68  			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
69  		for (z = z0-2; z >= 0; z--) {
70  			asm volatile("prefetchnta %0\n\t"
71  				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
72  				     "vpmovm2b %%k1,%%zmm5\n\t"
73  				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
74  				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
75  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
76  				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
77  				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
78  				     "vmovdqa64 %0,%%zmm6"
79  				     :
80  				     : "m" (dptr[z][d]));
81  		}
82  		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
83  			     "vpmovm2b %%k1,%%zmm5\n\t"
84  			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
85  			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
86  			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
87  			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
88  			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
89  			     "vmovntdq %%zmm2,%0\n\t"
90  			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
91  			     "vmovntdq %%zmm4,%1\n\t"
92  			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
93  			     :
94  			     : "m" (p[d]), "m" (q[d]));
95  	}
96  
97  	asm volatile("sfence" : : : "memory");
98  	kernel_fpu_end();
99  }
100  
raid6_avx5121_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)101  static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
102  				       size_t bytes, void **ptrs)
103  {
104  	u8 **dptr = (u8 **)ptrs;
105  	u8 *p, *q;
106  	int d, z, z0;
107  
108  	z0 = stop;		/* P/Q right side optimization */
109  	p = dptr[disks-2];	/* XOR parity */
110  	q = dptr[disks-1];	/* RS syndrome */
111  
112  	kernel_fpu_begin();
113  
114  	asm volatile("vmovdqa64 %0,%%zmm0"
115  		     : : "m" (raid6_avx512_constants.x1d[0]));
116  
117  	for (d = 0 ; d < bytes ; d += 64) {
118  		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
119  			     "vmovdqa64 %1,%%zmm2\n\t"
120  			     "vpxorq %%zmm4,%%zmm2,%%zmm2"
121  			     :
122  			     : "m" (dptr[z0][d]),  "m" (p[d]));
123  		/* P/Q data pages */
124  		for (z = z0-1 ; z >= start ; z--) {
125  			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
126  				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
127  				     "vpmovm2b %%k1,%%zmm5\n\t"
128  				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
129  				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
130  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
131  				     "vmovdqa64 %0,%%zmm5\n\t"
132  				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
133  				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
134  				     :
135  				     : "m" (dptr[z][d]));
136  		}
137  		/* P/Q left side optimization */
138  		for (z = start-1 ; z >= 0 ; z--) {
139  			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
140  				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
141  				     "vpmovm2b %%k1,%%zmm5\n\t"
142  				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
143  				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
144  				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
145  				     :
146  				     : );
147  		}
148  		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
149  		/* Don't use movntdq for r/w memory area < cache line */
150  			     "vmovdqa64 %%zmm4,%0\n\t"
151  			     "vmovdqa64 %%zmm2,%1"
152  			     :
153  			     : "m" (q[d]), "m" (p[d]));
154  	}
155  
156  	asm volatile("sfence" : : : "memory");
157  	kernel_fpu_end();
158  }
159  
160  const struct raid6_calls raid6_avx512x1 = {
161  	raid6_avx5121_gen_syndrome,
162  	raid6_avx5121_xor_syndrome,
163  	raid6_have_avx512,
164  	"avx512x1",
165  	.priority = 2		/* Prefer AVX512 over priority 1 (SSE2 and others) */
166  };
167  
168  /*
169   * Unrolled-by-2 AVX512 implementation
170   */
raid6_avx5122_gen_syndrome(int disks,size_t bytes,void ** ptrs)171  static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
172  {
173  	u8 **dptr = (u8 **)ptrs;
174  	u8 *p, *q;
175  	int d, z, z0;
176  
177  	z0 = disks - 3;         /* Highest data disk */
178  	p = dptr[z0+1];         /* XOR parity */
179  	q = dptr[z0+2];         /* RS syndrome */
180  
181  	kernel_fpu_begin();
182  
183  	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
184  		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
185  		     :
186  		     : "m" (raid6_avx512_constants.x1d[0]));
187  
188  	/* We uniformly assume a single prefetch covers at least 64 bytes */
189  	for (d = 0; d < bytes; d += 128) {
190  		asm volatile("prefetchnta %0\n\t"
191  			     "prefetchnta %1\n\t"
192  			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
193  			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
194  			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
195  			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
196  			     :
197  			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
198  		for (z = z0-1; z >= 0; z--) {
199  			asm volatile("prefetchnta %0\n\t"
200  				     "prefetchnta %1\n\t"
201  				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
202  				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
203  				     "vpmovm2b %%k1,%%zmm5\n\t"
204  				     "vpmovm2b %%k2,%%zmm7\n\t"
205  				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
206  				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
207  				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
208  				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
209  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
210  				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
211  				     "vmovdqa64 %0,%%zmm5\n\t"
212  				     "vmovdqa64 %1,%%zmm7\n\t"
213  				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
214  				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
215  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
216  				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
217  				     :
218  				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
219  		}
220  		asm volatile("vmovntdq %%zmm2,%0\n\t"
221  			     "vmovntdq %%zmm3,%1\n\t"
222  			     "vmovntdq %%zmm4,%2\n\t"
223  			     "vmovntdq %%zmm6,%3"
224  			     :
225  			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
226  			       "m" (q[d+64]));
227  	}
228  
229  	asm volatile("sfence" : : : "memory");
230  	kernel_fpu_end();
231  }
232  
raid6_avx5122_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)233  static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
234  				       size_t bytes, void **ptrs)
235  {
236  	u8 **dptr = (u8 **)ptrs;
237  	u8 *p, *q;
238  	int d, z, z0;
239  
240  	z0 = stop;		/* P/Q right side optimization */
241  	p = dptr[disks-2];	/* XOR parity */
242  	q = dptr[disks-1];	/* RS syndrome */
243  
244  	kernel_fpu_begin();
245  
246  	asm volatile("vmovdqa64 %0,%%zmm0"
247  		     : : "m" (raid6_avx512_constants.x1d[0]));
248  
249  	for (d = 0 ; d < bytes ; d += 128) {
250  		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
251  			     "vmovdqa64 %1,%%zmm6\n\t"
252  			     "vmovdqa64 %2,%%zmm2\n\t"
253  			     "vmovdqa64 %3,%%zmm3\n\t"
254  			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
255  			     "vpxorq %%zmm6,%%zmm3,%%zmm3"
256  			     :
257  			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
258  			       "m" (p[d]), "m" (p[d+64]));
259  		/* P/Q data pages */
260  		for (z = z0-1 ; z >= start ; z--) {
261  			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
262  				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
263  				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
264  				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
265  				     "vpmovm2b %%k1,%%zmm5\n\t"
266  				     "vpmovm2b %%k2,%%zmm7\n\t"
267  				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
268  				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
269  				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
270  				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
271  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
272  				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
273  				     "vmovdqa64 %0,%%zmm5\n\t"
274  				     "vmovdqa64 %1,%%zmm7\n\t"
275  				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
276  				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
277  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
278  				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
279  				     :
280  				     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
281  		}
282  		/* P/Q left side optimization */
283  		for (z = start-1 ; z >= 0 ; z--) {
284  			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
285  				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
286  				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
287  				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
288  				     "vpmovm2b %%k1,%%zmm5\n\t"
289  				     "vpmovm2b %%k2,%%zmm7\n\t"
290  				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
291  				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
292  				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
293  				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
294  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
295  				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
296  				     :
297  				     : );
298  		}
299  		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
300  			     "vpxorq %1,%%zmm6,%%zmm6\n\t"
301  			     /* Don't use movntdq for r/w
302  			      * memory area < cache line
303  			      */
304  			     "vmovdqa64 %%zmm4,%0\n\t"
305  			     "vmovdqa64 %%zmm6,%1\n\t"
306  			     "vmovdqa64 %%zmm2,%2\n\t"
307  			     "vmovdqa64 %%zmm3,%3"
308  			     :
309  			     : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
310  			       "m" (p[d+64]));
311  	}
312  
313  	asm volatile("sfence" : : : "memory");
314  	kernel_fpu_end();
315  }
316  
317  const struct raid6_calls raid6_avx512x2 = {
318  	raid6_avx5122_gen_syndrome,
319  	raid6_avx5122_xor_syndrome,
320  	raid6_have_avx512,
321  	"avx512x2",
322  	.priority = 2		/* Prefer AVX512 over priority 1 (SSE2 and others) */
323  };
324  
325  #ifdef CONFIG_X86_64
326  
327  /*
328   * Unrolled-by-4 AVX2 implementation
329   */
raid6_avx5124_gen_syndrome(int disks,size_t bytes,void ** ptrs)330  static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
331  {
332  	u8 **dptr = (u8 **)ptrs;
333  	u8 *p, *q;
334  	int d, z, z0;
335  
336  	z0 = disks - 3;         /* Highest data disk */
337  	p = dptr[z0+1];         /* XOR parity */
338  	q = dptr[z0+2];         /* RS syndrome */
339  
340  	kernel_fpu_begin();
341  
342  	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
343  		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
344  		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
345  		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
346  		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
347  		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
348  		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
349  		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
350  		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
351  		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
352  		     :
353  		     : "m" (raid6_avx512_constants.x1d[0]));
354  
355  	for (d = 0; d < bytes; d += 256) {
356  		for (z = z0; z >= 0; z--) {
357  		asm volatile("prefetchnta %0\n\t"
358  			     "prefetchnta %1\n\t"
359  			     "prefetchnta %2\n\t"
360  			     "prefetchnta %3\n\t"
361  			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
362  			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
363  			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
364  			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
365  			     "vpmovm2b %%k1,%%zmm5\n\t"
366  			     "vpmovm2b %%k2,%%zmm7\n\t"
367  			     "vpmovm2b %%k3,%%zmm13\n\t"
368  			     "vpmovm2b %%k4,%%zmm15\n\t"
369  			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
370  			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
371  			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
372  			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
373  			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
374  			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
375  			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
376  			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
377  			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
378  			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
379  			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
380  			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
381  			     "vmovdqa64 %0,%%zmm5\n\t"
382  			     "vmovdqa64 %1,%%zmm7\n\t"
383  			     "vmovdqa64 %2,%%zmm13\n\t"
384  			     "vmovdqa64 %3,%%zmm15\n\t"
385  			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
386  			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
387  			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
388  			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
389  			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
390  			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
391  			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
392  			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
393  			     :
394  			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
395  			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
396  		}
397  		asm volatile("vmovntdq %%zmm2,%0\n\t"
398  			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
399  			     "vmovntdq %%zmm3,%1\n\t"
400  			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
401  			     "vmovntdq %%zmm10,%2\n\t"
402  			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
403  			     "vmovntdq %%zmm11,%3\n\t"
404  			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
405  			     "vmovntdq %%zmm4,%4\n\t"
406  			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
407  			     "vmovntdq %%zmm6,%5\n\t"
408  			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
409  			     "vmovntdq %%zmm12,%6\n\t"
410  			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
411  			     "vmovntdq %%zmm14,%7\n\t"
412  			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
413  			     :
414  			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
415  			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
416  			       "m" (q[d+128]), "m" (q[d+192]));
417  	}
418  
419  	asm volatile("sfence" : : : "memory");
420  	kernel_fpu_end();
421  }
422  
raid6_avx5124_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)423  static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
424  				       size_t bytes, void **ptrs)
425  {
426  	u8 **dptr = (u8 **)ptrs;
427  	u8 *p, *q;
428  	int d, z, z0;
429  
430  	z0 = stop;		/* P/Q right side optimization */
431  	p = dptr[disks-2];	/* XOR parity */
432  	q = dptr[disks-1];	/* RS syndrome */
433  
434  	kernel_fpu_begin();
435  
436  	asm volatile("vmovdqa64 %0,%%zmm0"
437  		     :: "m" (raid6_avx512_constants.x1d[0]));
438  
439  	for (d = 0 ; d < bytes ; d += 256) {
440  		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
441  			     "vmovdqa64 %1,%%zmm6\n\t"
442  			     "vmovdqa64 %2,%%zmm12\n\t"
443  			     "vmovdqa64 %3,%%zmm14\n\t"
444  			     "vmovdqa64 %4,%%zmm2\n\t"
445  			     "vmovdqa64 %5,%%zmm3\n\t"
446  			     "vmovdqa64 %6,%%zmm10\n\t"
447  			     "vmovdqa64 %7,%%zmm11\n\t"
448  			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
449  			     "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
450  			     "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
451  			     "vpxorq %%zmm14,%%zmm11,%%zmm11"
452  			     :
453  			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
454  			       "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
455  			       "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
456  			       "m" (p[d+192]));
457  		/* P/Q data pages */
458  		for (z = z0-1 ; z >= start ; z--) {
459  			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
460  				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
461  				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
462  				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
463  				     "prefetchnta %0\n\t"
464  				     "prefetchnta %2\n\t"
465  				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
466  				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
467  				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
468  				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
469  				     "vpmovm2b %%k1,%%zmm5\n\t"
470  				     "vpmovm2b %%k2,%%zmm7\n\t"
471  				     "vpmovm2b %%k3,%%zmm13\n\t"
472  				     "vpmovm2b %%k4,%%zmm15\n\t"
473  				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
474  				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
475  				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
476  				     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
477  				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
478  				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
479  				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
480  				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
481  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
482  				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
483  				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
484  				     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
485  				     "vmovdqa64 %0,%%zmm5\n\t"
486  				     "vmovdqa64 %1,%%zmm7\n\t"
487  				     "vmovdqa64 %2,%%zmm13\n\t"
488  				     "vmovdqa64 %3,%%zmm15\n\t"
489  				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
490  				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
491  				     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
492  				     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
493  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
494  				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
495  				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
496  				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
497  				     :
498  				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
499  				       "m" (dptr[z][d+128]),
500  				       "m" (dptr[z][d+192]));
501  		}
502  		asm volatile("prefetchnta %0\n\t"
503  			     "prefetchnta %1\n\t"
504  			     :
505  			     : "m" (q[d]), "m" (q[d+128]));
506  		/* P/Q left side optimization */
507  		for (z = start-1 ; z >= 0 ; z--) {
508  			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
509  				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
510  				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
511  				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
512  				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
513  				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
514  				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
515  				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
516  				     "vpmovm2b %%k1,%%zmm5\n\t"
517  				     "vpmovm2b %%k2,%%zmm7\n\t"
518  				     "vpmovm2b %%k3,%%zmm13\n\t"
519  				     "vpmovm2b %%k4,%%zmm15\n\t"
520  				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
521  				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
522  				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
523  				     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
524  				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
525  				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
526  				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
527  				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
528  				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
529  				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
530  				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
531  				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
532  				     :
533  				     : );
534  		}
535  		asm volatile("vmovntdq %%zmm2,%0\n\t"
536  			     "vmovntdq %%zmm3,%1\n\t"
537  			     "vmovntdq %%zmm10,%2\n\t"
538  			     "vmovntdq %%zmm11,%3\n\t"
539  			     "vpxorq %4,%%zmm4,%%zmm4\n\t"
540  			     "vpxorq %5,%%zmm6,%%zmm6\n\t"
541  			     "vpxorq %6,%%zmm12,%%zmm12\n\t"
542  			     "vpxorq %7,%%zmm14,%%zmm14\n\t"
543  			     "vmovntdq %%zmm4,%4\n\t"
544  			     "vmovntdq %%zmm6,%5\n\t"
545  			     "vmovntdq %%zmm12,%6\n\t"
546  			     "vmovntdq %%zmm14,%7"
547  			     :
548  			     : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
549  			       "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
550  			       "m" (q[d+128]), "m" (q[d+192]));
551  	}
552  	asm volatile("sfence" : : : "memory");
553  	kernel_fpu_end();
554  }
555  const struct raid6_calls raid6_avx512x4 = {
556  	raid6_avx5124_gen_syndrome,
557  	raid6_avx5124_xor_syndrome,
558  	raid6_have_avx512,
559  	"avx512x4",
560  	.priority = 2		/* Prefer AVX512 over priority 1 (SSE2 and others) */
561  };
562  #endif
563  
564  #endif /* CONFIG_AS_AVX512 */
565