xref: /openbmc/linux/lib/raid6/sse2.c (revision 58e16d792a6a8c6b750f637a4649967fcac853dc)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /* -*- linux-c -*- ------------------------------------------------------- *
3   *
4   *   Copyright 2002 H. Peter Anvin - All Rights Reserved
5   *
6   * ----------------------------------------------------------------------- */
7  
8  /*
9   * raid6/sse2.c
10   *
11   * SSE-2 implementation of RAID-6 syndrome functions
12   *
13   */
14  
15  #include <linux/raid/pq.h>
16  #include "x86.h"
17  
18  static const struct raid6_sse_constants {
19  	u64 x1d[2];
20  } raid6_sse_constants  __attribute__((aligned(16))) = {
21  	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
22  };
23  
raid6_have_sse2(void)24  static int raid6_have_sse2(void)
25  {
26  	/* Not really boot_cpu but "all_cpus" */
27  	return boot_cpu_has(X86_FEATURE_MMX) &&
28  		boot_cpu_has(X86_FEATURE_FXSR) &&
29  		boot_cpu_has(X86_FEATURE_XMM) &&
30  		boot_cpu_has(X86_FEATURE_XMM2);
31  }
32  
33  /*
34   * Plain SSE2 implementation
35   */
raid6_sse21_gen_syndrome(int disks,size_t bytes,void ** ptrs)36  static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
37  {
38  	u8 **dptr = (u8 **)ptrs;
39  	u8 *p, *q;
40  	int d, z, z0;
41  
42  	z0 = disks - 3;		/* Highest data disk */
43  	p = dptr[z0+1];		/* XOR parity */
44  	q = dptr[z0+2];		/* RS syndrome */
45  
46  	kernel_fpu_begin();
47  
48  	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
49  	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
50  
51  	for ( d = 0 ; d < bytes ; d += 16 ) {
52  		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
53  		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
54  		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
55  		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
56  		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
57  		for ( z = z0-2 ; z >= 0 ; z-- ) {
58  			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
59  			asm volatile("pcmpgtb %xmm4,%xmm5");
60  			asm volatile("paddb %xmm4,%xmm4");
61  			asm volatile("pand %xmm0,%xmm5");
62  			asm volatile("pxor %xmm5,%xmm4");
63  			asm volatile("pxor %xmm5,%xmm5");
64  			asm volatile("pxor %xmm6,%xmm2");
65  			asm volatile("pxor %xmm6,%xmm4");
66  			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
67  		}
68  		asm volatile("pcmpgtb %xmm4,%xmm5");
69  		asm volatile("paddb %xmm4,%xmm4");
70  		asm volatile("pand %xmm0,%xmm5");
71  		asm volatile("pxor %xmm5,%xmm4");
72  		asm volatile("pxor %xmm5,%xmm5");
73  		asm volatile("pxor %xmm6,%xmm2");
74  		asm volatile("pxor %xmm6,%xmm4");
75  
76  		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
77  		asm volatile("pxor %xmm2,%xmm2");
78  		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
79  		asm volatile("pxor %xmm4,%xmm4");
80  	}
81  
82  	asm volatile("sfence" : : : "memory");
83  	kernel_fpu_end();
84  }
85  
86  
raid6_sse21_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)87  static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
88  				     size_t bytes, void **ptrs)
89  {
90  	u8 **dptr = (u8 **)ptrs;
91  	u8 *p, *q;
92  	int d, z, z0;
93  
94  	z0 = stop;		/* P/Q right side optimization */
95  	p = dptr[disks-2];	/* XOR parity */
96  	q = dptr[disks-1];	/* RS syndrome */
97  
98  	kernel_fpu_begin();
99  
100  	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
101  
102  	for ( d = 0 ; d < bytes ; d += 16 ) {
103  		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
104  		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
105  		asm volatile("pxor %xmm4,%xmm2");
106  		/* P/Q data pages */
107  		for ( z = z0-1 ; z >= start ; z-- ) {
108  			asm volatile("pxor %xmm5,%xmm5");
109  			asm volatile("pcmpgtb %xmm4,%xmm5");
110  			asm volatile("paddb %xmm4,%xmm4");
111  			asm volatile("pand %xmm0,%xmm5");
112  			asm volatile("pxor %xmm5,%xmm4");
113  			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
114  			asm volatile("pxor %xmm5,%xmm2");
115  			asm volatile("pxor %xmm5,%xmm4");
116  		}
117  		/* P/Q left side optimization */
118  		for ( z = start-1 ; z >= 0 ; z-- ) {
119  			asm volatile("pxor %xmm5,%xmm5");
120  			asm volatile("pcmpgtb %xmm4,%xmm5");
121  			asm volatile("paddb %xmm4,%xmm4");
122  			asm volatile("pand %xmm0,%xmm5");
123  			asm volatile("pxor %xmm5,%xmm4");
124  		}
125  		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
126  		/* Don't use movntdq for r/w memory area < cache line */
127  		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
128  		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
129  	}
130  
131  	asm volatile("sfence" : : : "memory");
132  	kernel_fpu_end();
133  }
134  
135  const struct raid6_calls raid6_sse2x1 = {
136  	raid6_sse21_gen_syndrome,
137  	raid6_sse21_xor_syndrome,
138  	raid6_have_sse2,
139  	"sse2x1",
140  	1			/* Has cache hints */
141  };
142  
143  /*
144   * Unrolled-by-2 SSE2 implementation
145   */
raid6_sse22_gen_syndrome(int disks,size_t bytes,void ** ptrs)146  static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
147  {
148  	u8 **dptr = (u8 **)ptrs;
149  	u8 *p, *q;
150  	int d, z, z0;
151  
152  	z0 = disks - 3;		/* Highest data disk */
153  	p = dptr[z0+1];		/* XOR parity */
154  	q = dptr[z0+2];		/* RS syndrome */
155  
156  	kernel_fpu_begin();
157  
158  	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
159  	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
160  	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
161  
162  	/* We uniformly assume a single prefetch covers at least 32 bytes */
163  	for ( d = 0 ; d < bytes ; d += 32 ) {
164  		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
165  		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
166  		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
167  		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
168  		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
169  		for ( z = z0-1 ; z >= 0 ; z-- ) {
170  			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
171  			asm volatile("pcmpgtb %xmm4,%xmm5");
172  			asm volatile("pcmpgtb %xmm6,%xmm7");
173  			asm volatile("paddb %xmm4,%xmm4");
174  			asm volatile("paddb %xmm6,%xmm6");
175  			asm volatile("pand %xmm0,%xmm5");
176  			asm volatile("pand %xmm0,%xmm7");
177  			asm volatile("pxor %xmm5,%xmm4");
178  			asm volatile("pxor %xmm7,%xmm6");
179  			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
180  			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
181  			asm volatile("pxor %xmm5,%xmm2");
182  			asm volatile("pxor %xmm7,%xmm3");
183  			asm volatile("pxor %xmm5,%xmm4");
184  			asm volatile("pxor %xmm7,%xmm6");
185  			asm volatile("pxor %xmm5,%xmm5");
186  			asm volatile("pxor %xmm7,%xmm7");
187  		}
188  		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
189  		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
190  		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
191  		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
192  	}
193  
194  	asm volatile("sfence" : : : "memory");
195  	kernel_fpu_end();
196  }
197  
raid6_sse22_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)198  static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
199  				     size_t bytes, void **ptrs)
200  {
201  	u8 **dptr = (u8 **)ptrs;
202  	u8 *p, *q;
203  	int d, z, z0;
204  
205  	z0 = stop;		/* P/Q right side optimization */
206  	p = dptr[disks-2];	/* XOR parity */
207  	q = dptr[disks-1];	/* RS syndrome */
208  
209  	kernel_fpu_begin();
210  
211  	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
212  
213  	for ( d = 0 ; d < bytes ; d += 32 ) {
214  		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
215  		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
216  		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
217  		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
218  		asm volatile("pxor %xmm4,%xmm2");
219  		asm volatile("pxor %xmm6,%xmm3");
220  		/* P/Q data pages */
221  		for ( z = z0-1 ; z >= start ; z-- ) {
222  			asm volatile("pxor %xmm5,%xmm5");
223  			asm volatile("pxor %xmm7,%xmm7");
224  			asm volatile("pcmpgtb %xmm4,%xmm5");
225  			asm volatile("pcmpgtb %xmm6,%xmm7");
226  			asm volatile("paddb %xmm4,%xmm4");
227  			asm volatile("paddb %xmm6,%xmm6");
228  			asm volatile("pand %xmm0,%xmm5");
229  			asm volatile("pand %xmm0,%xmm7");
230  			asm volatile("pxor %xmm5,%xmm4");
231  			asm volatile("pxor %xmm7,%xmm6");
232  			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
233  			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
234  			asm volatile("pxor %xmm5,%xmm2");
235  			asm volatile("pxor %xmm7,%xmm3");
236  			asm volatile("pxor %xmm5,%xmm4");
237  			asm volatile("pxor %xmm7,%xmm6");
238  		}
239  		/* P/Q left side optimization */
240  		for ( z = start-1 ; z >= 0 ; z-- ) {
241  			asm volatile("pxor %xmm5,%xmm5");
242  			asm volatile("pxor %xmm7,%xmm7");
243  			asm volatile("pcmpgtb %xmm4,%xmm5");
244  			asm volatile("pcmpgtb %xmm6,%xmm7");
245  			asm volatile("paddb %xmm4,%xmm4");
246  			asm volatile("paddb %xmm6,%xmm6");
247  			asm volatile("pand %xmm0,%xmm5");
248  			asm volatile("pand %xmm0,%xmm7");
249  			asm volatile("pxor %xmm5,%xmm4");
250  			asm volatile("pxor %xmm7,%xmm6");
251  		}
252  		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
253  		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
254  		/* Don't use movntdq for r/w memory area < cache line */
255  		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
256  		asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
257  		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
258  		asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
259  	}
260  
261  	asm volatile("sfence" : : : "memory");
262  	kernel_fpu_end();
263  }
264  
265  const struct raid6_calls raid6_sse2x2 = {
266  	raid6_sse22_gen_syndrome,
267  	raid6_sse22_xor_syndrome,
268  	raid6_have_sse2,
269  	"sse2x2",
270  	1			/* Has cache hints */
271  };
272  
273  #ifdef CONFIG_X86_64
274  
275  /*
276   * Unrolled-by-4 SSE2 implementation
277   */
raid6_sse24_gen_syndrome(int disks,size_t bytes,void ** ptrs)278  static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
279  {
280  	u8 **dptr = (u8 **)ptrs;
281  	u8 *p, *q;
282  	int d, z, z0;
283  
284  	z0 = disks - 3;		/* Highest data disk */
285  	p = dptr[z0+1];		/* XOR parity */
286  	q = dptr[z0+2];		/* RS syndrome */
287  
288  	kernel_fpu_begin();
289  
290  	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
291  	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
292  	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
293  	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
294  	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
295  	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
296  	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
297  	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
298  	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
299  	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
300  	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
301  	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
302  	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
303  
304  	for ( d = 0 ; d < bytes ; d += 64 ) {
305  		for ( z = z0 ; z >= 0 ; z-- ) {
306  			/* The second prefetch seems to improve performance... */
307  			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
308  			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
309  			asm volatile("pcmpgtb %xmm4,%xmm5");
310  			asm volatile("pcmpgtb %xmm6,%xmm7");
311  			asm volatile("pcmpgtb %xmm12,%xmm13");
312  			asm volatile("pcmpgtb %xmm14,%xmm15");
313  			asm volatile("paddb %xmm4,%xmm4");
314  			asm volatile("paddb %xmm6,%xmm6");
315  			asm volatile("paddb %xmm12,%xmm12");
316  			asm volatile("paddb %xmm14,%xmm14");
317  			asm volatile("pand %xmm0,%xmm5");
318  			asm volatile("pand %xmm0,%xmm7");
319  			asm volatile("pand %xmm0,%xmm13");
320  			asm volatile("pand %xmm0,%xmm15");
321  			asm volatile("pxor %xmm5,%xmm4");
322  			asm volatile("pxor %xmm7,%xmm6");
323  			asm volatile("pxor %xmm13,%xmm12");
324  			asm volatile("pxor %xmm15,%xmm14");
325  			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
326  			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
327  			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
328  			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
329  			asm volatile("pxor %xmm5,%xmm2");
330  			asm volatile("pxor %xmm7,%xmm3");
331  			asm volatile("pxor %xmm13,%xmm10");
332  			asm volatile("pxor %xmm15,%xmm11");
333  			asm volatile("pxor %xmm5,%xmm4");
334  			asm volatile("pxor %xmm7,%xmm6");
335  			asm volatile("pxor %xmm13,%xmm12");
336  			asm volatile("pxor %xmm15,%xmm14");
337  			asm volatile("pxor %xmm5,%xmm5");
338  			asm volatile("pxor %xmm7,%xmm7");
339  			asm volatile("pxor %xmm13,%xmm13");
340  			asm volatile("pxor %xmm15,%xmm15");
341  		}
342  		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
343  		asm volatile("pxor %xmm2,%xmm2");
344  		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
345  		asm volatile("pxor %xmm3,%xmm3");
346  		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
347  		asm volatile("pxor %xmm10,%xmm10");
348  		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
349  		asm volatile("pxor %xmm11,%xmm11");
350  		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
351  		asm volatile("pxor %xmm4,%xmm4");
352  		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
353  		asm volatile("pxor %xmm6,%xmm6");
354  		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
355  		asm volatile("pxor %xmm12,%xmm12");
356  		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
357  		asm volatile("pxor %xmm14,%xmm14");
358  	}
359  
360  	asm volatile("sfence" : : : "memory");
361  	kernel_fpu_end();
362  }
363  
raid6_sse24_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)364  static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
365  				     size_t bytes, void **ptrs)
366  {
367  	u8 **dptr = (u8 **)ptrs;
368  	u8 *p, *q;
369  	int d, z, z0;
370  
371  	z0 = stop;		/* P/Q right side optimization */
372  	p = dptr[disks-2];	/* XOR parity */
373  	q = dptr[disks-1];	/* RS syndrome */
374  
375  	kernel_fpu_begin();
376  
377  	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
378  
379  	for ( d = 0 ; d < bytes ; d += 64 ) {
380  		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
381  		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
382  		asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
383  		asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
384  		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
385  		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
386  		asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
387  		asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
388  		asm volatile("pxor %xmm4,%xmm2");
389  		asm volatile("pxor %xmm6,%xmm3");
390  		asm volatile("pxor %xmm12,%xmm10");
391  		asm volatile("pxor %xmm14,%xmm11");
392  		/* P/Q data pages */
393  		for ( z = z0-1 ; z >= start ; z-- ) {
394  			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
395  			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
396  			asm volatile("pxor %xmm5,%xmm5");
397  			asm volatile("pxor %xmm7,%xmm7");
398  			asm volatile("pxor %xmm13,%xmm13");
399  			asm volatile("pxor %xmm15,%xmm15");
400  			asm volatile("pcmpgtb %xmm4,%xmm5");
401  			asm volatile("pcmpgtb %xmm6,%xmm7");
402  			asm volatile("pcmpgtb %xmm12,%xmm13");
403  			asm volatile("pcmpgtb %xmm14,%xmm15");
404  			asm volatile("paddb %xmm4,%xmm4");
405  			asm volatile("paddb %xmm6,%xmm6");
406  			asm volatile("paddb %xmm12,%xmm12");
407  			asm volatile("paddb %xmm14,%xmm14");
408  			asm volatile("pand %xmm0,%xmm5");
409  			asm volatile("pand %xmm0,%xmm7");
410  			asm volatile("pand %xmm0,%xmm13");
411  			asm volatile("pand %xmm0,%xmm15");
412  			asm volatile("pxor %xmm5,%xmm4");
413  			asm volatile("pxor %xmm7,%xmm6");
414  			asm volatile("pxor %xmm13,%xmm12");
415  			asm volatile("pxor %xmm15,%xmm14");
416  			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
417  			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
418  			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
419  			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
420  			asm volatile("pxor %xmm5,%xmm2");
421  			asm volatile("pxor %xmm7,%xmm3");
422  			asm volatile("pxor %xmm13,%xmm10");
423  			asm volatile("pxor %xmm15,%xmm11");
424  			asm volatile("pxor %xmm5,%xmm4");
425  			asm volatile("pxor %xmm7,%xmm6");
426  			asm volatile("pxor %xmm13,%xmm12");
427  			asm volatile("pxor %xmm15,%xmm14");
428  		}
429  		asm volatile("prefetchnta %0" :: "m" (q[d]));
430  		asm volatile("prefetchnta %0" :: "m" (q[d+32]));
431  		/* P/Q left side optimization */
432  		for ( z = start-1 ; z >= 0 ; z-- ) {
433  			asm volatile("pxor %xmm5,%xmm5");
434  			asm volatile("pxor %xmm7,%xmm7");
435  			asm volatile("pxor %xmm13,%xmm13");
436  			asm volatile("pxor %xmm15,%xmm15");
437  			asm volatile("pcmpgtb %xmm4,%xmm5");
438  			asm volatile("pcmpgtb %xmm6,%xmm7");
439  			asm volatile("pcmpgtb %xmm12,%xmm13");
440  			asm volatile("pcmpgtb %xmm14,%xmm15");
441  			asm volatile("paddb %xmm4,%xmm4");
442  			asm volatile("paddb %xmm6,%xmm6");
443  			asm volatile("paddb %xmm12,%xmm12");
444  			asm volatile("paddb %xmm14,%xmm14");
445  			asm volatile("pand %xmm0,%xmm5");
446  			asm volatile("pand %xmm0,%xmm7");
447  			asm volatile("pand %xmm0,%xmm13");
448  			asm volatile("pand %xmm0,%xmm15");
449  			asm volatile("pxor %xmm5,%xmm4");
450  			asm volatile("pxor %xmm7,%xmm6");
451  			asm volatile("pxor %xmm13,%xmm12");
452  			asm volatile("pxor %xmm15,%xmm14");
453  		}
454  		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
455  		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
456  		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
457  		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
458  		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
459  		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
460  		asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
461  		asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
462  		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
463  		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
464  		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
465  		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
466  	}
467  	asm volatile("sfence" : : : "memory");
468  	kernel_fpu_end();
469  }
470  
471  
472  const struct raid6_calls raid6_sse2x4 = {
473  	raid6_sse24_gen_syndrome,
474  	raid6_sse24_xor_syndrome,
475  	raid6_have_sse2,
476  	"sse2x4",
477  	1			/* Has cache hints */
478  };
479  
480  #endif /* CONFIG_X86_64 */
481