xref: /openbmc/linux/lib/raid6/sse2.c (revision dd165a65)
1dd165a65SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
2cc4589ebSDavid Woodhouse /* -*- linux-c -*- ------------------------------------------------------- *
3cc4589ebSDavid Woodhouse  *
4cc4589ebSDavid Woodhouse  *   Copyright 2002 H. Peter Anvin - All Rights Reserved
5cc4589ebSDavid Woodhouse  *
6cc4589ebSDavid Woodhouse  * ----------------------------------------------------------------------- */
7cc4589ebSDavid Woodhouse 
8cc4589ebSDavid Woodhouse /*
9a8e026c7SNeilBrown  * raid6/sse2.c
10cc4589ebSDavid Woodhouse  *
11cc4589ebSDavid Woodhouse  * SSE-2 implementation of RAID-6 syndrome functions
12cc4589ebSDavid Woodhouse  *
13cc4589ebSDavid Woodhouse  */
14cc4589ebSDavid Woodhouse 
15cc4589ebSDavid Woodhouse #include <linux/raid/pq.h>
16a8e026c7SNeilBrown #include "x86.h"
17cc4589ebSDavid Woodhouse 
18cc4589ebSDavid Woodhouse static const struct raid6_sse_constants {
19cc4589ebSDavid Woodhouse 	u64 x1d[2];
20cc4589ebSDavid Woodhouse } raid6_sse_constants  __attribute__((aligned(16))) = {
21cc4589ebSDavid Woodhouse 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
22cc4589ebSDavid Woodhouse };
23cc4589ebSDavid Woodhouse 
raid6_have_sse2(void)24cc4589ebSDavid Woodhouse static int raid6_have_sse2(void)
25cc4589ebSDavid Woodhouse {
26cc4589ebSDavid Woodhouse 	/* Not really boot_cpu but "all_cpus" */
27cc4589ebSDavid Woodhouse 	return boot_cpu_has(X86_FEATURE_MMX) &&
28cc4589ebSDavid Woodhouse 		boot_cpu_has(X86_FEATURE_FXSR) &&
29cc4589ebSDavid Woodhouse 		boot_cpu_has(X86_FEATURE_XMM) &&
30cc4589ebSDavid Woodhouse 		boot_cpu_has(X86_FEATURE_XMM2);
31cc4589ebSDavid Woodhouse }
32cc4589ebSDavid Woodhouse 
33cc4589ebSDavid Woodhouse /*
34cc4589ebSDavid Woodhouse  * Plain SSE2 implementation
35cc4589ebSDavid Woodhouse  */
raid6_sse21_gen_syndrome(int disks,size_t bytes,void ** ptrs)36cc4589ebSDavid Woodhouse static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
37cc4589ebSDavid Woodhouse {
38cc4589ebSDavid Woodhouse 	u8 **dptr = (u8 **)ptrs;
39cc4589ebSDavid Woodhouse 	u8 *p, *q;
40cc4589ebSDavid Woodhouse 	int d, z, z0;
41cc4589ebSDavid Woodhouse 
42cc4589ebSDavid Woodhouse 	z0 = disks - 3;		/* Highest data disk */
43cc4589ebSDavid Woodhouse 	p = dptr[z0+1];		/* XOR parity */
44cc4589ebSDavid Woodhouse 	q = dptr[z0+2];		/* RS syndrome */
45cc4589ebSDavid Woodhouse 
46cc4589ebSDavid Woodhouse 	kernel_fpu_begin();
47cc4589ebSDavid Woodhouse 
48cc4589ebSDavid Woodhouse 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
49cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
50cc4589ebSDavid Woodhouse 
51cc4589ebSDavid Woodhouse 	for ( d = 0 ; d < bytes ; d += 16 ) {
52cc4589ebSDavid Woodhouse 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
53cc4589ebSDavid Woodhouse 		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
54cc4589ebSDavid Woodhouse 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
55cc4589ebSDavid Woodhouse 		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
56cc4589ebSDavid Woodhouse 		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
57cc4589ebSDavid Woodhouse 		for ( z = z0-2 ; z >= 0 ; z-- ) {
58cc4589ebSDavid Woodhouse 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
59cc4589ebSDavid Woodhouse 			asm volatile("pcmpgtb %xmm4,%xmm5");
60cc4589ebSDavid Woodhouse 			asm volatile("paddb %xmm4,%xmm4");
61cc4589ebSDavid Woodhouse 			asm volatile("pand %xmm0,%xmm5");
62cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm4");
63cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm5");
64cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm6,%xmm2");
65cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm6,%xmm4");
66cc4589ebSDavid Woodhouse 			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
67cc4589ebSDavid Woodhouse 		}
68cc4589ebSDavid Woodhouse 		asm volatile("pcmpgtb %xmm4,%xmm5");
69cc4589ebSDavid Woodhouse 		asm volatile("paddb %xmm4,%xmm4");
70cc4589ebSDavid Woodhouse 		asm volatile("pand %xmm0,%xmm5");
71cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm5,%xmm4");
72cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm5,%xmm5");
73cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm6,%xmm2");
74cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm6,%xmm4");
75cc4589ebSDavid Woodhouse 
76cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
77cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm2,%xmm2");
78cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
79cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm4,%xmm4");
80cc4589ebSDavid Woodhouse 	}
81cc4589ebSDavid Woodhouse 
82cc4589ebSDavid Woodhouse 	asm volatile("sfence" : : : "memory");
83cc4589ebSDavid Woodhouse 	kernel_fpu_end();
84cc4589ebSDavid Woodhouse }
85cc4589ebSDavid Woodhouse 
86a582564bSMarkus Stockhausen 
raid6_sse21_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)87a582564bSMarkus Stockhausen static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
88a582564bSMarkus Stockhausen 				     size_t bytes, void **ptrs)
89a582564bSMarkus Stockhausen {
90a582564bSMarkus Stockhausen 	u8 **dptr = (u8 **)ptrs;
91a582564bSMarkus Stockhausen 	u8 *p, *q;
92a582564bSMarkus Stockhausen 	int d, z, z0;
93a582564bSMarkus Stockhausen 
94a582564bSMarkus Stockhausen 	z0 = stop;		/* P/Q right side optimization */
95a582564bSMarkus Stockhausen 	p = dptr[disks-2];	/* XOR parity */
96a582564bSMarkus Stockhausen 	q = dptr[disks-1];	/* RS syndrome */
97a582564bSMarkus Stockhausen 
98a582564bSMarkus Stockhausen 	kernel_fpu_begin();
99a582564bSMarkus Stockhausen 
100a582564bSMarkus Stockhausen 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
101a582564bSMarkus Stockhausen 
102a582564bSMarkus Stockhausen 	for ( d = 0 ; d < bytes ; d += 16 ) {
103a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
104a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
105a582564bSMarkus Stockhausen 		asm volatile("pxor %xmm4,%xmm2");
106a582564bSMarkus Stockhausen 		/* P/Q data pages */
107a582564bSMarkus Stockhausen 		for ( z = z0-1 ; z >= start ; z-- ) {
108a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm5");
109a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm4,%xmm5");
110a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm4,%xmm4");
111a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm5");
112a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm4");
113a582564bSMarkus Stockhausen 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
114a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm2");
115a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm4");
116a582564bSMarkus Stockhausen 		}
117a582564bSMarkus Stockhausen 		/* P/Q left side optimization */
118a582564bSMarkus Stockhausen 		for ( z = start-1 ; z >= 0 ; z-- ) {
119a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm5");
120a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm4,%xmm5");
121a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm4,%xmm4");
122a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm5");
123a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm4");
124a582564bSMarkus Stockhausen 		}
125a582564bSMarkus Stockhausen 		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
126a582564bSMarkus Stockhausen 		/* Don't use movntdq for r/w memory area < cache line */
127a582564bSMarkus Stockhausen 		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
128a582564bSMarkus Stockhausen 		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
129a582564bSMarkus Stockhausen 	}
130a582564bSMarkus Stockhausen 
131a582564bSMarkus Stockhausen 	asm volatile("sfence" : : : "memory");
132a582564bSMarkus Stockhausen 	kernel_fpu_end();
133a582564bSMarkus Stockhausen }
134a582564bSMarkus Stockhausen 
135cc4589ebSDavid Woodhouse const struct raid6_calls raid6_sse2x1 = {
136cc4589ebSDavid Woodhouse 	raid6_sse21_gen_syndrome,
137a582564bSMarkus Stockhausen 	raid6_sse21_xor_syndrome,
138cc4589ebSDavid Woodhouse 	raid6_have_sse2,
139cc4589ebSDavid Woodhouse 	"sse2x1",
140cc4589ebSDavid Woodhouse 	1			/* Has cache hints */
141cc4589ebSDavid Woodhouse };
142cc4589ebSDavid Woodhouse 
143cc4589ebSDavid Woodhouse /*
144cc4589ebSDavid Woodhouse  * Unrolled-by-2 SSE2 implementation
145cc4589ebSDavid Woodhouse  */
raid6_sse22_gen_syndrome(int disks,size_t bytes,void ** ptrs)146cc4589ebSDavid Woodhouse static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
147cc4589ebSDavid Woodhouse {
148cc4589ebSDavid Woodhouse 	u8 **dptr = (u8 **)ptrs;
149cc4589ebSDavid Woodhouse 	u8 *p, *q;
150cc4589ebSDavid Woodhouse 	int d, z, z0;
151cc4589ebSDavid Woodhouse 
152cc4589ebSDavid Woodhouse 	z0 = disks - 3;		/* Highest data disk */
153cc4589ebSDavid Woodhouse 	p = dptr[z0+1];		/* XOR parity */
154cc4589ebSDavid Woodhouse 	q = dptr[z0+2];		/* RS syndrome */
155cc4589ebSDavid Woodhouse 
156cc4589ebSDavid Woodhouse 	kernel_fpu_begin();
157cc4589ebSDavid Woodhouse 
158cc4589ebSDavid Woodhouse 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
159cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
160cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
161cc4589ebSDavid Woodhouse 
162cc4589ebSDavid Woodhouse 	/* We uniformly assume a single prefetch covers at least 32 bytes */
163cc4589ebSDavid Woodhouse 	for ( d = 0 ; d < bytes ; d += 32 ) {
164cc4589ebSDavid Woodhouse 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
165cc4589ebSDavid Woodhouse 		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
166cc4589ebSDavid Woodhouse 		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
167cc4589ebSDavid Woodhouse 		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
168cc4589ebSDavid Woodhouse 		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
169cc4589ebSDavid Woodhouse 		for ( z = z0-1 ; z >= 0 ; z-- ) {
170cc4589ebSDavid Woodhouse 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
171cc4589ebSDavid Woodhouse 			asm volatile("pcmpgtb %xmm4,%xmm5");
172cc4589ebSDavid Woodhouse 			asm volatile("pcmpgtb %xmm6,%xmm7");
173cc4589ebSDavid Woodhouse 			asm volatile("paddb %xmm4,%xmm4");
174cc4589ebSDavid Woodhouse 			asm volatile("paddb %xmm6,%xmm6");
175cc4589ebSDavid Woodhouse 			asm volatile("pand %xmm0,%xmm5");
176cc4589ebSDavid Woodhouse 			asm volatile("pand %xmm0,%xmm7");
177cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm4");
178cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm7,%xmm6");
179cc4589ebSDavid Woodhouse 			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
180cc4589ebSDavid Woodhouse 			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
181cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm2");
182cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm7,%xmm3");
183cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm4");
184cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm7,%xmm6");
185cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm5");
186cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm7,%xmm7");
187cc4589ebSDavid Woodhouse 		}
188cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
189cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
190cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
191cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
192cc4589ebSDavid Woodhouse 	}
193cc4589ebSDavid Woodhouse 
194cc4589ebSDavid Woodhouse 	asm volatile("sfence" : : : "memory");
195cc4589ebSDavid Woodhouse 	kernel_fpu_end();
196cc4589ebSDavid Woodhouse }
197cc4589ebSDavid Woodhouse 
raid6_sse22_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)198a582564bSMarkus Stockhausen static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
199a582564bSMarkus Stockhausen 				     size_t bytes, void **ptrs)
200a582564bSMarkus Stockhausen {
201a582564bSMarkus Stockhausen 	u8 **dptr = (u8 **)ptrs;
202a582564bSMarkus Stockhausen 	u8 *p, *q;
203a582564bSMarkus Stockhausen 	int d, z, z0;
204a582564bSMarkus Stockhausen 
205a582564bSMarkus Stockhausen 	z0 = stop;		/* P/Q right side optimization */
206a582564bSMarkus Stockhausen 	p = dptr[disks-2];	/* XOR parity */
207a582564bSMarkus Stockhausen 	q = dptr[disks-1];	/* RS syndrome */
208a582564bSMarkus Stockhausen 
209a582564bSMarkus Stockhausen 	kernel_fpu_begin();
210a582564bSMarkus Stockhausen 
211a582564bSMarkus Stockhausen 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
212a582564bSMarkus Stockhausen 
213a582564bSMarkus Stockhausen 	for ( d = 0 ; d < bytes ; d += 32 ) {
214a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
215a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
216a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
217a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
218a582564bSMarkus Stockhausen 		asm volatile("pxor %xmm4,%xmm2");
219a582564bSMarkus Stockhausen 		asm volatile("pxor %xmm6,%xmm3");
220a582564bSMarkus Stockhausen 		/* P/Q data pages */
221a582564bSMarkus Stockhausen 		for ( z = z0-1 ; z >= start ; z-- ) {
222a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm5");
223a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm7");
224a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm4,%xmm5");
225a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm6,%xmm7");
226a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm4,%xmm4");
227a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm6,%xmm6");
228a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm5");
229a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm7");
230a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm4");
231a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm6");
232a582564bSMarkus Stockhausen 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
233a582564bSMarkus Stockhausen 			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
234a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm2");
235a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm3");
236a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm4");
237a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm6");
238a582564bSMarkus Stockhausen 		}
239a582564bSMarkus Stockhausen 		/* P/Q left side optimization */
240a582564bSMarkus Stockhausen 		for ( z = start-1 ; z >= 0 ; z-- ) {
241a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm5");
242a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm7");
243a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm4,%xmm5");
244a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm6,%xmm7");
245a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm4,%xmm4");
246a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm6,%xmm6");
247a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm5");
248a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm7");
249a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm4");
250a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm6");
251a582564bSMarkus Stockhausen 		}
252a582564bSMarkus Stockhausen 		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
253a582564bSMarkus Stockhausen 		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
254a582564bSMarkus Stockhausen 		/* Don't use movntdq for r/w memory area < cache line */
255a582564bSMarkus Stockhausen 		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
256a582564bSMarkus Stockhausen 		asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
257a582564bSMarkus Stockhausen 		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
258a582564bSMarkus Stockhausen 		asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
259a582564bSMarkus Stockhausen 	}
260a582564bSMarkus Stockhausen 
261a582564bSMarkus Stockhausen 	asm volatile("sfence" : : : "memory");
262a582564bSMarkus Stockhausen 	kernel_fpu_end();
263a582564bSMarkus Stockhausen }
264a582564bSMarkus Stockhausen 
265cc4589ebSDavid Woodhouse const struct raid6_calls raid6_sse2x2 = {
266cc4589ebSDavid Woodhouse 	raid6_sse22_gen_syndrome,
267a582564bSMarkus Stockhausen 	raid6_sse22_xor_syndrome,
268cc4589ebSDavid Woodhouse 	raid6_have_sse2,
269cc4589ebSDavid Woodhouse 	"sse2x2",
270cc4589ebSDavid Woodhouse 	1			/* Has cache hints */
271cc4589ebSDavid Woodhouse };
272cc4589ebSDavid Woodhouse 
2734f8c55c5SYuanhan Liu #ifdef CONFIG_X86_64
274cc4589ebSDavid Woodhouse 
275cc4589ebSDavid Woodhouse /*
276cc4589ebSDavid Woodhouse  * Unrolled-by-4 SSE2 implementation
277cc4589ebSDavid Woodhouse  */
raid6_sse24_gen_syndrome(int disks,size_t bytes,void ** ptrs)278cc4589ebSDavid Woodhouse static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
279cc4589ebSDavid Woodhouse {
280cc4589ebSDavid Woodhouse 	u8 **dptr = (u8 **)ptrs;
281cc4589ebSDavid Woodhouse 	u8 *p, *q;
282cc4589ebSDavid Woodhouse 	int d, z, z0;
283cc4589ebSDavid Woodhouse 
284cc4589ebSDavid Woodhouse 	z0 = disks - 3;		/* Highest data disk */
285cc4589ebSDavid Woodhouse 	p = dptr[z0+1];		/* XOR parity */
286cc4589ebSDavid Woodhouse 	q = dptr[z0+2];		/* RS syndrome */
287cc4589ebSDavid Woodhouse 
288cc4589ebSDavid Woodhouse 	kernel_fpu_begin();
289cc4589ebSDavid Woodhouse 
290cc4589ebSDavid Woodhouse 	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
291cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
292cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
293cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
294cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
295cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
296cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
297cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
298cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
299cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
300cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
301cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
302cc4589ebSDavid Woodhouse 	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
303cc4589ebSDavid Woodhouse 
304cc4589ebSDavid Woodhouse 	for ( d = 0 ; d < bytes ; d += 64 ) {
305cc4589ebSDavid Woodhouse 		for ( z = z0 ; z >= 0 ; z-- ) {
306cc4589ebSDavid Woodhouse 			/* The second prefetch seems to improve performance... */
307cc4589ebSDavid Woodhouse 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
308cc4589ebSDavid Woodhouse 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
309cc4589ebSDavid Woodhouse 			asm volatile("pcmpgtb %xmm4,%xmm5");
310cc4589ebSDavid Woodhouse 			asm volatile("pcmpgtb %xmm6,%xmm7");
311cc4589ebSDavid Woodhouse 			asm volatile("pcmpgtb %xmm12,%xmm13");
312cc4589ebSDavid Woodhouse 			asm volatile("pcmpgtb %xmm14,%xmm15");
313cc4589ebSDavid Woodhouse 			asm volatile("paddb %xmm4,%xmm4");
314cc4589ebSDavid Woodhouse 			asm volatile("paddb %xmm6,%xmm6");
315cc4589ebSDavid Woodhouse 			asm volatile("paddb %xmm12,%xmm12");
316cc4589ebSDavid Woodhouse 			asm volatile("paddb %xmm14,%xmm14");
317cc4589ebSDavid Woodhouse 			asm volatile("pand %xmm0,%xmm5");
318cc4589ebSDavid Woodhouse 			asm volatile("pand %xmm0,%xmm7");
319cc4589ebSDavid Woodhouse 			asm volatile("pand %xmm0,%xmm13");
320cc4589ebSDavid Woodhouse 			asm volatile("pand %xmm0,%xmm15");
321cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm4");
322cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm7,%xmm6");
323cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm13,%xmm12");
324cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm15,%xmm14");
325cc4589ebSDavid Woodhouse 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
326cc4589ebSDavid Woodhouse 			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
327cc4589ebSDavid Woodhouse 			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
328cc4589ebSDavid Woodhouse 			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
329cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm2");
330cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm7,%xmm3");
331cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm13,%xmm10");
332cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm15,%xmm11");
333cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm4");
334cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm7,%xmm6");
335cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm13,%xmm12");
336cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm15,%xmm14");
337cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm5,%xmm5");
338cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm7,%xmm7");
339cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm13,%xmm13");
340cc4589ebSDavid Woodhouse 			asm volatile("pxor %xmm15,%xmm15");
341cc4589ebSDavid Woodhouse 		}
342cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
343cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm2,%xmm2");
344cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
345cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm3,%xmm3");
346cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
347cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm10,%xmm10");
348cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
349cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm11,%xmm11");
350cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
351cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm4,%xmm4");
352cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
353cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm6,%xmm6");
354cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
355cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm12,%xmm12");
356cc4589ebSDavid Woodhouse 		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
357cc4589ebSDavid Woodhouse 		asm volatile("pxor %xmm14,%xmm14");
358cc4589ebSDavid Woodhouse 	}
359cc4589ebSDavid Woodhouse 
360cc4589ebSDavid Woodhouse 	asm volatile("sfence" : : : "memory");
361cc4589ebSDavid Woodhouse 	kernel_fpu_end();
362cc4589ebSDavid Woodhouse }
363cc4589ebSDavid Woodhouse 
raid6_sse24_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)364a582564bSMarkus Stockhausen static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
365a582564bSMarkus Stockhausen 				     size_t bytes, void **ptrs)
366a582564bSMarkus Stockhausen {
367a582564bSMarkus Stockhausen 	u8 **dptr = (u8 **)ptrs;
368a582564bSMarkus Stockhausen 	u8 *p, *q;
369a582564bSMarkus Stockhausen 	int d, z, z0;
370a582564bSMarkus Stockhausen 
371a582564bSMarkus Stockhausen 	z0 = stop;		/* P/Q right side optimization */
372a582564bSMarkus Stockhausen 	p = dptr[disks-2];	/* XOR parity */
373a582564bSMarkus Stockhausen 	q = dptr[disks-1];	/* RS syndrome */
374a582564bSMarkus Stockhausen 
375a582564bSMarkus Stockhausen 	kernel_fpu_begin();
376a582564bSMarkus Stockhausen 
377a582564bSMarkus Stockhausen 	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
378a582564bSMarkus Stockhausen 
379a582564bSMarkus Stockhausen 	for ( d = 0 ; d < bytes ; d += 64 ) {
380a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
381a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
382a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
383a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
384a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
385a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
386a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
387a582564bSMarkus Stockhausen 		asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
388a582564bSMarkus Stockhausen 		asm volatile("pxor %xmm4,%xmm2");
389a582564bSMarkus Stockhausen 		asm volatile("pxor %xmm6,%xmm3");
390a582564bSMarkus Stockhausen 		asm volatile("pxor %xmm12,%xmm10");
391a582564bSMarkus Stockhausen 		asm volatile("pxor %xmm14,%xmm11");
392a582564bSMarkus Stockhausen 		/* P/Q data pages */
393a582564bSMarkus Stockhausen 		for ( z = z0-1 ; z >= start ; z-- ) {
394a582564bSMarkus Stockhausen 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
395a582564bSMarkus Stockhausen 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
396a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm5");
397a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm7");
398a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm13,%xmm13");
399a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm15,%xmm15");
400a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm4,%xmm5");
401a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm6,%xmm7");
402a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm12,%xmm13");
403a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm14,%xmm15");
404a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm4,%xmm4");
405a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm6,%xmm6");
406a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm12,%xmm12");
407a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm14,%xmm14");
408a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm5");
409a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm7");
410a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm13");
411a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm15");
412a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm4");
413a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm6");
414a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm13,%xmm12");
415a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm15,%xmm14");
416a582564bSMarkus Stockhausen 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
417a582564bSMarkus Stockhausen 			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
418a582564bSMarkus Stockhausen 			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
419a582564bSMarkus Stockhausen 			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
420a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm2");
421a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm3");
422a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm13,%xmm10");
423a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm15,%xmm11");
424a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm4");
425a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm6");
426a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm13,%xmm12");
427a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm15,%xmm14");
428a582564bSMarkus Stockhausen 		}
429a582564bSMarkus Stockhausen 		asm volatile("prefetchnta %0" :: "m" (q[d]));
430a582564bSMarkus Stockhausen 		asm volatile("prefetchnta %0" :: "m" (q[d+32]));
431a582564bSMarkus Stockhausen 		/* P/Q left side optimization */
432a582564bSMarkus Stockhausen 		for ( z = start-1 ; z >= 0 ; z-- ) {
433a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm5");
434a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm7");
435a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm13,%xmm13");
436a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm15,%xmm15");
437a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm4,%xmm5");
438a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm6,%xmm7");
439a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm12,%xmm13");
440a582564bSMarkus Stockhausen 			asm volatile("pcmpgtb %xmm14,%xmm15");
441a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm4,%xmm4");
442a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm6,%xmm6");
443a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm12,%xmm12");
444a582564bSMarkus Stockhausen 			asm volatile("paddb %xmm14,%xmm14");
445a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm5");
446a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm7");
447a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm13");
448a582564bSMarkus Stockhausen 			asm volatile("pand %xmm0,%xmm15");
449a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm5,%xmm4");
450a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm7,%xmm6");
451a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm13,%xmm12");
452a582564bSMarkus Stockhausen 			asm volatile("pxor %xmm15,%xmm14");
453a582564bSMarkus Stockhausen 		}
454a582564bSMarkus Stockhausen 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
455a582564bSMarkus Stockhausen 		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
456a582564bSMarkus Stockhausen 		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
457a582564bSMarkus Stockhausen 		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
458a582564bSMarkus Stockhausen 		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
459a582564bSMarkus Stockhausen 		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
460a582564bSMarkus Stockhausen 		asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
461a582564bSMarkus Stockhausen 		asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
462a582564bSMarkus Stockhausen 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
463a582564bSMarkus Stockhausen 		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
464a582564bSMarkus Stockhausen 		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
465a582564bSMarkus Stockhausen 		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
466a582564bSMarkus Stockhausen 	}
467a582564bSMarkus Stockhausen 	asm volatile("sfence" : : : "memory");
468a582564bSMarkus Stockhausen 	kernel_fpu_end();
469a582564bSMarkus Stockhausen }
470a582564bSMarkus Stockhausen 
471a582564bSMarkus Stockhausen 
472cc4589ebSDavid Woodhouse const struct raid6_calls raid6_sse2x4 = {
473cc4589ebSDavid Woodhouse 	raid6_sse24_gen_syndrome,
474a582564bSMarkus Stockhausen 	raid6_sse24_xor_syndrome,
475cc4589ebSDavid Woodhouse 	raid6_have_sse2,
476cc4589ebSDavid Woodhouse 	"sse2x4",
477cc4589ebSDavid Woodhouse 	1			/* Has cache hints */
478cc4589ebSDavid Woodhouse };
479cc4589ebSDavid Woodhouse 
4804f8c55c5SYuanhan Liu #endif /* CONFIG_X86_64 */
481