xref: /openbmc/linux/arch/x86/include/asm/xor_64.h (revision 9ac8d3fb)
1 #ifndef _ASM_X86_XOR_64_H
2 #define _ASM_X86_XOR_64_H
3 
4 /*
5  * Optimized RAID-5 checksumming functions for MMX and SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16 
17 
18 /*
19  * Cache avoiding checksumming functions utilizing KNI instructions
20  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21  */
22 
23 /*
24  * Based on
25  * High-speed RAID5 checksumming functions utilizing SSE instructions.
26  * Copyright (C) 1998 Ingo Molnar.
27  */
28 
29 /*
30  * x86-64 changes / gcc fixes from Andi Kleen.
31  * Copyright 2002 Andi Kleen, SuSE Labs.
32  *
33  * This hasn't been optimized for the hammer yet, but there are likely
34  * no advantages to be gotten from x86-64 here anyways.
35  */
36 
37 typedef struct {
38 	unsigned long a, b;
39 } __attribute__((aligned(16))) xmm_store_t;
40 
41 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
42    tell it to do a clts before the register saving. */
43 #define XMMS_SAVE				\
44 do {						\
45 	preempt_disable();			\
46 	asm volatile(				\
47 		"movq %%cr0,%0		;\n\t"	\
48 		"clts			;\n\t"	\
49 		"movups %%xmm0,(%1)	;\n\t"	\
50 		"movups %%xmm1,0x10(%1)	;\n\t"	\
51 		"movups %%xmm2,0x20(%1)	;\n\t"	\
52 		"movups %%xmm3,0x30(%1)	;\n\t"	\
53 		: "=&r" (cr0)			\
54 		: "r" (xmm_save) 		\
55 		: "memory");			\
56 } while (0)
57 
58 #define XMMS_RESTORE				\
59 do {						\
60 	asm volatile(				\
61 		"sfence			;\n\t"	\
62 		"movups (%1),%%xmm0	;\n\t"	\
63 		"movups 0x10(%1),%%xmm1	;\n\t"	\
64 		"movups 0x20(%1),%%xmm2	;\n\t"	\
65 		"movups 0x30(%1),%%xmm3	;\n\t"	\
66 		"movq 	%0,%%cr0	;\n\t"	\
67 		:				\
68 		: "r" (cr0), "r" (xmm_save)	\
69 		: "memory");			\
70 	preempt_enable();			\
71 } while (0)
72 
73 #define OFFS(x)		"16*("#x")"
74 #define PF_OFFS(x)	"256+16*("#x")"
75 #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
76 #define LD(x, y)	"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
77 #define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
78 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
79 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
80 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
81 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
82 #define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
83 #define XO1(x, y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
84 #define XO2(x, y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
85 #define XO3(x, y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
86 #define XO4(x, y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
87 #define XO5(x, y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"
88 
89 
90 static void
91 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
92 {
93 	unsigned int lines = bytes >> 8;
94 	unsigned long cr0;
95 	xmm_store_t xmm_save[4];
96 
97 	XMMS_SAVE;
98 
99 	asm volatile(
100 #undef BLOCK
101 #define BLOCK(i) \
102 		LD(i, 0)				\
103 			LD(i + 1, 1)			\
104 		PF1(i)					\
105 				PF1(i + 2)		\
106 				LD(i + 2, 2)		\
107 					LD(i + 3, 3)	\
108 		PF0(i + 4)				\
109 				PF0(i + 6)		\
110 		XO1(i, 0)				\
111 			XO1(i + 1, 1)			\
112 				XO1(i + 2, 2)		\
113 					XO1(i + 3, 3)	\
114 		ST(i, 0)				\
115 			ST(i + 1, 1)			\
116 				ST(i + 2, 2)		\
117 					ST(i + 3, 3)	\
118 
119 
120 		PF0(0)
121 				PF0(2)
122 
123 	" .align 32			;\n"
124 	" 1:                            ;\n"
125 
126 		BLOCK(0)
127 		BLOCK(4)
128 		BLOCK(8)
129 		BLOCK(12)
130 
131 	"       addq %[inc], %[p1]           ;\n"
132 	"       addq %[inc], %[p2]           ;\n"
133 		"		decl %[cnt] ; jnz 1b"
134 	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
135 	: [inc] "r" (256UL)
136 	: "memory");
137 
138 	XMMS_RESTORE;
139 }
140 
141 static void
142 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
143 	  unsigned long *p3)
144 {
145 	unsigned int lines = bytes >> 8;
146 	xmm_store_t xmm_save[4];
147 	unsigned long cr0;
148 
149 	XMMS_SAVE;
150 
151 	asm volatile(
152 #undef BLOCK
153 #define BLOCK(i) \
154 		PF1(i)					\
155 				PF1(i + 2)		\
156 		LD(i, 0)					\
157 			LD(i + 1, 1)			\
158 				LD(i + 2, 2)		\
159 					LD(i + 3, 3)	\
160 		PF2(i)					\
161 				PF2(i + 2)		\
162 		PF0(i + 4)				\
163 				PF0(i + 6)		\
164 		XO1(i, 0)				\
165 			XO1(i + 1, 1)			\
166 				XO1(i + 2, 2)		\
167 					XO1(i + 3, 3)	\
168 		XO2(i, 0)				\
169 			XO2(i + 1, 1)			\
170 				XO2(i + 2, 2)		\
171 					XO2(i + 3, 3)	\
172 		ST(i, 0)				\
173 			ST(i + 1, 1)			\
174 				ST(i + 2, 2)		\
175 					ST(i + 3, 3)	\
176 
177 
178 		PF0(0)
179 				PF0(2)
180 
181 	" .align 32			;\n"
182 	" 1:                            ;\n"
183 
184 		BLOCK(0)
185 		BLOCK(4)
186 		BLOCK(8)
187 		BLOCK(12)
188 
189 	"       addq %[inc], %[p1]           ;\n"
190 	"       addq %[inc], %[p2]          ;\n"
191 	"       addq %[inc], %[p3]           ;\n"
192 		"		decl %[cnt] ; jnz 1b"
193 	: [cnt] "+r" (lines),
194 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195 	: [inc] "r" (256UL)
196 	: "memory");
197 	XMMS_RESTORE;
198 }
199 
200 static void
201 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
202 	  unsigned long *p3, unsigned long *p4)
203 {
204 	unsigned int lines = bytes >> 8;
205 	xmm_store_t xmm_save[4];
206 	unsigned long cr0;
207 
208 	XMMS_SAVE;
209 
210 	asm volatile(
211 #undef BLOCK
212 #define BLOCK(i) \
213 		PF1(i)					\
214 				PF1(i + 2)		\
215 		LD(i, 0)				\
216 			LD(i + 1, 1)			\
217 				LD(i + 2, 2)		\
218 					LD(i + 3, 3)	\
219 		PF2(i)					\
220 				PF2(i + 2)		\
221 		XO1(i, 0)				\
222 			XO1(i + 1, 1)			\
223 				XO1(i + 2, 2)		\
224 					XO1(i + 3, 3)	\
225 		PF3(i)					\
226 				PF3(i + 2)		\
227 		PF0(i + 4)				\
228 				PF0(i + 6)		\
229 		XO2(i, 0)				\
230 			XO2(i + 1, 1)			\
231 				XO2(i + 2, 2)		\
232 					XO2(i + 3, 3)	\
233 		XO3(i, 0)				\
234 			XO3(i + 1, 1)			\
235 				XO3(i + 2, 2)		\
236 					XO3(i + 3, 3)	\
237 		ST(i, 0)				\
238 			ST(i + 1, 1)			\
239 				ST(i + 2, 2)		\
240 					ST(i + 3, 3)	\
241 
242 
243 		PF0(0)
244 				PF0(2)
245 
246 	" .align 32			;\n"
247 	" 1:                            ;\n"
248 
249 		BLOCK(0)
250 		BLOCK(4)
251 		BLOCK(8)
252 		BLOCK(12)
253 
254 	"       addq %[inc], %[p1]           ;\n"
255 	"       addq %[inc], %[p2]           ;\n"
256 	"       addq %[inc], %[p3]           ;\n"
257 	"       addq %[inc], %[p4]           ;\n"
258 	"	decl %[cnt] ; jnz 1b"
259 	: [cnt] "+c" (lines),
260 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
261 	: [inc] "r" (256UL)
262 	: "memory" );
263 
264 	XMMS_RESTORE;
265 }
266 
267 static void
268 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
269 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
270 {
271 	unsigned int lines = bytes >> 8;
272 	xmm_store_t xmm_save[4];
273 	unsigned long cr0;
274 
275 	XMMS_SAVE;
276 
277 	asm volatile(
278 #undef BLOCK
279 #define BLOCK(i) \
280 		PF1(i)					\
281 				PF1(i + 2)		\
282 		LD(i, 0)				\
283 			LD(i + 1, 1)			\
284 				LD(i + 2, 2)		\
285 					LD(i + 3, 3)	\
286 		PF2(i)					\
287 				PF2(i + 2)		\
288 		XO1(i, 0)				\
289 			XO1(i + 1, 1)			\
290 				XO1(i + 2, 2)		\
291 					XO1(i + 3, 3)	\
292 		PF3(i)					\
293 				PF3(i + 2)		\
294 		XO2(i, 0)				\
295 			XO2(i + 1, 1)			\
296 				XO2(i + 2, 2)		\
297 					XO2(i + 3, 3)	\
298 		PF4(i)					\
299 				PF4(i + 2)		\
300 		PF0(i + 4)				\
301 				PF0(i + 6)		\
302 		XO3(i, 0)				\
303 			XO3(i + 1, 1)			\
304 				XO3(i + 2, 2)		\
305 					XO3(i + 3, 3)	\
306 		XO4(i, 0)				\
307 			XO4(i + 1, 1)			\
308 				XO4(i + 2, 2)		\
309 					XO4(i + 3, 3)	\
310 		ST(i, 0)				\
311 			ST(i + 1, 1)			\
312 				ST(i + 2, 2)		\
313 					ST(i + 3, 3)	\
314 
315 
316 		PF0(0)
317 				PF0(2)
318 
319 	" .align 32			;\n"
320 	" 1:                            ;\n"
321 
322 		BLOCK(0)
323 		BLOCK(4)
324 		BLOCK(8)
325 		BLOCK(12)
326 
327 	"       addq %[inc], %[p1]           ;\n"
328 	"       addq %[inc], %[p2]           ;\n"
329 	"       addq %[inc], %[p3]           ;\n"
330 	"       addq %[inc], %[p4]           ;\n"
331 	"       addq %[inc], %[p5]           ;\n"
332 	"	decl %[cnt] ; jnz 1b"
333 	: [cnt] "+c" (lines),
334 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
335 	  [p5] "+r" (p5)
336 	: [inc] "r" (256UL)
337 	: "memory");
338 
339 	XMMS_RESTORE;
340 }
341 
342 static struct xor_block_template xor_block_sse = {
343 	.name = "generic_sse",
344 	.do_2 = xor_sse_2,
345 	.do_3 = xor_sse_3,
346 	.do_4 = xor_sse_4,
347 	.do_5 = xor_sse_5,
348 };
349 
350 #undef XOR_TRY_TEMPLATES
351 #define XOR_TRY_TEMPLATES			\
352 do {						\
353 	xor_speed(&xor_block_sse);		\
354 } while (0)
355 
356 /* We force the use of the SSE xor block because it can write around L2.
357    We may also be able to load into the L1 only depending on how the cpu
358    deals with a load to a line that is being prefetched.  */
359 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
360 
361 #endif /* _ASM_X86_XOR_64_H */
362