xref: /openbmc/linux/arch/x86/include/asm/xor.h (revision 05cf4fe738242183f1237f1b3a28b4479348c0a1)
1 #ifndef _ASM_X86_XOR_H
2 #define _ASM_X86_XOR_H
3 
4 /*
5  * Optimized RAID-5 checksumming functions for SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16 
17 /*
18  * Cache avoiding checksumming functions utilizing KNI instructions
19  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20  */
21 
22 /*
23  * Based on
24  * High-speed RAID5 checksumming functions utilizing SSE instructions.
25  * Copyright (C) 1998 Ingo Molnar.
26  */
27 
28 /*
29  * x86-64 changes / gcc fixes from Andi Kleen.
30  * Copyright 2002 Andi Kleen, SuSE Labs.
31  *
32  * This hasn't been optimized for the hammer yet, but there are likely
33  * no advantages to be gotten from x86-64 here anyways.
34  */
35 
36 #include <asm/fpu/api.h>
37 
38 #ifdef CONFIG_X86_32
39 /* reduce register pressure */
40 # define XOR_CONSTANT_CONSTRAINT "i"
41 #else
42 # define XOR_CONSTANT_CONSTRAINT "re"
43 #endif
44 
45 #define OFFS(x)		"16*("#x")"
46 #define PF_OFFS(x)	"256+16*("#x")"
47 #define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
48 #define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
49 #define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
50 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
51 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
52 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
53 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
54 #define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
55 #define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
56 #define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
57 #define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
58 #define NOP(x)
59 
60 #define BLK64(pf, op, i)				\
61 		pf(i)					\
62 		op(i, 0)				\
63 			op(i + 1, 1)			\
64 				op(i + 2, 2)		\
65 					op(i + 3, 3)
66 
67 static void
68 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
69 {
70 	unsigned long lines = bytes >> 8;
71 
72 	kernel_fpu_begin();
73 
74 	asm volatile(
75 #undef BLOCK
76 #define BLOCK(i)					\
77 		LD(i, 0)				\
78 			LD(i + 1, 1)			\
79 		PF1(i)					\
80 				PF1(i + 2)		\
81 				LD(i + 2, 2)		\
82 					LD(i + 3, 3)	\
83 		PF0(i + 4)				\
84 				PF0(i + 6)		\
85 		XO1(i, 0)				\
86 			XO1(i + 1, 1)			\
87 				XO1(i + 2, 2)		\
88 					XO1(i + 3, 3)	\
89 		ST(i, 0)				\
90 			ST(i + 1, 1)			\
91 				ST(i + 2, 2)		\
92 					ST(i + 3, 3)	\
93 
94 
95 		PF0(0)
96 				PF0(2)
97 
98 	" .align 32			;\n"
99 	" 1:                            ;\n"
100 
101 		BLOCK(0)
102 		BLOCK(4)
103 		BLOCK(8)
104 		BLOCK(12)
105 
106 	"       add %[inc], %[p1]       ;\n"
107 	"       add %[inc], %[p2]       ;\n"
108 	"       dec %[cnt]              ;\n"
109 	"       jnz 1b                  ;\n"
110 	: [cnt] "+r" (lines),
111 	  [p1] "+r" (p1), [p2] "+r" (p2)
112 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
113 	: "memory");
114 
115 	kernel_fpu_end();
116 }
117 
118 static void
119 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
120 {
121 	unsigned long lines = bytes >> 8;
122 
123 	kernel_fpu_begin();
124 
125 	asm volatile(
126 #undef BLOCK
127 #define BLOCK(i)			\
128 		BLK64(PF0, LD, i)	\
129 		BLK64(PF1, XO1, i)	\
130 		BLK64(NOP, ST, i)	\
131 
132 	" .align 32			;\n"
133 	" 1:                            ;\n"
134 
135 		BLOCK(0)
136 		BLOCK(4)
137 		BLOCK(8)
138 		BLOCK(12)
139 
140 	"       add %[inc], %[p1]       ;\n"
141 	"       add %[inc], %[p2]       ;\n"
142 	"       dec %[cnt]              ;\n"
143 	"       jnz 1b                  ;\n"
144 	: [cnt] "+r" (lines),
145 	  [p1] "+r" (p1), [p2] "+r" (p2)
146 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
147 	: "memory");
148 
149 	kernel_fpu_end();
150 }
151 
152 static void
153 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
154 	  unsigned long *p3)
155 {
156 	unsigned long lines = bytes >> 8;
157 
158 	kernel_fpu_begin();
159 
160 	asm volatile(
161 #undef BLOCK
162 #define BLOCK(i) \
163 		PF1(i)					\
164 				PF1(i + 2)		\
165 		LD(i, 0)				\
166 			LD(i + 1, 1)			\
167 				LD(i + 2, 2)		\
168 					LD(i + 3, 3)	\
169 		PF2(i)					\
170 				PF2(i + 2)		\
171 		PF0(i + 4)				\
172 				PF0(i + 6)		\
173 		XO1(i, 0)				\
174 			XO1(i + 1, 1)			\
175 				XO1(i + 2, 2)		\
176 					XO1(i + 3, 3)	\
177 		XO2(i, 0)				\
178 			XO2(i + 1, 1)			\
179 				XO2(i + 2, 2)		\
180 					XO2(i + 3, 3)	\
181 		ST(i, 0)				\
182 			ST(i + 1, 1)			\
183 				ST(i + 2, 2)		\
184 					ST(i + 3, 3)	\
185 
186 
187 		PF0(0)
188 				PF0(2)
189 
190 	" .align 32			;\n"
191 	" 1:                            ;\n"
192 
193 		BLOCK(0)
194 		BLOCK(4)
195 		BLOCK(8)
196 		BLOCK(12)
197 
198 	"       add %[inc], %[p1]       ;\n"
199 	"       add %[inc], %[p2]       ;\n"
200 	"       add %[inc], %[p3]       ;\n"
201 	"       dec %[cnt]              ;\n"
202 	"       jnz 1b                  ;\n"
203 	: [cnt] "+r" (lines),
204 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
205 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
206 	: "memory");
207 
208 	kernel_fpu_end();
209 }
210 
211 static void
212 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
213 	       unsigned long *p3)
214 {
215 	unsigned long lines = bytes >> 8;
216 
217 	kernel_fpu_begin();
218 
219 	asm volatile(
220 #undef BLOCK
221 #define BLOCK(i)			\
222 		BLK64(PF0, LD, i)	\
223 		BLK64(PF1, XO1, i)	\
224 		BLK64(PF2, XO2, i)	\
225 		BLK64(NOP, ST, i)	\
226 
227 	" .align 32			;\n"
228 	" 1:                            ;\n"
229 
230 		BLOCK(0)
231 		BLOCK(4)
232 		BLOCK(8)
233 		BLOCK(12)
234 
235 	"       add %[inc], %[p1]       ;\n"
236 	"       add %[inc], %[p2]       ;\n"
237 	"       add %[inc], %[p3]       ;\n"
238 	"       dec %[cnt]              ;\n"
239 	"       jnz 1b                  ;\n"
240 	: [cnt] "+r" (lines),
241 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
242 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
243 	: "memory");
244 
245 	kernel_fpu_end();
246 }
247 
248 static void
249 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
250 	  unsigned long *p3, unsigned long *p4)
251 {
252 	unsigned long lines = bytes >> 8;
253 
254 	kernel_fpu_begin();
255 
256 	asm volatile(
257 #undef BLOCK
258 #define BLOCK(i) \
259 		PF1(i)					\
260 				PF1(i + 2)		\
261 		LD(i, 0)				\
262 			LD(i + 1, 1)			\
263 				LD(i + 2, 2)		\
264 					LD(i + 3, 3)	\
265 		PF2(i)					\
266 				PF2(i + 2)		\
267 		XO1(i, 0)				\
268 			XO1(i + 1, 1)			\
269 				XO1(i + 2, 2)		\
270 					XO1(i + 3, 3)	\
271 		PF3(i)					\
272 				PF3(i + 2)		\
273 		PF0(i + 4)				\
274 				PF0(i + 6)		\
275 		XO2(i, 0)				\
276 			XO2(i + 1, 1)			\
277 				XO2(i + 2, 2)		\
278 					XO2(i + 3, 3)	\
279 		XO3(i, 0)				\
280 			XO3(i + 1, 1)			\
281 				XO3(i + 2, 2)		\
282 					XO3(i + 3, 3)	\
283 		ST(i, 0)				\
284 			ST(i + 1, 1)			\
285 				ST(i + 2, 2)		\
286 					ST(i + 3, 3)	\
287 
288 
289 		PF0(0)
290 				PF0(2)
291 
292 	" .align 32			;\n"
293 	" 1:                            ;\n"
294 
295 		BLOCK(0)
296 		BLOCK(4)
297 		BLOCK(8)
298 		BLOCK(12)
299 
300 	"       add %[inc], %[p1]       ;\n"
301 	"       add %[inc], %[p2]       ;\n"
302 	"       add %[inc], %[p3]       ;\n"
303 	"       add %[inc], %[p4]       ;\n"
304 	"       dec %[cnt]              ;\n"
305 	"       jnz 1b                  ;\n"
306 	: [cnt] "+r" (lines), [p1] "+r" (p1),
307 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
308 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
309 	: "memory");
310 
311 	kernel_fpu_end();
312 }
313 
314 static void
315 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
316 	       unsigned long *p3, unsigned long *p4)
317 {
318 	unsigned long lines = bytes >> 8;
319 
320 	kernel_fpu_begin();
321 
322 	asm volatile(
323 #undef BLOCK
324 #define BLOCK(i)			\
325 		BLK64(PF0, LD, i)	\
326 		BLK64(PF1, XO1, i)	\
327 		BLK64(PF2, XO2, i)	\
328 		BLK64(PF3, XO3, i)	\
329 		BLK64(NOP, ST, i)	\
330 
331 	" .align 32			;\n"
332 	" 1:                            ;\n"
333 
334 		BLOCK(0)
335 		BLOCK(4)
336 		BLOCK(8)
337 		BLOCK(12)
338 
339 	"       add %[inc], %[p1]       ;\n"
340 	"       add %[inc], %[p2]       ;\n"
341 	"       add %[inc], %[p3]       ;\n"
342 	"       add %[inc], %[p4]       ;\n"
343 	"       dec %[cnt]              ;\n"
344 	"       jnz 1b                  ;\n"
345 	: [cnt] "+r" (lines), [p1] "+r" (p1),
346 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
348 	: "memory");
349 
350 	kernel_fpu_end();
351 }
352 
353 static void
354 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
355 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
356 {
357 	unsigned long lines = bytes >> 8;
358 
359 	kernel_fpu_begin();
360 
361 	asm volatile(
362 #undef BLOCK
363 #define BLOCK(i) \
364 		PF1(i)					\
365 				PF1(i + 2)		\
366 		LD(i, 0)				\
367 			LD(i + 1, 1)			\
368 				LD(i + 2, 2)		\
369 					LD(i + 3, 3)	\
370 		PF2(i)					\
371 				PF2(i + 2)		\
372 		XO1(i, 0)				\
373 			XO1(i + 1, 1)			\
374 				XO1(i + 2, 2)		\
375 					XO1(i + 3, 3)	\
376 		PF3(i)					\
377 				PF3(i + 2)		\
378 		XO2(i, 0)				\
379 			XO2(i + 1, 1)			\
380 				XO2(i + 2, 2)		\
381 					XO2(i + 3, 3)	\
382 		PF4(i)					\
383 				PF4(i + 2)		\
384 		PF0(i + 4)				\
385 				PF0(i + 6)		\
386 		XO3(i, 0)				\
387 			XO3(i + 1, 1)			\
388 				XO3(i + 2, 2)		\
389 					XO3(i + 3, 3)	\
390 		XO4(i, 0)				\
391 			XO4(i + 1, 1)			\
392 				XO4(i + 2, 2)		\
393 					XO4(i + 3, 3)	\
394 		ST(i, 0)				\
395 			ST(i + 1, 1)			\
396 				ST(i + 2, 2)		\
397 					ST(i + 3, 3)	\
398 
399 
400 		PF0(0)
401 				PF0(2)
402 
403 	" .align 32			;\n"
404 	" 1:                            ;\n"
405 
406 		BLOCK(0)
407 		BLOCK(4)
408 		BLOCK(8)
409 		BLOCK(12)
410 
411 	"       add %[inc], %[p1]       ;\n"
412 	"       add %[inc], %[p2]       ;\n"
413 	"       add %[inc], %[p3]       ;\n"
414 	"       add %[inc], %[p4]       ;\n"
415 	"       add %[inc], %[p5]       ;\n"
416 	"       dec %[cnt]              ;\n"
417 	"       jnz 1b                  ;\n"
418 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
419 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
420 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
421 	: "memory");
422 
423 	kernel_fpu_end();
424 }
425 
426 static void
427 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
428 	       unsigned long *p3, unsigned long *p4, unsigned long *p5)
429 {
430 	unsigned long lines = bytes >> 8;
431 
432 	kernel_fpu_begin();
433 
434 	asm volatile(
435 #undef BLOCK
436 #define BLOCK(i)			\
437 		BLK64(PF0, LD, i)	\
438 		BLK64(PF1, XO1, i)	\
439 		BLK64(PF2, XO2, i)	\
440 		BLK64(PF3, XO3, i)	\
441 		BLK64(PF4, XO4, i)	\
442 		BLK64(NOP, ST, i)	\
443 
444 	" .align 32			;\n"
445 	" 1:                            ;\n"
446 
447 		BLOCK(0)
448 		BLOCK(4)
449 		BLOCK(8)
450 		BLOCK(12)
451 
452 	"       add %[inc], %[p1]       ;\n"
453 	"       add %[inc], %[p2]       ;\n"
454 	"       add %[inc], %[p3]       ;\n"
455 	"       add %[inc], %[p4]       ;\n"
456 	"       add %[inc], %[p5]       ;\n"
457 	"       dec %[cnt]              ;\n"
458 	"       jnz 1b                  ;\n"
459 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
460 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
461 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
462 	: "memory");
463 
464 	kernel_fpu_end();
465 }
466 
467 static struct xor_block_template xor_block_sse_pf64 = {
468 	.name = "prefetch64-sse",
469 	.do_2 = xor_sse_2_pf64,
470 	.do_3 = xor_sse_3_pf64,
471 	.do_4 = xor_sse_4_pf64,
472 	.do_5 = xor_sse_5_pf64,
473 };
474 
475 #undef LD
476 #undef XO1
477 #undef XO2
478 #undef XO3
479 #undef XO4
480 #undef ST
481 #undef NOP
482 #undef BLK64
483 #undef BLOCK
484 
485 #undef XOR_CONSTANT_CONSTRAINT
486 
487 #ifdef CONFIG_X86_32
488 # include <asm/xor_32.h>
489 #else
490 # include <asm/xor_64.h>
491 #endif
492 
493 #define XOR_SELECT_TEMPLATE(FASTEST) \
494 	AVX_SELECT(FASTEST)
495 
496 #endif /* _ASM_X86_XOR_H */
497