xref: /openbmc/linux/arch/x86/include/asm/xor.h (revision 3dc4b6fb)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _ASM_X86_XOR_H
3 #define _ASM_X86_XOR_H
4 
5 /*
6  * Optimized RAID-5 checksumming functions for SSE.
7  */
8 
9 /*
10  * Cache avoiding checksumming functions utilizing KNI instructions
11  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
12  */
13 
14 /*
15  * Based on
16  * High-speed RAID5 checksumming functions utilizing SSE instructions.
17  * Copyright (C) 1998 Ingo Molnar.
18  */
19 
20 /*
21  * x86-64 changes / gcc fixes from Andi Kleen.
22  * Copyright 2002 Andi Kleen, SuSE Labs.
23  *
24  * This hasn't been optimized for the hammer yet, but there are likely
25  * no advantages to be gotten from x86-64 here anyways.
26  */
27 
28 #include <asm/fpu/api.h>
29 
30 #ifdef CONFIG_X86_32
31 /* reduce register pressure */
32 # define XOR_CONSTANT_CONSTRAINT "i"
33 #else
34 # define XOR_CONSTANT_CONSTRAINT "re"
35 #endif
36 
37 #define OFFS(x)		"16*("#x")"
38 #define PF_OFFS(x)	"256+16*("#x")"
39 #define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
40 #define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
41 #define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
42 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
43 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
44 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
45 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
46 #define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
47 #define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
48 #define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
49 #define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
50 #define NOP(x)
51 
52 #define BLK64(pf, op, i)				\
53 		pf(i)					\
54 		op(i, 0)				\
55 			op(i + 1, 1)			\
56 				op(i + 2, 2)		\
57 					op(i + 3, 3)
58 
59 static void
60 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
61 {
62 	unsigned long lines = bytes >> 8;
63 
64 	kernel_fpu_begin();
65 
66 	asm volatile(
67 #undef BLOCK
68 #define BLOCK(i)					\
69 		LD(i, 0)				\
70 			LD(i + 1, 1)			\
71 		PF1(i)					\
72 				PF1(i + 2)		\
73 				LD(i + 2, 2)		\
74 					LD(i + 3, 3)	\
75 		PF0(i + 4)				\
76 				PF0(i + 6)		\
77 		XO1(i, 0)				\
78 			XO1(i + 1, 1)			\
79 				XO1(i + 2, 2)		\
80 					XO1(i + 3, 3)	\
81 		ST(i, 0)				\
82 			ST(i + 1, 1)			\
83 				ST(i + 2, 2)		\
84 					ST(i + 3, 3)	\
85 
86 
87 		PF0(0)
88 				PF0(2)
89 
90 	" .align 32			;\n"
91 	" 1:                            ;\n"
92 
93 		BLOCK(0)
94 		BLOCK(4)
95 		BLOCK(8)
96 		BLOCK(12)
97 
98 	"       add %[inc], %[p1]       ;\n"
99 	"       add %[inc], %[p2]       ;\n"
100 	"       dec %[cnt]              ;\n"
101 	"       jnz 1b                  ;\n"
102 	: [cnt] "+r" (lines),
103 	  [p1] "+r" (p1), [p2] "+r" (p2)
104 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
105 	: "memory");
106 
107 	kernel_fpu_end();
108 }
109 
110 static void
111 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
112 {
113 	unsigned long lines = bytes >> 8;
114 
115 	kernel_fpu_begin();
116 
117 	asm volatile(
118 #undef BLOCK
119 #define BLOCK(i)			\
120 		BLK64(PF0, LD, i)	\
121 		BLK64(PF1, XO1, i)	\
122 		BLK64(NOP, ST, i)	\
123 
124 	" .align 32			;\n"
125 	" 1:                            ;\n"
126 
127 		BLOCK(0)
128 		BLOCK(4)
129 		BLOCK(8)
130 		BLOCK(12)
131 
132 	"       add %[inc], %[p1]       ;\n"
133 	"       add %[inc], %[p2]       ;\n"
134 	"       dec %[cnt]              ;\n"
135 	"       jnz 1b                  ;\n"
136 	: [cnt] "+r" (lines),
137 	  [p1] "+r" (p1), [p2] "+r" (p2)
138 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
139 	: "memory");
140 
141 	kernel_fpu_end();
142 }
143 
144 static void
145 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
146 	  unsigned long *p3)
147 {
148 	unsigned long lines = bytes >> 8;
149 
150 	kernel_fpu_begin();
151 
152 	asm volatile(
153 #undef BLOCK
154 #define BLOCK(i) \
155 		PF1(i)					\
156 				PF1(i + 2)		\
157 		LD(i, 0)				\
158 			LD(i + 1, 1)			\
159 				LD(i + 2, 2)		\
160 					LD(i + 3, 3)	\
161 		PF2(i)					\
162 				PF2(i + 2)		\
163 		PF0(i + 4)				\
164 				PF0(i + 6)		\
165 		XO1(i, 0)				\
166 			XO1(i + 1, 1)			\
167 				XO1(i + 2, 2)		\
168 					XO1(i + 3, 3)	\
169 		XO2(i, 0)				\
170 			XO2(i + 1, 1)			\
171 				XO2(i + 2, 2)		\
172 					XO2(i + 3, 3)	\
173 		ST(i, 0)				\
174 			ST(i + 1, 1)			\
175 				ST(i + 2, 2)		\
176 					ST(i + 3, 3)	\
177 
178 
179 		PF0(0)
180 				PF0(2)
181 
182 	" .align 32			;\n"
183 	" 1:                            ;\n"
184 
185 		BLOCK(0)
186 		BLOCK(4)
187 		BLOCK(8)
188 		BLOCK(12)
189 
190 	"       add %[inc], %[p1]       ;\n"
191 	"       add %[inc], %[p2]       ;\n"
192 	"       add %[inc], %[p3]       ;\n"
193 	"       dec %[cnt]              ;\n"
194 	"       jnz 1b                  ;\n"
195 	: [cnt] "+r" (lines),
196 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
197 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
198 	: "memory");
199 
200 	kernel_fpu_end();
201 }
202 
203 static void
204 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
205 	       unsigned long *p3)
206 {
207 	unsigned long lines = bytes >> 8;
208 
209 	kernel_fpu_begin();
210 
211 	asm volatile(
212 #undef BLOCK
213 #define BLOCK(i)			\
214 		BLK64(PF0, LD, i)	\
215 		BLK64(PF1, XO1, i)	\
216 		BLK64(PF2, XO2, i)	\
217 		BLK64(NOP, ST, i)	\
218 
219 	" .align 32			;\n"
220 	" 1:                            ;\n"
221 
222 		BLOCK(0)
223 		BLOCK(4)
224 		BLOCK(8)
225 		BLOCK(12)
226 
227 	"       add %[inc], %[p1]       ;\n"
228 	"       add %[inc], %[p2]       ;\n"
229 	"       add %[inc], %[p3]       ;\n"
230 	"       dec %[cnt]              ;\n"
231 	"       jnz 1b                  ;\n"
232 	: [cnt] "+r" (lines),
233 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
234 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
235 	: "memory");
236 
237 	kernel_fpu_end();
238 }
239 
240 static void
241 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
242 	  unsigned long *p3, unsigned long *p4)
243 {
244 	unsigned long lines = bytes >> 8;
245 
246 	kernel_fpu_begin();
247 
248 	asm volatile(
249 #undef BLOCK
250 #define BLOCK(i) \
251 		PF1(i)					\
252 				PF1(i + 2)		\
253 		LD(i, 0)				\
254 			LD(i + 1, 1)			\
255 				LD(i + 2, 2)		\
256 					LD(i + 3, 3)	\
257 		PF2(i)					\
258 				PF2(i + 2)		\
259 		XO1(i, 0)				\
260 			XO1(i + 1, 1)			\
261 				XO1(i + 2, 2)		\
262 					XO1(i + 3, 3)	\
263 		PF3(i)					\
264 				PF3(i + 2)		\
265 		PF0(i + 4)				\
266 				PF0(i + 6)		\
267 		XO2(i, 0)				\
268 			XO2(i + 1, 1)			\
269 				XO2(i + 2, 2)		\
270 					XO2(i + 3, 3)	\
271 		XO3(i, 0)				\
272 			XO3(i + 1, 1)			\
273 				XO3(i + 2, 2)		\
274 					XO3(i + 3, 3)	\
275 		ST(i, 0)				\
276 			ST(i + 1, 1)			\
277 				ST(i + 2, 2)		\
278 					ST(i + 3, 3)	\
279 
280 
281 		PF0(0)
282 				PF0(2)
283 
284 	" .align 32			;\n"
285 	" 1:                            ;\n"
286 
287 		BLOCK(0)
288 		BLOCK(4)
289 		BLOCK(8)
290 		BLOCK(12)
291 
292 	"       add %[inc], %[p1]       ;\n"
293 	"       add %[inc], %[p2]       ;\n"
294 	"       add %[inc], %[p3]       ;\n"
295 	"       add %[inc], %[p4]       ;\n"
296 	"       dec %[cnt]              ;\n"
297 	"       jnz 1b                  ;\n"
298 	: [cnt] "+r" (lines), [p1] "+r" (p1),
299 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
300 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
301 	: "memory");
302 
303 	kernel_fpu_end();
304 }
305 
306 static void
307 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
308 	       unsigned long *p3, unsigned long *p4)
309 {
310 	unsigned long lines = bytes >> 8;
311 
312 	kernel_fpu_begin();
313 
314 	asm volatile(
315 #undef BLOCK
316 #define BLOCK(i)			\
317 		BLK64(PF0, LD, i)	\
318 		BLK64(PF1, XO1, i)	\
319 		BLK64(PF2, XO2, i)	\
320 		BLK64(PF3, XO3, i)	\
321 		BLK64(NOP, ST, i)	\
322 
323 	" .align 32			;\n"
324 	" 1:                            ;\n"
325 
326 		BLOCK(0)
327 		BLOCK(4)
328 		BLOCK(8)
329 		BLOCK(12)
330 
331 	"       add %[inc], %[p1]       ;\n"
332 	"       add %[inc], %[p2]       ;\n"
333 	"       add %[inc], %[p3]       ;\n"
334 	"       add %[inc], %[p4]       ;\n"
335 	"       dec %[cnt]              ;\n"
336 	"       jnz 1b                  ;\n"
337 	: [cnt] "+r" (lines), [p1] "+r" (p1),
338 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
339 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
340 	: "memory");
341 
342 	kernel_fpu_end();
343 }
344 
345 static void
346 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
347 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
348 {
349 	unsigned long lines = bytes >> 8;
350 
351 	kernel_fpu_begin();
352 
353 	asm volatile(
354 #undef BLOCK
355 #define BLOCK(i) \
356 		PF1(i)					\
357 				PF1(i + 2)		\
358 		LD(i, 0)				\
359 			LD(i + 1, 1)			\
360 				LD(i + 2, 2)		\
361 					LD(i + 3, 3)	\
362 		PF2(i)					\
363 				PF2(i + 2)		\
364 		XO1(i, 0)				\
365 			XO1(i + 1, 1)			\
366 				XO1(i + 2, 2)		\
367 					XO1(i + 3, 3)	\
368 		PF3(i)					\
369 				PF3(i + 2)		\
370 		XO2(i, 0)				\
371 			XO2(i + 1, 1)			\
372 				XO2(i + 2, 2)		\
373 					XO2(i + 3, 3)	\
374 		PF4(i)					\
375 				PF4(i + 2)		\
376 		PF0(i + 4)				\
377 				PF0(i + 6)		\
378 		XO3(i, 0)				\
379 			XO3(i + 1, 1)			\
380 				XO3(i + 2, 2)		\
381 					XO3(i + 3, 3)	\
382 		XO4(i, 0)				\
383 			XO4(i + 1, 1)			\
384 				XO4(i + 2, 2)		\
385 					XO4(i + 3, 3)	\
386 		ST(i, 0)				\
387 			ST(i + 1, 1)			\
388 				ST(i + 2, 2)		\
389 					ST(i + 3, 3)	\
390 
391 
392 		PF0(0)
393 				PF0(2)
394 
395 	" .align 32			;\n"
396 	" 1:                            ;\n"
397 
398 		BLOCK(0)
399 		BLOCK(4)
400 		BLOCK(8)
401 		BLOCK(12)
402 
403 	"       add %[inc], %[p1]       ;\n"
404 	"       add %[inc], %[p2]       ;\n"
405 	"       add %[inc], %[p3]       ;\n"
406 	"       add %[inc], %[p4]       ;\n"
407 	"       add %[inc], %[p5]       ;\n"
408 	"       dec %[cnt]              ;\n"
409 	"       jnz 1b                  ;\n"
410 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
411 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
412 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
413 	: "memory");
414 
415 	kernel_fpu_end();
416 }
417 
418 static void
419 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
420 	       unsigned long *p3, unsigned long *p4, unsigned long *p5)
421 {
422 	unsigned long lines = bytes >> 8;
423 
424 	kernel_fpu_begin();
425 
426 	asm volatile(
427 #undef BLOCK
428 #define BLOCK(i)			\
429 		BLK64(PF0, LD, i)	\
430 		BLK64(PF1, XO1, i)	\
431 		BLK64(PF2, XO2, i)	\
432 		BLK64(PF3, XO3, i)	\
433 		BLK64(PF4, XO4, i)	\
434 		BLK64(NOP, ST, i)	\
435 
436 	" .align 32			;\n"
437 	" 1:                            ;\n"
438 
439 		BLOCK(0)
440 		BLOCK(4)
441 		BLOCK(8)
442 		BLOCK(12)
443 
444 	"       add %[inc], %[p1]       ;\n"
445 	"       add %[inc], %[p2]       ;\n"
446 	"       add %[inc], %[p3]       ;\n"
447 	"       add %[inc], %[p4]       ;\n"
448 	"       add %[inc], %[p5]       ;\n"
449 	"       dec %[cnt]              ;\n"
450 	"       jnz 1b                  ;\n"
451 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
452 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
453 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
454 	: "memory");
455 
456 	kernel_fpu_end();
457 }
458 
459 static struct xor_block_template xor_block_sse_pf64 = {
460 	.name = "prefetch64-sse",
461 	.do_2 = xor_sse_2_pf64,
462 	.do_3 = xor_sse_3_pf64,
463 	.do_4 = xor_sse_4_pf64,
464 	.do_5 = xor_sse_5_pf64,
465 };
466 
467 #undef LD
468 #undef XO1
469 #undef XO2
470 #undef XO3
471 #undef XO4
472 #undef ST
473 #undef NOP
474 #undef BLK64
475 #undef BLOCK
476 
477 #undef XOR_CONSTANT_CONSTRAINT
478 
479 #ifdef CONFIG_X86_32
480 # include <asm/xor_32.h>
481 #else
482 # include <asm/xor_64.h>
483 #endif
484 
485 #define XOR_SELECT_TEMPLATE(FASTEST) \
486 	AVX_SELECT(FASTEST)
487 
488 #endif /* _ASM_X86_XOR_H */
489