xref: /openbmc/linux/arch/x86/include/asm/xor_32.h (revision b627b4ed)
1 #ifndef _ASM_X86_XOR_32_H
2 #define _ASM_X86_XOR_32_H
3 
4 /*
5  * Optimized RAID-5 checksumming functions for MMX and SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16 
17 /*
18  * High-speed RAID5 checksumming functions utilizing MMX instructions.
19  * Copyright (C) 1998 Ingo Molnar.
20  */
21 
22 #define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
23 #define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
24 #define XO1(x, y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
25 #define XO2(x, y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
26 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
27 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
28 
29 #include <asm/i387.h>
30 
31 static void
32 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
33 {
34 	unsigned long lines = bytes >> 7;
35 
36 	kernel_fpu_begin();
37 
38 	asm volatile(
39 #undef BLOCK
40 #define BLOCK(i)				\
41 	LD(i, 0)				\
42 		LD(i + 1, 1)			\
43 			LD(i + 2, 2)		\
44 				LD(i + 3, 3)	\
45 	XO1(i, 0)				\
46 	ST(i, 0)				\
47 		XO1(i+1, 1)			\
48 		ST(i+1, 1)			\
49 			XO1(i + 2, 2)		\
50 			ST(i + 2, 2)		\
51 				XO1(i + 3, 3)	\
52 				ST(i + 3, 3)
53 
54 	" .align 32			;\n"
55 	" 1:                            ;\n"
56 
57 	BLOCK(0)
58 	BLOCK(4)
59 	BLOCK(8)
60 	BLOCK(12)
61 
62 	"       addl $128, %1         ;\n"
63 	"       addl $128, %2         ;\n"
64 	"       decl %0               ;\n"
65 	"       jnz 1b                ;\n"
66 	: "+r" (lines),
67 	  "+r" (p1), "+r" (p2)
68 	:
69 	: "memory");
70 
71 	kernel_fpu_end();
72 }
73 
74 static void
75 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76 	      unsigned long *p3)
77 {
78 	unsigned long lines = bytes >> 7;
79 
80 	kernel_fpu_begin();
81 
82 	asm volatile(
83 #undef BLOCK
84 #define BLOCK(i)				\
85 	LD(i, 0)				\
86 		LD(i + 1, 1)			\
87 			LD(i + 2, 2)		\
88 				LD(i + 3, 3)	\
89 	XO1(i, 0)				\
90 		XO1(i + 1, 1)			\
91 			XO1(i + 2, 2)		\
92 				XO1(i + 3, 3)	\
93 	XO2(i, 0)				\
94 	ST(i, 0)				\
95 		XO2(i + 1, 1)			\
96 		ST(i + 1, 1)			\
97 			XO2(i + 2, 2)		\
98 			ST(i + 2, 2)		\
99 				XO2(i + 3, 3)	\
100 				ST(i + 3, 3)
101 
102 	" .align 32			;\n"
103 	" 1:                            ;\n"
104 
105 	BLOCK(0)
106 	BLOCK(4)
107 	BLOCK(8)
108 	BLOCK(12)
109 
110 	"       addl $128, %1         ;\n"
111 	"       addl $128, %2         ;\n"
112 	"       addl $128, %3         ;\n"
113 	"       decl %0               ;\n"
114 	"       jnz 1b                ;\n"
115 	: "+r" (lines),
116 	  "+r" (p1), "+r" (p2), "+r" (p3)
117 	:
118 	: "memory");
119 
120 	kernel_fpu_end();
121 }
122 
123 static void
124 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
125 	      unsigned long *p3, unsigned long *p4)
126 {
127 	unsigned long lines = bytes >> 7;
128 
129 	kernel_fpu_begin();
130 
131 	asm volatile(
132 #undef BLOCK
133 #define BLOCK(i)				\
134 	LD(i, 0)				\
135 		LD(i + 1, 1)			\
136 			LD(i + 2, 2)		\
137 				LD(i + 3, 3)	\
138 	XO1(i, 0)				\
139 		XO1(i + 1, 1)			\
140 			XO1(i + 2, 2)		\
141 				XO1(i + 3, 3)	\
142 	XO2(i, 0)				\
143 		XO2(i + 1, 1)			\
144 			XO2(i + 2, 2)		\
145 				XO2(i + 3, 3)	\
146 	XO3(i, 0)				\
147 	ST(i, 0)				\
148 		XO3(i + 1, 1)			\
149 		ST(i + 1, 1)			\
150 			XO3(i + 2, 2)		\
151 			ST(i + 2, 2)		\
152 				XO3(i + 3, 3)	\
153 				ST(i + 3, 3)
154 
155 	" .align 32			;\n"
156 	" 1:                            ;\n"
157 
158 	BLOCK(0)
159 	BLOCK(4)
160 	BLOCK(8)
161 	BLOCK(12)
162 
163 	"       addl $128, %1         ;\n"
164 	"       addl $128, %2         ;\n"
165 	"       addl $128, %3         ;\n"
166 	"       addl $128, %4         ;\n"
167 	"       decl %0               ;\n"
168 	"       jnz 1b                ;\n"
169 	: "+r" (lines),
170 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
171 	:
172 	: "memory");
173 
174 	kernel_fpu_end();
175 }
176 
177 
178 static void
179 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180 	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
181 {
182 	unsigned long lines = bytes >> 7;
183 
184 	kernel_fpu_begin();
185 
186 	/* Make sure GCC forgets anything it knows about p4 or p5,
187 	   such that it won't pass to the asm volatile below a
188 	   register that is shared with any other variable.  That's
189 	   because we modify p4 and p5 there, but we can't mark them
190 	   as read/write, otherwise we'd overflow the 10-asm-operands
191 	   limit of GCC < 3.1.  */
192 	asm("" : "+r" (p4), "+r" (p5));
193 
194 	asm volatile(
195 #undef BLOCK
196 #define BLOCK(i)				\
197 	LD(i, 0)				\
198 		LD(i + 1, 1)			\
199 			LD(i + 2, 2)		\
200 				LD(i + 3, 3)	\
201 	XO1(i, 0)				\
202 		XO1(i + 1, 1)			\
203 			XO1(i + 2, 2)		\
204 				XO1(i + 3, 3)	\
205 	XO2(i, 0)				\
206 		XO2(i + 1, 1)			\
207 			XO2(i + 2, 2)		\
208 				XO2(i + 3, 3)	\
209 	XO3(i, 0)				\
210 		XO3(i + 1, 1)			\
211 			XO3(i + 2, 2)		\
212 				XO3(i + 3, 3)	\
213 	XO4(i, 0)				\
214 	ST(i, 0)				\
215 		XO4(i + 1, 1)			\
216 		ST(i + 1, 1)			\
217 			XO4(i + 2, 2)		\
218 			ST(i + 2, 2)		\
219 				XO4(i + 3, 3)	\
220 				ST(i + 3, 3)
221 
222 	" .align 32			;\n"
223 	" 1:                            ;\n"
224 
225 	BLOCK(0)
226 	BLOCK(4)
227 	BLOCK(8)
228 	BLOCK(12)
229 
230 	"       addl $128, %1         ;\n"
231 	"       addl $128, %2         ;\n"
232 	"       addl $128, %3         ;\n"
233 	"       addl $128, %4         ;\n"
234 	"       addl $128, %5         ;\n"
235 	"       decl %0               ;\n"
236 	"       jnz 1b                ;\n"
237 	: "+r" (lines),
238 	  "+r" (p1), "+r" (p2), "+r" (p3)
239 	: "r" (p4), "r" (p5)
240 	: "memory");
241 
242 	/* p4 and p5 were modified, and now the variables are dead.
243 	   Clobber them just to be sure nobody does something stupid
244 	   like assuming they have some legal value.  */
245 	asm("" : "=r" (p4), "=r" (p5));
246 
247 	kernel_fpu_end();
248 }
249 
250 #undef LD
251 #undef XO1
252 #undef XO2
253 #undef XO3
254 #undef XO4
255 #undef ST
256 #undef BLOCK
257 
258 static void
259 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
260 {
261 	unsigned long lines = bytes >> 6;
262 
263 	kernel_fpu_begin();
264 
265 	asm volatile(
266 	" .align 32	             ;\n"
267 	" 1:                         ;\n"
268 	"       movq   (%1), %%mm0   ;\n"
269 	"       movq  8(%1), %%mm1   ;\n"
270 	"       pxor   (%2), %%mm0   ;\n"
271 	"       movq 16(%1), %%mm2   ;\n"
272 	"       movq %%mm0,   (%1)   ;\n"
273 	"       pxor  8(%2), %%mm1   ;\n"
274 	"       movq 24(%1), %%mm3   ;\n"
275 	"       movq %%mm1,  8(%1)   ;\n"
276 	"       pxor 16(%2), %%mm2   ;\n"
277 	"       movq 32(%1), %%mm4   ;\n"
278 	"       movq %%mm2, 16(%1)   ;\n"
279 	"       pxor 24(%2), %%mm3   ;\n"
280 	"       movq 40(%1), %%mm5   ;\n"
281 	"       movq %%mm3, 24(%1)   ;\n"
282 	"       pxor 32(%2), %%mm4   ;\n"
283 	"       movq 48(%1), %%mm6   ;\n"
284 	"       movq %%mm4, 32(%1)   ;\n"
285 	"       pxor 40(%2), %%mm5   ;\n"
286 	"       movq 56(%1), %%mm7   ;\n"
287 	"       movq %%mm5, 40(%1)   ;\n"
288 	"       pxor 48(%2), %%mm6   ;\n"
289 	"       pxor 56(%2), %%mm7   ;\n"
290 	"       movq %%mm6, 48(%1)   ;\n"
291 	"       movq %%mm7, 56(%1)   ;\n"
292 
293 	"       addl $64, %1         ;\n"
294 	"       addl $64, %2         ;\n"
295 	"       decl %0              ;\n"
296 	"       jnz 1b               ;\n"
297 	: "+r" (lines),
298 	  "+r" (p1), "+r" (p2)
299 	:
300 	: "memory");
301 
302 	kernel_fpu_end();
303 }
304 
305 static void
306 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
307 	     unsigned long *p3)
308 {
309 	unsigned long lines = bytes >> 6;
310 
311 	kernel_fpu_begin();
312 
313 	asm volatile(
314 	" .align 32,0x90             ;\n"
315 	" 1:                         ;\n"
316 	"       movq   (%1), %%mm0   ;\n"
317 	"       movq  8(%1), %%mm1   ;\n"
318 	"       pxor   (%2), %%mm0   ;\n"
319 	"       movq 16(%1), %%mm2   ;\n"
320 	"       pxor  8(%2), %%mm1   ;\n"
321 	"       pxor   (%3), %%mm0   ;\n"
322 	"       pxor 16(%2), %%mm2   ;\n"
323 	"       movq %%mm0,   (%1)   ;\n"
324 	"       pxor  8(%3), %%mm1   ;\n"
325 	"       pxor 16(%3), %%mm2   ;\n"
326 	"       movq 24(%1), %%mm3   ;\n"
327 	"       movq %%mm1,  8(%1)   ;\n"
328 	"       movq 32(%1), %%mm4   ;\n"
329 	"       movq 40(%1), %%mm5   ;\n"
330 	"       pxor 24(%2), %%mm3   ;\n"
331 	"       movq %%mm2, 16(%1)   ;\n"
332 	"       pxor 32(%2), %%mm4   ;\n"
333 	"       pxor 24(%3), %%mm3   ;\n"
334 	"       pxor 40(%2), %%mm5   ;\n"
335 	"       movq %%mm3, 24(%1)   ;\n"
336 	"       pxor 32(%3), %%mm4   ;\n"
337 	"       pxor 40(%3), %%mm5   ;\n"
338 	"       movq 48(%1), %%mm6   ;\n"
339 	"       movq %%mm4, 32(%1)   ;\n"
340 	"       movq 56(%1), %%mm7   ;\n"
341 	"       pxor 48(%2), %%mm6   ;\n"
342 	"       movq %%mm5, 40(%1)   ;\n"
343 	"       pxor 56(%2), %%mm7   ;\n"
344 	"       pxor 48(%3), %%mm6   ;\n"
345 	"       pxor 56(%3), %%mm7   ;\n"
346 	"       movq %%mm6, 48(%1)   ;\n"
347 	"       movq %%mm7, 56(%1)   ;\n"
348 
349 	"       addl $64, %1         ;\n"
350 	"       addl $64, %2         ;\n"
351 	"       addl $64, %3         ;\n"
352 	"       decl %0              ;\n"
353 	"       jnz 1b               ;\n"
354 	: "+r" (lines),
355 	  "+r" (p1), "+r" (p2), "+r" (p3)
356 	:
357 	: "memory" );
358 
359 	kernel_fpu_end();
360 }
361 
362 static void
363 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
364 	     unsigned long *p3, unsigned long *p4)
365 {
366 	unsigned long lines = bytes >> 6;
367 
368 	kernel_fpu_begin();
369 
370 	asm volatile(
371 	" .align 32,0x90             ;\n"
372 	" 1:                         ;\n"
373 	"       movq   (%1), %%mm0   ;\n"
374 	"       movq  8(%1), %%mm1   ;\n"
375 	"       pxor   (%2), %%mm0   ;\n"
376 	"       movq 16(%1), %%mm2   ;\n"
377 	"       pxor  8(%2), %%mm1   ;\n"
378 	"       pxor   (%3), %%mm0   ;\n"
379 	"       pxor 16(%2), %%mm2   ;\n"
380 	"       pxor  8(%3), %%mm1   ;\n"
381 	"       pxor   (%4), %%mm0   ;\n"
382 	"       movq 24(%1), %%mm3   ;\n"
383 	"       pxor 16(%3), %%mm2   ;\n"
384 	"       pxor  8(%4), %%mm1   ;\n"
385 	"       movq %%mm0,   (%1)   ;\n"
386 	"       movq 32(%1), %%mm4   ;\n"
387 	"       pxor 24(%2), %%mm3   ;\n"
388 	"       pxor 16(%4), %%mm2   ;\n"
389 	"       movq %%mm1,  8(%1)   ;\n"
390 	"       movq 40(%1), %%mm5   ;\n"
391 	"       pxor 32(%2), %%mm4   ;\n"
392 	"       pxor 24(%3), %%mm3   ;\n"
393 	"       movq %%mm2, 16(%1)   ;\n"
394 	"       pxor 40(%2), %%mm5   ;\n"
395 	"       pxor 32(%3), %%mm4   ;\n"
396 	"       pxor 24(%4), %%mm3   ;\n"
397 	"       movq %%mm3, 24(%1)   ;\n"
398 	"       movq 56(%1), %%mm7   ;\n"
399 	"       movq 48(%1), %%mm6   ;\n"
400 	"       pxor 40(%3), %%mm5   ;\n"
401 	"       pxor 32(%4), %%mm4   ;\n"
402 	"       pxor 48(%2), %%mm6   ;\n"
403 	"       movq %%mm4, 32(%1)   ;\n"
404 	"       pxor 56(%2), %%mm7   ;\n"
405 	"       pxor 40(%4), %%mm5   ;\n"
406 	"       pxor 48(%3), %%mm6   ;\n"
407 	"       pxor 56(%3), %%mm7   ;\n"
408 	"       movq %%mm5, 40(%1)   ;\n"
409 	"       pxor 48(%4), %%mm6   ;\n"
410 	"       pxor 56(%4), %%mm7   ;\n"
411 	"       movq %%mm6, 48(%1)   ;\n"
412 	"       movq %%mm7, 56(%1)   ;\n"
413 
414 	"       addl $64, %1         ;\n"
415 	"       addl $64, %2         ;\n"
416 	"       addl $64, %3         ;\n"
417 	"       addl $64, %4         ;\n"
418 	"       decl %0              ;\n"
419 	"       jnz 1b               ;\n"
420 	: "+r" (lines),
421 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
422 	:
423 	: "memory");
424 
425 	kernel_fpu_end();
426 }
427 
428 static void
429 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
430 	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
431 {
432 	unsigned long lines = bytes >> 6;
433 
434 	kernel_fpu_begin();
435 
436 	/* Make sure GCC forgets anything it knows about p4 or p5,
437 	   such that it won't pass to the asm volatile below a
438 	   register that is shared with any other variable.  That's
439 	   because we modify p4 and p5 there, but we can't mark them
440 	   as read/write, otherwise we'd overflow the 10-asm-operands
441 	   limit of GCC < 3.1.  */
442 	asm("" : "+r" (p4), "+r" (p5));
443 
444 	asm volatile(
445 	" .align 32,0x90             ;\n"
446 	" 1:                         ;\n"
447 	"       movq   (%1), %%mm0   ;\n"
448 	"       movq  8(%1), %%mm1   ;\n"
449 	"       pxor   (%2), %%mm0   ;\n"
450 	"       pxor  8(%2), %%mm1   ;\n"
451 	"       movq 16(%1), %%mm2   ;\n"
452 	"       pxor   (%3), %%mm0   ;\n"
453 	"       pxor  8(%3), %%mm1   ;\n"
454 	"       pxor 16(%2), %%mm2   ;\n"
455 	"       pxor   (%4), %%mm0   ;\n"
456 	"       pxor  8(%4), %%mm1   ;\n"
457 	"       pxor 16(%3), %%mm2   ;\n"
458 	"       movq 24(%1), %%mm3   ;\n"
459 	"       pxor   (%5), %%mm0   ;\n"
460 	"       pxor  8(%5), %%mm1   ;\n"
461 	"       movq %%mm0,   (%1)   ;\n"
462 	"       pxor 16(%4), %%mm2   ;\n"
463 	"       pxor 24(%2), %%mm3   ;\n"
464 	"       movq %%mm1,  8(%1)   ;\n"
465 	"       pxor 16(%5), %%mm2   ;\n"
466 	"       pxor 24(%3), %%mm3   ;\n"
467 	"       movq 32(%1), %%mm4   ;\n"
468 	"       movq %%mm2, 16(%1)   ;\n"
469 	"       pxor 24(%4), %%mm3   ;\n"
470 	"       pxor 32(%2), %%mm4   ;\n"
471 	"       movq 40(%1), %%mm5   ;\n"
472 	"       pxor 24(%5), %%mm3   ;\n"
473 	"       pxor 32(%3), %%mm4   ;\n"
474 	"       pxor 40(%2), %%mm5   ;\n"
475 	"       movq %%mm3, 24(%1)   ;\n"
476 	"       pxor 32(%4), %%mm4   ;\n"
477 	"       pxor 40(%3), %%mm5   ;\n"
478 	"       movq 48(%1), %%mm6   ;\n"
479 	"       movq 56(%1), %%mm7   ;\n"
480 	"       pxor 32(%5), %%mm4   ;\n"
481 	"       pxor 40(%4), %%mm5   ;\n"
482 	"       pxor 48(%2), %%mm6   ;\n"
483 	"       pxor 56(%2), %%mm7   ;\n"
484 	"       movq %%mm4, 32(%1)   ;\n"
485 	"       pxor 48(%3), %%mm6   ;\n"
486 	"       pxor 56(%3), %%mm7   ;\n"
487 	"       pxor 40(%5), %%mm5   ;\n"
488 	"       pxor 48(%4), %%mm6   ;\n"
489 	"       pxor 56(%4), %%mm7   ;\n"
490 	"       movq %%mm5, 40(%1)   ;\n"
491 	"       pxor 48(%5), %%mm6   ;\n"
492 	"       pxor 56(%5), %%mm7   ;\n"
493 	"       movq %%mm6, 48(%1)   ;\n"
494 	"       movq %%mm7, 56(%1)   ;\n"
495 
496 	"       addl $64, %1         ;\n"
497 	"       addl $64, %2         ;\n"
498 	"       addl $64, %3         ;\n"
499 	"       addl $64, %4         ;\n"
500 	"       addl $64, %5         ;\n"
501 	"       decl %0              ;\n"
502 	"       jnz 1b               ;\n"
503 	: "+r" (lines),
504 	  "+r" (p1), "+r" (p2), "+r" (p3)
505 	: "r" (p4), "r" (p5)
506 	: "memory");
507 
508 	/* p4 and p5 were modified, and now the variables are dead.
509 	   Clobber them just to be sure nobody does something stupid
510 	   like assuming they have some legal value.  */
511 	asm("" : "=r" (p4), "=r" (p5));
512 
513 	kernel_fpu_end();
514 }
515 
516 static struct xor_block_template xor_block_pII_mmx = {
517 	.name = "pII_mmx",
518 	.do_2 = xor_pII_mmx_2,
519 	.do_3 = xor_pII_mmx_3,
520 	.do_4 = xor_pII_mmx_4,
521 	.do_5 = xor_pII_mmx_5,
522 };
523 
524 static struct xor_block_template xor_block_p5_mmx = {
525 	.name = "p5_mmx",
526 	.do_2 = xor_p5_mmx_2,
527 	.do_3 = xor_p5_mmx_3,
528 	.do_4 = xor_p5_mmx_4,
529 	.do_5 = xor_p5_mmx_5,
530 };
531 
532 /*
533  * Cache avoiding checksumming functions utilizing KNI instructions
534  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535  */
536 
537 #define XMMS_SAVE				\
538 do {						\
539 	preempt_disable();			\
540 	cr0 = read_cr0();			\
541 	clts();					\
542 	asm volatile(				\
543 		"movups %%xmm0,(%0)	;\n\t"	\
544 		"movups %%xmm1,0x10(%0)	;\n\t"	\
545 		"movups %%xmm2,0x20(%0)	;\n\t"	\
546 		"movups %%xmm3,0x30(%0)	;\n\t"	\
547 		:				\
548 		: "r" (xmm_save) 		\
549 		: "memory");			\
550 } while (0)
551 
552 #define XMMS_RESTORE				\
553 do {						\
554 	asm volatile(				\
555 		"sfence			;\n\t"	\
556 		"movups (%0),%%xmm0	;\n\t"	\
557 		"movups 0x10(%0),%%xmm1	;\n\t"	\
558 		"movups 0x20(%0),%%xmm2	;\n\t"	\
559 		"movups 0x30(%0),%%xmm3	;\n\t"	\
560 		:				\
561 		: "r" (xmm_save)		\
562 		: "memory");			\
563 	write_cr0(cr0);				\
564 	preempt_enable();			\
565 } while (0)
566 
567 #define ALIGN16 __attribute__((aligned(16)))
568 
569 #define OFFS(x)		"16*("#x")"
570 #define PF_OFFS(x)	"256+16*("#x")"
571 #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n"
572 #define LD(x, y)	"       movaps   "OFFS(x)"(%1), %%xmm"#y"	;\n"
573 #define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%1)	;\n"
574 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%2)		;\n"
575 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%3)		;\n"
576 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%4)		;\n"
577 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%5)		;\n"
578 #define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%6)		;\n"
579 #define XO1(x, y)	"       xorps   "OFFS(x)"(%2), %%xmm"#y"	;\n"
580 #define XO2(x, y)	"       xorps   "OFFS(x)"(%3), %%xmm"#y"	;\n"
581 #define XO3(x, y)	"       xorps   "OFFS(x)"(%4), %%xmm"#y"	;\n"
582 #define XO4(x, y)	"       xorps   "OFFS(x)"(%5), %%xmm"#y"	;\n"
583 #define XO5(x, y)	"       xorps   "OFFS(x)"(%6), %%xmm"#y"	;\n"
584 
585 
586 static void
587 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
588 {
589 	unsigned long lines = bytes >> 8;
590 	char xmm_save[16*4] ALIGN16;
591 	int cr0;
592 
593 	XMMS_SAVE;
594 
595 	asm volatile(
596 #undef BLOCK
597 #define BLOCK(i)					\
598 		LD(i, 0)				\
599 			LD(i + 1, 1)			\
600 		PF1(i)					\
601 				PF1(i + 2)		\
602 				LD(i + 2, 2)		\
603 					LD(i + 3, 3)	\
604 		PF0(i + 4)				\
605 				PF0(i + 6)		\
606 		XO1(i, 0)				\
607 			XO1(i + 1, 1)			\
608 				XO1(i + 2, 2)		\
609 					XO1(i + 3, 3)	\
610 		ST(i, 0)				\
611 			ST(i + 1, 1)			\
612 				ST(i + 2, 2)		\
613 					ST(i + 3, 3)	\
614 
615 
616 		PF0(0)
617 				PF0(2)
618 
619 	" .align 32			;\n"
620 	" 1:                            ;\n"
621 
622 		BLOCK(0)
623 		BLOCK(4)
624 		BLOCK(8)
625 		BLOCK(12)
626 
627 	"       addl $256, %1           ;\n"
628 	"       addl $256, %2           ;\n"
629 	"       decl %0                 ;\n"
630 	"       jnz 1b                  ;\n"
631 	: "+r" (lines),
632 	  "+r" (p1), "+r" (p2)
633 	:
634 	: "memory");
635 
636 	XMMS_RESTORE;
637 }
638 
639 static void
640 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
641 	  unsigned long *p3)
642 {
643 	unsigned long lines = bytes >> 8;
644 	char xmm_save[16*4] ALIGN16;
645 	int cr0;
646 
647 	XMMS_SAVE;
648 
649 	asm volatile(
650 #undef BLOCK
651 #define BLOCK(i) \
652 		PF1(i)					\
653 				PF1(i + 2)		\
654 		LD(i,0)					\
655 			LD(i + 1, 1)			\
656 				LD(i + 2, 2)		\
657 					LD(i + 3, 3)	\
658 		PF2(i)					\
659 				PF2(i + 2)		\
660 		PF0(i + 4)				\
661 				PF0(i + 6)		\
662 		XO1(i,0)				\
663 			XO1(i + 1, 1)			\
664 				XO1(i + 2, 2)		\
665 					XO1(i + 3, 3)	\
666 		XO2(i,0)				\
667 			XO2(i + 1, 1)			\
668 				XO2(i + 2, 2)		\
669 					XO2(i + 3, 3)	\
670 		ST(i,0)					\
671 			ST(i + 1, 1)			\
672 				ST(i + 2, 2)		\
673 					ST(i + 3, 3)	\
674 
675 
676 		PF0(0)
677 				PF0(2)
678 
679 	" .align 32			;\n"
680 	" 1:                            ;\n"
681 
682 		BLOCK(0)
683 		BLOCK(4)
684 		BLOCK(8)
685 		BLOCK(12)
686 
687 	"       addl $256, %1           ;\n"
688 	"       addl $256, %2           ;\n"
689 	"       addl $256, %3           ;\n"
690 	"       decl %0                 ;\n"
691 	"       jnz 1b                  ;\n"
692 	: "+r" (lines),
693 	  "+r" (p1), "+r"(p2), "+r"(p3)
694 	:
695 	: "memory" );
696 
697 	XMMS_RESTORE;
698 }
699 
700 static void
701 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
702 	  unsigned long *p3, unsigned long *p4)
703 {
704 	unsigned long lines = bytes >> 8;
705 	char xmm_save[16*4] ALIGN16;
706 	int cr0;
707 
708 	XMMS_SAVE;
709 
710 	asm volatile(
711 #undef BLOCK
712 #define BLOCK(i) \
713 		PF1(i)					\
714 				PF1(i + 2)		\
715 		LD(i,0)					\
716 			LD(i + 1, 1)			\
717 				LD(i + 2, 2)		\
718 					LD(i + 3, 3)	\
719 		PF2(i)					\
720 				PF2(i + 2)		\
721 		XO1(i,0)				\
722 			XO1(i + 1, 1)			\
723 				XO1(i + 2, 2)		\
724 					XO1(i + 3, 3)	\
725 		PF3(i)					\
726 				PF3(i + 2)		\
727 		PF0(i + 4)				\
728 				PF0(i + 6)		\
729 		XO2(i,0)				\
730 			XO2(i + 1, 1)			\
731 				XO2(i + 2, 2)		\
732 					XO2(i + 3, 3)	\
733 		XO3(i,0)				\
734 			XO3(i + 1, 1)			\
735 				XO3(i + 2, 2)		\
736 					XO3(i + 3, 3)	\
737 		ST(i,0)					\
738 			ST(i + 1, 1)			\
739 				ST(i + 2, 2)		\
740 					ST(i + 3, 3)	\
741 
742 
743 		PF0(0)
744 				PF0(2)
745 
746 	" .align 32			;\n"
747 	" 1:                            ;\n"
748 
749 		BLOCK(0)
750 		BLOCK(4)
751 		BLOCK(8)
752 		BLOCK(12)
753 
754 	"       addl $256, %1           ;\n"
755 	"       addl $256, %2           ;\n"
756 	"       addl $256, %3           ;\n"
757 	"       addl $256, %4           ;\n"
758 	"       decl %0                 ;\n"
759 	"       jnz 1b                  ;\n"
760 	: "+r" (lines),
761 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
762 	:
763 	: "memory" );
764 
765 	XMMS_RESTORE;
766 }
767 
768 static void
769 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
770 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
771 {
772 	unsigned long lines = bytes >> 8;
773 	char xmm_save[16*4] ALIGN16;
774 	int cr0;
775 
776 	XMMS_SAVE;
777 
778 	/* Make sure GCC forgets anything it knows about p4 or p5,
779 	   such that it won't pass to the asm volatile below a
780 	   register that is shared with any other variable.  That's
781 	   because we modify p4 and p5 there, but we can't mark them
782 	   as read/write, otherwise we'd overflow the 10-asm-operands
783 	   limit of GCC < 3.1.  */
784 	asm("" : "+r" (p4), "+r" (p5));
785 
786 	asm volatile(
787 #undef BLOCK
788 #define BLOCK(i) \
789 		PF1(i)					\
790 				PF1(i + 2)		\
791 		LD(i,0)					\
792 			LD(i + 1, 1)			\
793 				LD(i + 2, 2)		\
794 					LD(i + 3, 3)	\
795 		PF2(i)					\
796 				PF2(i + 2)		\
797 		XO1(i,0)				\
798 			XO1(i + 1, 1)			\
799 				XO1(i + 2, 2)		\
800 					XO1(i + 3, 3)	\
801 		PF3(i)					\
802 				PF3(i + 2)		\
803 		XO2(i,0)				\
804 			XO2(i + 1, 1)			\
805 				XO2(i + 2, 2)		\
806 					XO2(i + 3, 3)	\
807 		PF4(i)					\
808 				PF4(i + 2)		\
809 		PF0(i + 4)				\
810 				PF0(i + 6)		\
811 		XO3(i,0)				\
812 			XO3(i + 1, 1)			\
813 				XO3(i + 2, 2)		\
814 					XO3(i + 3, 3)	\
815 		XO4(i,0)				\
816 			XO4(i + 1, 1)			\
817 				XO4(i + 2, 2)		\
818 					XO4(i + 3, 3)	\
819 		ST(i,0)					\
820 			ST(i + 1, 1)			\
821 				ST(i + 2, 2)		\
822 					ST(i + 3, 3)	\
823 
824 
825 		PF0(0)
826 				PF0(2)
827 
828 	" .align 32			;\n"
829 	" 1:                            ;\n"
830 
831 		BLOCK(0)
832 		BLOCK(4)
833 		BLOCK(8)
834 		BLOCK(12)
835 
836 	"       addl $256, %1           ;\n"
837 	"       addl $256, %2           ;\n"
838 	"       addl $256, %3           ;\n"
839 	"       addl $256, %4           ;\n"
840 	"       addl $256, %5           ;\n"
841 	"       decl %0                 ;\n"
842 	"       jnz 1b                  ;\n"
843 	: "+r" (lines),
844 	  "+r" (p1), "+r" (p2), "+r" (p3)
845 	: "r" (p4), "r" (p5)
846 	: "memory");
847 
848 	/* p4 and p5 were modified, and now the variables are dead.
849 	   Clobber them just to be sure nobody does something stupid
850 	   like assuming they have some legal value.  */
851 	asm("" : "=r" (p4), "=r" (p5));
852 
853 	XMMS_RESTORE;
854 }
855 
856 static struct xor_block_template xor_block_pIII_sse = {
857 	.name = "pIII_sse",
858 	.do_2 = xor_sse_2,
859 	.do_3 = xor_sse_3,
860 	.do_4 = xor_sse_4,
861 	.do_5 = xor_sse_5,
862 };
863 
864 /* Also try the generic routines.  */
865 #include <asm-generic/xor.h>
866 
867 #undef XOR_TRY_TEMPLATES
868 #define XOR_TRY_TEMPLATES				\
869 do {							\
870 	xor_speed(&xor_block_8regs);			\
871 	xor_speed(&xor_block_8regs_p);			\
872 	xor_speed(&xor_block_32regs);			\
873 	xor_speed(&xor_block_32regs_p);			\
874 	if (cpu_has_xmm)				\
875 		xor_speed(&xor_block_pIII_sse);		\
876 	if (cpu_has_mmx) {				\
877 		xor_speed(&xor_block_pII_mmx);		\
878 		xor_speed(&xor_block_p5_mmx);		\
879 	}						\
880 } while (0)
881 
882 /* We force the use of the SSE xor block because it can write around L2.
883    We may also be able to load into the L1 only depending on how the cpu
884    deals with a load to a line that is being prefetched.  */
885 #define XOR_SELECT_TEMPLATE(FASTEST)			\
886 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
887 
888 #endif /* _ASM_X86_XOR_32_H */
889