xref: /openbmc/linux/include/asm-generic/xor.h (revision 4f2c0a4acffbec01079c28f839422e64ddeff004)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * include/asm-generic/xor.h
4  *
5  * Generic optimized RAID-5 checksumming functions.
6  */
7 
8 #include <linux/prefetch.h>
9 
10 static void
xor_8regs_2(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)11 xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1,
12 	    const unsigned long * __restrict p2)
13 {
14 	long lines = bytes / (sizeof (long)) / 8;
15 
16 	do {
17 		p1[0] ^= p2[0];
18 		p1[1] ^= p2[1];
19 		p1[2] ^= p2[2];
20 		p1[3] ^= p2[3];
21 		p1[4] ^= p2[4];
22 		p1[5] ^= p2[5];
23 		p1[6] ^= p2[6];
24 		p1[7] ^= p2[7];
25 		p1 += 8;
26 		p2 += 8;
27 	} while (--lines > 0);
28 }
29 
30 static void
xor_8regs_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)31 xor_8regs_3(unsigned long bytes, unsigned long * __restrict p1,
32 	    const unsigned long * __restrict p2,
33 	    const unsigned long * __restrict p3)
34 {
35 	long lines = bytes / (sizeof (long)) / 8;
36 
37 	do {
38 		p1[0] ^= p2[0] ^ p3[0];
39 		p1[1] ^= p2[1] ^ p3[1];
40 		p1[2] ^= p2[2] ^ p3[2];
41 		p1[3] ^= p2[3] ^ p3[3];
42 		p1[4] ^= p2[4] ^ p3[4];
43 		p1[5] ^= p2[5] ^ p3[5];
44 		p1[6] ^= p2[6] ^ p3[6];
45 		p1[7] ^= p2[7] ^ p3[7];
46 		p1 += 8;
47 		p2 += 8;
48 		p3 += 8;
49 	} while (--lines > 0);
50 }
51 
52 static void
xor_8regs_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)53 xor_8regs_4(unsigned long bytes, unsigned long * __restrict p1,
54 	    const unsigned long * __restrict p2,
55 	    const unsigned long * __restrict p3,
56 	    const unsigned long * __restrict p4)
57 {
58 	long lines = bytes / (sizeof (long)) / 8;
59 
60 	do {
61 		p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
62 		p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
63 		p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
64 		p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
65 		p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
66 		p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
67 		p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
68 		p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
69 		p1 += 8;
70 		p2 += 8;
71 		p3 += 8;
72 		p4 += 8;
73 	} while (--lines > 0);
74 }
75 
76 static void
xor_8regs_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)77 xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
78 	    const unsigned long * __restrict p2,
79 	    const unsigned long * __restrict p3,
80 	    const unsigned long * __restrict p4,
81 	    const unsigned long * __restrict p5)
82 {
83 	long lines = bytes / (sizeof (long)) / 8;
84 
85 	do {
86 		p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
87 		p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
88 		p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
89 		p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
90 		p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
91 		p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
92 		p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
93 		p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
94 		p1 += 8;
95 		p2 += 8;
96 		p3 += 8;
97 		p4 += 8;
98 		p5 += 8;
99 	} while (--lines > 0);
100 }
101 
102 static void
xor_32regs_2(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)103 xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1,
104 	     const unsigned long * __restrict p2)
105 {
106 	long lines = bytes / (sizeof (long)) / 8;
107 
108 	do {
109 		register long d0, d1, d2, d3, d4, d5, d6, d7;
110 		d0 = p1[0];	/* Pull the stuff into registers	*/
111 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
112 		d2 = p1[2];
113 		d3 = p1[3];
114 		d4 = p1[4];
115 		d5 = p1[5];
116 		d6 = p1[6];
117 		d7 = p1[7];
118 		d0 ^= p2[0];
119 		d1 ^= p2[1];
120 		d2 ^= p2[2];
121 		d3 ^= p2[3];
122 		d4 ^= p2[4];
123 		d5 ^= p2[5];
124 		d6 ^= p2[6];
125 		d7 ^= p2[7];
126 		p1[0] = d0;	/* Store the result (in bursts)		*/
127 		p1[1] = d1;
128 		p1[2] = d2;
129 		p1[3] = d3;
130 		p1[4] = d4;
131 		p1[5] = d5;
132 		p1[6] = d6;
133 		p1[7] = d7;
134 		p1 += 8;
135 		p2 += 8;
136 	} while (--lines > 0);
137 }
138 
139 static void
xor_32regs_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)140 xor_32regs_3(unsigned long bytes, unsigned long * __restrict p1,
141 	     const unsigned long * __restrict p2,
142 	     const unsigned long * __restrict p3)
143 {
144 	long lines = bytes / (sizeof (long)) / 8;
145 
146 	do {
147 		register long d0, d1, d2, d3, d4, d5, d6, d7;
148 		d0 = p1[0];	/* Pull the stuff into registers	*/
149 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
150 		d2 = p1[2];
151 		d3 = p1[3];
152 		d4 = p1[4];
153 		d5 = p1[5];
154 		d6 = p1[6];
155 		d7 = p1[7];
156 		d0 ^= p2[0];
157 		d1 ^= p2[1];
158 		d2 ^= p2[2];
159 		d3 ^= p2[3];
160 		d4 ^= p2[4];
161 		d5 ^= p2[5];
162 		d6 ^= p2[6];
163 		d7 ^= p2[7];
164 		d0 ^= p3[0];
165 		d1 ^= p3[1];
166 		d2 ^= p3[2];
167 		d3 ^= p3[3];
168 		d4 ^= p3[4];
169 		d5 ^= p3[5];
170 		d6 ^= p3[6];
171 		d7 ^= p3[7];
172 		p1[0] = d0;	/* Store the result (in bursts)		*/
173 		p1[1] = d1;
174 		p1[2] = d2;
175 		p1[3] = d3;
176 		p1[4] = d4;
177 		p1[5] = d5;
178 		p1[6] = d6;
179 		p1[7] = d7;
180 		p1 += 8;
181 		p2 += 8;
182 		p3 += 8;
183 	} while (--lines > 0);
184 }
185 
186 static void
xor_32regs_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)187 xor_32regs_4(unsigned long bytes, unsigned long * __restrict p1,
188 	     const unsigned long * __restrict p2,
189 	     const unsigned long * __restrict p3,
190 	     const unsigned long * __restrict p4)
191 {
192 	long lines = bytes / (sizeof (long)) / 8;
193 
194 	do {
195 		register long d0, d1, d2, d3, d4, d5, d6, d7;
196 		d0 = p1[0];	/* Pull the stuff into registers	*/
197 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
198 		d2 = p1[2];
199 		d3 = p1[3];
200 		d4 = p1[4];
201 		d5 = p1[5];
202 		d6 = p1[6];
203 		d7 = p1[7];
204 		d0 ^= p2[0];
205 		d1 ^= p2[1];
206 		d2 ^= p2[2];
207 		d3 ^= p2[3];
208 		d4 ^= p2[4];
209 		d5 ^= p2[5];
210 		d6 ^= p2[6];
211 		d7 ^= p2[7];
212 		d0 ^= p3[0];
213 		d1 ^= p3[1];
214 		d2 ^= p3[2];
215 		d3 ^= p3[3];
216 		d4 ^= p3[4];
217 		d5 ^= p3[5];
218 		d6 ^= p3[6];
219 		d7 ^= p3[7];
220 		d0 ^= p4[0];
221 		d1 ^= p4[1];
222 		d2 ^= p4[2];
223 		d3 ^= p4[3];
224 		d4 ^= p4[4];
225 		d5 ^= p4[5];
226 		d6 ^= p4[6];
227 		d7 ^= p4[7];
228 		p1[0] = d0;	/* Store the result (in bursts)		*/
229 		p1[1] = d1;
230 		p1[2] = d2;
231 		p1[3] = d3;
232 		p1[4] = d4;
233 		p1[5] = d5;
234 		p1[6] = d6;
235 		p1[7] = d7;
236 		p1 += 8;
237 		p2 += 8;
238 		p3 += 8;
239 		p4 += 8;
240 	} while (--lines > 0);
241 }
242 
243 static void
xor_32regs_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)244 xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1,
245 	     const unsigned long * __restrict p2,
246 	     const unsigned long * __restrict p3,
247 	     const unsigned long * __restrict p4,
248 	     const unsigned long * __restrict p5)
249 {
250 	long lines = bytes / (sizeof (long)) / 8;
251 
252 	do {
253 		register long d0, d1, d2, d3, d4, d5, d6, d7;
254 		d0 = p1[0];	/* Pull the stuff into registers	*/
255 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
256 		d2 = p1[2];
257 		d3 = p1[3];
258 		d4 = p1[4];
259 		d5 = p1[5];
260 		d6 = p1[6];
261 		d7 = p1[7];
262 		d0 ^= p2[0];
263 		d1 ^= p2[1];
264 		d2 ^= p2[2];
265 		d3 ^= p2[3];
266 		d4 ^= p2[4];
267 		d5 ^= p2[5];
268 		d6 ^= p2[6];
269 		d7 ^= p2[7];
270 		d0 ^= p3[0];
271 		d1 ^= p3[1];
272 		d2 ^= p3[2];
273 		d3 ^= p3[3];
274 		d4 ^= p3[4];
275 		d5 ^= p3[5];
276 		d6 ^= p3[6];
277 		d7 ^= p3[7];
278 		d0 ^= p4[0];
279 		d1 ^= p4[1];
280 		d2 ^= p4[2];
281 		d3 ^= p4[3];
282 		d4 ^= p4[4];
283 		d5 ^= p4[5];
284 		d6 ^= p4[6];
285 		d7 ^= p4[7];
286 		d0 ^= p5[0];
287 		d1 ^= p5[1];
288 		d2 ^= p5[2];
289 		d3 ^= p5[3];
290 		d4 ^= p5[4];
291 		d5 ^= p5[5];
292 		d6 ^= p5[6];
293 		d7 ^= p5[7];
294 		p1[0] = d0;	/* Store the result (in bursts)		*/
295 		p1[1] = d1;
296 		p1[2] = d2;
297 		p1[3] = d3;
298 		p1[4] = d4;
299 		p1[5] = d5;
300 		p1[6] = d6;
301 		p1[7] = d7;
302 		p1 += 8;
303 		p2 += 8;
304 		p3 += 8;
305 		p4 += 8;
306 		p5 += 8;
307 	} while (--lines > 0);
308 }
309 
310 static void
xor_8regs_p_2(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)311 xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
312 	      const unsigned long * __restrict p2)
313 {
314 	long lines = bytes / (sizeof (long)) / 8 - 1;
315 	prefetchw(p1);
316 	prefetch(p2);
317 
318 	do {
319 		prefetchw(p1+8);
320 		prefetch(p2+8);
321  once_more:
322 		p1[0] ^= p2[0];
323 		p1[1] ^= p2[1];
324 		p1[2] ^= p2[2];
325 		p1[3] ^= p2[3];
326 		p1[4] ^= p2[4];
327 		p1[5] ^= p2[5];
328 		p1[6] ^= p2[6];
329 		p1[7] ^= p2[7];
330 		p1 += 8;
331 		p2 += 8;
332 	} while (--lines > 0);
333 	if (lines == 0)
334 		goto once_more;
335 }
336 
337 static void
xor_8regs_p_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)338 xor_8regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
339 	      const unsigned long * __restrict p2,
340 	      const unsigned long * __restrict p3)
341 {
342 	long lines = bytes / (sizeof (long)) / 8 - 1;
343 	prefetchw(p1);
344 	prefetch(p2);
345 	prefetch(p3);
346 
347 	do {
348 		prefetchw(p1+8);
349 		prefetch(p2+8);
350 		prefetch(p3+8);
351  once_more:
352 		p1[0] ^= p2[0] ^ p3[0];
353 		p1[1] ^= p2[1] ^ p3[1];
354 		p1[2] ^= p2[2] ^ p3[2];
355 		p1[3] ^= p2[3] ^ p3[3];
356 		p1[4] ^= p2[4] ^ p3[4];
357 		p1[5] ^= p2[5] ^ p3[5];
358 		p1[6] ^= p2[6] ^ p3[6];
359 		p1[7] ^= p2[7] ^ p3[7];
360 		p1 += 8;
361 		p2 += 8;
362 		p3 += 8;
363 	} while (--lines > 0);
364 	if (lines == 0)
365 		goto once_more;
366 }
367 
368 static void
xor_8regs_p_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)369 xor_8regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
370 	      const unsigned long * __restrict p2,
371 	      const unsigned long * __restrict p3,
372 	      const unsigned long * __restrict p4)
373 {
374 	long lines = bytes / (sizeof (long)) / 8 - 1;
375 
376 	prefetchw(p1);
377 	prefetch(p2);
378 	prefetch(p3);
379 	prefetch(p4);
380 
381 	do {
382 		prefetchw(p1+8);
383 		prefetch(p2+8);
384 		prefetch(p3+8);
385 		prefetch(p4+8);
386  once_more:
387 		p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
388 		p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
389 		p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
390 		p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
391 		p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
392 		p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
393 		p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
394 		p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
395 		p1 += 8;
396 		p2 += 8;
397 		p3 += 8;
398 		p4 += 8;
399 	} while (--lines > 0);
400 	if (lines == 0)
401 		goto once_more;
402 }
403 
404 static void
xor_8regs_p_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)405 xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
406 	      const unsigned long * __restrict p2,
407 	      const unsigned long * __restrict p3,
408 	      const unsigned long * __restrict p4,
409 	      const unsigned long * __restrict p5)
410 {
411 	long lines = bytes / (sizeof (long)) / 8 - 1;
412 
413 	prefetchw(p1);
414 	prefetch(p2);
415 	prefetch(p3);
416 	prefetch(p4);
417 	prefetch(p5);
418 
419 	do {
420 		prefetchw(p1+8);
421 		prefetch(p2+8);
422 		prefetch(p3+8);
423 		prefetch(p4+8);
424 		prefetch(p5+8);
425  once_more:
426 		p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
427 		p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
428 		p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
429 		p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
430 		p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
431 		p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
432 		p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
433 		p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
434 		p1 += 8;
435 		p2 += 8;
436 		p3 += 8;
437 		p4 += 8;
438 		p5 += 8;
439 	} while (--lines > 0);
440 	if (lines == 0)
441 		goto once_more;
442 }
443 
444 static void
xor_32regs_p_2(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)445 xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
446 	       const unsigned long * __restrict p2)
447 {
448 	long lines = bytes / (sizeof (long)) / 8 - 1;
449 
450 	prefetchw(p1);
451 	prefetch(p2);
452 
453 	do {
454 		register long d0, d1, d2, d3, d4, d5, d6, d7;
455 
456 		prefetchw(p1+8);
457 		prefetch(p2+8);
458  once_more:
459 		d0 = p1[0];	/* Pull the stuff into registers	*/
460 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
461 		d2 = p1[2];
462 		d3 = p1[3];
463 		d4 = p1[4];
464 		d5 = p1[5];
465 		d6 = p1[6];
466 		d7 = p1[7];
467 		d0 ^= p2[0];
468 		d1 ^= p2[1];
469 		d2 ^= p2[2];
470 		d3 ^= p2[3];
471 		d4 ^= p2[4];
472 		d5 ^= p2[5];
473 		d6 ^= p2[6];
474 		d7 ^= p2[7];
475 		p1[0] = d0;	/* Store the result (in bursts)		*/
476 		p1[1] = d1;
477 		p1[2] = d2;
478 		p1[3] = d3;
479 		p1[4] = d4;
480 		p1[5] = d5;
481 		p1[6] = d6;
482 		p1[7] = d7;
483 		p1 += 8;
484 		p2 += 8;
485 	} while (--lines > 0);
486 	if (lines == 0)
487 		goto once_more;
488 }
489 
490 static void
xor_32regs_p_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)491 xor_32regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
492 	       const unsigned long * __restrict p2,
493 	       const unsigned long * __restrict p3)
494 {
495 	long lines = bytes / (sizeof (long)) / 8 - 1;
496 
497 	prefetchw(p1);
498 	prefetch(p2);
499 	prefetch(p3);
500 
501 	do {
502 		register long d0, d1, d2, d3, d4, d5, d6, d7;
503 
504 		prefetchw(p1+8);
505 		prefetch(p2+8);
506 		prefetch(p3+8);
507  once_more:
508 		d0 = p1[0];	/* Pull the stuff into registers	*/
509 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
510 		d2 = p1[2];
511 		d3 = p1[3];
512 		d4 = p1[4];
513 		d5 = p1[5];
514 		d6 = p1[6];
515 		d7 = p1[7];
516 		d0 ^= p2[0];
517 		d1 ^= p2[1];
518 		d2 ^= p2[2];
519 		d3 ^= p2[3];
520 		d4 ^= p2[4];
521 		d5 ^= p2[5];
522 		d6 ^= p2[6];
523 		d7 ^= p2[7];
524 		d0 ^= p3[0];
525 		d1 ^= p3[1];
526 		d2 ^= p3[2];
527 		d3 ^= p3[3];
528 		d4 ^= p3[4];
529 		d5 ^= p3[5];
530 		d6 ^= p3[6];
531 		d7 ^= p3[7];
532 		p1[0] = d0;	/* Store the result (in bursts)		*/
533 		p1[1] = d1;
534 		p1[2] = d2;
535 		p1[3] = d3;
536 		p1[4] = d4;
537 		p1[5] = d5;
538 		p1[6] = d6;
539 		p1[7] = d7;
540 		p1 += 8;
541 		p2 += 8;
542 		p3 += 8;
543 	} while (--lines > 0);
544 	if (lines == 0)
545 		goto once_more;
546 }
547 
548 static void
xor_32regs_p_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)549 xor_32regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
550 	       const unsigned long * __restrict p2,
551 	       const unsigned long * __restrict p3,
552 	       const unsigned long * __restrict p4)
553 {
554 	long lines = bytes / (sizeof (long)) / 8 - 1;
555 
556 	prefetchw(p1);
557 	prefetch(p2);
558 	prefetch(p3);
559 	prefetch(p4);
560 
561 	do {
562 		register long d0, d1, d2, d3, d4, d5, d6, d7;
563 
564 		prefetchw(p1+8);
565 		prefetch(p2+8);
566 		prefetch(p3+8);
567 		prefetch(p4+8);
568  once_more:
569 		d0 = p1[0];	/* Pull the stuff into registers	*/
570 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
571 		d2 = p1[2];
572 		d3 = p1[3];
573 		d4 = p1[4];
574 		d5 = p1[5];
575 		d6 = p1[6];
576 		d7 = p1[7];
577 		d0 ^= p2[0];
578 		d1 ^= p2[1];
579 		d2 ^= p2[2];
580 		d3 ^= p2[3];
581 		d4 ^= p2[4];
582 		d5 ^= p2[5];
583 		d6 ^= p2[6];
584 		d7 ^= p2[7];
585 		d0 ^= p3[0];
586 		d1 ^= p3[1];
587 		d2 ^= p3[2];
588 		d3 ^= p3[3];
589 		d4 ^= p3[4];
590 		d5 ^= p3[5];
591 		d6 ^= p3[6];
592 		d7 ^= p3[7];
593 		d0 ^= p4[0];
594 		d1 ^= p4[1];
595 		d2 ^= p4[2];
596 		d3 ^= p4[3];
597 		d4 ^= p4[4];
598 		d5 ^= p4[5];
599 		d6 ^= p4[6];
600 		d7 ^= p4[7];
601 		p1[0] = d0;	/* Store the result (in bursts)		*/
602 		p1[1] = d1;
603 		p1[2] = d2;
604 		p1[3] = d3;
605 		p1[4] = d4;
606 		p1[5] = d5;
607 		p1[6] = d6;
608 		p1[7] = d7;
609 		p1 += 8;
610 		p2 += 8;
611 		p3 += 8;
612 		p4 += 8;
613 	} while (--lines > 0);
614 	if (lines == 0)
615 		goto once_more;
616 }
617 
618 static void
xor_32regs_p_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)619 xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
620 	       const unsigned long * __restrict p2,
621 	       const unsigned long * __restrict p3,
622 	       const unsigned long * __restrict p4,
623 	       const unsigned long * __restrict p5)
624 {
625 	long lines = bytes / (sizeof (long)) / 8 - 1;
626 
627 	prefetchw(p1);
628 	prefetch(p2);
629 	prefetch(p3);
630 	prefetch(p4);
631 	prefetch(p5);
632 
633 	do {
634 		register long d0, d1, d2, d3, d4, d5, d6, d7;
635 
636 		prefetchw(p1+8);
637 		prefetch(p2+8);
638 		prefetch(p3+8);
639 		prefetch(p4+8);
640 		prefetch(p5+8);
641  once_more:
642 		d0 = p1[0];	/* Pull the stuff into registers	*/
643 		d1 = p1[1];	/*  ... in bursts, if possible.		*/
644 		d2 = p1[2];
645 		d3 = p1[3];
646 		d4 = p1[4];
647 		d5 = p1[5];
648 		d6 = p1[6];
649 		d7 = p1[7];
650 		d0 ^= p2[0];
651 		d1 ^= p2[1];
652 		d2 ^= p2[2];
653 		d3 ^= p2[3];
654 		d4 ^= p2[4];
655 		d5 ^= p2[5];
656 		d6 ^= p2[6];
657 		d7 ^= p2[7];
658 		d0 ^= p3[0];
659 		d1 ^= p3[1];
660 		d2 ^= p3[2];
661 		d3 ^= p3[3];
662 		d4 ^= p3[4];
663 		d5 ^= p3[5];
664 		d6 ^= p3[6];
665 		d7 ^= p3[7];
666 		d0 ^= p4[0];
667 		d1 ^= p4[1];
668 		d2 ^= p4[2];
669 		d3 ^= p4[3];
670 		d4 ^= p4[4];
671 		d5 ^= p4[5];
672 		d6 ^= p4[6];
673 		d7 ^= p4[7];
674 		d0 ^= p5[0];
675 		d1 ^= p5[1];
676 		d2 ^= p5[2];
677 		d3 ^= p5[3];
678 		d4 ^= p5[4];
679 		d5 ^= p5[5];
680 		d6 ^= p5[6];
681 		d7 ^= p5[7];
682 		p1[0] = d0;	/* Store the result (in bursts)		*/
683 		p1[1] = d1;
684 		p1[2] = d2;
685 		p1[3] = d3;
686 		p1[4] = d4;
687 		p1[5] = d5;
688 		p1[6] = d6;
689 		p1[7] = d7;
690 		p1 += 8;
691 		p2 += 8;
692 		p3 += 8;
693 		p4 += 8;
694 		p5 += 8;
695 	} while (--lines > 0);
696 	if (lines == 0)
697 		goto once_more;
698 }
699 
700 static struct xor_block_template xor_block_8regs = {
701 	.name = "8regs",
702 	.do_2 = xor_8regs_2,
703 	.do_3 = xor_8regs_3,
704 	.do_4 = xor_8regs_4,
705 	.do_5 = xor_8regs_5,
706 };
707 
708 static struct xor_block_template xor_block_32regs = {
709 	.name = "32regs",
710 	.do_2 = xor_32regs_2,
711 	.do_3 = xor_32regs_3,
712 	.do_4 = xor_32regs_4,
713 	.do_5 = xor_32regs_5,
714 };
715 
716 static struct xor_block_template xor_block_8regs_p __maybe_unused = {
717 	.name = "8regs_prefetch",
718 	.do_2 = xor_8regs_p_2,
719 	.do_3 = xor_8regs_p_3,
720 	.do_4 = xor_8regs_p_4,
721 	.do_5 = xor_8regs_p_5,
722 };
723 
724 static struct xor_block_template xor_block_32regs_p __maybe_unused = {
725 	.name = "32regs_prefetch",
726 	.do_2 = xor_32regs_p_2,
727 	.do_3 = xor_32regs_p_3,
728 	.do_4 = xor_32regs_p_4,
729 	.do_5 = xor_32regs_p_5,
730 };
731 
732 #define XOR_TRY_TEMPLATES			\
733 	do {					\
734 		xor_speed(&xor_block_8regs);	\
735 		xor_speed(&xor_block_8regs_p);	\
736 		xor_speed(&xor_block_32regs);	\
737 		xor_speed(&xor_block_32regs_p);	\
738 	} while (0)
739