11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * include/asm-generic/xor.h 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Generic optimized RAID-5 checksumming functions. 51da177e4SLinus Torvalds * 61da177e4SLinus Torvalds * This program is free software; you can redistribute it and/or modify 71da177e4SLinus Torvalds * it under the terms of the GNU General Public License as published by 81da177e4SLinus Torvalds * the Free Software Foundation; either version 2, or (at your option) 91da177e4SLinus Torvalds * any later version. 101da177e4SLinus Torvalds * 111da177e4SLinus Torvalds * You should have received a copy of the GNU General Public License 121da177e4SLinus Torvalds * (for example /usr/src/linux/COPYING); if not, write to the Free 131da177e4SLinus Torvalds * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 141da177e4SLinus Torvalds */ 151da177e4SLinus Torvalds 16*268bb0ceSLinus Torvalds #include <linux/prefetch.h> 171da177e4SLinus Torvalds 181da177e4SLinus Torvalds static void 191da177e4SLinus Torvalds xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 201da177e4SLinus Torvalds { 211da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8; 221da177e4SLinus Torvalds 231da177e4SLinus Torvalds do { 241da177e4SLinus Torvalds p1[0] ^= p2[0]; 251da177e4SLinus Torvalds p1[1] ^= p2[1]; 261da177e4SLinus Torvalds p1[2] ^= p2[2]; 271da177e4SLinus Torvalds p1[3] ^= p2[3]; 281da177e4SLinus Torvalds p1[4] ^= p2[4]; 291da177e4SLinus Torvalds p1[5] ^= p2[5]; 301da177e4SLinus Torvalds p1[6] ^= p2[6]; 311da177e4SLinus Torvalds p1[7] ^= p2[7]; 321da177e4SLinus Torvalds p1 += 8; 331da177e4SLinus Torvalds p2 += 8; 341da177e4SLinus Torvalds } while (--lines > 0); 351da177e4SLinus Torvalds } 361da177e4SLinus Torvalds 371da177e4SLinus Torvalds static void 381da177e4SLinus Torvalds xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 391da177e4SLinus Torvalds unsigned long *p3) 401da177e4SLinus Torvalds { 411da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8; 421da177e4SLinus Torvalds 431da177e4SLinus Torvalds do { 441da177e4SLinus Torvalds p1[0] ^= p2[0] ^ p3[0]; 451da177e4SLinus Torvalds p1[1] ^= p2[1] ^ p3[1]; 461da177e4SLinus Torvalds p1[2] ^= p2[2] ^ p3[2]; 471da177e4SLinus Torvalds p1[3] ^= p2[3] ^ p3[3]; 481da177e4SLinus Torvalds p1[4] ^= p2[4] ^ p3[4]; 491da177e4SLinus Torvalds p1[5] ^= p2[5] ^ p3[5]; 501da177e4SLinus Torvalds p1[6] ^= p2[6] ^ p3[6]; 511da177e4SLinus Torvalds p1[7] ^= p2[7] ^ p3[7]; 521da177e4SLinus Torvalds p1 += 8; 531da177e4SLinus Torvalds p2 += 8; 541da177e4SLinus Torvalds p3 += 8; 551da177e4SLinus Torvalds } while (--lines > 0); 561da177e4SLinus Torvalds } 571da177e4SLinus Torvalds 581da177e4SLinus Torvalds static void 591da177e4SLinus Torvalds xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 601da177e4SLinus Torvalds unsigned long *p3, unsigned long *p4) 611da177e4SLinus Torvalds { 621da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8; 631da177e4SLinus Torvalds 641da177e4SLinus Torvalds do { 651da177e4SLinus Torvalds p1[0] ^= p2[0] ^ p3[0] ^ p4[0]; 661da177e4SLinus Torvalds p1[1] ^= p2[1] ^ p3[1] ^ p4[1]; 671da177e4SLinus Torvalds p1[2] ^= p2[2] ^ p3[2] ^ p4[2]; 681da177e4SLinus Torvalds p1[3] ^= p2[3] ^ p3[3] ^ p4[3]; 691da177e4SLinus Torvalds p1[4] ^= p2[4] ^ p3[4] ^ p4[4]; 701da177e4SLinus Torvalds p1[5] ^= p2[5] ^ p3[5] ^ p4[5]; 711da177e4SLinus Torvalds p1[6] ^= p2[6] ^ p3[6] ^ p4[6]; 721da177e4SLinus Torvalds p1[7] ^= p2[7] ^ p3[7] ^ p4[7]; 731da177e4SLinus Torvalds p1 += 8; 741da177e4SLinus Torvalds p2 += 8; 751da177e4SLinus Torvalds p3 += 8; 761da177e4SLinus Torvalds p4 += 8; 771da177e4SLinus Torvalds } while (--lines > 0); 781da177e4SLinus Torvalds } 791da177e4SLinus Torvalds 801da177e4SLinus Torvalds static void 811da177e4SLinus Torvalds xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 821da177e4SLinus Torvalds unsigned long *p3, unsigned long *p4, unsigned long *p5) 831da177e4SLinus Torvalds { 841da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8; 851da177e4SLinus Torvalds 861da177e4SLinus Torvalds do { 871da177e4SLinus Torvalds p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0]; 881da177e4SLinus Torvalds p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1]; 891da177e4SLinus Torvalds p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2]; 901da177e4SLinus Torvalds p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3]; 911da177e4SLinus Torvalds p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4]; 921da177e4SLinus Torvalds p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5]; 931da177e4SLinus Torvalds p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6]; 941da177e4SLinus Torvalds p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7]; 951da177e4SLinus Torvalds p1 += 8; 961da177e4SLinus Torvalds p2 += 8; 971da177e4SLinus Torvalds p3 += 8; 981da177e4SLinus Torvalds p4 += 8; 991da177e4SLinus Torvalds p5 += 8; 1001da177e4SLinus Torvalds } while (--lines > 0); 1011da177e4SLinus Torvalds } 1021da177e4SLinus Torvalds 1031da177e4SLinus Torvalds static void 1041da177e4SLinus Torvalds xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 1051da177e4SLinus Torvalds { 1061da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8; 1071da177e4SLinus Torvalds 1081da177e4SLinus Torvalds do { 1091da177e4SLinus Torvalds register long d0, d1, d2, d3, d4, d5, d6, d7; 1101da177e4SLinus Torvalds d0 = p1[0]; /* Pull the stuff into registers */ 1111da177e4SLinus Torvalds d1 = p1[1]; /* ... in bursts, if possible. */ 1121da177e4SLinus Torvalds d2 = p1[2]; 1131da177e4SLinus Torvalds d3 = p1[3]; 1141da177e4SLinus Torvalds d4 = p1[4]; 1151da177e4SLinus Torvalds d5 = p1[5]; 1161da177e4SLinus Torvalds d6 = p1[6]; 1171da177e4SLinus Torvalds d7 = p1[7]; 1181da177e4SLinus Torvalds d0 ^= p2[0]; 1191da177e4SLinus Torvalds d1 ^= p2[1]; 1201da177e4SLinus Torvalds d2 ^= p2[2]; 1211da177e4SLinus Torvalds d3 ^= p2[3]; 1221da177e4SLinus Torvalds d4 ^= p2[4]; 1231da177e4SLinus Torvalds d5 ^= p2[5]; 1241da177e4SLinus Torvalds d6 ^= p2[6]; 1251da177e4SLinus Torvalds d7 ^= p2[7]; 1261da177e4SLinus Torvalds p1[0] = d0; /* Store the result (in bursts) */ 1271da177e4SLinus Torvalds p1[1] = d1; 1281da177e4SLinus Torvalds p1[2] = d2; 1291da177e4SLinus Torvalds p1[3] = d3; 1301da177e4SLinus Torvalds p1[4] = d4; 1311da177e4SLinus Torvalds p1[5] = d5; 1321da177e4SLinus Torvalds p1[6] = d6; 1331da177e4SLinus Torvalds p1[7] = d7; 1341da177e4SLinus Torvalds p1 += 8; 1351da177e4SLinus Torvalds p2 += 8; 1361da177e4SLinus Torvalds } while (--lines > 0); 1371da177e4SLinus Torvalds } 1381da177e4SLinus Torvalds 1391da177e4SLinus Torvalds static void 1401da177e4SLinus Torvalds xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 1411da177e4SLinus Torvalds unsigned long *p3) 1421da177e4SLinus Torvalds { 1431da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8; 1441da177e4SLinus Torvalds 1451da177e4SLinus Torvalds do { 1461da177e4SLinus Torvalds register long d0, d1, d2, d3, d4, d5, d6, d7; 1471da177e4SLinus Torvalds d0 = p1[0]; /* Pull the stuff into registers */ 1481da177e4SLinus Torvalds d1 = p1[1]; /* ... in bursts, if possible. */ 1491da177e4SLinus Torvalds d2 = p1[2]; 1501da177e4SLinus Torvalds d3 = p1[3]; 1511da177e4SLinus Torvalds d4 = p1[4]; 1521da177e4SLinus Torvalds d5 = p1[5]; 1531da177e4SLinus Torvalds d6 = p1[6]; 1541da177e4SLinus Torvalds d7 = p1[7]; 1551da177e4SLinus Torvalds d0 ^= p2[0]; 1561da177e4SLinus Torvalds d1 ^= p2[1]; 1571da177e4SLinus Torvalds d2 ^= p2[2]; 1581da177e4SLinus Torvalds d3 ^= p2[3]; 1591da177e4SLinus Torvalds d4 ^= p2[4]; 1601da177e4SLinus Torvalds d5 ^= p2[5]; 1611da177e4SLinus Torvalds d6 ^= p2[6]; 1621da177e4SLinus Torvalds d7 ^= p2[7]; 1631da177e4SLinus Torvalds d0 ^= p3[0]; 1641da177e4SLinus Torvalds d1 ^= p3[1]; 1651da177e4SLinus Torvalds d2 ^= p3[2]; 1661da177e4SLinus Torvalds d3 ^= p3[3]; 1671da177e4SLinus Torvalds d4 ^= p3[4]; 1681da177e4SLinus Torvalds d5 ^= p3[5]; 1691da177e4SLinus Torvalds d6 ^= p3[6]; 1701da177e4SLinus Torvalds d7 ^= p3[7]; 1711da177e4SLinus Torvalds p1[0] = d0; /* Store the result (in bursts) */ 1721da177e4SLinus Torvalds p1[1] = d1; 1731da177e4SLinus Torvalds p1[2] = d2; 1741da177e4SLinus Torvalds p1[3] = d3; 1751da177e4SLinus Torvalds p1[4] = d4; 1761da177e4SLinus Torvalds p1[5] = d5; 1771da177e4SLinus Torvalds p1[6] = d6; 1781da177e4SLinus Torvalds p1[7] = d7; 1791da177e4SLinus Torvalds p1 += 8; 1801da177e4SLinus Torvalds p2 += 8; 1811da177e4SLinus Torvalds p3 += 8; 1821da177e4SLinus Torvalds } while (--lines > 0); 1831da177e4SLinus Torvalds } 1841da177e4SLinus Torvalds 1851da177e4SLinus Torvalds static void 1861da177e4SLinus Torvalds xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 1871da177e4SLinus Torvalds unsigned long *p3, unsigned long *p4) 1881da177e4SLinus Torvalds { 1891da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8; 1901da177e4SLinus Torvalds 1911da177e4SLinus Torvalds do { 1921da177e4SLinus Torvalds register long d0, d1, d2, d3, d4, d5, d6, d7; 1931da177e4SLinus Torvalds d0 = p1[0]; /* Pull the stuff into registers */ 1941da177e4SLinus Torvalds d1 = p1[1]; /* ... in bursts, if possible. */ 1951da177e4SLinus Torvalds d2 = p1[2]; 1961da177e4SLinus Torvalds d3 = p1[3]; 1971da177e4SLinus Torvalds d4 = p1[4]; 1981da177e4SLinus Torvalds d5 = p1[5]; 1991da177e4SLinus Torvalds d6 = p1[6]; 2001da177e4SLinus Torvalds d7 = p1[7]; 2011da177e4SLinus Torvalds d0 ^= p2[0]; 2021da177e4SLinus Torvalds d1 ^= p2[1]; 2031da177e4SLinus Torvalds d2 ^= p2[2]; 2041da177e4SLinus Torvalds d3 ^= p2[3]; 2051da177e4SLinus Torvalds d4 ^= p2[4]; 2061da177e4SLinus Torvalds d5 ^= p2[5]; 2071da177e4SLinus Torvalds d6 ^= p2[6]; 2081da177e4SLinus Torvalds d7 ^= p2[7]; 2091da177e4SLinus Torvalds d0 ^= p3[0]; 2101da177e4SLinus Torvalds d1 ^= p3[1]; 2111da177e4SLinus Torvalds d2 ^= p3[2]; 2121da177e4SLinus Torvalds d3 ^= p3[3]; 2131da177e4SLinus Torvalds d4 ^= p3[4]; 2141da177e4SLinus Torvalds d5 ^= p3[5]; 2151da177e4SLinus Torvalds d6 ^= p3[6]; 2161da177e4SLinus Torvalds d7 ^= p3[7]; 2171da177e4SLinus Torvalds d0 ^= p4[0]; 2181da177e4SLinus Torvalds d1 ^= p4[1]; 2191da177e4SLinus Torvalds d2 ^= p4[2]; 2201da177e4SLinus Torvalds d3 ^= p4[3]; 2211da177e4SLinus Torvalds d4 ^= p4[4]; 2221da177e4SLinus Torvalds d5 ^= p4[5]; 2231da177e4SLinus Torvalds d6 ^= p4[6]; 2241da177e4SLinus Torvalds d7 ^= p4[7]; 2251da177e4SLinus Torvalds p1[0] = d0; /* Store the result (in bursts) */ 2261da177e4SLinus Torvalds p1[1] = d1; 2271da177e4SLinus Torvalds p1[2] = d2; 2281da177e4SLinus Torvalds p1[3] = d3; 2291da177e4SLinus Torvalds p1[4] = d4; 2301da177e4SLinus Torvalds p1[5] = d5; 2311da177e4SLinus Torvalds p1[6] = d6; 2321da177e4SLinus Torvalds p1[7] = d7; 2331da177e4SLinus Torvalds p1 += 8; 2341da177e4SLinus Torvalds p2 += 8; 2351da177e4SLinus Torvalds p3 += 8; 2361da177e4SLinus Torvalds p4 += 8; 2371da177e4SLinus Torvalds } while (--lines > 0); 2381da177e4SLinus Torvalds } 2391da177e4SLinus Torvalds 2401da177e4SLinus Torvalds static void 2411da177e4SLinus Torvalds xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 2421da177e4SLinus Torvalds unsigned long *p3, unsigned long *p4, unsigned long *p5) 2431da177e4SLinus Torvalds { 2441da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8; 2451da177e4SLinus Torvalds 2461da177e4SLinus Torvalds do { 2471da177e4SLinus Torvalds register long d0, d1, d2, d3, d4, d5, d6, d7; 2481da177e4SLinus Torvalds d0 = p1[0]; /* Pull the stuff into registers */ 2491da177e4SLinus Torvalds d1 = p1[1]; /* ... in bursts, if possible. */ 2501da177e4SLinus Torvalds d2 = p1[2]; 2511da177e4SLinus Torvalds d3 = p1[3]; 2521da177e4SLinus Torvalds d4 = p1[4]; 2531da177e4SLinus Torvalds d5 = p1[5]; 2541da177e4SLinus Torvalds d6 = p1[6]; 2551da177e4SLinus Torvalds d7 = p1[7]; 2561da177e4SLinus Torvalds d0 ^= p2[0]; 2571da177e4SLinus Torvalds d1 ^= p2[1]; 2581da177e4SLinus Torvalds d2 ^= p2[2]; 2591da177e4SLinus Torvalds d3 ^= p2[3]; 2601da177e4SLinus Torvalds d4 ^= p2[4]; 2611da177e4SLinus Torvalds d5 ^= p2[5]; 2621da177e4SLinus Torvalds d6 ^= p2[6]; 2631da177e4SLinus Torvalds d7 ^= p2[7]; 2641da177e4SLinus Torvalds d0 ^= p3[0]; 2651da177e4SLinus Torvalds d1 ^= p3[1]; 2661da177e4SLinus Torvalds d2 ^= p3[2]; 2671da177e4SLinus Torvalds d3 ^= p3[3]; 2681da177e4SLinus Torvalds d4 ^= p3[4]; 2691da177e4SLinus Torvalds d5 ^= p3[5]; 2701da177e4SLinus Torvalds d6 ^= p3[6]; 2711da177e4SLinus Torvalds d7 ^= p3[7]; 2721da177e4SLinus Torvalds d0 ^= p4[0]; 2731da177e4SLinus Torvalds d1 ^= p4[1]; 2741da177e4SLinus Torvalds d2 ^= p4[2]; 2751da177e4SLinus Torvalds d3 ^= p4[3]; 2761da177e4SLinus Torvalds d4 ^= p4[4]; 2771da177e4SLinus Torvalds d5 ^= p4[5]; 2781da177e4SLinus Torvalds d6 ^= p4[6]; 2791da177e4SLinus Torvalds d7 ^= p4[7]; 2801da177e4SLinus Torvalds d0 ^= p5[0]; 2811da177e4SLinus Torvalds d1 ^= p5[1]; 2821da177e4SLinus Torvalds d2 ^= p5[2]; 2831da177e4SLinus Torvalds d3 ^= p5[3]; 2841da177e4SLinus Torvalds d4 ^= p5[4]; 2851da177e4SLinus Torvalds d5 ^= p5[5]; 2861da177e4SLinus Torvalds d6 ^= p5[6]; 2871da177e4SLinus Torvalds d7 ^= p5[7]; 2881da177e4SLinus Torvalds p1[0] = d0; /* Store the result (in bursts) */ 2891da177e4SLinus Torvalds p1[1] = d1; 2901da177e4SLinus Torvalds p1[2] = d2; 2911da177e4SLinus Torvalds p1[3] = d3; 2921da177e4SLinus Torvalds p1[4] = d4; 2931da177e4SLinus Torvalds p1[5] = d5; 2941da177e4SLinus Torvalds p1[6] = d6; 2951da177e4SLinus Torvalds p1[7] = d7; 2961da177e4SLinus Torvalds p1 += 8; 2971da177e4SLinus Torvalds p2 += 8; 2981da177e4SLinus Torvalds p3 += 8; 2991da177e4SLinus Torvalds p4 += 8; 3001da177e4SLinus Torvalds p5 += 8; 3011da177e4SLinus Torvalds } while (--lines > 0); 3021da177e4SLinus Torvalds } 3031da177e4SLinus Torvalds 3041da177e4SLinus Torvalds static void 3051da177e4SLinus Torvalds xor_8regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 3061da177e4SLinus Torvalds { 3071da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8 - 1; 3081da177e4SLinus Torvalds prefetchw(p1); 3091da177e4SLinus Torvalds prefetch(p2); 3101da177e4SLinus Torvalds 3111da177e4SLinus Torvalds do { 3121da177e4SLinus Torvalds prefetchw(p1+8); 3131da177e4SLinus Torvalds prefetch(p2+8); 3141da177e4SLinus Torvalds once_more: 3151da177e4SLinus Torvalds p1[0] ^= p2[0]; 3161da177e4SLinus Torvalds p1[1] ^= p2[1]; 3171da177e4SLinus Torvalds p1[2] ^= p2[2]; 3181da177e4SLinus Torvalds p1[3] ^= p2[3]; 3191da177e4SLinus Torvalds p1[4] ^= p2[4]; 3201da177e4SLinus Torvalds p1[5] ^= p2[5]; 3211da177e4SLinus Torvalds p1[6] ^= p2[6]; 3221da177e4SLinus Torvalds p1[7] ^= p2[7]; 3231da177e4SLinus Torvalds p1 += 8; 3241da177e4SLinus Torvalds p2 += 8; 3251da177e4SLinus Torvalds } while (--lines > 0); 3261da177e4SLinus Torvalds if (lines == 0) 3271da177e4SLinus Torvalds goto once_more; 3281da177e4SLinus Torvalds } 3291da177e4SLinus Torvalds 3301da177e4SLinus Torvalds static void 3311da177e4SLinus Torvalds xor_8regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 3321da177e4SLinus Torvalds unsigned long *p3) 3331da177e4SLinus Torvalds { 3341da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8 - 1; 3351da177e4SLinus Torvalds prefetchw(p1); 3361da177e4SLinus Torvalds prefetch(p2); 3371da177e4SLinus Torvalds prefetch(p3); 3381da177e4SLinus Torvalds 3391da177e4SLinus Torvalds do { 3401da177e4SLinus Torvalds prefetchw(p1+8); 3411da177e4SLinus Torvalds prefetch(p2+8); 3421da177e4SLinus Torvalds prefetch(p3+8); 3431da177e4SLinus Torvalds once_more: 3441da177e4SLinus Torvalds p1[0] ^= p2[0] ^ p3[0]; 3451da177e4SLinus Torvalds p1[1] ^= p2[1] ^ p3[1]; 3461da177e4SLinus Torvalds p1[2] ^= p2[2] ^ p3[2]; 3471da177e4SLinus Torvalds p1[3] ^= p2[3] ^ p3[3]; 3481da177e4SLinus Torvalds p1[4] ^= p2[4] ^ p3[4]; 3491da177e4SLinus Torvalds p1[5] ^= p2[5] ^ p3[5]; 3501da177e4SLinus Torvalds p1[6] ^= p2[6] ^ p3[6]; 3511da177e4SLinus Torvalds p1[7] ^= p2[7] ^ p3[7]; 3521da177e4SLinus Torvalds p1 += 8; 3531da177e4SLinus Torvalds p2 += 8; 3541da177e4SLinus Torvalds p3 += 8; 3551da177e4SLinus Torvalds } while (--lines > 0); 3561da177e4SLinus Torvalds if (lines == 0) 3571da177e4SLinus Torvalds goto once_more; 3581da177e4SLinus Torvalds } 3591da177e4SLinus Torvalds 3601da177e4SLinus Torvalds static void 3611da177e4SLinus Torvalds xor_8regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 3621da177e4SLinus Torvalds unsigned long *p3, unsigned long *p4) 3631da177e4SLinus Torvalds { 3641da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8 - 1; 3651da177e4SLinus Torvalds 3661da177e4SLinus Torvalds prefetchw(p1); 3671da177e4SLinus Torvalds prefetch(p2); 3681da177e4SLinus Torvalds prefetch(p3); 3691da177e4SLinus Torvalds prefetch(p4); 3701da177e4SLinus Torvalds 3711da177e4SLinus Torvalds do { 3721da177e4SLinus Torvalds prefetchw(p1+8); 3731da177e4SLinus Torvalds prefetch(p2+8); 3741da177e4SLinus Torvalds prefetch(p3+8); 3751da177e4SLinus Torvalds prefetch(p4+8); 3761da177e4SLinus Torvalds once_more: 3771da177e4SLinus Torvalds p1[0] ^= p2[0] ^ p3[0] ^ p4[0]; 3781da177e4SLinus Torvalds p1[1] ^= p2[1] ^ p3[1] ^ p4[1]; 3791da177e4SLinus Torvalds p1[2] ^= p2[2] ^ p3[2] ^ p4[2]; 3801da177e4SLinus Torvalds p1[3] ^= p2[3] ^ p3[3] ^ p4[3]; 3811da177e4SLinus Torvalds p1[4] ^= p2[4] ^ p3[4] ^ p4[4]; 3821da177e4SLinus Torvalds p1[5] ^= p2[5] ^ p3[5] ^ p4[5]; 3831da177e4SLinus Torvalds p1[6] ^= p2[6] ^ p3[6] ^ p4[6]; 3841da177e4SLinus Torvalds p1[7] ^= p2[7] ^ p3[7] ^ p4[7]; 3851da177e4SLinus Torvalds p1 += 8; 3861da177e4SLinus Torvalds p2 += 8; 3871da177e4SLinus Torvalds p3 += 8; 3881da177e4SLinus Torvalds p4 += 8; 3891da177e4SLinus Torvalds } while (--lines > 0); 3901da177e4SLinus Torvalds if (lines == 0) 3911da177e4SLinus Torvalds goto once_more; 3921da177e4SLinus Torvalds } 3931da177e4SLinus Torvalds 3941da177e4SLinus Torvalds static void 3951da177e4SLinus Torvalds xor_8regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 3961da177e4SLinus Torvalds unsigned long *p3, unsigned long *p4, unsigned long *p5) 3971da177e4SLinus Torvalds { 3981da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8 - 1; 3991da177e4SLinus Torvalds 4001da177e4SLinus Torvalds prefetchw(p1); 4011da177e4SLinus Torvalds prefetch(p2); 4021da177e4SLinus Torvalds prefetch(p3); 4031da177e4SLinus Torvalds prefetch(p4); 4041da177e4SLinus Torvalds prefetch(p5); 4051da177e4SLinus Torvalds 4061da177e4SLinus Torvalds do { 4071da177e4SLinus Torvalds prefetchw(p1+8); 4081da177e4SLinus Torvalds prefetch(p2+8); 4091da177e4SLinus Torvalds prefetch(p3+8); 4101da177e4SLinus Torvalds prefetch(p4+8); 4111da177e4SLinus Torvalds prefetch(p5+8); 4121da177e4SLinus Torvalds once_more: 4131da177e4SLinus Torvalds p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0]; 4141da177e4SLinus Torvalds p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1]; 4151da177e4SLinus Torvalds p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2]; 4161da177e4SLinus Torvalds p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3]; 4171da177e4SLinus Torvalds p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4]; 4181da177e4SLinus Torvalds p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5]; 4191da177e4SLinus Torvalds p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6]; 4201da177e4SLinus Torvalds p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7]; 4211da177e4SLinus Torvalds p1 += 8; 4221da177e4SLinus Torvalds p2 += 8; 4231da177e4SLinus Torvalds p3 += 8; 4241da177e4SLinus Torvalds p4 += 8; 4251da177e4SLinus Torvalds p5 += 8; 4261da177e4SLinus Torvalds } while (--lines > 0); 4271da177e4SLinus Torvalds if (lines == 0) 4281da177e4SLinus Torvalds goto once_more; 4291da177e4SLinus Torvalds } 4301da177e4SLinus Torvalds 4311da177e4SLinus Torvalds static void 4321da177e4SLinus Torvalds xor_32regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 4331da177e4SLinus Torvalds { 4341da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8 - 1; 4351da177e4SLinus Torvalds 4361da177e4SLinus Torvalds prefetchw(p1); 4371da177e4SLinus Torvalds prefetch(p2); 4381da177e4SLinus Torvalds 4391da177e4SLinus Torvalds do { 4401da177e4SLinus Torvalds register long d0, d1, d2, d3, d4, d5, d6, d7; 4411da177e4SLinus Torvalds 4421da177e4SLinus Torvalds prefetchw(p1+8); 4431da177e4SLinus Torvalds prefetch(p2+8); 4441da177e4SLinus Torvalds once_more: 4451da177e4SLinus Torvalds d0 = p1[0]; /* Pull the stuff into registers */ 4461da177e4SLinus Torvalds d1 = p1[1]; /* ... in bursts, if possible. */ 4471da177e4SLinus Torvalds d2 = p1[2]; 4481da177e4SLinus Torvalds d3 = p1[3]; 4491da177e4SLinus Torvalds d4 = p1[4]; 4501da177e4SLinus Torvalds d5 = p1[5]; 4511da177e4SLinus Torvalds d6 = p1[6]; 4521da177e4SLinus Torvalds d7 = p1[7]; 4531da177e4SLinus Torvalds d0 ^= p2[0]; 4541da177e4SLinus Torvalds d1 ^= p2[1]; 4551da177e4SLinus Torvalds d2 ^= p2[2]; 4561da177e4SLinus Torvalds d3 ^= p2[3]; 4571da177e4SLinus Torvalds d4 ^= p2[4]; 4581da177e4SLinus Torvalds d5 ^= p2[5]; 4591da177e4SLinus Torvalds d6 ^= p2[6]; 4601da177e4SLinus Torvalds d7 ^= p2[7]; 4611da177e4SLinus Torvalds p1[0] = d0; /* Store the result (in bursts) */ 4621da177e4SLinus Torvalds p1[1] = d1; 4631da177e4SLinus Torvalds p1[2] = d2; 4641da177e4SLinus Torvalds p1[3] = d3; 4651da177e4SLinus Torvalds p1[4] = d4; 4661da177e4SLinus Torvalds p1[5] = d5; 4671da177e4SLinus Torvalds p1[6] = d6; 4681da177e4SLinus Torvalds p1[7] = d7; 4691da177e4SLinus Torvalds p1 += 8; 4701da177e4SLinus Torvalds p2 += 8; 4711da177e4SLinus Torvalds } while (--lines > 0); 4721da177e4SLinus Torvalds if (lines == 0) 4731da177e4SLinus Torvalds goto once_more; 4741da177e4SLinus Torvalds } 4751da177e4SLinus Torvalds 4761da177e4SLinus Torvalds static void 4771da177e4SLinus Torvalds xor_32regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 4781da177e4SLinus Torvalds unsigned long *p3) 4791da177e4SLinus Torvalds { 4801da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8 - 1; 4811da177e4SLinus Torvalds 4821da177e4SLinus Torvalds prefetchw(p1); 4831da177e4SLinus Torvalds prefetch(p2); 4841da177e4SLinus Torvalds prefetch(p3); 4851da177e4SLinus Torvalds 4861da177e4SLinus Torvalds do { 4871da177e4SLinus Torvalds register long d0, d1, d2, d3, d4, d5, d6, d7; 4881da177e4SLinus Torvalds 4891da177e4SLinus Torvalds prefetchw(p1+8); 4901da177e4SLinus Torvalds prefetch(p2+8); 4911da177e4SLinus Torvalds prefetch(p3+8); 4921da177e4SLinus Torvalds once_more: 4931da177e4SLinus Torvalds d0 = p1[0]; /* Pull the stuff into registers */ 4941da177e4SLinus Torvalds d1 = p1[1]; /* ... in bursts, if possible. */ 4951da177e4SLinus Torvalds d2 = p1[2]; 4961da177e4SLinus Torvalds d3 = p1[3]; 4971da177e4SLinus Torvalds d4 = p1[4]; 4981da177e4SLinus Torvalds d5 = p1[5]; 4991da177e4SLinus Torvalds d6 = p1[6]; 5001da177e4SLinus Torvalds d7 = p1[7]; 5011da177e4SLinus Torvalds d0 ^= p2[0]; 5021da177e4SLinus Torvalds d1 ^= p2[1]; 5031da177e4SLinus Torvalds d2 ^= p2[2]; 5041da177e4SLinus Torvalds d3 ^= p2[3]; 5051da177e4SLinus Torvalds d4 ^= p2[4]; 5061da177e4SLinus Torvalds d5 ^= p2[5]; 5071da177e4SLinus Torvalds d6 ^= p2[6]; 5081da177e4SLinus Torvalds d7 ^= p2[7]; 5091da177e4SLinus Torvalds d0 ^= p3[0]; 5101da177e4SLinus Torvalds d1 ^= p3[1]; 5111da177e4SLinus Torvalds d2 ^= p3[2]; 5121da177e4SLinus Torvalds d3 ^= p3[3]; 5131da177e4SLinus Torvalds d4 ^= p3[4]; 5141da177e4SLinus Torvalds d5 ^= p3[5]; 5151da177e4SLinus Torvalds d6 ^= p3[6]; 5161da177e4SLinus Torvalds d7 ^= p3[7]; 5171da177e4SLinus Torvalds p1[0] = d0; /* Store the result (in bursts) */ 5181da177e4SLinus Torvalds p1[1] = d1; 5191da177e4SLinus Torvalds p1[2] = d2; 5201da177e4SLinus Torvalds p1[3] = d3; 5211da177e4SLinus Torvalds p1[4] = d4; 5221da177e4SLinus Torvalds p1[5] = d5; 5231da177e4SLinus Torvalds p1[6] = d6; 5241da177e4SLinus Torvalds p1[7] = d7; 5251da177e4SLinus Torvalds p1 += 8; 5261da177e4SLinus Torvalds p2 += 8; 5271da177e4SLinus Torvalds p3 += 8; 5281da177e4SLinus Torvalds } while (--lines > 0); 5291da177e4SLinus Torvalds if (lines == 0) 5301da177e4SLinus Torvalds goto once_more; 5311da177e4SLinus Torvalds } 5321da177e4SLinus Torvalds 5331da177e4SLinus Torvalds static void 5341da177e4SLinus Torvalds xor_32regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 5351da177e4SLinus Torvalds unsigned long *p3, unsigned long *p4) 5361da177e4SLinus Torvalds { 5371da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8 - 1; 5381da177e4SLinus Torvalds 5391da177e4SLinus Torvalds prefetchw(p1); 5401da177e4SLinus Torvalds prefetch(p2); 5411da177e4SLinus Torvalds prefetch(p3); 5421da177e4SLinus Torvalds prefetch(p4); 5431da177e4SLinus Torvalds 5441da177e4SLinus Torvalds do { 5451da177e4SLinus Torvalds register long d0, d1, d2, d3, d4, d5, d6, d7; 5461da177e4SLinus Torvalds 5471da177e4SLinus Torvalds prefetchw(p1+8); 5481da177e4SLinus Torvalds prefetch(p2+8); 5491da177e4SLinus Torvalds prefetch(p3+8); 5501da177e4SLinus Torvalds prefetch(p4+8); 5511da177e4SLinus Torvalds once_more: 5521da177e4SLinus Torvalds d0 = p1[0]; /* Pull the stuff into registers */ 5531da177e4SLinus Torvalds d1 = p1[1]; /* ... in bursts, if possible. */ 5541da177e4SLinus Torvalds d2 = p1[2]; 5551da177e4SLinus Torvalds d3 = p1[3]; 5561da177e4SLinus Torvalds d4 = p1[4]; 5571da177e4SLinus Torvalds d5 = p1[5]; 5581da177e4SLinus Torvalds d6 = p1[6]; 5591da177e4SLinus Torvalds d7 = p1[7]; 5601da177e4SLinus Torvalds d0 ^= p2[0]; 5611da177e4SLinus Torvalds d1 ^= p2[1]; 5621da177e4SLinus Torvalds d2 ^= p2[2]; 5631da177e4SLinus Torvalds d3 ^= p2[3]; 5641da177e4SLinus Torvalds d4 ^= p2[4]; 5651da177e4SLinus Torvalds d5 ^= p2[5]; 5661da177e4SLinus Torvalds d6 ^= p2[6]; 5671da177e4SLinus Torvalds d7 ^= p2[7]; 5681da177e4SLinus Torvalds d0 ^= p3[0]; 5691da177e4SLinus Torvalds d1 ^= p3[1]; 5701da177e4SLinus Torvalds d2 ^= p3[2]; 5711da177e4SLinus Torvalds d3 ^= p3[3]; 5721da177e4SLinus Torvalds d4 ^= p3[4]; 5731da177e4SLinus Torvalds d5 ^= p3[5]; 5741da177e4SLinus Torvalds d6 ^= p3[6]; 5751da177e4SLinus Torvalds d7 ^= p3[7]; 5761da177e4SLinus Torvalds d0 ^= p4[0]; 5771da177e4SLinus Torvalds d1 ^= p4[1]; 5781da177e4SLinus Torvalds d2 ^= p4[2]; 5791da177e4SLinus Torvalds d3 ^= p4[3]; 5801da177e4SLinus Torvalds d4 ^= p4[4]; 5811da177e4SLinus Torvalds d5 ^= p4[5]; 5821da177e4SLinus Torvalds d6 ^= p4[6]; 5831da177e4SLinus Torvalds d7 ^= p4[7]; 5841da177e4SLinus Torvalds p1[0] = d0; /* Store the result (in bursts) */ 5851da177e4SLinus Torvalds p1[1] = d1; 5861da177e4SLinus Torvalds p1[2] = d2; 5871da177e4SLinus Torvalds p1[3] = d3; 5881da177e4SLinus Torvalds p1[4] = d4; 5891da177e4SLinus Torvalds p1[5] = d5; 5901da177e4SLinus Torvalds p1[6] = d6; 5911da177e4SLinus Torvalds p1[7] = d7; 5921da177e4SLinus Torvalds p1 += 8; 5931da177e4SLinus Torvalds p2 += 8; 5941da177e4SLinus Torvalds p3 += 8; 5951da177e4SLinus Torvalds p4 += 8; 5961da177e4SLinus Torvalds } while (--lines > 0); 5971da177e4SLinus Torvalds if (lines == 0) 5981da177e4SLinus Torvalds goto once_more; 5991da177e4SLinus Torvalds } 6001da177e4SLinus Torvalds 6011da177e4SLinus Torvalds static void 6021da177e4SLinus Torvalds xor_32regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 6031da177e4SLinus Torvalds unsigned long *p3, unsigned long *p4, unsigned long *p5) 6041da177e4SLinus Torvalds { 6051da177e4SLinus Torvalds long lines = bytes / (sizeof (long)) / 8 - 1; 6061da177e4SLinus Torvalds 6071da177e4SLinus Torvalds prefetchw(p1); 6081da177e4SLinus Torvalds prefetch(p2); 6091da177e4SLinus Torvalds prefetch(p3); 6101da177e4SLinus Torvalds prefetch(p4); 6111da177e4SLinus Torvalds prefetch(p5); 6121da177e4SLinus Torvalds 6131da177e4SLinus Torvalds do { 6141da177e4SLinus Torvalds register long d0, d1, d2, d3, d4, d5, d6, d7; 6151da177e4SLinus Torvalds 6161da177e4SLinus Torvalds prefetchw(p1+8); 6171da177e4SLinus Torvalds prefetch(p2+8); 6181da177e4SLinus Torvalds prefetch(p3+8); 6191da177e4SLinus Torvalds prefetch(p4+8); 6201da177e4SLinus Torvalds prefetch(p5+8); 6211da177e4SLinus Torvalds once_more: 6221da177e4SLinus Torvalds d0 = p1[0]; /* Pull the stuff into registers */ 6231da177e4SLinus Torvalds d1 = p1[1]; /* ... in bursts, if possible. */ 6241da177e4SLinus Torvalds d2 = p1[2]; 6251da177e4SLinus Torvalds d3 = p1[3]; 6261da177e4SLinus Torvalds d4 = p1[4]; 6271da177e4SLinus Torvalds d5 = p1[5]; 6281da177e4SLinus Torvalds d6 = p1[6]; 6291da177e4SLinus Torvalds d7 = p1[7]; 6301da177e4SLinus Torvalds d0 ^= p2[0]; 6311da177e4SLinus Torvalds d1 ^= p2[1]; 6321da177e4SLinus Torvalds d2 ^= p2[2]; 6331da177e4SLinus Torvalds d3 ^= p2[3]; 6341da177e4SLinus Torvalds d4 ^= p2[4]; 6351da177e4SLinus Torvalds d5 ^= p2[5]; 6361da177e4SLinus Torvalds d6 ^= p2[6]; 6371da177e4SLinus Torvalds d7 ^= p2[7]; 6381da177e4SLinus Torvalds d0 ^= p3[0]; 6391da177e4SLinus Torvalds d1 ^= p3[1]; 6401da177e4SLinus Torvalds d2 ^= p3[2]; 6411da177e4SLinus Torvalds d3 ^= p3[3]; 6421da177e4SLinus Torvalds d4 ^= p3[4]; 6431da177e4SLinus Torvalds d5 ^= p3[5]; 6441da177e4SLinus Torvalds d6 ^= p3[6]; 6451da177e4SLinus Torvalds d7 ^= p3[7]; 6461da177e4SLinus Torvalds d0 ^= p4[0]; 6471da177e4SLinus Torvalds d1 ^= p4[1]; 6481da177e4SLinus Torvalds d2 ^= p4[2]; 6491da177e4SLinus Torvalds d3 ^= p4[3]; 6501da177e4SLinus Torvalds d4 ^= p4[4]; 6511da177e4SLinus Torvalds d5 ^= p4[5]; 6521da177e4SLinus Torvalds d6 ^= p4[6]; 6531da177e4SLinus Torvalds d7 ^= p4[7]; 6541da177e4SLinus Torvalds d0 ^= p5[0]; 6551da177e4SLinus Torvalds d1 ^= p5[1]; 6561da177e4SLinus Torvalds d2 ^= p5[2]; 6571da177e4SLinus Torvalds d3 ^= p5[3]; 6581da177e4SLinus Torvalds d4 ^= p5[4]; 6591da177e4SLinus Torvalds d5 ^= p5[5]; 6601da177e4SLinus Torvalds d6 ^= p5[6]; 6611da177e4SLinus Torvalds d7 ^= p5[7]; 6621da177e4SLinus Torvalds p1[0] = d0; /* Store the result (in bursts) */ 6631da177e4SLinus Torvalds p1[1] = d1; 6641da177e4SLinus Torvalds p1[2] = d2; 6651da177e4SLinus Torvalds p1[3] = d3; 6661da177e4SLinus Torvalds p1[4] = d4; 6671da177e4SLinus Torvalds p1[5] = d5; 6681da177e4SLinus Torvalds p1[6] = d6; 6691da177e4SLinus Torvalds p1[7] = d7; 6701da177e4SLinus Torvalds p1 += 8; 6711da177e4SLinus Torvalds p2 += 8; 6721da177e4SLinus Torvalds p3 += 8; 6731da177e4SLinus Torvalds p4 += 8; 6741da177e4SLinus Torvalds p5 += 8; 6751da177e4SLinus Torvalds } while (--lines > 0); 6761da177e4SLinus Torvalds if (lines == 0) 6771da177e4SLinus Torvalds goto once_more; 6781da177e4SLinus Torvalds } 6791da177e4SLinus Torvalds 6801da177e4SLinus Torvalds static struct xor_block_template xor_block_8regs = { 6811da177e4SLinus Torvalds .name = "8regs", 6821da177e4SLinus Torvalds .do_2 = xor_8regs_2, 6831da177e4SLinus Torvalds .do_3 = xor_8regs_3, 6841da177e4SLinus Torvalds .do_4 = xor_8regs_4, 6851da177e4SLinus Torvalds .do_5 = xor_8regs_5, 6861da177e4SLinus Torvalds }; 6871da177e4SLinus Torvalds 6881da177e4SLinus Torvalds static struct xor_block_template xor_block_32regs = { 6891da177e4SLinus Torvalds .name = "32regs", 6901da177e4SLinus Torvalds .do_2 = xor_32regs_2, 6911da177e4SLinus Torvalds .do_3 = xor_32regs_3, 6921da177e4SLinus Torvalds .do_4 = xor_32regs_4, 6931da177e4SLinus Torvalds .do_5 = xor_32regs_5, 6941da177e4SLinus Torvalds }; 6951da177e4SLinus Torvalds 6961da177e4SLinus Torvalds static struct xor_block_template xor_block_8regs_p = { 6971da177e4SLinus Torvalds .name = "8regs_prefetch", 6981da177e4SLinus Torvalds .do_2 = xor_8regs_p_2, 6991da177e4SLinus Torvalds .do_3 = xor_8regs_p_3, 7001da177e4SLinus Torvalds .do_4 = xor_8regs_p_4, 7011da177e4SLinus Torvalds .do_5 = xor_8regs_p_5, 7021da177e4SLinus Torvalds }; 7031da177e4SLinus Torvalds 7041da177e4SLinus Torvalds static struct xor_block_template xor_block_32regs_p = { 7051da177e4SLinus Torvalds .name = "32regs_prefetch", 7061da177e4SLinus Torvalds .do_2 = xor_32regs_p_2, 7071da177e4SLinus Torvalds .do_3 = xor_32regs_p_3, 7081da177e4SLinus Torvalds .do_4 = xor_32regs_p_4, 7091da177e4SLinus Torvalds .do_5 = xor_32regs_p_5, 7101da177e4SLinus Torvalds }; 7111da177e4SLinus Torvalds 7121da177e4SLinus Torvalds #define XOR_TRY_TEMPLATES \ 7131da177e4SLinus Torvalds do { \ 7141da177e4SLinus Torvalds xor_speed(&xor_block_8regs); \ 7151da177e4SLinus Torvalds xor_speed(&xor_block_8regs_p); \ 7161da177e4SLinus Torvalds xor_speed(&xor_block_32regs); \ 7171da177e4SLinus Torvalds xor_speed(&xor_block_32regs_p); \ 7181da177e4SLinus Torvalds } while (0) 719