1 #ifndef _ASM_X86_XOR_H 2 #define _ASM_X86_XOR_H 3 4 /* 5 * Optimized RAID-5 checksumming functions for SSE. 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2, or (at your option) 10 * any later version. 11 * 12 * You should have received a copy of the GNU General Public License 13 * (for example /usr/src/linux/COPYING); if not, write to the Free 14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 15 */ 16 17 /* 18 * Cache avoiding checksumming functions utilizing KNI instructions 19 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 20 */ 21 22 /* 23 * Based on 24 * High-speed RAID5 checksumming functions utilizing SSE instructions. 25 * Copyright (C) 1998 Ingo Molnar. 26 */ 27 28 /* 29 * x86-64 changes / gcc fixes from Andi Kleen. 30 * Copyright 2002 Andi Kleen, SuSE Labs. 31 * 32 * This hasn't been optimized for the hammer yet, but there are likely 33 * no advantages to be gotten from x86-64 here anyways. 34 */ 35 36 #include <asm/fpu/api.h> 37 38 #ifdef CONFIG_X86_32 39 /* reduce register pressure */ 40 # define XOR_CONSTANT_CONSTRAINT "i" 41 #else 42 # define XOR_CONSTANT_CONSTRAINT "re" 43 #endif 44 45 #define OFFS(x) "16*("#x")" 46 #define PF_OFFS(x) "256+16*("#x")" 47 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 48 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 49 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 50 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 51 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 52 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 53 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 54 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 55 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 56 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 57 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 58 #define NOP(x) 59 60 #define BLK64(pf, op, i) \ 61 pf(i) \ 62 op(i, 0) \ 63 op(i + 1, 1) \ 64 op(i + 2, 2) \ 65 op(i + 3, 3) 66 67 static void 68 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 69 { 70 unsigned long lines = bytes >> 8; 71 72 kernel_fpu_begin(); 73 74 asm volatile( 75 #undef BLOCK 76 #define BLOCK(i) \ 77 LD(i, 0) \ 78 LD(i + 1, 1) \ 79 PF1(i) \ 80 PF1(i + 2) \ 81 LD(i + 2, 2) \ 82 LD(i + 3, 3) \ 83 PF0(i + 4) \ 84 PF0(i + 6) \ 85 XO1(i, 0) \ 86 XO1(i + 1, 1) \ 87 XO1(i + 2, 2) \ 88 XO1(i + 3, 3) \ 89 ST(i, 0) \ 90 ST(i + 1, 1) \ 91 ST(i + 2, 2) \ 92 ST(i + 3, 3) \ 93 94 95 PF0(0) 96 PF0(2) 97 98 " .align 32 ;\n" 99 " 1: ;\n" 100 101 BLOCK(0) 102 BLOCK(4) 103 BLOCK(8) 104 BLOCK(12) 105 106 " add %[inc], %[p1] ;\n" 107 " add %[inc], %[p2] ;\n" 108 " dec %[cnt] ;\n" 109 " jnz 1b ;\n" 110 : [cnt] "+r" (lines), 111 [p1] "+r" (p1), [p2] "+r" (p2) 112 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 113 : "memory"); 114 115 kernel_fpu_end(); 116 } 117 118 static void 119 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) 120 { 121 unsigned long lines = bytes >> 8; 122 123 kernel_fpu_begin(); 124 125 asm volatile( 126 #undef BLOCK 127 #define BLOCK(i) \ 128 BLK64(PF0, LD, i) \ 129 BLK64(PF1, XO1, i) \ 130 BLK64(NOP, ST, i) \ 131 132 " .align 32 ;\n" 133 " 1: ;\n" 134 135 BLOCK(0) 136 BLOCK(4) 137 BLOCK(8) 138 BLOCK(12) 139 140 " add %[inc], %[p1] ;\n" 141 " add %[inc], %[p2] ;\n" 142 " dec %[cnt] ;\n" 143 " jnz 1b ;\n" 144 : [cnt] "+r" (lines), 145 [p1] "+r" (p1), [p2] "+r" (p2) 146 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 147 : "memory"); 148 149 kernel_fpu_end(); 150 } 151 152 static void 153 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 154 unsigned long *p3) 155 { 156 unsigned long lines = bytes >> 8; 157 158 kernel_fpu_begin(); 159 160 asm volatile( 161 #undef BLOCK 162 #define BLOCK(i) \ 163 PF1(i) \ 164 PF1(i + 2) \ 165 LD(i, 0) \ 166 LD(i + 1, 1) \ 167 LD(i + 2, 2) \ 168 LD(i + 3, 3) \ 169 PF2(i) \ 170 PF2(i + 2) \ 171 PF0(i + 4) \ 172 PF0(i + 6) \ 173 XO1(i, 0) \ 174 XO1(i + 1, 1) \ 175 XO1(i + 2, 2) \ 176 XO1(i + 3, 3) \ 177 XO2(i, 0) \ 178 XO2(i + 1, 1) \ 179 XO2(i + 2, 2) \ 180 XO2(i + 3, 3) \ 181 ST(i, 0) \ 182 ST(i + 1, 1) \ 183 ST(i + 2, 2) \ 184 ST(i + 3, 3) \ 185 186 187 PF0(0) 188 PF0(2) 189 190 " .align 32 ;\n" 191 " 1: ;\n" 192 193 BLOCK(0) 194 BLOCK(4) 195 BLOCK(8) 196 BLOCK(12) 197 198 " add %[inc], %[p1] ;\n" 199 " add %[inc], %[p2] ;\n" 200 " add %[inc], %[p3] ;\n" 201 " dec %[cnt] ;\n" 202 " jnz 1b ;\n" 203 : [cnt] "+r" (lines), 204 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 205 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 206 : "memory"); 207 208 kernel_fpu_end(); 209 } 210 211 static void 212 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 213 unsigned long *p3) 214 { 215 unsigned long lines = bytes >> 8; 216 217 kernel_fpu_begin(); 218 219 asm volatile( 220 #undef BLOCK 221 #define BLOCK(i) \ 222 BLK64(PF0, LD, i) \ 223 BLK64(PF1, XO1, i) \ 224 BLK64(PF2, XO2, i) \ 225 BLK64(NOP, ST, i) \ 226 227 " .align 32 ;\n" 228 " 1: ;\n" 229 230 BLOCK(0) 231 BLOCK(4) 232 BLOCK(8) 233 BLOCK(12) 234 235 " add %[inc], %[p1] ;\n" 236 " add %[inc], %[p2] ;\n" 237 " add %[inc], %[p3] ;\n" 238 " dec %[cnt] ;\n" 239 " jnz 1b ;\n" 240 : [cnt] "+r" (lines), 241 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 242 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 243 : "memory"); 244 245 kernel_fpu_end(); 246 } 247 248 static void 249 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 250 unsigned long *p3, unsigned long *p4) 251 { 252 unsigned long lines = bytes >> 8; 253 254 kernel_fpu_begin(); 255 256 asm volatile( 257 #undef BLOCK 258 #define BLOCK(i) \ 259 PF1(i) \ 260 PF1(i + 2) \ 261 LD(i, 0) \ 262 LD(i + 1, 1) \ 263 LD(i + 2, 2) \ 264 LD(i + 3, 3) \ 265 PF2(i) \ 266 PF2(i + 2) \ 267 XO1(i, 0) \ 268 XO1(i + 1, 1) \ 269 XO1(i + 2, 2) \ 270 XO1(i + 3, 3) \ 271 PF3(i) \ 272 PF3(i + 2) \ 273 PF0(i + 4) \ 274 PF0(i + 6) \ 275 XO2(i, 0) \ 276 XO2(i + 1, 1) \ 277 XO2(i + 2, 2) \ 278 XO2(i + 3, 3) \ 279 XO3(i, 0) \ 280 XO3(i + 1, 1) \ 281 XO3(i + 2, 2) \ 282 XO3(i + 3, 3) \ 283 ST(i, 0) \ 284 ST(i + 1, 1) \ 285 ST(i + 2, 2) \ 286 ST(i + 3, 3) \ 287 288 289 PF0(0) 290 PF0(2) 291 292 " .align 32 ;\n" 293 " 1: ;\n" 294 295 BLOCK(0) 296 BLOCK(4) 297 BLOCK(8) 298 BLOCK(12) 299 300 " add %[inc], %[p1] ;\n" 301 " add %[inc], %[p2] ;\n" 302 " add %[inc], %[p3] ;\n" 303 " add %[inc], %[p4] ;\n" 304 " dec %[cnt] ;\n" 305 " jnz 1b ;\n" 306 : [cnt] "+r" (lines), [p1] "+r" (p1), 307 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 308 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 309 : "memory"); 310 311 kernel_fpu_end(); 312 } 313 314 static void 315 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 316 unsigned long *p3, unsigned long *p4) 317 { 318 unsigned long lines = bytes >> 8; 319 320 kernel_fpu_begin(); 321 322 asm volatile( 323 #undef BLOCK 324 #define BLOCK(i) \ 325 BLK64(PF0, LD, i) \ 326 BLK64(PF1, XO1, i) \ 327 BLK64(PF2, XO2, i) \ 328 BLK64(PF3, XO3, i) \ 329 BLK64(NOP, ST, i) \ 330 331 " .align 32 ;\n" 332 " 1: ;\n" 333 334 BLOCK(0) 335 BLOCK(4) 336 BLOCK(8) 337 BLOCK(12) 338 339 " add %[inc], %[p1] ;\n" 340 " add %[inc], %[p2] ;\n" 341 " add %[inc], %[p3] ;\n" 342 " add %[inc], %[p4] ;\n" 343 " dec %[cnt] ;\n" 344 " jnz 1b ;\n" 345 : [cnt] "+r" (lines), [p1] "+r" (p1), 346 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 347 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 348 : "memory"); 349 350 kernel_fpu_end(); 351 } 352 353 static void 354 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 355 unsigned long *p3, unsigned long *p4, unsigned long *p5) 356 { 357 unsigned long lines = bytes >> 8; 358 359 kernel_fpu_begin(); 360 361 asm volatile( 362 #undef BLOCK 363 #define BLOCK(i) \ 364 PF1(i) \ 365 PF1(i + 2) \ 366 LD(i, 0) \ 367 LD(i + 1, 1) \ 368 LD(i + 2, 2) \ 369 LD(i + 3, 3) \ 370 PF2(i) \ 371 PF2(i + 2) \ 372 XO1(i, 0) \ 373 XO1(i + 1, 1) \ 374 XO1(i + 2, 2) \ 375 XO1(i + 3, 3) \ 376 PF3(i) \ 377 PF3(i + 2) \ 378 XO2(i, 0) \ 379 XO2(i + 1, 1) \ 380 XO2(i + 2, 2) \ 381 XO2(i + 3, 3) \ 382 PF4(i) \ 383 PF4(i + 2) \ 384 PF0(i + 4) \ 385 PF0(i + 6) \ 386 XO3(i, 0) \ 387 XO3(i + 1, 1) \ 388 XO3(i + 2, 2) \ 389 XO3(i + 3, 3) \ 390 XO4(i, 0) \ 391 XO4(i + 1, 1) \ 392 XO4(i + 2, 2) \ 393 XO4(i + 3, 3) \ 394 ST(i, 0) \ 395 ST(i + 1, 1) \ 396 ST(i + 2, 2) \ 397 ST(i + 3, 3) \ 398 399 400 PF0(0) 401 PF0(2) 402 403 " .align 32 ;\n" 404 " 1: ;\n" 405 406 BLOCK(0) 407 BLOCK(4) 408 BLOCK(8) 409 BLOCK(12) 410 411 " add %[inc], %[p1] ;\n" 412 " add %[inc], %[p2] ;\n" 413 " add %[inc], %[p3] ;\n" 414 " add %[inc], %[p4] ;\n" 415 " add %[inc], %[p5] ;\n" 416 " dec %[cnt] ;\n" 417 " jnz 1b ;\n" 418 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 419 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 420 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 421 : "memory"); 422 423 kernel_fpu_end(); 424 } 425 426 static void 427 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 428 unsigned long *p3, unsigned long *p4, unsigned long *p5) 429 { 430 unsigned long lines = bytes >> 8; 431 432 kernel_fpu_begin(); 433 434 asm volatile( 435 #undef BLOCK 436 #define BLOCK(i) \ 437 BLK64(PF0, LD, i) \ 438 BLK64(PF1, XO1, i) \ 439 BLK64(PF2, XO2, i) \ 440 BLK64(PF3, XO3, i) \ 441 BLK64(PF4, XO4, i) \ 442 BLK64(NOP, ST, i) \ 443 444 " .align 32 ;\n" 445 " 1: ;\n" 446 447 BLOCK(0) 448 BLOCK(4) 449 BLOCK(8) 450 BLOCK(12) 451 452 " add %[inc], %[p1] ;\n" 453 " add %[inc], %[p2] ;\n" 454 " add %[inc], %[p3] ;\n" 455 " add %[inc], %[p4] ;\n" 456 " add %[inc], %[p5] ;\n" 457 " dec %[cnt] ;\n" 458 " jnz 1b ;\n" 459 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 460 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 461 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 462 : "memory"); 463 464 kernel_fpu_end(); 465 } 466 467 static struct xor_block_template xor_block_sse_pf64 = { 468 .name = "prefetch64-sse", 469 .do_2 = xor_sse_2_pf64, 470 .do_3 = xor_sse_3_pf64, 471 .do_4 = xor_sse_4_pf64, 472 .do_5 = xor_sse_5_pf64, 473 }; 474 475 #undef LD 476 #undef XO1 477 #undef XO2 478 #undef XO3 479 #undef XO4 480 #undef ST 481 #undef NOP 482 #undef BLK64 483 #undef BLOCK 484 485 #undef XOR_CONSTANT_CONSTRAINT 486 487 #ifdef CONFIG_X86_32 488 # include <asm/xor_32.h> 489 #else 490 # include <asm/xor_64.h> 491 #endif 492 493 #define XOR_SELECT_TEMPLATE(FASTEST) \ 494 AVX_SELECT(FASTEST) 495 496 #endif /* _ASM_X86_XOR_H */ 497