1 #ifdef CONFIG_KMEMCHECK 2 /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ 3 # include <asm-generic/xor.h> 4 #elif !defined(_ASM_X86_XOR_H) 5 #define _ASM_X86_XOR_H 6 7 /* 8 * Optimized RAID-5 checksumming functions for SSE. 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2, or (at your option) 13 * any later version. 14 * 15 * You should have received a copy of the GNU General Public License 16 * (for example /usr/src/linux/COPYING); if not, write to the Free 17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 */ 19 20 /* 21 * Cache avoiding checksumming functions utilizing KNI instructions 22 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 23 */ 24 25 /* 26 * Based on 27 * High-speed RAID5 checksumming functions utilizing SSE instructions. 28 * Copyright (C) 1998 Ingo Molnar. 29 */ 30 31 /* 32 * x86-64 changes / gcc fixes from Andi Kleen. 33 * Copyright 2002 Andi Kleen, SuSE Labs. 34 * 35 * This hasn't been optimized for the hammer yet, but there are likely 36 * no advantages to be gotten from x86-64 here anyways. 37 */ 38 39 #include <asm/fpu/api.h> 40 41 #ifdef CONFIG_X86_32 42 /* reduce register pressure */ 43 # define XOR_CONSTANT_CONSTRAINT "i" 44 #else 45 # define XOR_CONSTANT_CONSTRAINT "re" 46 #endif 47 48 #define OFFS(x) "16*("#x")" 49 #define PF_OFFS(x) "256+16*("#x")" 50 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 51 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 52 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 53 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 54 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 55 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 56 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 57 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 58 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 59 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 60 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 61 #define NOP(x) 62 63 #define BLK64(pf, op, i) \ 64 pf(i) \ 65 op(i, 0) \ 66 op(i + 1, 1) \ 67 op(i + 2, 2) \ 68 op(i + 3, 3) 69 70 static void 71 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 72 { 73 unsigned long lines = bytes >> 8; 74 75 kernel_fpu_begin(); 76 77 asm volatile( 78 #undef BLOCK 79 #define BLOCK(i) \ 80 LD(i, 0) \ 81 LD(i + 1, 1) \ 82 PF1(i) \ 83 PF1(i + 2) \ 84 LD(i + 2, 2) \ 85 LD(i + 3, 3) \ 86 PF0(i + 4) \ 87 PF0(i + 6) \ 88 XO1(i, 0) \ 89 XO1(i + 1, 1) \ 90 XO1(i + 2, 2) \ 91 XO1(i + 3, 3) \ 92 ST(i, 0) \ 93 ST(i + 1, 1) \ 94 ST(i + 2, 2) \ 95 ST(i + 3, 3) \ 96 97 98 PF0(0) 99 PF0(2) 100 101 " .align 32 ;\n" 102 " 1: ;\n" 103 104 BLOCK(0) 105 BLOCK(4) 106 BLOCK(8) 107 BLOCK(12) 108 109 " add %[inc], %[p1] ;\n" 110 " add %[inc], %[p2] ;\n" 111 " dec %[cnt] ;\n" 112 " jnz 1b ;\n" 113 : [cnt] "+r" (lines), 114 [p1] "+r" (p1), [p2] "+r" (p2) 115 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 116 : "memory"); 117 118 kernel_fpu_end(); 119 } 120 121 static void 122 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) 123 { 124 unsigned long lines = bytes >> 8; 125 126 kernel_fpu_begin(); 127 128 asm volatile( 129 #undef BLOCK 130 #define BLOCK(i) \ 131 BLK64(PF0, LD, i) \ 132 BLK64(PF1, XO1, i) \ 133 BLK64(NOP, ST, i) \ 134 135 " .align 32 ;\n" 136 " 1: ;\n" 137 138 BLOCK(0) 139 BLOCK(4) 140 BLOCK(8) 141 BLOCK(12) 142 143 " add %[inc], %[p1] ;\n" 144 " add %[inc], %[p2] ;\n" 145 " dec %[cnt] ;\n" 146 " jnz 1b ;\n" 147 : [cnt] "+r" (lines), 148 [p1] "+r" (p1), [p2] "+r" (p2) 149 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 150 : "memory"); 151 152 kernel_fpu_end(); 153 } 154 155 static void 156 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 157 unsigned long *p3) 158 { 159 unsigned long lines = bytes >> 8; 160 161 kernel_fpu_begin(); 162 163 asm volatile( 164 #undef BLOCK 165 #define BLOCK(i) \ 166 PF1(i) \ 167 PF1(i + 2) \ 168 LD(i, 0) \ 169 LD(i + 1, 1) \ 170 LD(i + 2, 2) \ 171 LD(i + 3, 3) \ 172 PF2(i) \ 173 PF2(i + 2) \ 174 PF0(i + 4) \ 175 PF0(i + 6) \ 176 XO1(i, 0) \ 177 XO1(i + 1, 1) \ 178 XO1(i + 2, 2) \ 179 XO1(i + 3, 3) \ 180 XO2(i, 0) \ 181 XO2(i + 1, 1) \ 182 XO2(i + 2, 2) \ 183 XO2(i + 3, 3) \ 184 ST(i, 0) \ 185 ST(i + 1, 1) \ 186 ST(i + 2, 2) \ 187 ST(i + 3, 3) \ 188 189 190 PF0(0) 191 PF0(2) 192 193 " .align 32 ;\n" 194 " 1: ;\n" 195 196 BLOCK(0) 197 BLOCK(4) 198 BLOCK(8) 199 BLOCK(12) 200 201 " add %[inc], %[p1] ;\n" 202 " add %[inc], %[p2] ;\n" 203 " add %[inc], %[p3] ;\n" 204 " dec %[cnt] ;\n" 205 " jnz 1b ;\n" 206 : [cnt] "+r" (lines), 207 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 208 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 209 : "memory"); 210 211 kernel_fpu_end(); 212 } 213 214 static void 215 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 216 unsigned long *p3) 217 { 218 unsigned long lines = bytes >> 8; 219 220 kernel_fpu_begin(); 221 222 asm volatile( 223 #undef BLOCK 224 #define BLOCK(i) \ 225 BLK64(PF0, LD, i) \ 226 BLK64(PF1, XO1, i) \ 227 BLK64(PF2, XO2, i) \ 228 BLK64(NOP, ST, i) \ 229 230 " .align 32 ;\n" 231 " 1: ;\n" 232 233 BLOCK(0) 234 BLOCK(4) 235 BLOCK(8) 236 BLOCK(12) 237 238 " add %[inc], %[p1] ;\n" 239 " add %[inc], %[p2] ;\n" 240 " add %[inc], %[p3] ;\n" 241 " dec %[cnt] ;\n" 242 " jnz 1b ;\n" 243 : [cnt] "+r" (lines), 244 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 245 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 246 : "memory"); 247 248 kernel_fpu_end(); 249 } 250 251 static void 252 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 253 unsigned long *p3, unsigned long *p4) 254 { 255 unsigned long lines = bytes >> 8; 256 257 kernel_fpu_begin(); 258 259 asm volatile( 260 #undef BLOCK 261 #define BLOCK(i) \ 262 PF1(i) \ 263 PF1(i + 2) \ 264 LD(i, 0) \ 265 LD(i + 1, 1) \ 266 LD(i + 2, 2) \ 267 LD(i + 3, 3) \ 268 PF2(i) \ 269 PF2(i + 2) \ 270 XO1(i, 0) \ 271 XO1(i + 1, 1) \ 272 XO1(i + 2, 2) \ 273 XO1(i + 3, 3) \ 274 PF3(i) \ 275 PF3(i + 2) \ 276 PF0(i + 4) \ 277 PF0(i + 6) \ 278 XO2(i, 0) \ 279 XO2(i + 1, 1) \ 280 XO2(i + 2, 2) \ 281 XO2(i + 3, 3) \ 282 XO3(i, 0) \ 283 XO3(i + 1, 1) \ 284 XO3(i + 2, 2) \ 285 XO3(i + 3, 3) \ 286 ST(i, 0) \ 287 ST(i + 1, 1) \ 288 ST(i + 2, 2) \ 289 ST(i + 3, 3) \ 290 291 292 PF0(0) 293 PF0(2) 294 295 " .align 32 ;\n" 296 " 1: ;\n" 297 298 BLOCK(0) 299 BLOCK(4) 300 BLOCK(8) 301 BLOCK(12) 302 303 " add %[inc], %[p1] ;\n" 304 " add %[inc], %[p2] ;\n" 305 " add %[inc], %[p3] ;\n" 306 " add %[inc], %[p4] ;\n" 307 " dec %[cnt] ;\n" 308 " jnz 1b ;\n" 309 : [cnt] "+r" (lines), [p1] "+r" (p1), 310 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 311 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 312 : "memory"); 313 314 kernel_fpu_end(); 315 } 316 317 static void 318 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 319 unsigned long *p3, unsigned long *p4) 320 { 321 unsigned long lines = bytes >> 8; 322 323 kernel_fpu_begin(); 324 325 asm volatile( 326 #undef BLOCK 327 #define BLOCK(i) \ 328 BLK64(PF0, LD, i) \ 329 BLK64(PF1, XO1, i) \ 330 BLK64(PF2, XO2, i) \ 331 BLK64(PF3, XO3, i) \ 332 BLK64(NOP, ST, i) \ 333 334 " .align 32 ;\n" 335 " 1: ;\n" 336 337 BLOCK(0) 338 BLOCK(4) 339 BLOCK(8) 340 BLOCK(12) 341 342 " add %[inc], %[p1] ;\n" 343 " add %[inc], %[p2] ;\n" 344 " add %[inc], %[p3] ;\n" 345 " add %[inc], %[p4] ;\n" 346 " dec %[cnt] ;\n" 347 " jnz 1b ;\n" 348 : [cnt] "+r" (lines), [p1] "+r" (p1), 349 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 350 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 351 : "memory"); 352 353 kernel_fpu_end(); 354 } 355 356 static void 357 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 358 unsigned long *p3, unsigned long *p4, unsigned long *p5) 359 { 360 unsigned long lines = bytes >> 8; 361 362 kernel_fpu_begin(); 363 364 asm volatile( 365 #undef BLOCK 366 #define BLOCK(i) \ 367 PF1(i) \ 368 PF1(i + 2) \ 369 LD(i, 0) \ 370 LD(i + 1, 1) \ 371 LD(i + 2, 2) \ 372 LD(i + 3, 3) \ 373 PF2(i) \ 374 PF2(i + 2) \ 375 XO1(i, 0) \ 376 XO1(i + 1, 1) \ 377 XO1(i + 2, 2) \ 378 XO1(i + 3, 3) \ 379 PF3(i) \ 380 PF3(i + 2) \ 381 XO2(i, 0) \ 382 XO2(i + 1, 1) \ 383 XO2(i + 2, 2) \ 384 XO2(i + 3, 3) \ 385 PF4(i) \ 386 PF4(i + 2) \ 387 PF0(i + 4) \ 388 PF0(i + 6) \ 389 XO3(i, 0) \ 390 XO3(i + 1, 1) \ 391 XO3(i + 2, 2) \ 392 XO3(i + 3, 3) \ 393 XO4(i, 0) \ 394 XO4(i + 1, 1) \ 395 XO4(i + 2, 2) \ 396 XO4(i + 3, 3) \ 397 ST(i, 0) \ 398 ST(i + 1, 1) \ 399 ST(i + 2, 2) \ 400 ST(i + 3, 3) \ 401 402 403 PF0(0) 404 PF0(2) 405 406 " .align 32 ;\n" 407 " 1: ;\n" 408 409 BLOCK(0) 410 BLOCK(4) 411 BLOCK(8) 412 BLOCK(12) 413 414 " add %[inc], %[p1] ;\n" 415 " add %[inc], %[p2] ;\n" 416 " add %[inc], %[p3] ;\n" 417 " add %[inc], %[p4] ;\n" 418 " add %[inc], %[p5] ;\n" 419 " dec %[cnt] ;\n" 420 " jnz 1b ;\n" 421 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 422 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 423 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 424 : "memory"); 425 426 kernel_fpu_end(); 427 } 428 429 static void 430 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 431 unsigned long *p3, unsigned long *p4, unsigned long *p5) 432 { 433 unsigned long lines = bytes >> 8; 434 435 kernel_fpu_begin(); 436 437 asm volatile( 438 #undef BLOCK 439 #define BLOCK(i) \ 440 BLK64(PF0, LD, i) \ 441 BLK64(PF1, XO1, i) \ 442 BLK64(PF2, XO2, i) \ 443 BLK64(PF3, XO3, i) \ 444 BLK64(PF4, XO4, i) \ 445 BLK64(NOP, ST, i) \ 446 447 " .align 32 ;\n" 448 " 1: ;\n" 449 450 BLOCK(0) 451 BLOCK(4) 452 BLOCK(8) 453 BLOCK(12) 454 455 " add %[inc], %[p1] ;\n" 456 " add %[inc], %[p2] ;\n" 457 " add %[inc], %[p3] ;\n" 458 " add %[inc], %[p4] ;\n" 459 " add %[inc], %[p5] ;\n" 460 " dec %[cnt] ;\n" 461 " jnz 1b ;\n" 462 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 463 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 464 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 465 : "memory"); 466 467 kernel_fpu_end(); 468 } 469 470 static struct xor_block_template xor_block_sse_pf64 = { 471 .name = "prefetch64-sse", 472 .do_2 = xor_sse_2_pf64, 473 .do_3 = xor_sse_3_pf64, 474 .do_4 = xor_sse_4_pf64, 475 .do_5 = xor_sse_5_pf64, 476 }; 477 478 #undef LD 479 #undef XO1 480 #undef XO2 481 #undef XO3 482 #undef XO4 483 #undef ST 484 #undef NOP 485 #undef BLK64 486 #undef BLOCK 487 488 #undef XOR_CONSTANT_CONSTRAINT 489 490 #ifdef CONFIG_X86_32 491 # include <asm/xor_32.h> 492 #else 493 # include <asm/xor_64.h> 494 #endif 495 496 #define XOR_SELECT_TEMPLATE(FASTEST) \ 497 AVX_SELECT(FASTEST) 498 499 #endif /* _ASM_X86_XOR_H */ 500