1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 #ifndef _ASM_X86_XOR_32_H 3 #define _ASM_X86_XOR_32_H 4 5 /* 6 * Optimized RAID-5 checksumming functions for MMX. 7 */ 8 9 /* 10 * High-speed RAID5 checksumming functions utilizing MMX instructions. 11 * Copyright (C) 1998 Ingo Molnar. 12 */ 13 14 #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" 15 #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" 16 #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" 17 #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" 18 #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" 19 #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" 20 21 #include <asm/fpu/api.h> 22 23 static void 24 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 25 { 26 unsigned long lines = bytes >> 7; 27 28 kernel_fpu_begin(); 29 30 asm volatile( 31 #undef BLOCK 32 #define BLOCK(i) \ 33 LD(i, 0) \ 34 LD(i + 1, 1) \ 35 LD(i + 2, 2) \ 36 LD(i + 3, 3) \ 37 XO1(i, 0) \ 38 ST(i, 0) \ 39 XO1(i+1, 1) \ 40 ST(i+1, 1) \ 41 XO1(i + 2, 2) \ 42 ST(i + 2, 2) \ 43 XO1(i + 3, 3) \ 44 ST(i + 3, 3) 45 46 " .align 32 ;\n" 47 " 1: ;\n" 48 49 BLOCK(0) 50 BLOCK(4) 51 BLOCK(8) 52 BLOCK(12) 53 54 " addl $128, %1 ;\n" 55 " addl $128, %2 ;\n" 56 " decl %0 ;\n" 57 " jnz 1b ;\n" 58 : "+r" (lines), 59 "+r" (p1), "+r" (p2) 60 : 61 : "memory"); 62 63 kernel_fpu_end(); 64 } 65 66 static void 67 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 68 unsigned long *p3) 69 { 70 unsigned long lines = bytes >> 7; 71 72 kernel_fpu_begin(); 73 74 asm volatile( 75 #undef BLOCK 76 #define BLOCK(i) \ 77 LD(i, 0) \ 78 LD(i + 1, 1) \ 79 LD(i + 2, 2) \ 80 LD(i + 3, 3) \ 81 XO1(i, 0) \ 82 XO1(i + 1, 1) \ 83 XO1(i + 2, 2) \ 84 XO1(i + 3, 3) \ 85 XO2(i, 0) \ 86 ST(i, 0) \ 87 XO2(i + 1, 1) \ 88 ST(i + 1, 1) \ 89 XO2(i + 2, 2) \ 90 ST(i + 2, 2) \ 91 XO2(i + 3, 3) \ 92 ST(i + 3, 3) 93 94 " .align 32 ;\n" 95 " 1: ;\n" 96 97 BLOCK(0) 98 BLOCK(4) 99 BLOCK(8) 100 BLOCK(12) 101 102 " addl $128, %1 ;\n" 103 " addl $128, %2 ;\n" 104 " addl $128, %3 ;\n" 105 " decl %0 ;\n" 106 " jnz 1b ;\n" 107 : "+r" (lines), 108 "+r" (p1), "+r" (p2), "+r" (p3) 109 : 110 : "memory"); 111 112 kernel_fpu_end(); 113 } 114 115 static void 116 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 117 unsigned long *p3, unsigned long *p4) 118 { 119 unsigned long lines = bytes >> 7; 120 121 kernel_fpu_begin(); 122 123 asm volatile( 124 #undef BLOCK 125 #define BLOCK(i) \ 126 LD(i, 0) \ 127 LD(i + 1, 1) \ 128 LD(i + 2, 2) \ 129 LD(i + 3, 3) \ 130 XO1(i, 0) \ 131 XO1(i + 1, 1) \ 132 XO1(i + 2, 2) \ 133 XO1(i + 3, 3) \ 134 XO2(i, 0) \ 135 XO2(i + 1, 1) \ 136 XO2(i + 2, 2) \ 137 XO2(i + 3, 3) \ 138 XO3(i, 0) \ 139 ST(i, 0) \ 140 XO3(i + 1, 1) \ 141 ST(i + 1, 1) \ 142 XO3(i + 2, 2) \ 143 ST(i + 2, 2) \ 144 XO3(i + 3, 3) \ 145 ST(i + 3, 3) 146 147 " .align 32 ;\n" 148 " 1: ;\n" 149 150 BLOCK(0) 151 BLOCK(4) 152 BLOCK(8) 153 BLOCK(12) 154 155 " addl $128, %1 ;\n" 156 " addl $128, %2 ;\n" 157 " addl $128, %3 ;\n" 158 " addl $128, %4 ;\n" 159 " decl %0 ;\n" 160 " jnz 1b ;\n" 161 : "+r" (lines), 162 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) 163 : 164 : "memory"); 165 166 kernel_fpu_end(); 167 } 168 169 170 static void 171 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 172 unsigned long *p3, unsigned long *p4, unsigned long *p5) 173 { 174 unsigned long lines = bytes >> 7; 175 176 kernel_fpu_begin(); 177 178 /* Make sure GCC forgets anything it knows about p4 or p5, 179 such that it won't pass to the asm volatile below a 180 register that is shared with any other variable. That's 181 because we modify p4 and p5 there, but we can't mark them 182 as read/write, otherwise we'd overflow the 10-asm-operands 183 limit of GCC < 3.1. */ 184 asm("" : "+r" (p4), "+r" (p5)); 185 186 asm volatile( 187 #undef BLOCK 188 #define BLOCK(i) \ 189 LD(i, 0) \ 190 LD(i + 1, 1) \ 191 LD(i + 2, 2) \ 192 LD(i + 3, 3) \ 193 XO1(i, 0) \ 194 XO1(i + 1, 1) \ 195 XO1(i + 2, 2) \ 196 XO1(i + 3, 3) \ 197 XO2(i, 0) \ 198 XO2(i + 1, 1) \ 199 XO2(i + 2, 2) \ 200 XO2(i + 3, 3) \ 201 XO3(i, 0) \ 202 XO3(i + 1, 1) \ 203 XO3(i + 2, 2) \ 204 XO3(i + 3, 3) \ 205 XO4(i, 0) \ 206 ST(i, 0) \ 207 XO4(i + 1, 1) \ 208 ST(i + 1, 1) \ 209 XO4(i + 2, 2) \ 210 ST(i + 2, 2) \ 211 XO4(i + 3, 3) \ 212 ST(i + 3, 3) 213 214 " .align 32 ;\n" 215 " 1: ;\n" 216 217 BLOCK(0) 218 BLOCK(4) 219 BLOCK(8) 220 BLOCK(12) 221 222 " addl $128, %1 ;\n" 223 " addl $128, %2 ;\n" 224 " addl $128, %3 ;\n" 225 " addl $128, %4 ;\n" 226 " addl $128, %5 ;\n" 227 " decl %0 ;\n" 228 " jnz 1b ;\n" 229 : "+r" (lines), 230 "+r" (p1), "+r" (p2), "+r" (p3) 231 : "r" (p4), "r" (p5) 232 : "memory"); 233 234 /* p4 and p5 were modified, and now the variables are dead. 235 Clobber them just to be sure nobody does something stupid 236 like assuming they have some legal value. */ 237 asm("" : "=r" (p4), "=r" (p5)); 238 239 kernel_fpu_end(); 240 } 241 242 #undef LD 243 #undef XO1 244 #undef XO2 245 #undef XO3 246 #undef XO4 247 #undef ST 248 #undef BLOCK 249 250 static void 251 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 252 { 253 unsigned long lines = bytes >> 6; 254 255 kernel_fpu_begin(); 256 257 asm volatile( 258 " .align 32 ;\n" 259 " 1: ;\n" 260 " movq (%1), %%mm0 ;\n" 261 " movq 8(%1), %%mm1 ;\n" 262 " pxor (%2), %%mm0 ;\n" 263 " movq 16(%1), %%mm2 ;\n" 264 " movq %%mm0, (%1) ;\n" 265 " pxor 8(%2), %%mm1 ;\n" 266 " movq 24(%1), %%mm3 ;\n" 267 " movq %%mm1, 8(%1) ;\n" 268 " pxor 16(%2), %%mm2 ;\n" 269 " movq 32(%1), %%mm4 ;\n" 270 " movq %%mm2, 16(%1) ;\n" 271 " pxor 24(%2), %%mm3 ;\n" 272 " movq 40(%1), %%mm5 ;\n" 273 " movq %%mm3, 24(%1) ;\n" 274 " pxor 32(%2), %%mm4 ;\n" 275 " movq 48(%1), %%mm6 ;\n" 276 " movq %%mm4, 32(%1) ;\n" 277 " pxor 40(%2), %%mm5 ;\n" 278 " movq 56(%1), %%mm7 ;\n" 279 " movq %%mm5, 40(%1) ;\n" 280 " pxor 48(%2), %%mm6 ;\n" 281 " pxor 56(%2), %%mm7 ;\n" 282 " movq %%mm6, 48(%1) ;\n" 283 " movq %%mm7, 56(%1) ;\n" 284 285 " addl $64, %1 ;\n" 286 " addl $64, %2 ;\n" 287 " decl %0 ;\n" 288 " jnz 1b ;\n" 289 : "+r" (lines), 290 "+r" (p1), "+r" (p2) 291 : 292 : "memory"); 293 294 kernel_fpu_end(); 295 } 296 297 static void 298 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 299 unsigned long *p3) 300 { 301 unsigned long lines = bytes >> 6; 302 303 kernel_fpu_begin(); 304 305 asm volatile( 306 " .align 32,0x90 ;\n" 307 " 1: ;\n" 308 " movq (%1), %%mm0 ;\n" 309 " movq 8(%1), %%mm1 ;\n" 310 " pxor (%2), %%mm0 ;\n" 311 " movq 16(%1), %%mm2 ;\n" 312 " pxor 8(%2), %%mm1 ;\n" 313 " pxor (%3), %%mm0 ;\n" 314 " pxor 16(%2), %%mm2 ;\n" 315 " movq %%mm0, (%1) ;\n" 316 " pxor 8(%3), %%mm1 ;\n" 317 " pxor 16(%3), %%mm2 ;\n" 318 " movq 24(%1), %%mm3 ;\n" 319 " movq %%mm1, 8(%1) ;\n" 320 " movq 32(%1), %%mm4 ;\n" 321 " movq 40(%1), %%mm5 ;\n" 322 " pxor 24(%2), %%mm3 ;\n" 323 " movq %%mm2, 16(%1) ;\n" 324 " pxor 32(%2), %%mm4 ;\n" 325 " pxor 24(%3), %%mm3 ;\n" 326 " pxor 40(%2), %%mm5 ;\n" 327 " movq %%mm3, 24(%1) ;\n" 328 " pxor 32(%3), %%mm4 ;\n" 329 " pxor 40(%3), %%mm5 ;\n" 330 " movq 48(%1), %%mm6 ;\n" 331 " movq %%mm4, 32(%1) ;\n" 332 " movq 56(%1), %%mm7 ;\n" 333 " pxor 48(%2), %%mm6 ;\n" 334 " movq %%mm5, 40(%1) ;\n" 335 " pxor 56(%2), %%mm7 ;\n" 336 " pxor 48(%3), %%mm6 ;\n" 337 " pxor 56(%3), %%mm7 ;\n" 338 " movq %%mm6, 48(%1) ;\n" 339 " movq %%mm7, 56(%1) ;\n" 340 341 " addl $64, %1 ;\n" 342 " addl $64, %2 ;\n" 343 " addl $64, %3 ;\n" 344 " decl %0 ;\n" 345 " jnz 1b ;\n" 346 : "+r" (lines), 347 "+r" (p1), "+r" (p2), "+r" (p3) 348 : 349 : "memory" ); 350 351 kernel_fpu_end(); 352 } 353 354 static void 355 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 356 unsigned long *p3, unsigned long *p4) 357 { 358 unsigned long lines = bytes >> 6; 359 360 kernel_fpu_begin(); 361 362 asm volatile( 363 " .align 32,0x90 ;\n" 364 " 1: ;\n" 365 " movq (%1), %%mm0 ;\n" 366 " movq 8(%1), %%mm1 ;\n" 367 " pxor (%2), %%mm0 ;\n" 368 " movq 16(%1), %%mm2 ;\n" 369 " pxor 8(%2), %%mm1 ;\n" 370 " pxor (%3), %%mm0 ;\n" 371 " pxor 16(%2), %%mm2 ;\n" 372 " pxor 8(%3), %%mm1 ;\n" 373 " pxor (%4), %%mm0 ;\n" 374 " movq 24(%1), %%mm3 ;\n" 375 " pxor 16(%3), %%mm2 ;\n" 376 " pxor 8(%4), %%mm1 ;\n" 377 " movq %%mm0, (%1) ;\n" 378 " movq 32(%1), %%mm4 ;\n" 379 " pxor 24(%2), %%mm3 ;\n" 380 " pxor 16(%4), %%mm2 ;\n" 381 " movq %%mm1, 8(%1) ;\n" 382 " movq 40(%1), %%mm5 ;\n" 383 " pxor 32(%2), %%mm4 ;\n" 384 " pxor 24(%3), %%mm3 ;\n" 385 " movq %%mm2, 16(%1) ;\n" 386 " pxor 40(%2), %%mm5 ;\n" 387 " pxor 32(%3), %%mm4 ;\n" 388 " pxor 24(%4), %%mm3 ;\n" 389 " movq %%mm3, 24(%1) ;\n" 390 " movq 56(%1), %%mm7 ;\n" 391 " movq 48(%1), %%mm6 ;\n" 392 " pxor 40(%3), %%mm5 ;\n" 393 " pxor 32(%4), %%mm4 ;\n" 394 " pxor 48(%2), %%mm6 ;\n" 395 " movq %%mm4, 32(%1) ;\n" 396 " pxor 56(%2), %%mm7 ;\n" 397 " pxor 40(%4), %%mm5 ;\n" 398 " pxor 48(%3), %%mm6 ;\n" 399 " pxor 56(%3), %%mm7 ;\n" 400 " movq %%mm5, 40(%1) ;\n" 401 " pxor 48(%4), %%mm6 ;\n" 402 " pxor 56(%4), %%mm7 ;\n" 403 " movq %%mm6, 48(%1) ;\n" 404 " movq %%mm7, 56(%1) ;\n" 405 406 " addl $64, %1 ;\n" 407 " addl $64, %2 ;\n" 408 " addl $64, %3 ;\n" 409 " addl $64, %4 ;\n" 410 " decl %0 ;\n" 411 " jnz 1b ;\n" 412 : "+r" (lines), 413 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) 414 : 415 : "memory"); 416 417 kernel_fpu_end(); 418 } 419 420 static void 421 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 422 unsigned long *p3, unsigned long *p4, unsigned long *p5) 423 { 424 unsigned long lines = bytes >> 6; 425 426 kernel_fpu_begin(); 427 428 /* Make sure GCC forgets anything it knows about p4 or p5, 429 such that it won't pass to the asm volatile below a 430 register that is shared with any other variable. That's 431 because we modify p4 and p5 there, but we can't mark them 432 as read/write, otherwise we'd overflow the 10-asm-operands 433 limit of GCC < 3.1. */ 434 asm("" : "+r" (p4), "+r" (p5)); 435 436 asm volatile( 437 " .align 32,0x90 ;\n" 438 " 1: ;\n" 439 " movq (%1), %%mm0 ;\n" 440 " movq 8(%1), %%mm1 ;\n" 441 " pxor (%2), %%mm0 ;\n" 442 " pxor 8(%2), %%mm1 ;\n" 443 " movq 16(%1), %%mm2 ;\n" 444 " pxor (%3), %%mm0 ;\n" 445 " pxor 8(%3), %%mm1 ;\n" 446 " pxor 16(%2), %%mm2 ;\n" 447 " pxor (%4), %%mm0 ;\n" 448 " pxor 8(%4), %%mm1 ;\n" 449 " pxor 16(%3), %%mm2 ;\n" 450 " movq 24(%1), %%mm3 ;\n" 451 " pxor (%5), %%mm0 ;\n" 452 " pxor 8(%5), %%mm1 ;\n" 453 " movq %%mm0, (%1) ;\n" 454 " pxor 16(%4), %%mm2 ;\n" 455 " pxor 24(%2), %%mm3 ;\n" 456 " movq %%mm1, 8(%1) ;\n" 457 " pxor 16(%5), %%mm2 ;\n" 458 " pxor 24(%3), %%mm3 ;\n" 459 " movq 32(%1), %%mm4 ;\n" 460 " movq %%mm2, 16(%1) ;\n" 461 " pxor 24(%4), %%mm3 ;\n" 462 " pxor 32(%2), %%mm4 ;\n" 463 " movq 40(%1), %%mm5 ;\n" 464 " pxor 24(%5), %%mm3 ;\n" 465 " pxor 32(%3), %%mm4 ;\n" 466 " pxor 40(%2), %%mm5 ;\n" 467 " movq %%mm3, 24(%1) ;\n" 468 " pxor 32(%4), %%mm4 ;\n" 469 " pxor 40(%3), %%mm5 ;\n" 470 " movq 48(%1), %%mm6 ;\n" 471 " movq 56(%1), %%mm7 ;\n" 472 " pxor 32(%5), %%mm4 ;\n" 473 " pxor 40(%4), %%mm5 ;\n" 474 " pxor 48(%2), %%mm6 ;\n" 475 " pxor 56(%2), %%mm7 ;\n" 476 " movq %%mm4, 32(%1) ;\n" 477 " pxor 48(%3), %%mm6 ;\n" 478 " pxor 56(%3), %%mm7 ;\n" 479 " pxor 40(%5), %%mm5 ;\n" 480 " pxor 48(%4), %%mm6 ;\n" 481 " pxor 56(%4), %%mm7 ;\n" 482 " movq %%mm5, 40(%1) ;\n" 483 " pxor 48(%5), %%mm6 ;\n" 484 " pxor 56(%5), %%mm7 ;\n" 485 " movq %%mm6, 48(%1) ;\n" 486 " movq %%mm7, 56(%1) ;\n" 487 488 " addl $64, %1 ;\n" 489 " addl $64, %2 ;\n" 490 " addl $64, %3 ;\n" 491 " addl $64, %4 ;\n" 492 " addl $64, %5 ;\n" 493 " decl %0 ;\n" 494 " jnz 1b ;\n" 495 : "+r" (lines), 496 "+r" (p1), "+r" (p2), "+r" (p3) 497 : "r" (p4), "r" (p5) 498 : "memory"); 499 500 /* p4 and p5 were modified, and now the variables are dead. 501 Clobber them just to be sure nobody does something stupid 502 like assuming they have some legal value. */ 503 asm("" : "=r" (p4), "=r" (p5)); 504 505 kernel_fpu_end(); 506 } 507 508 static struct xor_block_template xor_block_pII_mmx = { 509 .name = "pII_mmx", 510 .do_2 = xor_pII_mmx_2, 511 .do_3 = xor_pII_mmx_3, 512 .do_4 = xor_pII_mmx_4, 513 .do_5 = xor_pII_mmx_5, 514 }; 515 516 static struct xor_block_template xor_block_p5_mmx = { 517 .name = "p5_mmx", 518 .do_2 = xor_p5_mmx_2, 519 .do_3 = xor_p5_mmx_3, 520 .do_4 = xor_p5_mmx_4, 521 .do_5 = xor_p5_mmx_5, 522 }; 523 524 static struct xor_block_template xor_block_pIII_sse = { 525 .name = "pIII_sse", 526 .do_2 = xor_sse_2, 527 .do_3 = xor_sse_3, 528 .do_4 = xor_sse_4, 529 .do_5 = xor_sse_5, 530 }; 531 532 /* Also try the AVX routines */ 533 #include <asm/xor_avx.h> 534 535 /* Also try the generic routines. */ 536 #include <asm-generic/xor.h> 537 538 /* We force the use of the SSE xor block because it can write around L2. 539 We may also be able to load into the L1 only depending on how the cpu 540 deals with a load to a line that is being prefetched. */ 541 #undef XOR_TRY_TEMPLATES 542 #define XOR_TRY_TEMPLATES \ 543 do { \ 544 AVX_XOR_SPEED; \ 545 if (boot_cpu_has(X86_FEATURE_XMM)) { \ 546 xor_speed(&xor_block_pIII_sse); \ 547 xor_speed(&xor_block_sse_pf64); \ 548 } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ 549 xor_speed(&xor_block_pII_mmx); \ 550 xor_speed(&xor_block_p5_mmx); \ 551 } else { \ 552 xor_speed(&xor_block_8regs); \ 553 xor_speed(&xor_block_8regs_p); \ 554 xor_speed(&xor_block_32regs); \ 555 xor_speed(&xor_block_32regs_p); \ 556 } \ 557 } while (0) 558 559 #endif /* _ASM_X86_XOR_32_H */ 560