1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 #ifndef _ASM_X86_XOR_H 3 #define _ASM_X86_XOR_H 4 5 /* 6 * Optimized RAID-5 checksumming functions for SSE. 7 */ 8 9 /* 10 * Cache avoiding checksumming functions utilizing KNI instructions 11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 12 */ 13 14 /* 15 * Based on 16 * High-speed RAID5 checksumming functions utilizing SSE instructions. 17 * Copyright (C) 1998 Ingo Molnar. 18 */ 19 20 /* 21 * x86-64 changes / gcc fixes from Andi Kleen. 22 * Copyright 2002 Andi Kleen, SuSE Labs. 23 * 24 * This hasn't been optimized for the hammer yet, but there are likely 25 * no advantages to be gotten from x86-64 here anyways. 26 */ 27 28 #include <asm/fpu/api.h> 29 30 #ifdef CONFIG_X86_32 31 /* reduce register pressure */ 32 # define XOR_CONSTANT_CONSTRAINT "i" 33 #else 34 # define XOR_CONSTANT_CONSTRAINT "re" 35 #endif 36 37 #define OFFS(x) "16*("#x")" 38 #define PF_OFFS(x) "256+16*("#x")" 39 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 40 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 41 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 42 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 43 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 44 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 45 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 46 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 47 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 48 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 49 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 50 #define NOP(x) 51 52 #define BLK64(pf, op, i) \ 53 pf(i) \ 54 op(i, 0) \ 55 op(i + 1, 1) \ 56 op(i + 2, 2) \ 57 op(i + 3, 3) 58 59 static void 60 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 61 { 62 unsigned long lines = bytes >> 8; 63 64 kernel_fpu_begin(); 65 66 asm volatile( 67 #undef BLOCK 68 #define BLOCK(i) \ 69 LD(i, 0) \ 70 LD(i + 1, 1) \ 71 PF1(i) \ 72 PF1(i + 2) \ 73 LD(i + 2, 2) \ 74 LD(i + 3, 3) \ 75 PF0(i + 4) \ 76 PF0(i + 6) \ 77 XO1(i, 0) \ 78 XO1(i + 1, 1) \ 79 XO1(i + 2, 2) \ 80 XO1(i + 3, 3) \ 81 ST(i, 0) \ 82 ST(i + 1, 1) \ 83 ST(i + 2, 2) \ 84 ST(i + 3, 3) \ 85 86 87 PF0(0) 88 PF0(2) 89 90 " .align 32 ;\n" 91 " 1: ;\n" 92 93 BLOCK(0) 94 BLOCK(4) 95 BLOCK(8) 96 BLOCK(12) 97 98 " add %[inc], %[p1] ;\n" 99 " add %[inc], %[p2] ;\n" 100 " dec %[cnt] ;\n" 101 " jnz 1b ;\n" 102 : [cnt] "+r" (lines), 103 [p1] "+r" (p1), [p2] "+r" (p2) 104 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 105 : "memory"); 106 107 kernel_fpu_end(); 108 } 109 110 static void 111 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) 112 { 113 unsigned long lines = bytes >> 8; 114 115 kernel_fpu_begin(); 116 117 asm volatile( 118 #undef BLOCK 119 #define BLOCK(i) \ 120 BLK64(PF0, LD, i) \ 121 BLK64(PF1, XO1, i) \ 122 BLK64(NOP, ST, i) \ 123 124 " .align 32 ;\n" 125 " 1: ;\n" 126 127 BLOCK(0) 128 BLOCK(4) 129 BLOCK(8) 130 BLOCK(12) 131 132 " add %[inc], %[p1] ;\n" 133 " add %[inc], %[p2] ;\n" 134 " dec %[cnt] ;\n" 135 " jnz 1b ;\n" 136 : [cnt] "+r" (lines), 137 [p1] "+r" (p1), [p2] "+r" (p2) 138 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 139 : "memory"); 140 141 kernel_fpu_end(); 142 } 143 144 static void 145 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 146 unsigned long *p3) 147 { 148 unsigned long lines = bytes >> 8; 149 150 kernel_fpu_begin(); 151 152 asm volatile( 153 #undef BLOCK 154 #define BLOCK(i) \ 155 PF1(i) \ 156 PF1(i + 2) \ 157 LD(i, 0) \ 158 LD(i + 1, 1) \ 159 LD(i + 2, 2) \ 160 LD(i + 3, 3) \ 161 PF2(i) \ 162 PF2(i + 2) \ 163 PF0(i + 4) \ 164 PF0(i + 6) \ 165 XO1(i, 0) \ 166 XO1(i + 1, 1) \ 167 XO1(i + 2, 2) \ 168 XO1(i + 3, 3) \ 169 XO2(i, 0) \ 170 XO2(i + 1, 1) \ 171 XO2(i + 2, 2) \ 172 XO2(i + 3, 3) \ 173 ST(i, 0) \ 174 ST(i + 1, 1) \ 175 ST(i + 2, 2) \ 176 ST(i + 3, 3) \ 177 178 179 PF0(0) 180 PF0(2) 181 182 " .align 32 ;\n" 183 " 1: ;\n" 184 185 BLOCK(0) 186 BLOCK(4) 187 BLOCK(8) 188 BLOCK(12) 189 190 " add %[inc], %[p1] ;\n" 191 " add %[inc], %[p2] ;\n" 192 " add %[inc], %[p3] ;\n" 193 " dec %[cnt] ;\n" 194 " jnz 1b ;\n" 195 : [cnt] "+r" (lines), 196 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 197 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 198 : "memory"); 199 200 kernel_fpu_end(); 201 } 202 203 static void 204 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 205 unsigned long *p3) 206 { 207 unsigned long lines = bytes >> 8; 208 209 kernel_fpu_begin(); 210 211 asm volatile( 212 #undef BLOCK 213 #define BLOCK(i) \ 214 BLK64(PF0, LD, i) \ 215 BLK64(PF1, XO1, i) \ 216 BLK64(PF2, XO2, i) \ 217 BLK64(NOP, ST, i) \ 218 219 " .align 32 ;\n" 220 " 1: ;\n" 221 222 BLOCK(0) 223 BLOCK(4) 224 BLOCK(8) 225 BLOCK(12) 226 227 " add %[inc], %[p1] ;\n" 228 " add %[inc], %[p2] ;\n" 229 " add %[inc], %[p3] ;\n" 230 " dec %[cnt] ;\n" 231 " jnz 1b ;\n" 232 : [cnt] "+r" (lines), 233 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 234 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 235 : "memory"); 236 237 kernel_fpu_end(); 238 } 239 240 static void 241 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 242 unsigned long *p3, unsigned long *p4) 243 { 244 unsigned long lines = bytes >> 8; 245 246 kernel_fpu_begin(); 247 248 asm volatile( 249 #undef BLOCK 250 #define BLOCK(i) \ 251 PF1(i) \ 252 PF1(i + 2) \ 253 LD(i, 0) \ 254 LD(i + 1, 1) \ 255 LD(i + 2, 2) \ 256 LD(i + 3, 3) \ 257 PF2(i) \ 258 PF2(i + 2) \ 259 XO1(i, 0) \ 260 XO1(i + 1, 1) \ 261 XO1(i + 2, 2) \ 262 XO1(i + 3, 3) \ 263 PF3(i) \ 264 PF3(i + 2) \ 265 PF0(i + 4) \ 266 PF0(i + 6) \ 267 XO2(i, 0) \ 268 XO2(i + 1, 1) \ 269 XO2(i + 2, 2) \ 270 XO2(i + 3, 3) \ 271 XO3(i, 0) \ 272 XO3(i + 1, 1) \ 273 XO3(i + 2, 2) \ 274 XO3(i + 3, 3) \ 275 ST(i, 0) \ 276 ST(i + 1, 1) \ 277 ST(i + 2, 2) \ 278 ST(i + 3, 3) \ 279 280 281 PF0(0) 282 PF0(2) 283 284 " .align 32 ;\n" 285 " 1: ;\n" 286 287 BLOCK(0) 288 BLOCK(4) 289 BLOCK(8) 290 BLOCK(12) 291 292 " add %[inc], %[p1] ;\n" 293 " add %[inc], %[p2] ;\n" 294 " add %[inc], %[p3] ;\n" 295 " add %[inc], %[p4] ;\n" 296 " dec %[cnt] ;\n" 297 " jnz 1b ;\n" 298 : [cnt] "+r" (lines), [p1] "+r" (p1), 299 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 300 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 301 : "memory"); 302 303 kernel_fpu_end(); 304 } 305 306 static void 307 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 308 unsigned long *p3, unsigned long *p4) 309 { 310 unsigned long lines = bytes >> 8; 311 312 kernel_fpu_begin(); 313 314 asm volatile( 315 #undef BLOCK 316 #define BLOCK(i) \ 317 BLK64(PF0, LD, i) \ 318 BLK64(PF1, XO1, i) \ 319 BLK64(PF2, XO2, i) \ 320 BLK64(PF3, XO3, i) \ 321 BLK64(NOP, ST, i) \ 322 323 " .align 32 ;\n" 324 " 1: ;\n" 325 326 BLOCK(0) 327 BLOCK(4) 328 BLOCK(8) 329 BLOCK(12) 330 331 " add %[inc], %[p1] ;\n" 332 " add %[inc], %[p2] ;\n" 333 " add %[inc], %[p3] ;\n" 334 " add %[inc], %[p4] ;\n" 335 " dec %[cnt] ;\n" 336 " jnz 1b ;\n" 337 : [cnt] "+r" (lines), [p1] "+r" (p1), 338 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 339 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 340 : "memory"); 341 342 kernel_fpu_end(); 343 } 344 345 static void 346 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 347 unsigned long *p3, unsigned long *p4, unsigned long *p5) 348 { 349 unsigned long lines = bytes >> 8; 350 351 kernel_fpu_begin(); 352 353 asm volatile( 354 #undef BLOCK 355 #define BLOCK(i) \ 356 PF1(i) \ 357 PF1(i + 2) \ 358 LD(i, 0) \ 359 LD(i + 1, 1) \ 360 LD(i + 2, 2) \ 361 LD(i + 3, 3) \ 362 PF2(i) \ 363 PF2(i + 2) \ 364 XO1(i, 0) \ 365 XO1(i + 1, 1) \ 366 XO1(i + 2, 2) \ 367 XO1(i + 3, 3) \ 368 PF3(i) \ 369 PF3(i + 2) \ 370 XO2(i, 0) \ 371 XO2(i + 1, 1) \ 372 XO2(i + 2, 2) \ 373 XO2(i + 3, 3) \ 374 PF4(i) \ 375 PF4(i + 2) \ 376 PF0(i + 4) \ 377 PF0(i + 6) \ 378 XO3(i, 0) \ 379 XO3(i + 1, 1) \ 380 XO3(i + 2, 2) \ 381 XO3(i + 3, 3) \ 382 XO4(i, 0) \ 383 XO4(i + 1, 1) \ 384 XO4(i + 2, 2) \ 385 XO4(i + 3, 3) \ 386 ST(i, 0) \ 387 ST(i + 1, 1) \ 388 ST(i + 2, 2) \ 389 ST(i + 3, 3) \ 390 391 392 PF0(0) 393 PF0(2) 394 395 " .align 32 ;\n" 396 " 1: ;\n" 397 398 BLOCK(0) 399 BLOCK(4) 400 BLOCK(8) 401 BLOCK(12) 402 403 " add %[inc], %[p1] ;\n" 404 " add %[inc], %[p2] ;\n" 405 " add %[inc], %[p3] ;\n" 406 " add %[inc], %[p4] ;\n" 407 " add %[inc], %[p5] ;\n" 408 " dec %[cnt] ;\n" 409 " jnz 1b ;\n" 410 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 411 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 412 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 413 : "memory"); 414 415 kernel_fpu_end(); 416 } 417 418 static void 419 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 420 unsigned long *p3, unsigned long *p4, unsigned long *p5) 421 { 422 unsigned long lines = bytes >> 8; 423 424 kernel_fpu_begin(); 425 426 asm volatile( 427 #undef BLOCK 428 #define BLOCK(i) \ 429 BLK64(PF0, LD, i) \ 430 BLK64(PF1, XO1, i) \ 431 BLK64(PF2, XO2, i) \ 432 BLK64(PF3, XO3, i) \ 433 BLK64(PF4, XO4, i) \ 434 BLK64(NOP, ST, i) \ 435 436 " .align 32 ;\n" 437 " 1: ;\n" 438 439 BLOCK(0) 440 BLOCK(4) 441 BLOCK(8) 442 BLOCK(12) 443 444 " add %[inc], %[p1] ;\n" 445 " add %[inc], %[p2] ;\n" 446 " add %[inc], %[p3] ;\n" 447 " add %[inc], %[p4] ;\n" 448 " add %[inc], %[p5] ;\n" 449 " dec %[cnt] ;\n" 450 " jnz 1b ;\n" 451 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 452 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 453 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 454 : "memory"); 455 456 kernel_fpu_end(); 457 } 458 459 static struct xor_block_template xor_block_sse_pf64 = { 460 .name = "prefetch64-sse", 461 .do_2 = xor_sse_2_pf64, 462 .do_3 = xor_sse_3_pf64, 463 .do_4 = xor_sse_4_pf64, 464 .do_5 = xor_sse_5_pf64, 465 }; 466 467 #undef LD 468 #undef XO1 469 #undef XO2 470 #undef XO3 471 #undef XO4 472 #undef ST 473 #undef NOP 474 #undef BLK64 475 #undef BLOCK 476 477 #undef XOR_CONSTANT_CONSTRAINT 478 479 #ifdef CONFIG_X86_32 480 # include <asm/xor_32.h> 481 #else 482 # include <asm/xor_64.h> 483 #endif 484 485 #define XOR_SELECT_TEMPLATE(FASTEST) \ 486 AVX_SELECT(FASTEST) 487 488 #endif /* _ASM_X86_XOR_H */ 489