1 /* -*- linux-c -*- ------------------------------------------------------- * 2 * 3 * Copyright (C) 2012 Intel Corporation 4 * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com> 5 * 6 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved 7 * 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 12 * Boston MA 02111-1307, USA; either version 2 of the License, or 13 * (at your option) any later version; incorporated herein by reference. 14 * 15 * ----------------------------------------------------------------------- */ 16 17 /* 18 * AVX2 implementation of RAID-6 syndrome functions 19 * 20 */ 21 22 #ifdef CONFIG_AS_AVX2 23 24 #include <linux/raid/pq.h> 25 #include "x86.h" 26 27 static const struct raid6_avx2_constants { 28 u64 x1d[4]; 29 } raid6_avx2_constants __aligned(32) = { 30 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 31 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, 32 }; 33 34 static int raid6_have_avx2(void) 35 { 36 return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX); 37 } 38 39 /* 40 * Plain AVX2 implementation 41 */ 42 static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) 43 { 44 u8 **dptr = (u8 **)ptrs; 45 u8 *p, *q; 46 int d, z, z0; 47 48 z0 = disks - 3; /* Highest data disk */ 49 p = dptr[z0+1]; /* XOR parity */ 50 q = dptr[z0+2]; /* RS syndrome */ 51 52 kernel_fpu_begin(); 53 54 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 55 asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */ 56 57 for (d = 0; d < bytes; d += 32) { 58 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); 59 asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ 60 asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); 61 asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */ 62 asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d])); 63 for (z = z0-2; z >= 0; z--) { 64 asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 65 asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); 66 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 67 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 68 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 69 asm volatile("vpxor %ymm6,%ymm2,%ymm2"); 70 asm volatile("vpxor %ymm6,%ymm4,%ymm4"); 71 asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d])); 72 } 73 asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); 74 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 75 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 76 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 77 asm volatile("vpxor %ymm6,%ymm2,%ymm2"); 78 asm volatile("vpxor %ymm6,%ymm4,%ymm4"); 79 80 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); 81 asm volatile("vpxor %ymm2,%ymm2,%ymm2"); 82 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); 83 asm volatile("vpxor %ymm4,%ymm4,%ymm4"); 84 } 85 86 asm volatile("sfence" : : : "memory"); 87 kernel_fpu_end(); 88 } 89 90 static void raid6_avx21_xor_syndrome(int disks, int start, int stop, 91 size_t bytes, void **ptrs) 92 { 93 u8 **dptr = (u8 **)ptrs; 94 u8 *p, *q; 95 int d, z, z0; 96 97 z0 = stop; /* P/Q right side optimization */ 98 p = dptr[disks-2]; /* XOR parity */ 99 q = dptr[disks-1]; /* RS syndrome */ 100 101 kernel_fpu_begin(); 102 103 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 104 105 for (d = 0 ; d < bytes ; d += 32) { 106 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); 107 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); 108 asm volatile("vpxor %ymm4,%ymm2,%ymm2"); 109 /* P/Q data pages */ 110 for (z = z0-1 ; z >= start ; z--) { 111 asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 112 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 113 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 114 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 115 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 116 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); 117 asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 118 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 119 } 120 /* P/Q left side optimization */ 121 for (z = start-1 ; z >= 0 ; z--) { 122 asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 123 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 124 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 125 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 126 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 127 } 128 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); 129 /* Don't use movntdq for r/w memory area < cache line */ 130 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); 131 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); 132 } 133 134 asm volatile("sfence" : : : "memory"); 135 kernel_fpu_end(); 136 } 137 138 const struct raid6_calls raid6_avx2x1 = { 139 raid6_avx21_gen_syndrome, 140 raid6_avx21_xor_syndrome, 141 raid6_have_avx2, 142 "avx2x1", 143 1 /* Has cache hints */ 144 }; 145 146 /* 147 * Unrolled-by-2 AVX2 implementation 148 */ 149 static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) 150 { 151 u8 **dptr = (u8 **)ptrs; 152 u8 *p, *q; 153 int d, z, z0; 154 155 z0 = disks - 3; /* Highest data disk */ 156 p = dptr[z0+1]; /* XOR parity */ 157 q = dptr[z0+2]; /* RS syndrome */ 158 159 kernel_fpu_begin(); 160 161 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 162 asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ 163 164 /* We uniformly assume a single prefetch covers at least 32 bytes */ 165 for (d = 0; d < bytes; d += 64) { 166 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); 167 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32])); 168 asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ 169 asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */ 170 asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */ 171 asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */ 172 for (z = z0-1; z >= 0; z--) { 173 asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 174 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); 175 asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); 176 asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); 177 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 178 asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 179 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 180 asm volatile("vpand %ymm0,%ymm7,%ymm7"); 181 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 182 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 183 asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); 184 asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); 185 asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 186 asm volatile("vpxor %ymm7,%ymm3,%ymm3"); 187 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 188 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 189 } 190 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); 191 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); 192 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); 193 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); 194 } 195 196 asm volatile("sfence" : : : "memory"); 197 kernel_fpu_end(); 198 } 199 200 static void raid6_avx22_xor_syndrome(int disks, int start, int stop, 201 size_t bytes, void **ptrs) 202 { 203 u8 **dptr = (u8 **)ptrs; 204 u8 *p, *q; 205 int d, z, z0; 206 207 z0 = stop; /* P/Q right side optimization */ 208 p = dptr[disks-2]; /* XOR parity */ 209 q = dptr[disks-1]; /* RS syndrome */ 210 211 kernel_fpu_begin(); 212 213 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 214 215 for (d = 0 ; d < bytes ; d += 64) { 216 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); 217 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); 218 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); 219 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); 220 asm volatile("vpxor %ymm4,%ymm2,%ymm2"); 221 asm volatile("vpxor %ymm6,%ymm3,%ymm3"); 222 /* P/Q data pages */ 223 for (z = z0-1 ; z >= start ; z--) { 224 asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 225 asm volatile("vpxor %ymm7,%ymm7,%ymm7"); 226 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 227 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); 228 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 229 asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 230 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 231 asm volatile("vpand %ymm0,%ymm7,%ymm7"); 232 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 233 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 234 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); 235 asm volatile("vmovdqa %0,%%ymm7" 236 :: "m" (dptr[z][d+32])); 237 asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 238 asm volatile("vpxor %ymm7,%ymm3,%ymm3"); 239 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 240 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 241 } 242 /* P/Q left side optimization */ 243 for (z = start-1 ; z >= 0 ; z--) { 244 asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 245 asm volatile("vpxor %ymm7,%ymm7,%ymm7"); 246 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 247 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); 248 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 249 asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 250 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 251 asm volatile("vpand %ymm0,%ymm7,%ymm7"); 252 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 253 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 254 } 255 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); 256 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); 257 /* Don't use movntdq for r/w memory area < cache line */ 258 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); 259 asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32])); 260 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); 261 asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32])); 262 } 263 264 asm volatile("sfence" : : : "memory"); 265 kernel_fpu_end(); 266 } 267 268 const struct raid6_calls raid6_avx2x2 = { 269 raid6_avx22_gen_syndrome, 270 raid6_avx22_xor_syndrome, 271 raid6_have_avx2, 272 "avx2x2", 273 1 /* Has cache hints */ 274 }; 275 276 #ifdef CONFIG_X86_64 277 278 /* 279 * Unrolled-by-4 AVX2 implementation 280 */ 281 static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) 282 { 283 u8 **dptr = (u8 **)ptrs; 284 u8 *p, *q; 285 int d, z, z0; 286 287 z0 = disks - 3; /* Highest data disk */ 288 p = dptr[z0+1]; /* XOR parity */ 289 q = dptr[z0+2]; /* RS syndrome */ 290 291 kernel_fpu_begin(); 292 293 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 294 asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ 295 asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */ 296 asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */ 297 asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */ 298 asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */ 299 asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */ 300 asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */ 301 asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */ 302 asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */ 303 304 for (d = 0; d < bytes; d += 128) { 305 for (z = z0; z >= 0; z--) { 306 asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 307 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); 308 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64])); 309 asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96])); 310 asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); 311 asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); 312 asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13"); 313 asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15"); 314 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 315 asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 316 asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); 317 asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); 318 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 319 asm volatile("vpand %ymm0,%ymm7,%ymm7"); 320 asm volatile("vpand %ymm0,%ymm13,%ymm13"); 321 asm volatile("vpand %ymm0,%ymm15,%ymm15"); 322 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 323 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 324 asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 325 asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 326 asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); 327 asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); 328 asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64])); 329 asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96])); 330 asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 331 asm volatile("vpxor %ymm7,%ymm3,%ymm3"); 332 asm volatile("vpxor %ymm13,%ymm10,%ymm10"); 333 asm volatile("vpxor %ymm15,%ymm11,%ymm11"); 334 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 335 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 336 asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 337 asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 338 } 339 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); 340 asm volatile("vpxor %ymm2,%ymm2,%ymm2"); 341 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); 342 asm volatile("vpxor %ymm3,%ymm3,%ymm3"); 343 asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); 344 asm volatile("vpxor %ymm10,%ymm10,%ymm10"); 345 asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); 346 asm volatile("vpxor %ymm11,%ymm11,%ymm11"); 347 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); 348 asm volatile("vpxor %ymm4,%ymm4,%ymm4"); 349 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); 350 asm volatile("vpxor %ymm6,%ymm6,%ymm6"); 351 asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); 352 asm volatile("vpxor %ymm12,%ymm12,%ymm12"); 353 asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); 354 asm volatile("vpxor %ymm14,%ymm14,%ymm14"); 355 } 356 357 asm volatile("sfence" : : : "memory"); 358 kernel_fpu_end(); 359 } 360 361 static void raid6_avx24_xor_syndrome(int disks, int start, int stop, 362 size_t bytes, void **ptrs) 363 { 364 u8 **dptr = (u8 **)ptrs; 365 u8 *p, *q; 366 int d, z, z0; 367 368 z0 = stop; /* P/Q right side optimization */ 369 p = dptr[disks-2]; /* XOR parity */ 370 q = dptr[disks-1]; /* RS syndrome */ 371 372 kernel_fpu_begin(); 373 374 asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0])); 375 376 for (d = 0 ; d < bytes ; d += 128) { 377 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); 378 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); 379 asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64])); 380 asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96])); 381 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); 382 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); 383 asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64])); 384 asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96])); 385 asm volatile("vpxor %ymm4,%ymm2,%ymm2"); 386 asm volatile("vpxor %ymm6,%ymm3,%ymm3"); 387 asm volatile("vpxor %ymm12,%ymm10,%ymm10"); 388 asm volatile("vpxor %ymm14,%ymm11,%ymm11"); 389 /* P/Q data pages */ 390 for (z = z0-1 ; z >= start ; z--) { 391 asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); 392 asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64])); 393 asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 394 asm volatile("vpxor %ymm7,%ymm7,%ymm7"); 395 asm volatile("vpxor %ymm13,%ymm13,%ymm13"); 396 asm volatile("vpxor %ymm15,%ymm15,%ymm15"); 397 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 398 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); 399 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); 400 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); 401 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 402 asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 403 asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); 404 asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); 405 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 406 asm volatile("vpand %ymm0,%ymm7,%ymm7"); 407 asm volatile("vpand %ymm0,%ymm13,%ymm13"); 408 asm volatile("vpand %ymm0,%ymm15,%ymm15"); 409 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 410 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 411 asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 412 asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 413 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); 414 asm volatile("vmovdqa %0,%%ymm7" 415 :: "m" (dptr[z][d+32])); 416 asm volatile("vmovdqa %0,%%ymm13" 417 :: "m" (dptr[z][d+64])); 418 asm volatile("vmovdqa %0,%%ymm15" 419 :: "m" (dptr[z][d+96])); 420 asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 421 asm volatile("vpxor %ymm7,%ymm3,%ymm3"); 422 asm volatile("vpxor %ymm13,%ymm10,%ymm10"); 423 asm volatile("vpxor %ymm15,%ymm11,%ymm11"); 424 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 425 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 426 asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 427 asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 428 } 429 asm volatile("prefetchnta %0" :: "m" (q[d])); 430 asm volatile("prefetchnta %0" :: "m" (q[d+64])); 431 /* P/Q left side optimization */ 432 for (z = start-1 ; z >= 0 ; z--) { 433 asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 434 asm volatile("vpxor %ymm7,%ymm7,%ymm7"); 435 asm volatile("vpxor %ymm13,%ymm13,%ymm13"); 436 asm volatile("vpxor %ymm15,%ymm15,%ymm15"); 437 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 438 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); 439 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); 440 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); 441 asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 442 asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 443 asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); 444 asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); 445 asm volatile("vpand %ymm0,%ymm5,%ymm5"); 446 asm volatile("vpand %ymm0,%ymm7,%ymm7"); 447 asm volatile("vpand %ymm0,%ymm13,%ymm13"); 448 asm volatile("vpand %ymm0,%ymm15,%ymm15"); 449 asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 450 asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 451 asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 452 asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 453 } 454 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); 455 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); 456 asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); 457 asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); 458 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); 459 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); 460 asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64])); 461 asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96])); 462 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); 463 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); 464 asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); 465 asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); 466 } 467 asm volatile("sfence" : : : "memory"); 468 kernel_fpu_end(); 469 } 470 471 const struct raid6_calls raid6_avx2x4 = { 472 raid6_avx24_gen_syndrome, 473 raid6_avx24_xor_syndrome, 474 raid6_have_avx2, 475 "avx2x4", 476 1 /* Has cache hints */ 477 }; 478 #endif 479 480 #endif /* CONFIG_AS_AVX2 */ 481