1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 #ifndef _ASM_X86_XOR_AVX_H 3 #define _ASM_X86_XOR_AVX_H 4 5 /* 6 * Optimized RAID-5 checksumming functions for AVX 7 * 8 * Copyright (C) 2012 Intel Corporation 9 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 10 * 11 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 12 */ 13 14 #include <linux/compiler.h> 15 #include <asm/fpu/api.h> 16 17 #define BLOCK4(i) \ 18 BLOCK(32 * i, 0) \ 19 BLOCK(32 * (i + 1), 1) \ 20 BLOCK(32 * (i + 2), 2) \ 21 BLOCK(32 * (i + 3), 3) 22 23 #define BLOCK16() \ 24 BLOCK4(0) \ 25 BLOCK4(4) \ 26 BLOCK4(8) \ 27 BLOCK4(12) 28 29 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) 30 { 31 unsigned long lines = bytes >> 9; 32 33 kernel_fpu_begin(); 34 35 while (lines--) { 36 #undef BLOCK 37 #define BLOCK(i, reg) \ 38 do { \ 39 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 40 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 41 "m" (p0[i / sizeof(*p0)])); \ 42 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 43 "=m" (p0[i / sizeof(*p0)])); \ 44 } while (0); 45 46 BLOCK16() 47 48 p0 = (unsigned long *)((uintptr_t)p0 + 512); 49 p1 = (unsigned long *)((uintptr_t)p1 + 512); 50 } 51 52 kernel_fpu_end(); 53 } 54 55 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, 56 unsigned long *p2) 57 { 58 unsigned long lines = bytes >> 9; 59 60 kernel_fpu_begin(); 61 62 while (lines--) { 63 #undef BLOCK 64 #define BLOCK(i, reg) \ 65 do { \ 66 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 67 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 68 "m" (p1[i / sizeof(*p1)])); \ 69 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 70 "m" (p0[i / sizeof(*p0)])); \ 71 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 72 "=m" (p0[i / sizeof(*p0)])); \ 73 } while (0); 74 75 BLOCK16() 76 77 p0 = (unsigned long *)((uintptr_t)p0 + 512); 78 p1 = (unsigned long *)((uintptr_t)p1 + 512); 79 p2 = (unsigned long *)((uintptr_t)p2 + 512); 80 } 81 82 kernel_fpu_end(); 83 } 84 85 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, 86 unsigned long *p2, unsigned long *p3) 87 { 88 unsigned long lines = bytes >> 9; 89 90 kernel_fpu_begin(); 91 92 while (lines--) { 93 #undef BLOCK 94 #define BLOCK(i, reg) \ 95 do { \ 96 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 97 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 98 "m" (p2[i / sizeof(*p2)])); \ 99 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 100 "m" (p1[i / sizeof(*p1)])); \ 101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 102 "m" (p0[i / sizeof(*p0)])); \ 103 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 104 "=m" (p0[i / sizeof(*p0)])); \ 105 } while (0); 106 107 BLOCK16(); 108 109 p0 = (unsigned long *)((uintptr_t)p0 + 512); 110 p1 = (unsigned long *)((uintptr_t)p1 + 512); 111 p2 = (unsigned long *)((uintptr_t)p2 + 512); 112 p3 = (unsigned long *)((uintptr_t)p3 + 512); 113 } 114 115 kernel_fpu_end(); 116 } 117 118 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, 119 unsigned long *p2, unsigned long *p3, unsigned long *p4) 120 { 121 unsigned long lines = bytes >> 9; 122 123 kernel_fpu_begin(); 124 125 while (lines--) { 126 #undef BLOCK 127 #define BLOCK(i, reg) \ 128 do { \ 129 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 130 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 131 "m" (p3[i / sizeof(*p3)])); \ 132 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 133 "m" (p2[i / sizeof(*p2)])); \ 134 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 135 "m" (p1[i / sizeof(*p1)])); \ 136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 137 "m" (p0[i / sizeof(*p0)])); \ 138 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 139 "=m" (p0[i / sizeof(*p0)])); \ 140 } while (0); 141 142 BLOCK16() 143 144 p0 = (unsigned long *)((uintptr_t)p0 + 512); 145 p1 = (unsigned long *)((uintptr_t)p1 + 512); 146 p2 = (unsigned long *)((uintptr_t)p2 + 512); 147 p3 = (unsigned long *)((uintptr_t)p3 + 512); 148 p4 = (unsigned long *)((uintptr_t)p4 + 512); 149 } 150 151 kernel_fpu_end(); 152 } 153 154 static struct xor_block_template xor_block_avx = { 155 .name = "avx", 156 .do_2 = xor_avx_2, 157 .do_3 = xor_avx_3, 158 .do_4 = xor_avx_4, 159 .do_5 = xor_avx_5, 160 }; 161 162 #define AVX_XOR_SPEED \ 163 do { \ 164 if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ 165 xor_speed(&xor_block_avx); \ 166 } while (0) 167 168 #define AVX_SELECT(FASTEST) \ 169 (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) 170 171 #endif 172