1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 #ifndef _ASM_X86_XOR_AVX_H 3 #define _ASM_X86_XOR_AVX_H 4 5 /* 6 * Optimized RAID-5 checksumming functions for AVX 7 * 8 * Copyright (C) 2012 Intel Corporation 9 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 10 * 11 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 12 */ 13 14 #ifdef CONFIG_AS_AVX 15 16 #include <linux/compiler.h> 17 #include <asm/fpu/api.h> 18 19 #define BLOCK4(i) \ 20 BLOCK(32 * i, 0) \ 21 BLOCK(32 * (i + 1), 1) \ 22 BLOCK(32 * (i + 2), 2) \ 23 BLOCK(32 * (i + 3), 3) 24 25 #define BLOCK16() \ 26 BLOCK4(0) \ 27 BLOCK4(4) \ 28 BLOCK4(8) \ 29 BLOCK4(12) 30 31 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) 32 { 33 unsigned long lines = bytes >> 9; 34 35 kernel_fpu_begin(); 36 37 while (lines--) { 38 #undef BLOCK 39 #define BLOCK(i, reg) \ 40 do { \ 41 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 42 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 43 "m" (p0[i / sizeof(*p0)])); \ 44 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 45 "=m" (p0[i / sizeof(*p0)])); \ 46 } while (0); 47 48 BLOCK16() 49 50 p0 = (unsigned long *)((uintptr_t)p0 + 512); 51 p1 = (unsigned long *)((uintptr_t)p1 + 512); 52 } 53 54 kernel_fpu_end(); 55 } 56 57 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, 58 unsigned long *p2) 59 { 60 unsigned long lines = bytes >> 9; 61 62 kernel_fpu_begin(); 63 64 while (lines--) { 65 #undef BLOCK 66 #define BLOCK(i, reg) \ 67 do { \ 68 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 69 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 70 "m" (p1[i / sizeof(*p1)])); \ 71 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 72 "m" (p0[i / sizeof(*p0)])); \ 73 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 74 "=m" (p0[i / sizeof(*p0)])); \ 75 } while (0); 76 77 BLOCK16() 78 79 p0 = (unsigned long *)((uintptr_t)p0 + 512); 80 p1 = (unsigned long *)((uintptr_t)p1 + 512); 81 p2 = (unsigned long *)((uintptr_t)p2 + 512); 82 } 83 84 kernel_fpu_end(); 85 } 86 87 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, 88 unsigned long *p2, unsigned long *p3) 89 { 90 unsigned long lines = bytes >> 9; 91 92 kernel_fpu_begin(); 93 94 while (lines--) { 95 #undef BLOCK 96 #define BLOCK(i, reg) \ 97 do { \ 98 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 99 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 100 "m" (p2[i / sizeof(*p2)])); \ 101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 102 "m" (p1[i / sizeof(*p1)])); \ 103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 104 "m" (p0[i / sizeof(*p0)])); \ 105 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 106 "=m" (p0[i / sizeof(*p0)])); \ 107 } while (0); 108 109 BLOCK16(); 110 111 p0 = (unsigned long *)((uintptr_t)p0 + 512); 112 p1 = (unsigned long *)((uintptr_t)p1 + 512); 113 p2 = (unsigned long *)((uintptr_t)p2 + 512); 114 p3 = (unsigned long *)((uintptr_t)p3 + 512); 115 } 116 117 kernel_fpu_end(); 118 } 119 120 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, 121 unsigned long *p2, unsigned long *p3, unsigned long *p4) 122 { 123 unsigned long lines = bytes >> 9; 124 125 kernel_fpu_begin(); 126 127 while (lines--) { 128 #undef BLOCK 129 #define BLOCK(i, reg) \ 130 do { \ 131 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 132 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 133 "m" (p3[i / sizeof(*p3)])); \ 134 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 135 "m" (p2[i / sizeof(*p2)])); \ 136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 137 "m" (p1[i / sizeof(*p1)])); \ 138 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 139 "m" (p0[i / sizeof(*p0)])); \ 140 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 141 "=m" (p0[i / sizeof(*p0)])); \ 142 } while (0); 143 144 BLOCK16() 145 146 p0 = (unsigned long *)((uintptr_t)p0 + 512); 147 p1 = (unsigned long *)((uintptr_t)p1 + 512); 148 p2 = (unsigned long *)((uintptr_t)p2 + 512); 149 p3 = (unsigned long *)((uintptr_t)p3 + 512); 150 p4 = (unsigned long *)((uintptr_t)p4 + 512); 151 } 152 153 kernel_fpu_end(); 154 } 155 156 static struct xor_block_template xor_block_avx = { 157 .name = "avx", 158 .do_2 = xor_avx_2, 159 .do_3 = xor_avx_3, 160 .do_4 = xor_avx_4, 161 .do_5 = xor_avx_5, 162 }; 163 164 #define AVX_XOR_SPEED \ 165 do { \ 166 if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ 167 xor_speed(&xor_block_avx); \ 168 } while (0) 169 170 #define AVX_SELECT(FASTEST) \ 171 (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) 172 173 #else 174 175 #define AVX_XOR_SPEED {} 176 177 #define AVX_SELECT(FASTEST) (FASTEST) 178 179 #endif 180 #endif 181