1 #ifndef _ASM_X86_XOR_AVX_H 2 #define _ASM_X86_XOR_AVX_H 3 4 /* 5 * Optimized RAID-5 checksumming functions for AVX 6 * 7 * Copyright (C) 2012 Intel Corporation 8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 9 * 10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; version 2 15 * of the License. 16 */ 17 18 #ifdef CONFIG_AS_AVX 19 20 #include <linux/compiler.h> 21 #include <asm/i387.h> 22 23 #define BLOCK4(i) \ 24 BLOCK(32 * i, 0) \ 25 BLOCK(32 * (i + 1), 1) \ 26 BLOCK(32 * (i + 2), 2) \ 27 BLOCK(32 * (i + 3), 3) 28 29 #define BLOCK16() \ 30 BLOCK4(0) \ 31 BLOCK4(4) \ 32 BLOCK4(8) \ 33 BLOCK4(12) 34 35 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) 36 { 37 unsigned long lines = bytes >> 9; 38 39 kernel_fpu_begin(); 40 41 while (lines--) { 42 #undef BLOCK 43 #define BLOCK(i, reg) \ 44 do { \ 45 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 46 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 47 "m" (p0[i / sizeof(*p0)])); \ 48 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 49 "=m" (p0[i / sizeof(*p0)])); \ 50 } while (0); 51 52 BLOCK16() 53 54 p0 = (unsigned long *)((uintptr_t)p0 + 512); 55 p1 = (unsigned long *)((uintptr_t)p1 + 512); 56 } 57 58 kernel_fpu_end(); 59 } 60 61 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, 62 unsigned long *p2) 63 { 64 unsigned long lines = bytes >> 9; 65 66 kernel_fpu_begin(); 67 68 while (lines--) { 69 #undef BLOCK 70 #define BLOCK(i, reg) \ 71 do { \ 72 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 73 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 74 "m" (p1[i / sizeof(*p1)])); \ 75 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 76 "m" (p0[i / sizeof(*p0)])); \ 77 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 78 "=m" (p0[i / sizeof(*p0)])); \ 79 } while (0); 80 81 BLOCK16() 82 83 p0 = (unsigned long *)((uintptr_t)p0 + 512); 84 p1 = (unsigned long *)((uintptr_t)p1 + 512); 85 p2 = (unsigned long *)((uintptr_t)p2 + 512); 86 } 87 88 kernel_fpu_end(); 89 } 90 91 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, 92 unsigned long *p2, unsigned long *p3) 93 { 94 unsigned long lines = bytes >> 9; 95 96 kernel_fpu_begin(); 97 98 while (lines--) { 99 #undef BLOCK 100 #define BLOCK(i, reg) \ 101 do { \ 102 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 104 "m" (p2[i / sizeof(*p2)])); \ 105 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 106 "m" (p1[i / sizeof(*p1)])); \ 107 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 108 "m" (p0[i / sizeof(*p0)])); \ 109 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 110 "=m" (p0[i / sizeof(*p0)])); \ 111 } while (0); 112 113 BLOCK16(); 114 115 p0 = (unsigned long *)((uintptr_t)p0 + 512); 116 p1 = (unsigned long *)((uintptr_t)p1 + 512); 117 p2 = (unsigned long *)((uintptr_t)p2 + 512); 118 p3 = (unsigned long *)((uintptr_t)p3 + 512); 119 } 120 121 kernel_fpu_end(); 122 } 123 124 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, 125 unsigned long *p2, unsigned long *p3, unsigned long *p4) 126 { 127 unsigned long lines = bytes >> 9; 128 129 kernel_fpu_begin(); 130 131 while (lines--) { 132 #undef BLOCK 133 #define BLOCK(i, reg) \ 134 do { \ 135 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 137 "m" (p3[i / sizeof(*p3)])); \ 138 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 139 "m" (p2[i / sizeof(*p2)])); \ 140 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 141 "m" (p1[i / sizeof(*p1)])); \ 142 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 143 "m" (p0[i / sizeof(*p0)])); \ 144 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 145 "=m" (p0[i / sizeof(*p0)])); \ 146 } while (0); 147 148 BLOCK16() 149 150 p0 = (unsigned long *)((uintptr_t)p0 + 512); 151 p1 = (unsigned long *)((uintptr_t)p1 + 512); 152 p2 = (unsigned long *)((uintptr_t)p2 + 512); 153 p3 = (unsigned long *)((uintptr_t)p3 + 512); 154 p4 = (unsigned long *)((uintptr_t)p4 + 512); 155 } 156 157 kernel_fpu_end(); 158 } 159 160 static struct xor_block_template xor_block_avx = { 161 .name = "avx", 162 .do_2 = xor_avx_2, 163 .do_3 = xor_avx_3, 164 .do_4 = xor_avx_4, 165 .do_5 = xor_avx_5, 166 }; 167 168 #define AVX_XOR_SPEED \ 169 do { \ 170 if (cpu_has_avx && cpu_has_osxsave) \ 171 xor_speed(&xor_block_avx); \ 172 } while (0) 173 174 #define AVX_SELECT(FASTEST) \ 175 (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST) 176 177 #else 178 179 #define AVX_XOR_SPEED {} 180 181 #define AVX_SELECT(FASTEST) (FASTEST) 182 183 #endif 184 #endif 185