xref: /openbmc/linux/arch/x86/include/asm/xor_avx.h (revision 3dc4b6fb)
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 #ifndef _ASM_X86_XOR_AVX_H
3 #define _ASM_X86_XOR_AVX_H
4 
5 /*
6  * Optimized RAID-5 checksumming functions for AVX
7  *
8  * Copyright (C) 2012 Intel Corporation
9  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10  *
11  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12  */
13 
14 #ifdef CONFIG_AS_AVX
15 
16 #include <linux/compiler.h>
17 #include <asm/fpu/api.h>
18 
19 #define BLOCK4(i) \
20 		BLOCK(32 * i, 0) \
21 		BLOCK(32 * (i + 1), 1) \
22 		BLOCK(32 * (i + 2), 2) \
23 		BLOCK(32 * (i + 3), 3)
24 
25 #define BLOCK16() \
26 		BLOCK4(0) \
27 		BLOCK4(4) \
28 		BLOCK4(8) \
29 		BLOCK4(12)
30 
31 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
32 {
33 	unsigned long lines = bytes >> 9;
34 
35 	kernel_fpu_begin();
36 
37 	while (lines--) {
38 #undef BLOCK
39 #define BLOCK(i, reg) \
40 do { \
41 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
42 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
43 		"m" (p0[i / sizeof(*p0)])); \
44 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
45 		"=m" (p0[i / sizeof(*p0)])); \
46 } while (0);
47 
48 		BLOCK16()
49 
50 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
51 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
52 	}
53 
54 	kernel_fpu_end();
55 }
56 
57 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
58 	unsigned long *p2)
59 {
60 	unsigned long lines = bytes >> 9;
61 
62 	kernel_fpu_begin();
63 
64 	while (lines--) {
65 #undef BLOCK
66 #define BLOCK(i, reg) \
67 do { \
68 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
69 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70 		"m" (p1[i / sizeof(*p1)])); \
71 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
72 		"m" (p0[i / sizeof(*p0)])); \
73 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
74 		"=m" (p0[i / sizeof(*p0)])); \
75 } while (0);
76 
77 		BLOCK16()
78 
79 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
80 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
81 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
82 	}
83 
84 	kernel_fpu_end();
85 }
86 
87 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
88 	unsigned long *p2, unsigned long *p3)
89 {
90 	unsigned long lines = bytes >> 9;
91 
92 	kernel_fpu_begin();
93 
94 	while (lines--) {
95 #undef BLOCK
96 #define BLOCK(i, reg) \
97 do { \
98 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
99 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
100 		"m" (p2[i / sizeof(*p2)])); \
101 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 		"m" (p1[i / sizeof(*p1)])); \
103 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 		"m" (p0[i / sizeof(*p0)])); \
105 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
106 		"=m" (p0[i / sizeof(*p0)])); \
107 } while (0);
108 
109 		BLOCK16();
110 
111 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
112 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
113 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
114 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
115 	}
116 
117 	kernel_fpu_end();
118 }
119 
120 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
121 	unsigned long *p2, unsigned long *p3, unsigned long *p4)
122 {
123 	unsigned long lines = bytes >> 9;
124 
125 	kernel_fpu_begin();
126 
127 	while (lines--) {
128 #undef BLOCK
129 #define BLOCK(i, reg) \
130 do { \
131 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
132 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133 		"m" (p3[i / sizeof(*p3)])); \
134 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135 		"m" (p2[i / sizeof(*p2)])); \
136 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 		"m" (p1[i / sizeof(*p1)])); \
138 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
139 		"m" (p0[i / sizeof(*p0)])); \
140 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
141 		"=m" (p0[i / sizeof(*p0)])); \
142 } while (0);
143 
144 		BLOCK16()
145 
146 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
147 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
148 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
149 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
150 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
151 	}
152 
153 	kernel_fpu_end();
154 }
155 
156 static struct xor_block_template xor_block_avx = {
157 	.name = "avx",
158 	.do_2 = xor_avx_2,
159 	.do_3 = xor_avx_3,
160 	.do_4 = xor_avx_4,
161 	.do_5 = xor_avx_5,
162 };
163 
164 #define AVX_XOR_SPEED \
165 do { \
166 	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
167 		xor_speed(&xor_block_avx); \
168 } while (0)
169 
170 #define AVX_SELECT(FASTEST) \
171 	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
172 
173 #else
174 
175 #define AVX_XOR_SPEED {}
176 
177 #define AVX_SELECT(FASTEST) (FASTEST)
178 
179 #endif
180 #endif
181