xref: /openbmc/linux/arch/x86/include/asm/xor_avx.h (revision f97cee494dc92395a668445bcd24d34c89f4ff8c)
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 #ifndef _ASM_X86_XOR_AVX_H
3 #define _ASM_X86_XOR_AVX_H
4 
5 /*
6  * Optimized RAID-5 checksumming functions for AVX
7  *
8  * Copyright (C) 2012 Intel Corporation
9  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10  *
11  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12  */
13 
14 #include <linux/compiler.h>
15 #include <asm/fpu/api.h>
16 
17 #define BLOCK4(i) \
18 		BLOCK(32 * i, 0) \
19 		BLOCK(32 * (i + 1), 1) \
20 		BLOCK(32 * (i + 2), 2) \
21 		BLOCK(32 * (i + 3), 3)
22 
23 #define BLOCK16() \
24 		BLOCK4(0) \
25 		BLOCK4(4) \
26 		BLOCK4(8) \
27 		BLOCK4(12)
28 
29 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
30 {
31 	unsigned long lines = bytes >> 9;
32 
33 	kernel_fpu_begin();
34 
35 	while (lines--) {
36 #undef BLOCK
37 #define BLOCK(i, reg) \
38 do { \
39 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
40 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
41 		"m" (p0[i / sizeof(*p0)])); \
42 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
43 		"=m" (p0[i / sizeof(*p0)])); \
44 } while (0);
45 
46 		BLOCK16()
47 
48 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
49 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
50 	}
51 
52 	kernel_fpu_end();
53 }
54 
55 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
56 	unsigned long *p2)
57 {
58 	unsigned long lines = bytes >> 9;
59 
60 	kernel_fpu_begin();
61 
62 	while (lines--) {
63 #undef BLOCK
64 #define BLOCK(i, reg) \
65 do { \
66 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
67 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
68 		"m" (p1[i / sizeof(*p1)])); \
69 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70 		"m" (p0[i / sizeof(*p0)])); \
71 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
72 		"=m" (p0[i / sizeof(*p0)])); \
73 } while (0);
74 
75 		BLOCK16()
76 
77 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
78 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
79 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
80 	}
81 
82 	kernel_fpu_end();
83 }
84 
85 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
86 	unsigned long *p2, unsigned long *p3)
87 {
88 	unsigned long lines = bytes >> 9;
89 
90 	kernel_fpu_begin();
91 
92 	while (lines--) {
93 #undef BLOCK
94 #define BLOCK(i, reg) \
95 do { \
96 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
97 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
98 		"m" (p2[i / sizeof(*p2)])); \
99 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
100 		"m" (p1[i / sizeof(*p1)])); \
101 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 		"m" (p0[i / sizeof(*p0)])); \
103 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
104 		"=m" (p0[i / sizeof(*p0)])); \
105 } while (0);
106 
107 		BLOCK16();
108 
109 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
110 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
111 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
112 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
113 	}
114 
115 	kernel_fpu_end();
116 }
117 
118 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
119 	unsigned long *p2, unsigned long *p3, unsigned long *p4)
120 {
121 	unsigned long lines = bytes >> 9;
122 
123 	kernel_fpu_begin();
124 
125 	while (lines--) {
126 #undef BLOCK
127 #define BLOCK(i, reg) \
128 do { \
129 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
130 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
131 		"m" (p3[i / sizeof(*p3)])); \
132 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133 		"m" (p2[i / sizeof(*p2)])); \
134 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135 		"m" (p1[i / sizeof(*p1)])); \
136 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 		"m" (p0[i / sizeof(*p0)])); \
138 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139 		"=m" (p0[i / sizeof(*p0)])); \
140 } while (0);
141 
142 		BLOCK16()
143 
144 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
145 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
146 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
147 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
148 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
149 	}
150 
151 	kernel_fpu_end();
152 }
153 
154 static struct xor_block_template xor_block_avx = {
155 	.name = "avx",
156 	.do_2 = xor_avx_2,
157 	.do_3 = xor_avx_3,
158 	.do_4 = xor_avx_4,
159 	.do_5 = xor_avx_5,
160 };
161 
162 #define AVX_XOR_SPEED \
163 do { \
164 	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
165 		xor_speed(&xor_block_avx); \
166 } while (0)
167 
168 #define AVX_SELECT(FASTEST) \
169 	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
170 
171 #endif
172