xref: /openbmc/qemu/util/bufferiszero.c (revision 88ca8e80defa4ec92c90054f151212cd32deb359)
1*88ca8e80SRichard Henderson /*
2*88ca8e80SRichard Henderson  * Simple C functions to supplement the C library
3*88ca8e80SRichard Henderson  *
4*88ca8e80SRichard Henderson  * Copyright (c) 2006 Fabrice Bellard
5*88ca8e80SRichard Henderson  *
6*88ca8e80SRichard Henderson  * Permission is hereby granted, free of charge, to any person obtaining a copy
7*88ca8e80SRichard Henderson  * of this software and associated documentation files (the "Software"), to deal
8*88ca8e80SRichard Henderson  * in the Software without restriction, including without limitation the rights
9*88ca8e80SRichard Henderson  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10*88ca8e80SRichard Henderson  * copies of the Software, and to permit persons to whom the Software is
11*88ca8e80SRichard Henderson  * furnished to do so, subject to the following conditions:
12*88ca8e80SRichard Henderson  *
13*88ca8e80SRichard Henderson  * The above copyright notice and this permission notice shall be included in
14*88ca8e80SRichard Henderson  * all copies or substantial portions of the Software.
15*88ca8e80SRichard Henderson  *
16*88ca8e80SRichard Henderson  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*88ca8e80SRichard Henderson  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*88ca8e80SRichard Henderson  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19*88ca8e80SRichard Henderson  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*88ca8e80SRichard Henderson  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*88ca8e80SRichard Henderson  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22*88ca8e80SRichard Henderson  * THE SOFTWARE.
23*88ca8e80SRichard Henderson  */
24*88ca8e80SRichard Henderson #include "qemu/osdep.h"
25*88ca8e80SRichard Henderson #include "qemu-common.h"
26*88ca8e80SRichard Henderson #include "qemu/cutils.h"
27*88ca8e80SRichard Henderson 
28*88ca8e80SRichard Henderson 
29*88ca8e80SRichard Henderson /* vector definitions */
30*88ca8e80SRichard Henderson #ifdef __ALTIVEC__
31*88ca8e80SRichard Henderson #include <altivec.h>
32*88ca8e80SRichard Henderson /* The altivec.h header says we're allowed to undef these for
33*88ca8e80SRichard Henderson  * C++ compatibility.  Here we don't care about C++, but we
34*88ca8e80SRichard Henderson  * undef them anyway to avoid namespace pollution.
35*88ca8e80SRichard Henderson  */
36*88ca8e80SRichard Henderson #undef vector
37*88ca8e80SRichard Henderson #undef pixel
38*88ca8e80SRichard Henderson #undef bool
39*88ca8e80SRichard Henderson #define VECTYPE        __vector unsigned char
40*88ca8e80SRichard Henderson #define SPLAT(p)       vec_splat(vec_ld(0, p), 0)
41*88ca8e80SRichard Henderson #define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
42*88ca8e80SRichard Henderson #define VEC_OR(v1, v2) ((v1) | (v2))
43*88ca8e80SRichard Henderson /* altivec.h may redefine the bool macro as vector type.
44*88ca8e80SRichard Henderson  * Reset it to POSIX semantics. */
45*88ca8e80SRichard Henderson #define bool _Bool
46*88ca8e80SRichard Henderson #elif defined __SSE2__
47*88ca8e80SRichard Henderson #include <emmintrin.h>
48*88ca8e80SRichard Henderson #define VECTYPE        __m128i
49*88ca8e80SRichard Henderson #define SPLAT(p)       _mm_set1_epi8(*(p))
50*88ca8e80SRichard Henderson #define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
51*88ca8e80SRichard Henderson #define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
52*88ca8e80SRichard Henderson #elif defined(__aarch64__)
53*88ca8e80SRichard Henderson #include "arm_neon.h"
54*88ca8e80SRichard Henderson #define VECTYPE        uint64x2_t
55*88ca8e80SRichard Henderson #define ALL_EQ(v1, v2) \
56*88ca8e80SRichard Henderson         ((vgetq_lane_u64(v1, 0) == vgetq_lane_u64(v2, 0)) && \
57*88ca8e80SRichard Henderson          (vgetq_lane_u64(v1, 1) == vgetq_lane_u64(v2, 1)))
58*88ca8e80SRichard Henderson #define VEC_OR(v1, v2) ((v1) | (v2))
59*88ca8e80SRichard Henderson #else
60*88ca8e80SRichard Henderson #define VECTYPE        unsigned long
61*88ca8e80SRichard Henderson #define SPLAT(p)       (*(p) * (~0UL / 255))
62*88ca8e80SRichard Henderson #define ALL_EQ(v1, v2) ((v1) == (v2))
63*88ca8e80SRichard Henderson #define VEC_OR(v1, v2) ((v1) | (v2))
64*88ca8e80SRichard Henderson #endif
65*88ca8e80SRichard Henderson 
66*88ca8e80SRichard Henderson #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
67*88ca8e80SRichard Henderson 
68*88ca8e80SRichard Henderson static bool
69*88ca8e80SRichard Henderson can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
70*88ca8e80SRichard Henderson {
71*88ca8e80SRichard Henderson     return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
72*88ca8e80SRichard Henderson                    * sizeof(VECTYPE)) == 0
73*88ca8e80SRichard Henderson             && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
74*88ca8e80SRichard Henderson }
75*88ca8e80SRichard Henderson 
76*88ca8e80SRichard Henderson /*
77*88ca8e80SRichard Henderson  * Searches for an area with non-zero content in a buffer
78*88ca8e80SRichard Henderson  *
79*88ca8e80SRichard Henderson  * Attention! The len must be a multiple of
80*88ca8e80SRichard Henderson  * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
81*88ca8e80SRichard Henderson  * and addr must be a multiple of sizeof(VECTYPE) due to
82*88ca8e80SRichard Henderson  * restriction of optimizations in this function.
83*88ca8e80SRichard Henderson  *
84*88ca8e80SRichard Henderson  * can_use_buffer_find_nonzero_offset_inner() can be used to
85*88ca8e80SRichard Henderson  * check these requirements.
86*88ca8e80SRichard Henderson  *
87*88ca8e80SRichard Henderson  * The return value is the offset of the non-zero area rounded
88*88ca8e80SRichard Henderson  * down to a multiple of sizeof(VECTYPE) for the first
89*88ca8e80SRichard Henderson  * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
90*88ca8e80SRichard Henderson  * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
91*88ca8e80SRichard Henderson  * afterwards.
92*88ca8e80SRichard Henderson  *
93*88ca8e80SRichard Henderson  * If the buffer is all zero the return value is equal to len.
94*88ca8e80SRichard Henderson  */
95*88ca8e80SRichard Henderson 
96*88ca8e80SRichard Henderson static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
97*88ca8e80SRichard Henderson {
98*88ca8e80SRichard Henderson     const VECTYPE *p = buf;
99*88ca8e80SRichard Henderson     const VECTYPE zero = (VECTYPE){0};
100*88ca8e80SRichard Henderson     size_t i;
101*88ca8e80SRichard Henderson 
102*88ca8e80SRichard Henderson     assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
103*88ca8e80SRichard Henderson 
104*88ca8e80SRichard Henderson     if (!len) {
105*88ca8e80SRichard Henderson         return 0;
106*88ca8e80SRichard Henderson     }
107*88ca8e80SRichard Henderson 
108*88ca8e80SRichard Henderson     for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
109*88ca8e80SRichard Henderson         if (!ALL_EQ(p[i], zero)) {
110*88ca8e80SRichard Henderson             return i * sizeof(VECTYPE);
111*88ca8e80SRichard Henderson         }
112*88ca8e80SRichard Henderson     }
113*88ca8e80SRichard Henderson 
114*88ca8e80SRichard Henderson     for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
115*88ca8e80SRichard Henderson          i < len / sizeof(VECTYPE);
116*88ca8e80SRichard Henderson          i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
117*88ca8e80SRichard Henderson         VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
118*88ca8e80SRichard Henderson         VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
119*88ca8e80SRichard Henderson         VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
120*88ca8e80SRichard Henderson         VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
121*88ca8e80SRichard Henderson         VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
122*88ca8e80SRichard Henderson         VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
123*88ca8e80SRichard Henderson         if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
124*88ca8e80SRichard Henderson             break;
125*88ca8e80SRichard Henderson         }
126*88ca8e80SRichard Henderson     }
127*88ca8e80SRichard Henderson 
128*88ca8e80SRichard Henderson     return i * sizeof(VECTYPE);
129*88ca8e80SRichard Henderson }
130*88ca8e80SRichard Henderson 
131*88ca8e80SRichard Henderson #if defined CONFIG_AVX2_OPT
132*88ca8e80SRichard Henderson #pragma GCC push_options
133*88ca8e80SRichard Henderson #pragma GCC target("avx2")
134*88ca8e80SRichard Henderson #include <cpuid.h>
135*88ca8e80SRichard Henderson #include <immintrin.h>
136*88ca8e80SRichard Henderson 
137*88ca8e80SRichard Henderson #define AVX2_VECTYPE        __m256i
138*88ca8e80SRichard Henderson #define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
139*88ca8e80SRichard Henderson #define AVX2_ALL_EQ(v1, v2) \
140*88ca8e80SRichard Henderson     (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
141*88ca8e80SRichard Henderson #define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
142*88ca8e80SRichard Henderson 
143*88ca8e80SRichard Henderson static bool
144*88ca8e80SRichard Henderson can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
145*88ca8e80SRichard Henderson {
146*88ca8e80SRichard Henderson     return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
147*88ca8e80SRichard Henderson                    * sizeof(AVX2_VECTYPE)) == 0
148*88ca8e80SRichard Henderson             && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
149*88ca8e80SRichard Henderson }
150*88ca8e80SRichard Henderson 
151*88ca8e80SRichard Henderson static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
152*88ca8e80SRichard Henderson {
153*88ca8e80SRichard Henderson     const AVX2_VECTYPE *p = buf;
154*88ca8e80SRichard Henderson     const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
155*88ca8e80SRichard Henderson     size_t i;
156*88ca8e80SRichard Henderson 
157*88ca8e80SRichard Henderson     assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
158*88ca8e80SRichard Henderson 
159*88ca8e80SRichard Henderson     if (!len) {
160*88ca8e80SRichard Henderson         return 0;
161*88ca8e80SRichard Henderson     }
162*88ca8e80SRichard Henderson 
163*88ca8e80SRichard Henderson     for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
164*88ca8e80SRichard Henderson         if (!AVX2_ALL_EQ(p[i], zero)) {
165*88ca8e80SRichard Henderson             return i * sizeof(AVX2_VECTYPE);
166*88ca8e80SRichard Henderson         }
167*88ca8e80SRichard Henderson     }
168*88ca8e80SRichard Henderson 
169*88ca8e80SRichard Henderson     for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
170*88ca8e80SRichard Henderson          i < len / sizeof(AVX2_VECTYPE);
171*88ca8e80SRichard Henderson          i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
172*88ca8e80SRichard Henderson         AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
173*88ca8e80SRichard Henderson         AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
174*88ca8e80SRichard Henderson         AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
175*88ca8e80SRichard Henderson         AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
176*88ca8e80SRichard Henderson         AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
177*88ca8e80SRichard Henderson         AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
178*88ca8e80SRichard Henderson         if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
179*88ca8e80SRichard Henderson             break;
180*88ca8e80SRichard Henderson         }
181*88ca8e80SRichard Henderson     }
182*88ca8e80SRichard Henderson 
183*88ca8e80SRichard Henderson     return i * sizeof(AVX2_VECTYPE);
184*88ca8e80SRichard Henderson }
185*88ca8e80SRichard Henderson 
186*88ca8e80SRichard Henderson static bool avx2_support(void)
187*88ca8e80SRichard Henderson {
188*88ca8e80SRichard Henderson     int a, b, c, d;
189*88ca8e80SRichard Henderson 
190*88ca8e80SRichard Henderson     if (__get_cpuid_max(0, NULL) < 7) {
191*88ca8e80SRichard Henderson         return false;
192*88ca8e80SRichard Henderson     }
193*88ca8e80SRichard Henderson 
194*88ca8e80SRichard Henderson     __cpuid_count(7, 0, a, b, c, d);
195*88ca8e80SRichard Henderson 
196*88ca8e80SRichard Henderson     return b & bit_AVX2;
197*88ca8e80SRichard Henderson }
198*88ca8e80SRichard Henderson 
199*88ca8e80SRichard Henderson bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
200*88ca8e80SRichard Henderson          __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
201*88ca8e80SRichard Henderson size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
202*88ca8e80SRichard Henderson          __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
203*88ca8e80SRichard Henderson 
204*88ca8e80SRichard Henderson static void *buffer_find_nonzero_offset_ifunc(void)
205*88ca8e80SRichard Henderson {
206*88ca8e80SRichard Henderson     typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
207*88ca8e80SRichard Henderson         buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
208*88ca8e80SRichard Henderson 
209*88ca8e80SRichard Henderson     return func;
210*88ca8e80SRichard Henderson }
211*88ca8e80SRichard Henderson 
212*88ca8e80SRichard Henderson static void *can_use_buffer_find_nonzero_offset_ifunc(void)
213*88ca8e80SRichard Henderson {
214*88ca8e80SRichard Henderson     typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
215*88ca8e80SRichard Henderson         can_use_buffer_find_nonzero_offset_avx2 :
216*88ca8e80SRichard Henderson         can_use_buffer_find_nonzero_offset_inner;
217*88ca8e80SRichard Henderson 
218*88ca8e80SRichard Henderson     return func;
219*88ca8e80SRichard Henderson }
220*88ca8e80SRichard Henderson #pragma GCC pop_options
221*88ca8e80SRichard Henderson #else
222*88ca8e80SRichard Henderson bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
223*88ca8e80SRichard Henderson {
224*88ca8e80SRichard Henderson     return can_use_buffer_find_nonzero_offset_inner(buf, len);
225*88ca8e80SRichard Henderson }
226*88ca8e80SRichard Henderson 
227*88ca8e80SRichard Henderson size_t buffer_find_nonzero_offset(const void *buf, size_t len)
228*88ca8e80SRichard Henderson {
229*88ca8e80SRichard Henderson     return buffer_find_nonzero_offset_inner(buf, len);
230*88ca8e80SRichard Henderson }
231*88ca8e80SRichard Henderson #endif
232*88ca8e80SRichard Henderson 
233*88ca8e80SRichard Henderson /*
234*88ca8e80SRichard Henderson  * Checks if a buffer is all zeroes
235*88ca8e80SRichard Henderson  *
236*88ca8e80SRichard Henderson  * Attention! The len must be a multiple of 4 * sizeof(long) due to
237*88ca8e80SRichard Henderson  * restriction of optimizations in this function.
238*88ca8e80SRichard Henderson  */
239*88ca8e80SRichard Henderson bool buffer_is_zero(const void *buf, size_t len)
240*88ca8e80SRichard Henderson {
241*88ca8e80SRichard Henderson     /*
242*88ca8e80SRichard Henderson      * Use long as the biggest available internal data type that fits into the
243*88ca8e80SRichard Henderson      * CPU register and unroll the loop to smooth out the effect of memory
244*88ca8e80SRichard Henderson      * latency.
245*88ca8e80SRichard Henderson      */
246*88ca8e80SRichard Henderson 
247*88ca8e80SRichard Henderson     size_t i;
248*88ca8e80SRichard Henderson     long d0, d1, d2, d3;
249*88ca8e80SRichard Henderson     const long * const data = buf;
250*88ca8e80SRichard Henderson 
251*88ca8e80SRichard Henderson     /* use vector optimized zero check if possible */
252*88ca8e80SRichard Henderson     if (can_use_buffer_find_nonzero_offset(buf, len)) {
253*88ca8e80SRichard Henderson         return buffer_find_nonzero_offset(buf, len) == len;
254*88ca8e80SRichard Henderson     }
255*88ca8e80SRichard Henderson 
256*88ca8e80SRichard Henderson     assert(len % (4 * sizeof(long)) == 0);
257*88ca8e80SRichard Henderson     len /= sizeof(long);
258*88ca8e80SRichard Henderson 
259*88ca8e80SRichard Henderson     for (i = 0; i < len; i += 4) {
260*88ca8e80SRichard Henderson         d0 = data[i + 0];
261*88ca8e80SRichard Henderson         d1 = data[i + 1];
262*88ca8e80SRichard Henderson         d2 = data[i + 2];
263*88ca8e80SRichard Henderson         d3 = data[i + 3];
264*88ca8e80SRichard Henderson 
265*88ca8e80SRichard Henderson         if (d0 || d1 || d2 || d3) {
266*88ca8e80SRichard Henderson             return false;
267*88ca8e80SRichard Henderson         }
268*88ca8e80SRichard Henderson     }
269*88ca8e80SRichard Henderson 
270*88ca8e80SRichard Henderson     return true;
271*88ca8e80SRichard Henderson }
272*88ca8e80SRichard Henderson 
273