xref: /openbmc/qemu/util/bufferiszero.c (revision 0100ce2b49725e6ba2fbe8301855978d5d3dc790)
188ca8e80SRichard Henderson /*
288ca8e80SRichard Henderson  * Simple C functions to supplement the C library
388ca8e80SRichard Henderson  *
488ca8e80SRichard Henderson  * Copyright (c) 2006 Fabrice Bellard
588ca8e80SRichard Henderson  *
688ca8e80SRichard Henderson  * Permission is hereby granted, free of charge, to any person obtaining a copy
788ca8e80SRichard Henderson  * of this software and associated documentation files (the "Software"), to deal
888ca8e80SRichard Henderson  * in the Software without restriction, including without limitation the rights
988ca8e80SRichard Henderson  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1088ca8e80SRichard Henderson  * copies of the Software, and to permit persons to whom the Software is
1188ca8e80SRichard Henderson  * furnished to do so, subject to the following conditions:
1288ca8e80SRichard Henderson  *
1388ca8e80SRichard Henderson  * The above copyright notice and this permission notice shall be included in
1488ca8e80SRichard Henderson  * all copies or substantial portions of the Software.
1588ca8e80SRichard Henderson  *
1688ca8e80SRichard Henderson  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1788ca8e80SRichard Henderson  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1888ca8e80SRichard Henderson  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1988ca8e80SRichard Henderson  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2088ca8e80SRichard Henderson  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2188ca8e80SRichard Henderson  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2288ca8e80SRichard Henderson  * THE SOFTWARE.
2388ca8e80SRichard Henderson  */
2488ca8e80SRichard Henderson #include "qemu/osdep.h"
2588ca8e80SRichard Henderson #include "qemu/cutils.h"
265e33a872SRichard Henderson #include "qemu/bswap.h"
2751f4d916SRichard Henderson #include "host/cpuinfo.h"
2888ca8e80SRichard Henderson 
29*0100ce2bSRichard Henderson typedef bool (*biz_accel_fn)(const void *, size_t);
30*0100ce2bSRichard Henderson static biz_accel_fn buffer_is_zero_accel;
31cbe3d526SAlexander Monakov 
327ae6399aSRichard Henderson static bool buffer_is_zero_int_lt256(const void *buf, size_t len)
335e33a872SRichard Henderson {
347ae6399aSRichard Henderson     uint64_t t;
357ae6399aSRichard Henderson     const uint64_t *p, *e;
365e33a872SRichard Henderson 
377ae6399aSRichard Henderson     /*
387ae6399aSRichard Henderson      * Use unaligned memory access functions to handle
397ae6399aSRichard Henderson      * the beginning and end of the buffer.
407ae6399aSRichard Henderson      */
417ae6399aSRichard Henderson     if (unlikely(len <= 8)) {
427ae6399aSRichard Henderson         return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0;
437ae6399aSRichard Henderson     }
447ae6399aSRichard Henderson 
457ae6399aSRichard Henderson     t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
467ae6399aSRichard Henderson     p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
477ae6399aSRichard Henderson     e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
487ae6399aSRichard Henderson 
497ae6399aSRichard Henderson     /* Read 0 to 31 aligned words from the middle. */
507ae6399aSRichard Henderson     while (p < e) {
515e33a872SRichard Henderson         t |= *p++;
527ae6399aSRichard Henderson     }
535e33a872SRichard Henderson     return t == 0;
547ae6399aSRichard Henderson }
555e33a872SRichard Henderson 
567ae6399aSRichard Henderson static bool buffer_is_zero_int_ge256(const void *buf, size_t len)
577ae6399aSRichard Henderson {
587ae6399aSRichard Henderson     /*
597ae6399aSRichard Henderson      * Use unaligned memory access functions to handle
607ae6399aSRichard Henderson      * the beginning and end of the buffer.
617ae6399aSRichard Henderson      */
627ae6399aSRichard Henderson     uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
637ae6399aSRichard Henderson     const uint64_t *p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
647ae6399aSRichard Henderson     const uint64_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
657ae6399aSRichard Henderson 
667ae6399aSRichard Henderson     /* Collect a partial block at the tail end. */
677ae6399aSRichard Henderson     t |= e[-7] | e[-6] | e[-5] | e[-4] | e[-3] | e[-2] | e[-1];
687ae6399aSRichard Henderson 
697ae6399aSRichard Henderson     /*
707ae6399aSRichard Henderson      * Loop over 64 byte blocks.
717ae6399aSRichard Henderson      * With the head and tail removed, e - p >= 30,
727ae6399aSRichard Henderson      * so the loop must iterate at least 3 times.
737ae6399aSRichard Henderson      */
747ae6399aSRichard Henderson     do {
755e33a872SRichard Henderson         if (t) {
765e33a872SRichard Henderson             return false;
775e33a872SRichard Henderson         }
785e33a872SRichard Henderson         t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
797ae6399aSRichard Henderson         p += 8;
807ae6399aSRichard Henderson     } while (p < e - 7);
815e33a872SRichard Henderson 
825e33a872SRichard Henderson     return t == 0;
835e33a872SRichard Henderson }
845e33a872SRichard Henderson 
85d018425cSAlexander Monakov #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
86701ea587SRichard Henderson #include <immintrin.h>
87d9911d14SRichard Henderson 
88f28e0bbeSAlexander Monakov /* Helper for preventing the compiler from reassociating
89f28e0bbeSAlexander Monakov    chains of binary vector operations.  */
90f28e0bbeSAlexander Monakov #define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
91f28e0bbeSAlexander Monakov 
92f28e0bbeSAlexander Monakov /* Note that these vectorized functions may assume len >= 256.  */
93d9911d14SRichard Henderson 
94701ea587SRichard Henderson static bool __attribute__((target("sse2")))
95d9911d14SRichard Henderson buffer_zero_sse2(const void *buf, size_t len)
96d9911d14SRichard Henderson {
97f28e0bbeSAlexander Monakov     /* Unaligned loads at head/tail.  */
98f28e0bbeSAlexander Monakov     __m128i v = *(__m128i_u *)(buf);
99f28e0bbeSAlexander Monakov     __m128i w = *(__m128i_u *)(buf + len - 16);
100f28e0bbeSAlexander Monakov     /* Align head/tail to 16-byte boundaries.  */
101f28e0bbeSAlexander Monakov     const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
102f28e0bbeSAlexander Monakov     const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
103f28e0bbeSAlexander Monakov     __m128i zero = { 0 };
104d9911d14SRichard Henderson 
105f28e0bbeSAlexander Monakov     /* Collect a partial block at tail end.  */
106f28e0bbeSAlexander Monakov     v |= e[-1]; w |= e[-2];
107f28e0bbeSAlexander Monakov     SSE_REASSOC_BARRIER(v, w);
108f28e0bbeSAlexander Monakov     v |= e[-3]; w |= e[-4];
109f28e0bbeSAlexander Monakov     SSE_REASSOC_BARRIER(v, w);
110f28e0bbeSAlexander Monakov     v |= e[-5]; w |= e[-6];
111f28e0bbeSAlexander Monakov     SSE_REASSOC_BARRIER(v, w);
112f28e0bbeSAlexander Monakov     v |= e[-7]; v |= w;
113f28e0bbeSAlexander Monakov 
114f28e0bbeSAlexander Monakov     /*
115f28e0bbeSAlexander Monakov      * Loop over complete 128-byte blocks.
116f28e0bbeSAlexander Monakov      * With the head and tail removed, e - p >= 14, so the loop
117f28e0bbeSAlexander Monakov      * must iterate at least once.
118f28e0bbeSAlexander Monakov      */
119f28e0bbeSAlexander Monakov     do {
120f28e0bbeSAlexander Monakov         v = _mm_cmpeq_epi8(v, zero);
121f28e0bbeSAlexander Monakov         if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
122d9911d14SRichard Henderson             return false;
123d9911d14SRichard Henderson         }
124f28e0bbeSAlexander Monakov         v = p[0]; w = p[1];
125f28e0bbeSAlexander Monakov         SSE_REASSOC_BARRIER(v, w);
126f28e0bbeSAlexander Monakov         v |= p[2]; w |= p[3];
127f28e0bbeSAlexander Monakov         SSE_REASSOC_BARRIER(v, w);
128f28e0bbeSAlexander Monakov         v |= p[4]; w |= p[5];
129f28e0bbeSAlexander Monakov         SSE_REASSOC_BARRIER(v, w);
130f28e0bbeSAlexander Monakov         v |= p[6]; w |= p[7];
131f28e0bbeSAlexander Monakov         SSE_REASSOC_BARRIER(v, w);
132f28e0bbeSAlexander Monakov         v |= w;
133f28e0bbeSAlexander Monakov         p += 8;
134f28e0bbeSAlexander Monakov     } while (p < e - 7);
135d9911d14SRichard Henderson 
136f28e0bbeSAlexander Monakov     return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
137d9911d14SRichard Henderson }
13888ca8e80SRichard Henderson 
1395e33a872SRichard Henderson #ifdef CONFIG_AVX2_OPT
140701ea587SRichard Henderson static bool __attribute__((target("avx2")))
141d9911d14SRichard Henderson buffer_zero_avx2(const void *buf, size_t len)
142d9911d14SRichard Henderson {
143f28e0bbeSAlexander Monakov     /* Unaligned loads at head/tail.  */
144f28e0bbeSAlexander Monakov     __m256i v = *(__m256i_u *)(buf);
145f28e0bbeSAlexander Monakov     __m256i w = *(__m256i_u *)(buf + len - 32);
146f28e0bbeSAlexander Monakov     /* Align head/tail to 32-byte boundaries.  */
147f28e0bbeSAlexander Monakov     const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
148f28e0bbeSAlexander Monakov     const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
149f28e0bbeSAlexander Monakov     __m256i zero = { 0 };
150d9911d14SRichard Henderson 
151f28e0bbeSAlexander Monakov     /* Collect a partial block at tail end.  */
152f28e0bbeSAlexander Monakov     v |= e[-1]; w |= e[-2];
153f28e0bbeSAlexander Monakov     SSE_REASSOC_BARRIER(v, w);
154f28e0bbeSAlexander Monakov     v |= e[-3]; w |= e[-4];
155f28e0bbeSAlexander Monakov     SSE_REASSOC_BARRIER(v, w);
156f28e0bbeSAlexander Monakov     v |= e[-5]; w |= e[-6];
157f28e0bbeSAlexander Monakov     SSE_REASSOC_BARRIER(v, w);
158f28e0bbeSAlexander Monakov     v |= e[-7]; v |= w;
159f28e0bbeSAlexander Monakov 
160f28e0bbeSAlexander Monakov     /* Loop over complete 256-byte blocks.  */
161f28e0bbeSAlexander Monakov     for (; p < e - 7; p += 8) {
162f28e0bbeSAlexander Monakov         /* PTEST is not profitable here.  */
163f28e0bbeSAlexander Monakov         v = _mm256_cmpeq_epi8(v, zero);
164f28e0bbeSAlexander Monakov         if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
165d9911d14SRichard Henderson             return false;
166d9911d14SRichard Henderson         }
167f28e0bbeSAlexander Monakov         v = p[0]; w = p[1];
168f28e0bbeSAlexander Monakov         SSE_REASSOC_BARRIER(v, w);
169f28e0bbeSAlexander Monakov         v |= p[2]; w |= p[3];
170f28e0bbeSAlexander Monakov         SSE_REASSOC_BARRIER(v, w);
171f28e0bbeSAlexander Monakov         v |= p[4]; w |= p[5];
172f28e0bbeSAlexander Monakov         SSE_REASSOC_BARRIER(v, w);
173f28e0bbeSAlexander Monakov         v |= p[6]; w |= p[7];
174f28e0bbeSAlexander Monakov         SSE_REASSOC_BARRIER(v, w);
175f28e0bbeSAlexander Monakov         v |= w;
176f28e0bbeSAlexander Monakov     }
177d9911d14SRichard Henderson 
178f28e0bbeSAlexander Monakov     return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
179d9911d14SRichard Henderson }
180d9911d14SRichard Henderson #endif /* CONFIG_AVX2_OPT */
181d9911d14SRichard Henderson 
18251f4d916SRichard Henderson static unsigned __attribute__((noinline))
18351f4d916SRichard Henderson select_accel_cpuinfo(unsigned info)
184d9911d14SRichard Henderson {
18551f4d916SRichard Henderson     /* Array is sorted in order of algorithm preference. */
18651f4d916SRichard Henderson     static const struct {
18751f4d916SRichard Henderson         unsigned bit;
188*0100ce2bSRichard Henderson         biz_accel_fn fn;
18951f4d916SRichard Henderson     } all[] = {
19051f4d916SRichard Henderson #ifdef CONFIG_AVX2_OPT
191cbe3d526SAlexander Monakov         { CPUINFO_AVX2,    buffer_zero_avx2 },
19251f4d916SRichard Henderson #endif
193cbe3d526SAlexander Monakov         { CPUINFO_SSE2,    buffer_zero_sse2 },
1947ae6399aSRichard Henderson         { CPUINFO_ALWAYS,  buffer_is_zero_int_ge256 },
19551f4d916SRichard Henderson     };
19651f4d916SRichard Henderson 
19751f4d916SRichard Henderson     for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
19851f4d916SRichard Henderson         if (info & all[i].bit) {
199cbe3d526SAlexander Monakov             buffer_is_zero_accel = all[i].fn;
20051f4d916SRichard Henderson             return all[i].bit;
20151f4d916SRichard Henderson         }
20251f4d916SRichard Henderson     }
20351f4d916SRichard Henderson     return 0;
204d9911d14SRichard Henderson }
2055e33a872SRichard Henderson 
206cbe3d526SAlexander Monakov static unsigned used_accel;
207cbe3d526SAlexander Monakov 
20851f4d916SRichard Henderson static void __attribute__((constructor)) init_accel(void)
20988ca8e80SRichard Henderson {
21051f4d916SRichard Henderson     used_accel = select_accel_cpuinfo(cpuinfo_init());
2115e33a872SRichard Henderson }
212cbe3d526SAlexander Monakov 
213cbe3d526SAlexander Monakov #define INIT_ACCEL NULL
2145e33a872SRichard Henderson 
215efad6682SRichard Henderson bool test_buffer_is_zero_next_accel(void)
216efad6682SRichard Henderson {
21751f4d916SRichard Henderson     /*
21851f4d916SRichard Henderson      * Accumulate the accelerators that we've already tested, and
21951f4d916SRichard Henderson      * remove them from the set to test this round.  We'll get back
22051f4d916SRichard Henderson      * a zero from select_accel_cpuinfo when there are no more.
22151f4d916SRichard Henderson      */
22251f4d916SRichard Henderson     unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel);
22351f4d916SRichard Henderson     used_accel |= used;
22451f4d916SRichard Henderson     return used;
225efad6682SRichard Henderson }
2265e33a872SRichard Henderson #else
227efad6682SRichard Henderson bool test_buffer_is_zero_next_accel(void)
228efad6682SRichard Henderson {
229efad6682SRichard Henderson     return false;
230efad6682SRichard Henderson }
231cbe3d526SAlexander Monakov 
2327ae6399aSRichard Henderson #define INIT_ACCEL buffer_is_zero_int_ge256
233efad6682SRichard Henderson #endif
234efad6682SRichard Henderson 
235*0100ce2bSRichard Henderson static biz_accel_fn buffer_is_zero_accel = INIT_ACCEL;
236cbe3d526SAlexander Monakov 
237cbe3d526SAlexander Monakov bool buffer_is_zero_ool(const void *buf, size_t len)
23888ca8e80SRichard Henderson {
2395e33a872SRichard Henderson     if (unlikely(len == 0)) {
24088ca8e80SRichard Henderson         return true;
24188ca8e80SRichard Henderson     }
242cbe3d526SAlexander Monakov     if (!buffer_is_zero_sample3(buf, len)) {
243cbe3d526SAlexander Monakov         return false;
244cbe3d526SAlexander Monakov     }
245cbe3d526SAlexander Monakov     /* All bytes are covered for any len <= 3.  */
246cbe3d526SAlexander Monakov     if (unlikely(len <= 3)) {
247cbe3d526SAlexander Monakov         return true;
248cbe3d526SAlexander Monakov     }
24988ca8e80SRichard Henderson 
250cbe3d526SAlexander Monakov     if (likely(len >= 256)) {
251cbe3d526SAlexander Monakov         return buffer_is_zero_accel(buf, len);
252cbe3d526SAlexander Monakov     }
2537ae6399aSRichard Henderson     return buffer_is_zero_int_lt256(buf, len);
254cbe3d526SAlexander Monakov }
255083d012aSRichard Henderson 
256cbe3d526SAlexander Monakov bool buffer_is_zero_ge256(const void *buf, size_t len)
257cbe3d526SAlexander Monakov {
258cbe3d526SAlexander Monakov     return buffer_is_zero_accel(buf, len);
2595e33a872SRichard Henderson }
260