xref: /openbmc/qemu/util/bufferiszero.c (revision 8f13a39dc02ea8a3e923102a8444185630c635ea)
188ca8e80SRichard Henderson /*
288ca8e80SRichard Henderson  * Simple C functions to supplement the C library
388ca8e80SRichard Henderson  *
488ca8e80SRichard Henderson  * Copyright (c) 2006 Fabrice Bellard
588ca8e80SRichard Henderson  *
688ca8e80SRichard Henderson  * Permission is hereby granted, free of charge, to any person obtaining a copy
788ca8e80SRichard Henderson  * of this software and associated documentation files (the "Software"), to deal
888ca8e80SRichard Henderson  * in the Software without restriction, including without limitation the rights
988ca8e80SRichard Henderson  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1088ca8e80SRichard Henderson  * copies of the Software, and to permit persons to whom the Software is
1188ca8e80SRichard Henderson  * furnished to do so, subject to the following conditions:
1288ca8e80SRichard Henderson  *
1388ca8e80SRichard Henderson  * The above copyright notice and this permission notice shall be included in
1488ca8e80SRichard Henderson  * all copies or substantial portions of the Software.
1588ca8e80SRichard Henderson  *
1688ca8e80SRichard Henderson  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1788ca8e80SRichard Henderson  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1888ca8e80SRichard Henderson  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1988ca8e80SRichard Henderson  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2088ca8e80SRichard Henderson  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2188ca8e80SRichard Henderson  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2288ca8e80SRichard Henderson  * THE SOFTWARE.
2388ca8e80SRichard Henderson  */
2488ca8e80SRichard Henderson #include "qemu/osdep.h"
2588ca8e80SRichard Henderson #include "qemu/cutils.h"
265e33a872SRichard Henderson #include "qemu/bswap.h"
2788ca8e80SRichard Henderson 
285e33a872SRichard Henderson static bool
295e33a872SRichard Henderson buffer_zero_int(const void *buf, size_t len)
305e33a872SRichard Henderson {
315e33a872SRichard Henderson     if (unlikely(len < 8)) {
325e33a872SRichard Henderson         /* For a very small buffer, simply accumulate all the bytes.  */
335e33a872SRichard Henderson         const unsigned char *p = buf;
345e33a872SRichard Henderson         const unsigned char *e = buf + len;
355e33a872SRichard Henderson         unsigned char t = 0;
365e33a872SRichard Henderson 
375e33a872SRichard Henderson         do {
385e33a872SRichard Henderson             t |= *p++;
395e33a872SRichard Henderson         } while (p < e);
405e33a872SRichard Henderson 
415e33a872SRichard Henderson         return t == 0;
425e33a872SRichard Henderson     } else {
435e33a872SRichard Henderson         /* Otherwise, use the unaligned memory access functions to
445e33a872SRichard Henderson            handle the beginning and end of the buffer, with a couple
455e33a872SRichard Henderson            of loops handling the middle aligned section.  */
465e33a872SRichard Henderson         uint64_t t = ldq_he_p(buf);
475e33a872SRichard Henderson         const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
485e33a872SRichard Henderson         const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
495e33a872SRichard Henderson 
505e33a872SRichard Henderson         for (; p + 8 <= e; p += 8) {
515e33a872SRichard Henderson             __builtin_prefetch(p + 8);
525e33a872SRichard Henderson             if (t) {
535e33a872SRichard Henderson                 return false;
545e33a872SRichard Henderson             }
555e33a872SRichard Henderson             t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
565e33a872SRichard Henderson         }
575e33a872SRichard Henderson         while (p < e) {
585e33a872SRichard Henderson             t |= *p++;
595e33a872SRichard Henderson         }
605e33a872SRichard Henderson         t |= ldq_he_p(buf + len - 8);
615e33a872SRichard Henderson 
625e33a872SRichard Henderson         return t == 0;
635e33a872SRichard Henderson     }
645e33a872SRichard Henderson }
655e33a872SRichard Henderson 
6627f08ea1SRobert Hoo #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
675e33a872SRichard Henderson /* Do not use push_options pragmas unnecessarily, because clang
685e33a872SRichard Henderson  * does not support them.
695e33a872SRichard Henderson  */
7027f08ea1SRobert Hoo #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
715e33a872SRichard Henderson #pragma GCC push_options
725e33a872SRichard Henderson #pragma GCC target("sse2")
735e33a872SRichard Henderson #endif
7488ca8e80SRichard Henderson #include <emmintrin.h>
75d9911d14SRichard Henderson 
76d9911d14SRichard Henderson /* Note that each of these vectorized functions require len >= 64.  */
77d9911d14SRichard Henderson 
78d9911d14SRichard Henderson static bool
79d9911d14SRichard Henderson buffer_zero_sse2(const void *buf, size_t len)
80d9911d14SRichard Henderson {
81d9911d14SRichard Henderson     __m128i t = _mm_loadu_si128(buf);
82d9911d14SRichard Henderson     __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
83d9911d14SRichard Henderson     __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
84d9911d14SRichard Henderson     __m128i zero = _mm_setzero_si128();
85d9911d14SRichard Henderson 
86d9911d14SRichard Henderson     /* Loop over 16-byte aligned blocks of 64.  */
87d9911d14SRichard Henderson     while (likely(p <= e)) {
88d9911d14SRichard Henderson         __builtin_prefetch(p);
89d9911d14SRichard Henderson         t = _mm_cmpeq_epi8(t, zero);
90d9911d14SRichard Henderson         if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
91d9911d14SRichard Henderson             return false;
92d9911d14SRichard Henderson         }
93d9911d14SRichard Henderson         t = p[-4] | p[-3] | p[-2] | p[-1];
94d9911d14SRichard Henderson         p += 4;
95d9911d14SRichard Henderson     }
96d9911d14SRichard Henderson 
97d9911d14SRichard Henderson     /* Finish the aligned tail.  */
98d9911d14SRichard Henderson     t |= e[-3];
99d9911d14SRichard Henderson     t |= e[-2];
100d9911d14SRichard Henderson     t |= e[-1];
101d9911d14SRichard Henderson 
102d9911d14SRichard Henderson     /* Finish the unaligned tail.  */
103d9911d14SRichard Henderson     t |= _mm_loadu_si128(buf + len - 16);
104d9911d14SRichard Henderson 
105d9911d14SRichard Henderson     return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
106d9911d14SRichard Henderson }
10727f08ea1SRobert Hoo #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
1085e33a872SRichard Henderson #pragma GCC pop_options
10988ca8e80SRichard Henderson #endif
11088ca8e80SRichard Henderson 
1115e33a872SRichard Henderson #ifdef CONFIG_AVX2_OPT
112d9911d14SRichard Henderson /* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,
113d9911d14SRichard Henderson  * the includes have to be within the corresponding push_options region, and
114d9911d14SRichard Henderson  * therefore the regions themselves have to be ordered with increasing ISA.
115d9911d14SRichard Henderson  */
11688ca8e80SRichard Henderson #pragma GCC push_options
11786444f08SPaolo Bonzini #pragma GCC target("sse4")
11886444f08SPaolo Bonzini #include <smmintrin.h>
11986444f08SPaolo Bonzini 
120d9911d14SRichard Henderson static bool
121d9911d14SRichard Henderson buffer_zero_sse4(const void *buf, size_t len)
122d9911d14SRichard Henderson {
123d9911d14SRichard Henderson     __m128i t = _mm_loadu_si128(buf);
124d9911d14SRichard Henderson     __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
125d9911d14SRichard Henderson     __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
126d9911d14SRichard Henderson 
127d9911d14SRichard Henderson     /* Loop over 16-byte aligned blocks of 64.  */
128d9911d14SRichard Henderson     while (likely(p <= e)) {
129d9911d14SRichard Henderson         __builtin_prefetch(p);
130d9911d14SRichard Henderson         if (unlikely(!_mm_testz_si128(t, t))) {
131d9911d14SRichard Henderson             return false;
132d9911d14SRichard Henderson         }
133d9911d14SRichard Henderson         t = p[-4] | p[-3] | p[-2] | p[-1];
134d9911d14SRichard Henderson         p += 4;
135d9911d14SRichard Henderson     }
136d9911d14SRichard Henderson 
137d9911d14SRichard Henderson     /* Finish the aligned tail.  */
138d9911d14SRichard Henderson     t |= e[-3];
139d9911d14SRichard Henderson     t |= e[-2];
140d9911d14SRichard Henderson     t |= e[-1];
141d9911d14SRichard Henderson 
142d9911d14SRichard Henderson     /* Finish the unaligned tail.  */
143d9911d14SRichard Henderson     t |= _mm_loadu_si128(buf + len - 16);
144d9911d14SRichard Henderson 
145d9911d14SRichard Henderson     return _mm_testz_si128(t, t);
146d9911d14SRichard Henderson }
147d9911d14SRichard Henderson 
148d9911d14SRichard Henderson #pragma GCC pop_options
14986444f08SPaolo Bonzini #pragma GCC push_options
15088ca8e80SRichard Henderson #pragma GCC target("avx2")
15188ca8e80SRichard Henderson #include <immintrin.h>
152d9911d14SRichard Henderson 
153d9911d14SRichard Henderson static bool
154d9911d14SRichard Henderson buffer_zero_avx2(const void *buf, size_t len)
155d9911d14SRichard Henderson {
156d9911d14SRichard Henderson     /* Begin with an unaligned head of 32 bytes.  */
157d9911d14SRichard Henderson     __m256i t = _mm256_loadu_si256(buf);
158d9911d14SRichard Henderson     __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
159d9911d14SRichard Henderson     __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
160d9911d14SRichard Henderson 
161d9911d14SRichard Henderson     /* Loop over 32-byte aligned blocks of 128.  */
162*8f13a39dSRobert Hoo     while (p <= e) {
163d9911d14SRichard Henderson         __builtin_prefetch(p);
164d9911d14SRichard Henderson         if (unlikely(!_mm256_testz_si256(t, t))) {
165d9911d14SRichard Henderson             return false;
166d9911d14SRichard Henderson         }
167d9911d14SRichard Henderson         t = p[-4] | p[-3] | p[-2] | p[-1];
168d9911d14SRichard Henderson         p += 4;
169*8f13a39dSRobert Hoo     } ;
170d9911d14SRichard Henderson 
171d9911d14SRichard Henderson     /* Finish the last block of 128 unaligned.  */
172d9911d14SRichard Henderson     t |= _mm256_loadu_si256(buf + len - 4 * 32);
173d9911d14SRichard Henderson     t |= _mm256_loadu_si256(buf + len - 3 * 32);
174d9911d14SRichard Henderson     t |= _mm256_loadu_si256(buf + len - 2 * 32);
175d9911d14SRichard Henderson     t |= _mm256_loadu_si256(buf + len - 1 * 32);
176d9911d14SRichard Henderson 
177d9911d14SRichard Henderson     return _mm256_testz_si256(t, t);
178d9911d14SRichard Henderson }
17988ca8e80SRichard Henderson #pragma GCC pop_options
180d9911d14SRichard Henderson #endif /* CONFIG_AVX2_OPT */
181d9911d14SRichard Henderson 
18227f08ea1SRobert Hoo #ifdef CONFIG_AVX512F_OPT
18327f08ea1SRobert Hoo #pragma GCC push_options
18427f08ea1SRobert Hoo #pragma GCC target("avx512f")
18527f08ea1SRobert Hoo #include <immintrin.h>
18627f08ea1SRobert Hoo 
18727f08ea1SRobert Hoo static bool
18827f08ea1SRobert Hoo buffer_zero_avx512(const void *buf, size_t len)
18927f08ea1SRobert Hoo {
19027f08ea1SRobert Hoo     /* Begin with an unaligned head of 64 bytes.  */
19127f08ea1SRobert Hoo     __m512i t = _mm512_loadu_si512(buf);
19227f08ea1SRobert Hoo     __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
19327f08ea1SRobert Hoo     __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64);
19427f08ea1SRobert Hoo 
19527f08ea1SRobert Hoo     /* Loop over 64-byte aligned blocks of 256.  */
19627f08ea1SRobert Hoo     while (p <= e) {
19727f08ea1SRobert Hoo         __builtin_prefetch(p);
19827f08ea1SRobert Hoo         if (unlikely(_mm512_test_epi64_mask(t, t))) {
19927f08ea1SRobert Hoo             return false;
20027f08ea1SRobert Hoo         }
20127f08ea1SRobert Hoo         t = p[-4] | p[-3] | p[-2] | p[-1];
20227f08ea1SRobert Hoo         p += 4;
20327f08ea1SRobert Hoo     }
20427f08ea1SRobert Hoo 
20527f08ea1SRobert Hoo     t |= _mm512_loadu_si512(buf + len - 4 * 64);
20627f08ea1SRobert Hoo     t |= _mm512_loadu_si512(buf + len - 3 * 64);
20727f08ea1SRobert Hoo     t |= _mm512_loadu_si512(buf + len - 2 * 64);
20827f08ea1SRobert Hoo     t |= _mm512_loadu_si512(buf + len - 1 * 64);
20927f08ea1SRobert Hoo 
21027f08ea1SRobert Hoo     return !_mm512_test_epi64_mask(t, t);
21127f08ea1SRobert Hoo 
21227f08ea1SRobert Hoo }
21327f08ea1SRobert Hoo #pragma GCC pop_options
21427f08ea1SRobert Hoo #endif
21527f08ea1SRobert Hoo 
21627f08ea1SRobert Hoo 
217d9911d14SRichard Henderson /* Note that for test_buffer_is_zero_next_accel, the most preferred
218d9911d14SRichard Henderson  * ISA must have the least significant bit.
219d9911d14SRichard Henderson  */
22027f08ea1SRobert Hoo #define CACHE_AVX512F 1
22127f08ea1SRobert Hoo #define CACHE_AVX2    2
22227f08ea1SRobert Hoo #define CACHE_SSE4    4
22327f08ea1SRobert Hoo #define CACHE_SSE2    8
224d9911d14SRichard Henderson 
225d9911d14SRichard Henderson /* Make sure that these variables are appropriately initialized when
226d9911d14SRichard Henderson  * SSE2 is enabled on the compiler command-line, but the compiler is
2275dd89908SRichard Henderson  * too old to support CONFIG_AVX2_OPT.
228d9911d14SRichard Henderson  */
22927f08ea1SRobert Hoo #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
230d9911d14SRichard Henderson # define INIT_CACHE 0
231d9911d14SRichard Henderson # define INIT_ACCEL buffer_zero_int
232d9911d14SRichard Henderson #else
233d9911d14SRichard Henderson # ifndef __SSE2__
234d9911d14SRichard Henderson #  error "ISA selection confusion"
235d9911d14SRichard Henderson # endif
236d9911d14SRichard Henderson # define INIT_CACHE CACHE_SSE2
237d9911d14SRichard Henderson # define INIT_ACCEL buffer_zero_sse2
2385e33a872SRichard Henderson #endif
2395e33a872SRichard Henderson 
240d9911d14SRichard Henderson static unsigned cpuid_cache = INIT_CACHE;
241d9911d14SRichard Henderson static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
24227f08ea1SRobert Hoo static int length_to_accel = 64;
2435e33a872SRichard Henderson 
244d9911d14SRichard Henderson static void init_accel(unsigned cache)
245d9911d14SRichard Henderson {
246d9911d14SRichard Henderson     bool (*fn)(const void *, size_t) = buffer_zero_int;
247d9911d14SRichard Henderson     if (cache & CACHE_SSE2) {
248d9911d14SRichard Henderson         fn = buffer_zero_sse2;
249b87c99d0SRobert Hoo         length_to_accel = 64;
250d9911d14SRichard Henderson     }
251d9911d14SRichard Henderson #ifdef CONFIG_AVX2_OPT
252d9911d14SRichard Henderson     if (cache & CACHE_SSE4) {
253d9911d14SRichard Henderson         fn = buffer_zero_sse4;
254b87c99d0SRobert Hoo         length_to_accel = 64;
255d9911d14SRichard Henderson     }
256d9911d14SRichard Henderson     if (cache & CACHE_AVX2) {
257d9911d14SRichard Henderson         fn = buffer_zero_avx2;
258*8f13a39dSRobert Hoo         length_to_accel = 128;
259d9911d14SRichard Henderson     }
260d9911d14SRichard Henderson #endif
26127f08ea1SRobert Hoo #ifdef CONFIG_AVX512F_OPT
26227f08ea1SRobert Hoo     if (cache & CACHE_AVX512F) {
26327f08ea1SRobert Hoo         fn = buffer_zero_avx512;
26427f08ea1SRobert Hoo         length_to_accel = 256;
26527f08ea1SRobert Hoo     }
26627f08ea1SRobert Hoo #endif
267d9911d14SRichard Henderson     buffer_accel = fn;
268d9911d14SRichard Henderson }
2695e33a872SRichard Henderson 
27027f08ea1SRobert Hoo #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
2715dd89908SRichard Henderson #include "qemu/cpuid.h"
2725dd89908SRichard Henderson 
2735e33a872SRichard Henderson static void __attribute__((constructor)) init_cpuid_cache(void)
27488ca8e80SRichard Henderson {
2755e33a872SRichard Henderson     int max = __get_cpuid_max(0, NULL);
2765e33a872SRichard Henderson     int a, b, c, d;
2775e33a872SRichard Henderson     unsigned cache = 0;
2785e33a872SRichard Henderson 
2795e33a872SRichard Henderson     if (max >= 1) {
2805e33a872SRichard Henderson         __cpuid(1, a, b, c, d);
2815e33a872SRichard Henderson         if (d & bit_SSE2) {
2825e33a872SRichard Henderson             cache |= CACHE_SSE2;
2835e33a872SRichard Henderson         }
2845e33a872SRichard Henderson         if (c & bit_SSE4_1) {
2855e33a872SRichard Henderson             cache |= CACHE_SSE4;
28688ca8e80SRichard Henderson         }
28788ca8e80SRichard Henderson 
2885e33a872SRichard Henderson         /* We must check that AVX is not just available, but usable.  */
289d9911d14SRichard Henderson         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
290d9911d14SRichard Henderson             int bv;
291d9911d14SRichard Henderson             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
2925e33a872SRichard Henderson             __cpuid_count(7, 0, a, b, c, d);
29327f08ea1SRobert Hoo             if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) {
2945e33a872SRichard Henderson                 cache |= CACHE_AVX2;
29588ca8e80SRichard Henderson             }
29627f08ea1SRobert Hoo             /* 0xe6:
29727f08ea1SRobert Hoo             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
29827f08ea1SRobert Hoo             *                    and ZMM16-ZMM31 state are enabled by OS)
29927f08ea1SRobert Hoo             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
30027f08ea1SRobert Hoo             */
30127f08ea1SRobert Hoo             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) {
30227f08ea1SRobert Hoo                 cache |= CACHE_AVX512F;
30327f08ea1SRobert Hoo             }
3045e33a872SRichard Henderson         }
3055e33a872SRichard Henderson     }
3065e33a872SRichard Henderson     cpuid_cache = cache;
307d9911d14SRichard Henderson     init_accel(cache);
3085e33a872SRichard Henderson }
309d9911d14SRichard Henderson #endif /* CONFIG_AVX2_OPT */
3105e33a872SRichard Henderson 
311efad6682SRichard Henderson bool test_buffer_is_zero_next_accel(void)
312efad6682SRichard Henderson {
313efad6682SRichard Henderson     /* If no bits set, we just tested buffer_zero_int, and there
314efad6682SRichard Henderson        are no more acceleration options to test.  */
315efad6682SRichard Henderson     if (cpuid_cache == 0) {
316efad6682SRichard Henderson         return false;
317efad6682SRichard Henderson     }
318efad6682SRichard Henderson     /* Disable the accelerator we used before and select a new one.  */
319efad6682SRichard Henderson     cpuid_cache &= cpuid_cache - 1;
320d9911d14SRichard Henderson     init_accel(cpuid_cache);
321efad6682SRichard Henderson     return true;
322efad6682SRichard Henderson }
323efad6682SRichard Henderson 
3245e33a872SRichard Henderson static bool select_accel_fn(const void *buf, size_t len)
3255e33a872SRichard Henderson {
32627f08ea1SRobert Hoo     if (likely(len >= length_to_accel)) {
327d9911d14SRichard Henderson         return buffer_accel(buf, len);
3285e33a872SRichard Henderson     }
3295e33a872SRichard Henderson     return buffer_zero_int(buf, len);
3305e33a872SRichard Henderson }
3315e33a872SRichard Henderson 
3325e33a872SRichard Henderson #else
3335e33a872SRichard Henderson #define select_accel_fn  buffer_zero_int
334efad6682SRichard Henderson bool test_buffer_is_zero_next_accel(void)
335efad6682SRichard Henderson {
336efad6682SRichard Henderson     return false;
337efad6682SRichard Henderson }
338efad6682SRichard Henderson #endif
339efad6682SRichard Henderson 
34088ca8e80SRichard Henderson /*
34188ca8e80SRichard Henderson  * Checks if a buffer is all zeroes
34288ca8e80SRichard Henderson  */
34388ca8e80SRichard Henderson bool buffer_is_zero(const void *buf, size_t len)
34488ca8e80SRichard Henderson {
3455e33a872SRichard Henderson     if (unlikely(len == 0)) {
34688ca8e80SRichard Henderson         return true;
34788ca8e80SRichard Henderson     }
34888ca8e80SRichard Henderson 
349083d012aSRichard Henderson     /* Fetch the beginning of the buffer while we select the accelerator.  */
350083d012aSRichard Henderson     __builtin_prefetch(buf);
351083d012aSRichard Henderson 
3525e33a872SRichard Henderson     /* Use an optimized zero check if possible.  Note that this also
3535e33a872SRichard Henderson        includes a check for an unrolled loop over 64-bit integers.  */
3545e33a872SRichard Henderson     return select_accel_fn(buf, len);
3555e33a872SRichard Henderson }
356