xref: /openbmc/qemu/util/bufferiszero.c (revision 083d012a388e7e2a8bfd9144c2c9bcceb29a78fc)
188ca8e80SRichard Henderson /*
288ca8e80SRichard Henderson  * Simple C functions to supplement the C library
388ca8e80SRichard Henderson  *
488ca8e80SRichard Henderson  * Copyright (c) 2006 Fabrice Bellard
588ca8e80SRichard Henderson  *
688ca8e80SRichard Henderson  * Permission is hereby granted, free of charge, to any person obtaining a copy
788ca8e80SRichard Henderson  * of this software and associated documentation files (the "Software"), to deal
888ca8e80SRichard Henderson  * in the Software without restriction, including without limitation the rights
988ca8e80SRichard Henderson  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1088ca8e80SRichard Henderson  * copies of the Software, and to permit persons to whom the Software is
1188ca8e80SRichard Henderson  * furnished to do so, subject to the following conditions:
1288ca8e80SRichard Henderson  *
1388ca8e80SRichard Henderson  * The above copyright notice and this permission notice shall be included in
1488ca8e80SRichard Henderson  * all copies or substantial portions of the Software.
1588ca8e80SRichard Henderson  *
1688ca8e80SRichard Henderson  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1788ca8e80SRichard Henderson  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1888ca8e80SRichard Henderson  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1988ca8e80SRichard Henderson  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2088ca8e80SRichard Henderson  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2188ca8e80SRichard Henderson  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2288ca8e80SRichard Henderson  * THE SOFTWARE.
2388ca8e80SRichard Henderson  */
2488ca8e80SRichard Henderson #include "qemu/osdep.h"
2588ca8e80SRichard Henderson #include "qemu-common.h"
2688ca8e80SRichard Henderson #include "qemu/cutils.h"
275e33a872SRichard Henderson #include "qemu/bswap.h"
2888ca8e80SRichard Henderson 
2988ca8e80SRichard Henderson 
3088ca8e80SRichard Henderson /* vector definitions */
315e33a872SRichard Henderson 
325e33a872SRichard Henderson extern void link_error(void);
335e33a872SRichard Henderson 
345e33a872SRichard Henderson #define ACCEL_BUFFER_ZERO(NAME, SIZE, VECTYPE, NONZERO)         \
355e33a872SRichard Henderson static bool NAME(const void *buf, size_t len)                   \
365e33a872SRichard Henderson {                                                               \
375e33a872SRichard Henderson     const void *end = buf + len;                                \
385e33a872SRichard Henderson     do {                                                        \
395e33a872SRichard Henderson         const VECTYPE *p = buf;                                 \
405e33a872SRichard Henderson         VECTYPE t;                                              \
41*083d012aSRichard Henderson         __builtin_prefetch(buf + SIZE);                         \
42*083d012aSRichard Henderson         barrier();                                              \
435e33a872SRichard Henderson         if (SIZE == sizeof(VECTYPE) * 4) {                      \
445e33a872SRichard Henderson             t = (p[0] | p[1]) | (p[2] | p[3]);                  \
455e33a872SRichard Henderson         } else if (SIZE == sizeof(VECTYPE) * 8) {               \
465e33a872SRichard Henderson             t  = p[0] | p[1];                                   \
475e33a872SRichard Henderson             t |= p[2] | p[3];                                   \
485e33a872SRichard Henderson             t |= p[4] | p[5];                                   \
495e33a872SRichard Henderson             t |= p[6] | p[7];                                   \
505e33a872SRichard Henderson         } else {                                                \
515e33a872SRichard Henderson             link_error();                                       \
525e33a872SRichard Henderson         }                                                       \
535e33a872SRichard Henderson         if (unlikely(NONZERO(t))) {                             \
545e33a872SRichard Henderson             return false;                                       \
555e33a872SRichard Henderson         }                                                       \
565e33a872SRichard Henderson         buf += SIZE;                                            \
575e33a872SRichard Henderson     } while (buf < end);                                        \
585e33a872SRichard Henderson     return true;                                                \
595e33a872SRichard Henderson }
605e33a872SRichard Henderson 
615e33a872SRichard Henderson static bool
625e33a872SRichard Henderson buffer_zero_int(const void *buf, size_t len)
635e33a872SRichard Henderson {
645e33a872SRichard Henderson     if (unlikely(len < 8)) {
655e33a872SRichard Henderson         /* For a very small buffer, simply accumulate all the bytes.  */
665e33a872SRichard Henderson         const unsigned char *p = buf;
675e33a872SRichard Henderson         const unsigned char *e = buf + len;
685e33a872SRichard Henderson         unsigned char t = 0;
695e33a872SRichard Henderson 
705e33a872SRichard Henderson         do {
715e33a872SRichard Henderson             t |= *p++;
725e33a872SRichard Henderson         } while (p < e);
735e33a872SRichard Henderson 
745e33a872SRichard Henderson         return t == 0;
755e33a872SRichard Henderson     } else {
765e33a872SRichard Henderson         /* Otherwise, use the unaligned memory access functions to
775e33a872SRichard Henderson            handle the beginning and end of the buffer, with a couple
785e33a872SRichard Henderson            of loops handling the middle aligned section.  */
795e33a872SRichard Henderson         uint64_t t = ldq_he_p(buf);
805e33a872SRichard Henderson         const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
815e33a872SRichard Henderson         const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
825e33a872SRichard Henderson 
835e33a872SRichard Henderson         for (; p + 8 <= e; p += 8) {
845e33a872SRichard Henderson             __builtin_prefetch(p + 8);
855e33a872SRichard Henderson             if (t) {
865e33a872SRichard Henderson                 return false;
875e33a872SRichard Henderson             }
885e33a872SRichard Henderson             t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
895e33a872SRichard Henderson         }
905e33a872SRichard Henderson         while (p < e) {
915e33a872SRichard Henderson             t |= *p++;
925e33a872SRichard Henderson         }
935e33a872SRichard Henderson         t |= ldq_he_p(buf + len - 8);
945e33a872SRichard Henderson 
955e33a872SRichard Henderson         return t == 0;
965e33a872SRichard Henderson     }
975e33a872SRichard Henderson }
985e33a872SRichard Henderson 
9943ff5e01SRichard Henderson #if defined(CONFIG_AVX2_OPT) || (defined(CONFIG_CPUID_H) && defined(__SSE2__))
1005e33a872SRichard Henderson #include <cpuid.h>
1015e33a872SRichard Henderson 
1025e33a872SRichard Henderson /* Do not use push_options pragmas unnecessarily, because clang
1035e33a872SRichard Henderson  * does not support them.
1045e33a872SRichard Henderson  */
1055e33a872SRichard Henderson #ifndef __SSE2__
1065e33a872SRichard Henderson #pragma GCC push_options
1075e33a872SRichard Henderson #pragma GCC target("sse2")
1085e33a872SRichard Henderson #endif
10988ca8e80SRichard Henderson #include <emmintrin.h>
1105e33a872SRichard Henderson #define SSE2_NONZERO(X) \
1115e33a872SRichard Henderson     (_mm_movemask_epi8(_mm_cmpeq_epi8((X), _mm_setzero_si128())) != 0xFFFF)
1125e33a872SRichard Henderson ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO)
1135e33a872SRichard Henderson #ifndef __SSE2__
1145e33a872SRichard Henderson #pragma GCC pop_options
11588ca8e80SRichard Henderson #endif
11688ca8e80SRichard Henderson 
1175e33a872SRichard Henderson #ifdef CONFIG_AVX2_OPT
11888ca8e80SRichard Henderson #pragma GCC push_options
11986444f08SPaolo Bonzini #pragma GCC target("sse4")
12086444f08SPaolo Bonzini #include <smmintrin.h>
12186444f08SPaolo Bonzini #define SSE4_NONZERO(X)  !_mm_testz_si128((X), (X))
12286444f08SPaolo Bonzini ACCEL_BUFFER_ZERO(buffer_zero_sse4, 64, __m128i, SSE4_NONZERO)
12386444f08SPaolo Bonzini #pragma GCC pop_options
12486444f08SPaolo Bonzini 
12586444f08SPaolo Bonzini #pragma GCC push_options
12688ca8e80SRichard Henderson #pragma GCC target("avx2")
12788ca8e80SRichard Henderson #include <immintrin.h>
1285e33a872SRichard Henderson #define AVX2_NONZERO(X)  !_mm256_testz_si256((X), (X))
1295e33a872SRichard Henderson ACCEL_BUFFER_ZERO(buffer_zero_avx2, 128, __m256i, AVX2_NONZERO)
13088ca8e80SRichard Henderson #pragma GCC pop_options
1315e33a872SRichard Henderson #endif
1325e33a872SRichard Henderson 
1335e33a872SRichard Henderson #define CACHE_AVX2    2
1345e33a872SRichard Henderson #define CACHE_AVX1    4
1355e33a872SRichard Henderson #define CACHE_SSE4    8
1365e33a872SRichard Henderson #define CACHE_SSE2    16
1375e33a872SRichard Henderson 
1385e33a872SRichard Henderson static unsigned cpuid_cache;
1395e33a872SRichard Henderson 
1405e33a872SRichard Henderson static void __attribute__((constructor)) init_cpuid_cache(void)
14188ca8e80SRichard Henderson {
1425e33a872SRichard Henderson     int max = __get_cpuid_max(0, NULL);
1435e33a872SRichard Henderson     int a, b, c, d;
1445e33a872SRichard Henderson     unsigned cache = 0;
1455e33a872SRichard Henderson 
1465e33a872SRichard Henderson     if (max >= 1) {
1475e33a872SRichard Henderson         __cpuid(1, a, b, c, d);
1485e33a872SRichard Henderson         if (d & bit_SSE2) {
1495e33a872SRichard Henderson             cache |= CACHE_SSE2;
1505e33a872SRichard Henderson         }
1515e33a872SRichard Henderson #ifdef CONFIG_AVX2_OPT
1525e33a872SRichard Henderson         if (c & bit_SSE4_1) {
1535e33a872SRichard Henderson             cache |= CACHE_SSE4;
15488ca8e80SRichard Henderson         }
15588ca8e80SRichard Henderson 
1565e33a872SRichard Henderson         /* We must check that AVX is not just available, but usable.  */
1575e33a872SRichard Henderson         if ((c & bit_OSXSAVE) && (c & bit_AVX)) {
1585e33a872SRichard Henderson             __asm("xgetbv" : "=a"(a), "=d"(d) : "c"(0));
1595e33a872SRichard Henderson             if ((a & 6) == 6) {
1605e33a872SRichard Henderson                 cache |= CACHE_AVX1;
1615e33a872SRichard Henderson                 if (max >= 7) {
1625e33a872SRichard Henderson                     __cpuid_count(7, 0, a, b, c, d);
1635e33a872SRichard Henderson                     if (b & bit_AVX2) {
1645e33a872SRichard Henderson                         cache |= CACHE_AVX2;
16588ca8e80SRichard Henderson                     }
1665e33a872SRichard Henderson                 }
1675e33a872SRichard Henderson             }
1685e33a872SRichard Henderson         }
1695e33a872SRichard Henderson #endif
1705e33a872SRichard Henderson     }
1715e33a872SRichard Henderson     cpuid_cache = cache;
1725e33a872SRichard Henderson }
1735e33a872SRichard Henderson 
174efad6682SRichard Henderson #define HAVE_NEXT_ACCEL
175efad6682SRichard Henderson bool test_buffer_is_zero_next_accel(void)
176efad6682SRichard Henderson {
177efad6682SRichard Henderson     /* If no bits set, we just tested buffer_zero_int, and there
178efad6682SRichard Henderson        are no more acceleration options to test.  */
179efad6682SRichard Henderson     if (cpuid_cache == 0) {
180efad6682SRichard Henderson         return false;
181efad6682SRichard Henderson     }
182efad6682SRichard Henderson     /* Disable the accelerator we used before and select a new one.  */
183efad6682SRichard Henderson     cpuid_cache &= cpuid_cache - 1;
184efad6682SRichard Henderson     return true;
185efad6682SRichard Henderson }
186efad6682SRichard Henderson 
1875e33a872SRichard Henderson static bool select_accel_fn(const void *buf, size_t len)
1885e33a872SRichard Henderson {
1895e33a872SRichard Henderson     uintptr_t ibuf = (uintptr_t)buf;
1905e33a872SRichard Henderson #ifdef CONFIG_AVX2_OPT
1915e33a872SRichard Henderson     if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) {
1925e33a872SRichard Henderson         return buffer_zero_avx2(buf, len);
1935e33a872SRichard Henderson     }
19486444f08SPaolo Bonzini     if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE4)) {
19586444f08SPaolo Bonzini         return buffer_zero_sse4(buf, len);
19686444f08SPaolo Bonzini     }
1975e33a872SRichard Henderson #endif
1985e33a872SRichard Henderson     if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) {
1995e33a872SRichard Henderson         return buffer_zero_sse2(buf, len);
2005e33a872SRichard Henderson     }
2015e33a872SRichard Henderson     return buffer_zero_int(buf, len);
2025e33a872SRichard Henderson }
2035e33a872SRichard Henderson 
2045e33a872SRichard Henderson #else
2055e33a872SRichard Henderson #define select_accel_fn  buffer_zero_int
20688ca8e80SRichard Henderson #endif
20788ca8e80SRichard Henderson 
208efad6682SRichard Henderson #ifndef HAVE_NEXT_ACCEL
209efad6682SRichard Henderson bool test_buffer_is_zero_next_accel(void)
210efad6682SRichard Henderson {
211efad6682SRichard Henderson     return false;
212efad6682SRichard Henderson }
213efad6682SRichard Henderson #endif
214efad6682SRichard Henderson 
21588ca8e80SRichard Henderson /*
21688ca8e80SRichard Henderson  * Checks if a buffer is all zeroes
21788ca8e80SRichard Henderson  */
21888ca8e80SRichard Henderson bool buffer_is_zero(const void *buf, size_t len)
21988ca8e80SRichard Henderson {
2205e33a872SRichard Henderson     if (unlikely(len == 0)) {
22188ca8e80SRichard Henderson         return true;
22288ca8e80SRichard Henderson     }
22388ca8e80SRichard Henderson 
224*083d012aSRichard Henderson     /* Fetch the beginning of the buffer while we select the accelerator.  */
225*083d012aSRichard Henderson     __builtin_prefetch(buf);
226*083d012aSRichard Henderson 
2275e33a872SRichard Henderson     /* Use an optimized zero check if possible.  Note that this also
2285e33a872SRichard Henderson        includes a check for an unrolled loop over 64-bit integers.  */
2295e33a872SRichard Henderson     return select_accel_fn(buf, len);
2305e33a872SRichard Henderson }
231