188ca8e80SRichard Henderson /* 288ca8e80SRichard Henderson * Simple C functions to supplement the C library 388ca8e80SRichard Henderson * 488ca8e80SRichard Henderson * Copyright (c) 2006 Fabrice Bellard 588ca8e80SRichard Henderson * 688ca8e80SRichard Henderson * Permission is hereby granted, free of charge, to any person obtaining a copy 788ca8e80SRichard Henderson * of this software and associated documentation files (the "Software"), to deal 888ca8e80SRichard Henderson * in the Software without restriction, including without limitation the rights 988ca8e80SRichard Henderson * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1088ca8e80SRichard Henderson * copies of the Software, and to permit persons to whom the Software is 1188ca8e80SRichard Henderson * furnished to do so, subject to the following conditions: 1288ca8e80SRichard Henderson * 1388ca8e80SRichard Henderson * The above copyright notice and this permission notice shall be included in 1488ca8e80SRichard Henderson * all copies or substantial portions of the Software. 1588ca8e80SRichard Henderson * 1688ca8e80SRichard Henderson * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1788ca8e80SRichard Henderson * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1888ca8e80SRichard Henderson * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1988ca8e80SRichard Henderson * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2088ca8e80SRichard Henderson * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2188ca8e80SRichard Henderson * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2288ca8e80SRichard Henderson * THE SOFTWARE. 2388ca8e80SRichard Henderson */ 2488ca8e80SRichard Henderson #include "qemu/osdep.h" 2588ca8e80SRichard Henderson #include "qemu/cutils.h" 265e33a872SRichard Henderson #include "qemu/bswap.h" 2751f4d916SRichard Henderson #include "host/cpuinfo.h" 2888ca8e80SRichard Henderson 29*0100ce2bSRichard Henderson typedef bool (*biz_accel_fn)(const void *, size_t); 30*0100ce2bSRichard Henderson static biz_accel_fn buffer_is_zero_accel; 31cbe3d526SAlexander Monakov 327ae6399aSRichard Henderson static bool buffer_is_zero_int_lt256(const void *buf, size_t len) 335e33a872SRichard Henderson { 347ae6399aSRichard Henderson uint64_t t; 357ae6399aSRichard Henderson const uint64_t *p, *e; 365e33a872SRichard Henderson 377ae6399aSRichard Henderson /* 387ae6399aSRichard Henderson * Use unaligned memory access functions to handle 397ae6399aSRichard Henderson * the beginning and end of the buffer. 407ae6399aSRichard Henderson */ 417ae6399aSRichard Henderson if (unlikely(len <= 8)) { 427ae6399aSRichard Henderson return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0; 437ae6399aSRichard Henderson } 447ae6399aSRichard Henderson 457ae6399aSRichard Henderson t = ldq_he_p(buf) | ldq_he_p(buf + len - 8); 467ae6399aSRichard Henderson p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8); 477ae6399aSRichard Henderson e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8); 487ae6399aSRichard Henderson 497ae6399aSRichard Henderson /* Read 0 to 31 aligned words from the middle. */ 507ae6399aSRichard Henderson while (p < e) { 515e33a872SRichard Henderson t |= *p++; 527ae6399aSRichard Henderson } 535e33a872SRichard Henderson return t == 0; 547ae6399aSRichard Henderson } 555e33a872SRichard Henderson 567ae6399aSRichard Henderson static bool buffer_is_zero_int_ge256(const void *buf, size_t len) 577ae6399aSRichard Henderson { 587ae6399aSRichard Henderson /* 597ae6399aSRichard Henderson * Use unaligned memory access functions to handle 607ae6399aSRichard Henderson * the beginning and end of the buffer. 617ae6399aSRichard Henderson */ 627ae6399aSRichard Henderson uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8); 637ae6399aSRichard Henderson const uint64_t *p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8); 647ae6399aSRichard Henderson const uint64_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8); 657ae6399aSRichard Henderson 667ae6399aSRichard Henderson /* Collect a partial block at the tail end. */ 677ae6399aSRichard Henderson t |= e[-7] | e[-6] | e[-5] | e[-4] | e[-3] | e[-2] | e[-1]; 687ae6399aSRichard Henderson 697ae6399aSRichard Henderson /* 707ae6399aSRichard Henderson * Loop over 64 byte blocks. 717ae6399aSRichard Henderson * With the head and tail removed, e - p >= 30, 727ae6399aSRichard Henderson * so the loop must iterate at least 3 times. 737ae6399aSRichard Henderson */ 747ae6399aSRichard Henderson do { 755e33a872SRichard Henderson if (t) { 765e33a872SRichard Henderson return false; 775e33a872SRichard Henderson } 785e33a872SRichard Henderson t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; 797ae6399aSRichard Henderson p += 8; 807ae6399aSRichard Henderson } while (p < e - 7); 815e33a872SRichard Henderson 825e33a872SRichard Henderson return t == 0; 835e33a872SRichard Henderson } 845e33a872SRichard Henderson 85d018425cSAlexander Monakov #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) 86701ea587SRichard Henderson #include <immintrin.h> 87d9911d14SRichard Henderson 88f28e0bbeSAlexander Monakov /* Helper for preventing the compiler from reassociating 89f28e0bbeSAlexander Monakov chains of binary vector operations. */ 90f28e0bbeSAlexander Monakov #define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1)) 91f28e0bbeSAlexander Monakov 92f28e0bbeSAlexander Monakov /* Note that these vectorized functions may assume len >= 256. */ 93d9911d14SRichard Henderson 94701ea587SRichard Henderson static bool __attribute__((target("sse2"))) 95d9911d14SRichard Henderson buffer_zero_sse2(const void *buf, size_t len) 96d9911d14SRichard Henderson { 97f28e0bbeSAlexander Monakov /* Unaligned loads at head/tail. */ 98f28e0bbeSAlexander Monakov __m128i v = *(__m128i_u *)(buf); 99f28e0bbeSAlexander Monakov __m128i w = *(__m128i_u *)(buf + len - 16); 100f28e0bbeSAlexander Monakov /* Align head/tail to 16-byte boundaries. */ 101f28e0bbeSAlexander Monakov const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16); 102f28e0bbeSAlexander Monakov const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16); 103f28e0bbeSAlexander Monakov __m128i zero = { 0 }; 104d9911d14SRichard Henderson 105f28e0bbeSAlexander Monakov /* Collect a partial block at tail end. */ 106f28e0bbeSAlexander Monakov v |= e[-1]; w |= e[-2]; 107f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 108f28e0bbeSAlexander Monakov v |= e[-3]; w |= e[-4]; 109f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 110f28e0bbeSAlexander Monakov v |= e[-5]; w |= e[-6]; 111f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 112f28e0bbeSAlexander Monakov v |= e[-7]; v |= w; 113f28e0bbeSAlexander Monakov 114f28e0bbeSAlexander Monakov /* 115f28e0bbeSAlexander Monakov * Loop over complete 128-byte blocks. 116f28e0bbeSAlexander Monakov * With the head and tail removed, e - p >= 14, so the loop 117f28e0bbeSAlexander Monakov * must iterate at least once. 118f28e0bbeSAlexander Monakov */ 119f28e0bbeSAlexander Monakov do { 120f28e0bbeSAlexander Monakov v = _mm_cmpeq_epi8(v, zero); 121f28e0bbeSAlexander Monakov if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) { 122d9911d14SRichard Henderson return false; 123d9911d14SRichard Henderson } 124f28e0bbeSAlexander Monakov v = p[0]; w = p[1]; 125f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 126f28e0bbeSAlexander Monakov v |= p[2]; w |= p[3]; 127f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 128f28e0bbeSAlexander Monakov v |= p[4]; w |= p[5]; 129f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 130f28e0bbeSAlexander Monakov v |= p[6]; w |= p[7]; 131f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 132f28e0bbeSAlexander Monakov v |= w; 133f28e0bbeSAlexander Monakov p += 8; 134f28e0bbeSAlexander Monakov } while (p < e - 7); 135d9911d14SRichard Henderson 136f28e0bbeSAlexander Monakov return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF; 137d9911d14SRichard Henderson } 13888ca8e80SRichard Henderson 1395e33a872SRichard Henderson #ifdef CONFIG_AVX2_OPT 140701ea587SRichard Henderson static bool __attribute__((target("avx2"))) 141d9911d14SRichard Henderson buffer_zero_avx2(const void *buf, size_t len) 142d9911d14SRichard Henderson { 143f28e0bbeSAlexander Monakov /* Unaligned loads at head/tail. */ 144f28e0bbeSAlexander Monakov __m256i v = *(__m256i_u *)(buf); 145f28e0bbeSAlexander Monakov __m256i w = *(__m256i_u *)(buf + len - 32); 146f28e0bbeSAlexander Monakov /* Align head/tail to 32-byte boundaries. */ 147f28e0bbeSAlexander Monakov const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32); 148f28e0bbeSAlexander Monakov const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32); 149f28e0bbeSAlexander Monakov __m256i zero = { 0 }; 150d9911d14SRichard Henderson 151f28e0bbeSAlexander Monakov /* Collect a partial block at tail end. */ 152f28e0bbeSAlexander Monakov v |= e[-1]; w |= e[-2]; 153f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 154f28e0bbeSAlexander Monakov v |= e[-3]; w |= e[-4]; 155f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 156f28e0bbeSAlexander Monakov v |= e[-5]; w |= e[-6]; 157f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 158f28e0bbeSAlexander Monakov v |= e[-7]; v |= w; 159f28e0bbeSAlexander Monakov 160f28e0bbeSAlexander Monakov /* Loop over complete 256-byte blocks. */ 161f28e0bbeSAlexander Monakov for (; p < e - 7; p += 8) { 162f28e0bbeSAlexander Monakov /* PTEST is not profitable here. */ 163f28e0bbeSAlexander Monakov v = _mm256_cmpeq_epi8(v, zero); 164f28e0bbeSAlexander Monakov if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) { 165d9911d14SRichard Henderson return false; 166d9911d14SRichard Henderson } 167f28e0bbeSAlexander Monakov v = p[0]; w = p[1]; 168f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 169f28e0bbeSAlexander Monakov v |= p[2]; w |= p[3]; 170f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 171f28e0bbeSAlexander Monakov v |= p[4]; w |= p[5]; 172f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 173f28e0bbeSAlexander Monakov v |= p[6]; w |= p[7]; 174f28e0bbeSAlexander Monakov SSE_REASSOC_BARRIER(v, w); 175f28e0bbeSAlexander Monakov v |= w; 176f28e0bbeSAlexander Monakov } 177d9911d14SRichard Henderson 178f28e0bbeSAlexander Monakov return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF; 179d9911d14SRichard Henderson } 180d9911d14SRichard Henderson #endif /* CONFIG_AVX2_OPT */ 181d9911d14SRichard Henderson 18251f4d916SRichard Henderson static unsigned __attribute__((noinline)) 18351f4d916SRichard Henderson select_accel_cpuinfo(unsigned info) 184d9911d14SRichard Henderson { 18551f4d916SRichard Henderson /* Array is sorted in order of algorithm preference. */ 18651f4d916SRichard Henderson static const struct { 18751f4d916SRichard Henderson unsigned bit; 188*0100ce2bSRichard Henderson biz_accel_fn fn; 18951f4d916SRichard Henderson } all[] = { 19051f4d916SRichard Henderson #ifdef CONFIG_AVX2_OPT 191cbe3d526SAlexander Monakov { CPUINFO_AVX2, buffer_zero_avx2 }, 19251f4d916SRichard Henderson #endif 193cbe3d526SAlexander Monakov { CPUINFO_SSE2, buffer_zero_sse2 }, 1947ae6399aSRichard Henderson { CPUINFO_ALWAYS, buffer_is_zero_int_ge256 }, 19551f4d916SRichard Henderson }; 19651f4d916SRichard Henderson 19751f4d916SRichard Henderson for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) { 19851f4d916SRichard Henderson if (info & all[i].bit) { 199cbe3d526SAlexander Monakov buffer_is_zero_accel = all[i].fn; 20051f4d916SRichard Henderson return all[i].bit; 20151f4d916SRichard Henderson } 20251f4d916SRichard Henderson } 20351f4d916SRichard Henderson return 0; 204d9911d14SRichard Henderson } 2055e33a872SRichard Henderson 206cbe3d526SAlexander Monakov static unsigned used_accel; 207cbe3d526SAlexander Monakov 20851f4d916SRichard Henderson static void __attribute__((constructor)) init_accel(void) 20988ca8e80SRichard Henderson { 21051f4d916SRichard Henderson used_accel = select_accel_cpuinfo(cpuinfo_init()); 2115e33a872SRichard Henderson } 212cbe3d526SAlexander Monakov 213cbe3d526SAlexander Monakov #define INIT_ACCEL NULL 2145e33a872SRichard Henderson 215efad6682SRichard Henderson bool test_buffer_is_zero_next_accel(void) 216efad6682SRichard Henderson { 21751f4d916SRichard Henderson /* 21851f4d916SRichard Henderson * Accumulate the accelerators that we've already tested, and 21951f4d916SRichard Henderson * remove them from the set to test this round. We'll get back 22051f4d916SRichard Henderson * a zero from select_accel_cpuinfo when there are no more. 22151f4d916SRichard Henderson */ 22251f4d916SRichard Henderson unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel); 22351f4d916SRichard Henderson used_accel |= used; 22451f4d916SRichard Henderson return used; 225efad6682SRichard Henderson } 2265e33a872SRichard Henderson #else 227efad6682SRichard Henderson bool test_buffer_is_zero_next_accel(void) 228efad6682SRichard Henderson { 229efad6682SRichard Henderson return false; 230efad6682SRichard Henderson } 231cbe3d526SAlexander Monakov 2327ae6399aSRichard Henderson #define INIT_ACCEL buffer_is_zero_int_ge256 233efad6682SRichard Henderson #endif 234efad6682SRichard Henderson 235*0100ce2bSRichard Henderson static biz_accel_fn buffer_is_zero_accel = INIT_ACCEL; 236cbe3d526SAlexander Monakov 237cbe3d526SAlexander Monakov bool buffer_is_zero_ool(const void *buf, size_t len) 23888ca8e80SRichard Henderson { 2395e33a872SRichard Henderson if (unlikely(len == 0)) { 24088ca8e80SRichard Henderson return true; 24188ca8e80SRichard Henderson } 242cbe3d526SAlexander Monakov if (!buffer_is_zero_sample3(buf, len)) { 243cbe3d526SAlexander Monakov return false; 244cbe3d526SAlexander Monakov } 245cbe3d526SAlexander Monakov /* All bytes are covered for any len <= 3. */ 246cbe3d526SAlexander Monakov if (unlikely(len <= 3)) { 247cbe3d526SAlexander Monakov return true; 248cbe3d526SAlexander Monakov } 24988ca8e80SRichard Henderson 250cbe3d526SAlexander Monakov if (likely(len >= 256)) { 251cbe3d526SAlexander Monakov return buffer_is_zero_accel(buf, len); 252cbe3d526SAlexander Monakov } 2537ae6399aSRichard Henderson return buffer_is_zero_int_lt256(buf, len); 254cbe3d526SAlexander Monakov } 255083d012aSRichard Henderson 256cbe3d526SAlexander Monakov bool buffer_is_zero_ge256(const void *buf, size_t len) 257cbe3d526SAlexander Monakov { 258cbe3d526SAlexander Monakov return buffer_is_zero_accel(buf, len); 2595e33a872SRichard Henderson } 260