1 /* 2 * Simple C functions to supplement the C library 3 * 4 * Copyright (c) 2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu/osdep.h" 25 #include "qemu/cutils.h" 26 #include "qemu/bswap.h" 27 #include "host/cpuinfo.h" 28 29 static bool (*buffer_is_zero_accel)(const void *, size_t); 30 31 static bool buffer_is_zero_integer(const void *buf, size_t len) 32 { 33 if (unlikely(len < 8)) { 34 /* For a very small buffer, simply accumulate all the bytes. */ 35 const unsigned char *p = buf; 36 const unsigned char *e = buf + len; 37 unsigned char t = 0; 38 39 do { 40 t |= *p++; 41 } while (p < e); 42 43 return t == 0; 44 } else { 45 /* Otherwise, use the unaligned memory access functions to 46 handle the beginning and end of the buffer, with a couple 47 of loops handling the middle aligned section. */ 48 uint64_t t = ldq_he_p(buf); 49 const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8); 50 const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); 51 52 for (; p + 8 <= e; p += 8) { 53 if (t) { 54 return false; 55 } 56 t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; 57 } 58 while (p < e) { 59 t |= *p++; 60 } 61 t |= ldq_he_p(buf + len - 8); 62 63 return t == 0; 64 } 65 } 66 67 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) 68 #include <immintrin.h> 69 70 /* Note that each of these vectorized functions require len >= 64. */ 71 72 static bool __attribute__((target("sse2"))) 73 buffer_zero_sse2(const void *buf, size_t len) 74 { 75 __m128i t = _mm_loadu_si128(buf); 76 __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); 77 __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); 78 __m128i zero = _mm_setzero_si128(); 79 80 /* Loop over 16-byte aligned blocks of 64. */ 81 while (likely(p <= e)) { 82 t = _mm_cmpeq_epi8(t, zero); 83 if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) { 84 return false; 85 } 86 t = p[-4] | p[-3] | p[-2] | p[-1]; 87 p += 4; 88 } 89 90 /* Finish the aligned tail. */ 91 t |= e[-3]; 92 t |= e[-2]; 93 t |= e[-1]; 94 95 /* Finish the unaligned tail. */ 96 t |= _mm_loadu_si128(buf + len - 16); 97 98 return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF; 99 } 100 101 #ifdef CONFIG_AVX2_OPT 102 static bool __attribute__((target("avx2"))) 103 buffer_zero_avx2(const void *buf, size_t len) 104 { 105 /* Begin with an unaligned head of 32 bytes. */ 106 __m256i t = _mm256_loadu_si256(buf); 107 __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32); 108 __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32); 109 110 /* Loop over 32-byte aligned blocks of 128. */ 111 while (p <= e) { 112 if (unlikely(!_mm256_testz_si256(t, t))) { 113 return false; 114 } 115 t = p[-4] | p[-3] | p[-2] | p[-1]; 116 p += 4; 117 } ; 118 119 /* Finish the last block of 128 unaligned. */ 120 t |= _mm256_loadu_si256(buf + len - 4 * 32); 121 t |= _mm256_loadu_si256(buf + len - 3 * 32); 122 t |= _mm256_loadu_si256(buf + len - 2 * 32); 123 t |= _mm256_loadu_si256(buf + len - 1 * 32); 124 125 return _mm256_testz_si256(t, t); 126 } 127 #endif /* CONFIG_AVX2_OPT */ 128 129 static unsigned __attribute__((noinline)) 130 select_accel_cpuinfo(unsigned info) 131 { 132 /* Array is sorted in order of algorithm preference. */ 133 static const struct { 134 unsigned bit; 135 bool (*fn)(const void *, size_t); 136 } all[] = { 137 #ifdef CONFIG_AVX2_OPT 138 { CPUINFO_AVX2, buffer_zero_avx2 }, 139 #endif 140 { CPUINFO_SSE2, buffer_zero_sse2 }, 141 { CPUINFO_ALWAYS, buffer_is_zero_integer }, 142 }; 143 144 for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) { 145 if (info & all[i].bit) { 146 buffer_is_zero_accel = all[i].fn; 147 return all[i].bit; 148 } 149 } 150 return 0; 151 } 152 153 static unsigned used_accel; 154 155 static void __attribute__((constructor)) init_accel(void) 156 { 157 used_accel = select_accel_cpuinfo(cpuinfo_init()); 158 } 159 160 #define INIT_ACCEL NULL 161 162 bool test_buffer_is_zero_next_accel(void) 163 { 164 /* 165 * Accumulate the accelerators that we've already tested, and 166 * remove them from the set to test this round. We'll get back 167 * a zero from select_accel_cpuinfo when there are no more. 168 */ 169 unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel); 170 used_accel |= used; 171 return used; 172 } 173 #else 174 bool test_buffer_is_zero_next_accel(void) 175 { 176 return false; 177 } 178 179 #define INIT_ACCEL buffer_is_zero_integer 180 #endif 181 182 static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL; 183 184 bool buffer_is_zero_ool(const void *buf, size_t len) 185 { 186 if (unlikely(len == 0)) { 187 return true; 188 } 189 if (!buffer_is_zero_sample3(buf, len)) { 190 return false; 191 } 192 /* All bytes are covered for any len <= 3. */ 193 if (unlikely(len <= 3)) { 194 return true; 195 } 196 197 if (likely(len >= 256)) { 198 return buffer_is_zero_accel(buf, len); 199 } 200 return buffer_is_zero_integer(buf, len); 201 } 202 203 bool buffer_is_zero_ge256(const void *buf, size_t len) 204 { 205 return buffer_is_zero_accel(buf, len); 206 } 207