1*88ca8e80SRichard Henderson /* 2*88ca8e80SRichard Henderson * Simple C functions to supplement the C library 3*88ca8e80SRichard Henderson * 4*88ca8e80SRichard Henderson * Copyright (c) 2006 Fabrice Bellard 5*88ca8e80SRichard Henderson * 6*88ca8e80SRichard Henderson * Permission is hereby granted, free of charge, to any person obtaining a copy 7*88ca8e80SRichard Henderson * of this software and associated documentation files (the "Software"), to deal 8*88ca8e80SRichard Henderson * in the Software without restriction, including without limitation the rights 9*88ca8e80SRichard Henderson * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10*88ca8e80SRichard Henderson * copies of the Software, and to permit persons to whom the Software is 11*88ca8e80SRichard Henderson * furnished to do so, subject to the following conditions: 12*88ca8e80SRichard Henderson * 13*88ca8e80SRichard Henderson * The above copyright notice and this permission notice shall be included in 14*88ca8e80SRichard Henderson * all copies or substantial portions of the Software. 15*88ca8e80SRichard Henderson * 16*88ca8e80SRichard Henderson * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17*88ca8e80SRichard Henderson * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18*88ca8e80SRichard Henderson * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19*88ca8e80SRichard Henderson * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20*88ca8e80SRichard Henderson * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21*88ca8e80SRichard Henderson * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22*88ca8e80SRichard Henderson * THE SOFTWARE. 23*88ca8e80SRichard Henderson */ 24*88ca8e80SRichard Henderson #include "qemu/osdep.h" 25*88ca8e80SRichard Henderson #include "qemu-common.h" 26*88ca8e80SRichard Henderson #include "qemu/cutils.h" 27*88ca8e80SRichard Henderson 28*88ca8e80SRichard Henderson 29*88ca8e80SRichard Henderson /* vector definitions */ 30*88ca8e80SRichard Henderson #ifdef __ALTIVEC__ 31*88ca8e80SRichard Henderson #include <altivec.h> 32*88ca8e80SRichard Henderson /* The altivec.h header says we're allowed to undef these for 33*88ca8e80SRichard Henderson * C++ compatibility. Here we don't care about C++, but we 34*88ca8e80SRichard Henderson * undef them anyway to avoid namespace pollution. 35*88ca8e80SRichard Henderson */ 36*88ca8e80SRichard Henderson #undef vector 37*88ca8e80SRichard Henderson #undef pixel 38*88ca8e80SRichard Henderson #undef bool 39*88ca8e80SRichard Henderson #define VECTYPE __vector unsigned char 40*88ca8e80SRichard Henderson #define SPLAT(p) vec_splat(vec_ld(0, p), 0) 41*88ca8e80SRichard Henderson #define ALL_EQ(v1, v2) vec_all_eq(v1, v2) 42*88ca8e80SRichard Henderson #define VEC_OR(v1, v2) ((v1) | (v2)) 43*88ca8e80SRichard Henderson /* altivec.h may redefine the bool macro as vector type. 44*88ca8e80SRichard Henderson * Reset it to POSIX semantics. */ 45*88ca8e80SRichard Henderson #define bool _Bool 46*88ca8e80SRichard Henderson #elif defined __SSE2__ 47*88ca8e80SRichard Henderson #include <emmintrin.h> 48*88ca8e80SRichard Henderson #define VECTYPE __m128i 49*88ca8e80SRichard Henderson #define SPLAT(p) _mm_set1_epi8(*(p)) 50*88ca8e80SRichard Henderson #define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF) 51*88ca8e80SRichard Henderson #define VEC_OR(v1, v2) (_mm_or_si128(v1, v2)) 52*88ca8e80SRichard Henderson #elif defined(__aarch64__) 53*88ca8e80SRichard Henderson #include "arm_neon.h" 54*88ca8e80SRichard Henderson #define VECTYPE uint64x2_t 55*88ca8e80SRichard Henderson #define ALL_EQ(v1, v2) \ 56*88ca8e80SRichard Henderson ((vgetq_lane_u64(v1, 0) == vgetq_lane_u64(v2, 0)) && \ 57*88ca8e80SRichard Henderson (vgetq_lane_u64(v1, 1) == vgetq_lane_u64(v2, 1))) 58*88ca8e80SRichard Henderson #define VEC_OR(v1, v2) ((v1) | (v2)) 59*88ca8e80SRichard Henderson #else 60*88ca8e80SRichard Henderson #define VECTYPE unsigned long 61*88ca8e80SRichard Henderson #define SPLAT(p) (*(p) * (~0UL / 255)) 62*88ca8e80SRichard Henderson #define ALL_EQ(v1, v2) ((v1) == (v2)) 63*88ca8e80SRichard Henderson #define VEC_OR(v1, v2) ((v1) | (v2)) 64*88ca8e80SRichard Henderson #endif 65*88ca8e80SRichard Henderson 66*88ca8e80SRichard Henderson #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 67*88ca8e80SRichard Henderson 68*88ca8e80SRichard Henderson static bool 69*88ca8e80SRichard Henderson can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) 70*88ca8e80SRichard Henderson { 71*88ca8e80SRichard Henderson return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 72*88ca8e80SRichard Henderson * sizeof(VECTYPE)) == 0 73*88ca8e80SRichard Henderson && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); 74*88ca8e80SRichard Henderson } 75*88ca8e80SRichard Henderson 76*88ca8e80SRichard Henderson /* 77*88ca8e80SRichard Henderson * Searches for an area with non-zero content in a buffer 78*88ca8e80SRichard Henderson * 79*88ca8e80SRichard Henderson * Attention! The len must be a multiple of 80*88ca8e80SRichard Henderson * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE) 81*88ca8e80SRichard Henderson * and addr must be a multiple of sizeof(VECTYPE) due to 82*88ca8e80SRichard Henderson * restriction of optimizations in this function. 83*88ca8e80SRichard Henderson * 84*88ca8e80SRichard Henderson * can_use_buffer_find_nonzero_offset_inner() can be used to 85*88ca8e80SRichard Henderson * check these requirements. 86*88ca8e80SRichard Henderson * 87*88ca8e80SRichard Henderson * The return value is the offset of the non-zero area rounded 88*88ca8e80SRichard Henderson * down to a multiple of sizeof(VECTYPE) for the first 89*88ca8e80SRichard Henderson * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to 90*88ca8e80SRichard Henderson * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE) 91*88ca8e80SRichard Henderson * afterwards. 92*88ca8e80SRichard Henderson * 93*88ca8e80SRichard Henderson * If the buffer is all zero the return value is equal to len. 94*88ca8e80SRichard Henderson */ 95*88ca8e80SRichard Henderson 96*88ca8e80SRichard Henderson static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) 97*88ca8e80SRichard Henderson { 98*88ca8e80SRichard Henderson const VECTYPE *p = buf; 99*88ca8e80SRichard Henderson const VECTYPE zero = (VECTYPE){0}; 100*88ca8e80SRichard Henderson size_t i; 101*88ca8e80SRichard Henderson 102*88ca8e80SRichard Henderson assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); 103*88ca8e80SRichard Henderson 104*88ca8e80SRichard Henderson if (!len) { 105*88ca8e80SRichard Henderson return 0; 106*88ca8e80SRichard Henderson } 107*88ca8e80SRichard Henderson 108*88ca8e80SRichard Henderson for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { 109*88ca8e80SRichard Henderson if (!ALL_EQ(p[i], zero)) { 110*88ca8e80SRichard Henderson return i * sizeof(VECTYPE); 111*88ca8e80SRichard Henderson } 112*88ca8e80SRichard Henderson } 113*88ca8e80SRichard Henderson 114*88ca8e80SRichard Henderson for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; 115*88ca8e80SRichard Henderson i < len / sizeof(VECTYPE); 116*88ca8e80SRichard Henderson i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { 117*88ca8e80SRichard Henderson VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]); 118*88ca8e80SRichard Henderson VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]); 119*88ca8e80SRichard Henderson VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]); 120*88ca8e80SRichard Henderson VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]); 121*88ca8e80SRichard Henderson VECTYPE tmp01 = VEC_OR(tmp0, tmp1); 122*88ca8e80SRichard Henderson VECTYPE tmp23 = VEC_OR(tmp2, tmp3); 123*88ca8e80SRichard Henderson if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) { 124*88ca8e80SRichard Henderson break; 125*88ca8e80SRichard Henderson } 126*88ca8e80SRichard Henderson } 127*88ca8e80SRichard Henderson 128*88ca8e80SRichard Henderson return i * sizeof(VECTYPE); 129*88ca8e80SRichard Henderson } 130*88ca8e80SRichard Henderson 131*88ca8e80SRichard Henderson #if defined CONFIG_AVX2_OPT 132*88ca8e80SRichard Henderson #pragma GCC push_options 133*88ca8e80SRichard Henderson #pragma GCC target("avx2") 134*88ca8e80SRichard Henderson #include <cpuid.h> 135*88ca8e80SRichard Henderson #include <immintrin.h> 136*88ca8e80SRichard Henderson 137*88ca8e80SRichard Henderson #define AVX2_VECTYPE __m256i 138*88ca8e80SRichard Henderson #define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) 139*88ca8e80SRichard Henderson #define AVX2_ALL_EQ(v1, v2) \ 140*88ca8e80SRichard Henderson (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF) 141*88ca8e80SRichard Henderson #define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) 142*88ca8e80SRichard Henderson 143*88ca8e80SRichard Henderson static bool 144*88ca8e80SRichard Henderson can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) 145*88ca8e80SRichard Henderson { 146*88ca8e80SRichard Henderson return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 147*88ca8e80SRichard Henderson * sizeof(AVX2_VECTYPE)) == 0 148*88ca8e80SRichard Henderson && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); 149*88ca8e80SRichard Henderson } 150*88ca8e80SRichard Henderson 151*88ca8e80SRichard Henderson static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) 152*88ca8e80SRichard Henderson { 153*88ca8e80SRichard Henderson const AVX2_VECTYPE *p = buf; 154*88ca8e80SRichard Henderson const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; 155*88ca8e80SRichard Henderson size_t i; 156*88ca8e80SRichard Henderson 157*88ca8e80SRichard Henderson assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); 158*88ca8e80SRichard Henderson 159*88ca8e80SRichard Henderson if (!len) { 160*88ca8e80SRichard Henderson return 0; 161*88ca8e80SRichard Henderson } 162*88ca8e80SRichard Henderson 163*88ca8e80SRichard Henderson for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { 164*88ca8e80SRichard Henderson if (!AVX2_ALL_EQ(p[i], zero)) { 165*88ca8e80SRichard Henderson return i * sizeof(AVX2_VECTYPE); 166*88ca8e80SRichard Henderson } 167*88ca8e80SRichard Henderson } 168*88ca8e80SRichard Henderson 169*88ca8e80SRichard Henderson for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; 170*88ca8e80SRichard Henderson i < len / sizeof(AVX2_VECTYPE); 171*88ca8e80SRichard Henderson i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { 172*88ca8e80SRichard Henderson AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); 173*88ca8e80SRichard Henderson AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); 174*88ca8e80SRichard Henderson AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); 175*88ca8e80SRichard Henderson AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); 176*88ca8e80SRichard Henderson AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); 177*88ca8e80SRichard Henderson AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); 178*88ca8e80SRichard Henderson if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { 179*88ca8e80SRichard Henderson break; 180*88ca8e80SRichard Henderson } 181*88ca8e80SRichard Henderson } 182*88ca8e80SRichard Henderson 183*88ca8e80SRichard Henderson return i * sizeof(AVX2_VECTYPE); 184*88ca8e80SRichard Henderson } 185*88ca8e80SRichard Henderson 186*88ca8e80SRichard Henderson static bool avx2_support(void) 187*88ca8e80SRichard Henderson { 188*88ca8e80SRichard Henderson int a, b, c, d; 189*88ca8e80SRichard Henderson 190*88ca8e80SRichard Henderson if (__get_cpuid_max(0, NULL) < 7) { 191*88ca8e80SRichard Henderson return false; 192*88ca8e80SRichard Henderson } 193*88ca8e80SRichard Henderson 194*88ca8e80SRichard Henderson __cpuid_count(7, 0, a, b, c, d); 195*88ca8e80SRichard Henderson 196*88ca8e80SRichard Henderson return b & bit_AVX2; 197*88ca8e80SRichard Henderson } 198*88ca8e80SRichard Henderson 199*88ca8e80SRichard Henderson bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \ 200*88ca8e80SRichard Henderson __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc"))); 201*88ca8e80SRichard Henderson size_t buffer_find_nonzero_offset(const void *buf, size_t len) \ 202*88ca8e80SRichard Henderson __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc"))); 203*88ca8e80SRichard Henderson 204*88ca8e80SRichard Henderson static void *buffer_find_nonzero_offset_ifunc(void) 205*88ca8e80SRichard Henderson { 206*88ca8e80SRichard Henderson typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ? 207*88ca8e80SRichard Henderson buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner; 208*88ca8e80SRichard Henderson 209*88ca8e80SRichard Henderson return func; 210*88ca8e80SRichard Henderson } 211*88ca8e80SRichard Henderson 212*88ca8e80SRichard Henderson static void *can_use_buffer_find_nonzero_offset_ifunc(void) 213*88ca8e80SRichard Henderson { 214*88ca8e80SRichard Henderson typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ? 215*88ca8e80SRichard Henderson can_use_buffer_find_nonzero_offset_avx2 : 216*88ca8e80SRichard Henderson can_use_buffer_find_nonzero_offset_inner; 217*88ca8e80SRichard Henderson 218*88ca8e80SRichard Henderson return func; 219*88ca8e80SRichard Henderson } 220*88ca8e80SRichard Henderson #pragma GCC pop_options 221*88ca8e80SRichard Henderson #else 222*88ca8e80SRichard Henderson bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) 223*88ca8e80SRichard Henderson { 224*88ca8e80SRichard Henderson return can_use_buffer_find_nonzero_offset_inner(buf, len); 225*88ca8e80SRichard Henderson } 226*88ca8e80SRichard Henderson 227*88ca8e80SRichard Henderson size_t buffer_find_nonzero_offset(const void *buf, size_t len) 228*88ca8e80SRichard Henderson { 229*88ca8e80SRichard Henderson return buffer_find_nonzero_offset_inner(buf, len); 230*88ca8e80SRichard Henderson } 231*88ca8e80SRichard Henderson #endif 232*88ca8e80SRichard Henderson 233*88ca8e80SRichard Henderson /* 234*88ca8e80SRichard Henderson * Checks if a buffer is all zeroes 235*88ca8e80SRichard Henderson * 236*88ca8e80SRichard Henderson * Attention! The len must be a multiple of 4 * sizeof(long) due to 237*88ca8e80SRichard Henderson * restriction of optimizations in this function. 238*88ca8e80SRichard Henderson */ 239*88ca8e80SRichard Henderson bool buffer_is_zero(const void *buf, size_t len) 240*88ca8e80SRichard Henderson { 241*88ca8e80SRichard Henderson /* 242*88ca8e80SRichard Henderson * Use long as the biggest available internal data type that fits into the 243*88ca8e80SRichard Henderson * CPU register and unroll the loop to smooth out the effect of memory 244*88ca8e80SRichard Henderson * latency. 245*88ca8e80SRichard Henderson */ 246*88ca8e80SRichard Henderson 247*88ca8e80SRichard Henderson size_t i; 248*88ca8e80SRichard Henderson long d0, d1, d2, d3; 249*88ca8e80SRichard Henderson const long * const data = buf; 250*88ca8e80SRichard Henderson 251*88ca8e80SRichard Henderson /* use vector optimized zero check if possible */ 252*88ca8e80SRichard Henderson if (can_use_buffer_find_nonzero_offset(buf, len)) { 253*88ca8e80SRichard Henderson return buffer_find_nonzero_offset(buf, len) == len; 254*88ca8e80SRichard Henderson } 255*88ca8e80SRichard Henderson 256*88ca8e80SRichard Henderson assert(len % (4 * sizeof(long)) == 0); 257*88ca8e80SRichard Henderson len /= sizeof(long); 258*88ca8e80SRichard Henderson 259*88ca8e80SRichard Henderson for (i = 0; i < len; i += 4) { 260*88ca8e80SRichard Henderson d0 = data[i + 0]; 261*88ca8e80SRichard Henderson d1 = data[i + 1]; 262*88ca8e80SRichard Henderson d2 = data[i + 2]; 263*88ca8e80SRichard Henderson d3 = data[i + 3]; 264*88ca8e80SRichard Henderson 265*88ca8e80SRichard Henderson if (d0 || d1 || d2 || d3) { 266*88ca8e80SRichard Henderson return false; 267*88ca8e80SRichard Henderson } 268*88ca8e80SRichard Henderson } 269*88ca8e80SRichard Henderson 270*88ca8e80SRichard Henderson return true; 271*88ca8e80SRichard Henderson } 272*88ca8e80SRichard Henderson 273