1/* 2 * SPDX-License-Identifier: GPL-2.0-or-later 3 * buffer_is_zero acceleration, aarch64 version. 4 */ 5 6#ifdef __ARM_NEON 7#include <arm_neon.h> 8 9/* 10 * Helper for preventing the compiler from reassociating 11 * chains of binary vector operations. 12 */ 13#define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1)) 14 15static bool buffer_is_zero_simd(const void *buf, size_t len) 16{ 17 uint32x4_t t0, t1, t2, t3; 18 19 /* Align head/tail to 16-byte boundaries. */ 20 const uint32x4_t *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16); 21 const uint32x4_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16); 22 23 /* Unaligned loads at head/tail. */ 24 t0 = vld1q_u32(buf) | vld1q_u32(buf + len - 16); 25 26 /* Collect a partial block at tail end. */ 27 t1 = e[-7] | e[-6]; 28 t2 = e[-5] | e[-4]; 29 t3 = e[-3] | e[-2]; 30 t0 |= e[-1]; 31 REASSOC_BARRIER(t0, t1); 32 REASSOC_BARRIER(t2, t3); 33 t0 |= t1; 34 t2 |= t3; 35 REASSOC_BARRIER(t0, t2); 36 t0 |= t2; 37 38 /* 39 * Loop over complete 128-byte blocks. 40 * With the head and tail removed, e - p >= 14, so the loop 41 * must iterate at least once. 42 */ 43 do { 44 /* 45 * Reduce via UMAXV. Whatever the actual result, 46 * it will only be zero if all input bytes are zero. 47 */ 48 if (unlikely(vmaxvq_u32(t0) != 0)) { 49 return false; 50 } 51 52 t0 = p[0] | p[1]; 53 t1 = p[2] | p[3]; 54 t2 = p[4] | p[5]; 55 t3 = p[6] | p[7]; 56 REASSOC_BARRIER(t0, t1); 57 REASSOC_BARRIER(t2, t3); 58 t0 |= t1; 59 t2 |= t3; 60 REASSOC_BARRIER(t0, t2); 61 t0 |= t2; 62 p += 8; 63 } while (p < e - 7); 64 65 return vmaxvq_u32(t0) == 0; 66} 67 68static biz_accel_fn const accel_table[] = { 69 buffer_is_zero_int_ge256, 70 buffer_is_zero_simd, 71}; 72 73#define best_accel() 1 74#else 75# include "host/include/generic/host/bufferiszero.c.inc" 76#endif 77