1 /*
2 * ARM AdvSIMD / SVE Vector Helpers
3 *
4 * Copyright (c) 2020 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef TARGET_ARM_VEC_INTERNAL_H
21 #define TARGET_ARM_VEC_INTERNAL_H
22
23 #include "fpu/softfloat.h"
24
25 /*
26 * Note that vector data is stored in host-endian 64-bit chunks,
27 * so addressing units smaller than that needs a host-endian fixup.
28 *
29 * The H<N> macros are used when indexing an array of elements of size N.
30 *
31 * The H1_<N> macros are used when performing byte arithmetic and then
32 * casting the final pointer to a type of size N.
33 */
34 #if HOST_BIG_ENDIAN
35 #define H1(x) ((x) ^ 7)
36 #define H1_2(x) ((x) ^ 6)
37 #define H1_4(x) ((x) ^ 4)
38 #define H2(x) ((x) ^ 3)
39 #define H4(x) ((x) ^ 1)
40 #else
41 #define H1(x) (x)
42 #define H1_2(x) (x)
43 #define H1_4(x) (x)
44 #define H2(x) (x)
45 #define H4(x) (x)
46 #endif
47 /*
48 * Access to 64-bit elements isn't host-endian dependent; we provide H8
49 * and H1_8 so that when a function is being generated from a macro we
50 * can pass these rather than an empty macro argument, for clarity.
51 */
52 #define H8(x) (x)
53 #define H1_8(x) (x)
54
55 /*
56 * Expand active predicate bits to bytes, for byte elements.
57 */
58 extern const uint64_t expand_pred_b_data[256];
expand_pred_b(uint8_t byte)59 static inline uint64_t expand_pred_b(uint8_t byte)
60 {
61 return expand_pred_b_data[byte];
62 }
63
64 /* Similarly for half-word elements. */
65 extern const uint64_t expand_pred_h_data[0x55 + 1];
expand_pred_h(uint8_t byte)66 static inline uint64_t expand_pred_h(uint8_t byte)
67 {
68 return expand_pred_h_data[byte & 0x55];
69 }
70
clear_tail(void * vd,uintptr_t opr_sz,uintptr_t max_sz)71 static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
72 {
73 uint64_t *d = vd + opr_sz;
74 uintptr_t i;
75
76 for (i = opr_sz; i < max_sz; i += 8) {
77 *d++ = 0;
78 }
79 }
80
do_sqrshl_bhs(int32_t src,int32_t shift,int bits,bool round,uint32_t * sat)81 static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
82 bool round, uint32_t *sat)
83 {
84 if (shift <= -bits) {
85 /* Rounding the sign bit always produces 0. */
86 if (round) {
87 return 0;
88 }
89 return src >> 31;
90 } else if (shift < 0) {
91 if (round) {
92 src >>= -shift - 1;
93 return (src >> 1) + (src & 1);
94 }
95 return src >> -shift;
96 } else if (shift < bits) {
97 int32_t val = src << shift;
98 if (bits == 32) {
99 if (!sat || val >> shift == src) {
100 return val;
101 }
102 } else {
103 int32_t extval = sextract32(val, 0, bits);
104 if (!sat || val == extval) {
105 return extval;
106 }
107 }
108 } else if (!sat || src == 0) {
109 return 0;
110 }
111
112 *sat = 1;
113 return (1u << (bits - 1)) - (src >= 0);
114 }
115
do_uqrshl_bhs(uint32_t src,int32_t shift,int bits,bool round,uint32_t * sat)116 static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
117 bool round, uint32_t *sat)
118 {
119 if (shift <= -(bits + round)) {
120 return 0;
121 } else if (shift < 0) {
122 if (round) {
123 src >>= -shift - 1;
124 return (src >> 1) + (src & 1);
125 }
126 return src >> -shift;
127 } else if (shift < bits) {
128 uint32_t val = src << shift;
129 if (bits == 32) {
130 if (!sat || val >> shift == src) {
131 return val;
132 }
133 } else {
134 uint32_t extval = extract32(val, 0, bits);
135 if (!sat || val == extval) {
136 return extval;
137 }
138 }
139 } else if (!sat || src == 0) {
140 return 0;
141 }
142
143 *sat = 1;
144 return MAKE_64BIT_MASK(0, bits);
145 }
146
do_suqrshl_bhs(int32_t src,int32_t shift,int bits,bool round,uint32_t * sat)147 static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
148 bool round, uint32_t *sat)
149 {
150 if (sat && src < 0) {
151 *sat = 1;
152 return 0;
153 }
154 return do_uqrshl_bhs(src, shift, bits, round, sat);
155 }
156
do_sqrshl_d(int64_t src,int64_t shift,bool round,uint32_t * sat)157 static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
158 bool round, uint32_t *sat)
159 {
160 if (shift <= -64) {
161 /* Rounding the sign bit always produces 0. */
162 if (round) {
163 return 0;
164 }
165 return src >> 63;
166 } else if (shift < 0) {
167 if (round) {
168 src >>= -shift - 1;
169 return (src >> 1) + (src & 1);
170 }
171 return src >> -shift;
172 } else if (shift < 64) {
173 int64_t val = src << shift;
174 if (!sat || val >> shift == src) {
175 return val;
176 }
177 } else if (!sat || src == 0) {
178 return 0;
179 }
180
181 *sat = 1;
182 return src < 0 ? INT64_MIN : INT64_MAX;
183 }
184
do_uqrshl_d(uint64_t src,int64_t shift,bool round,uint32_t * sat)185 static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
186 bool round, uint32_t *sat)
187 {
188 if (shift <= -(64 + round)) {
189 return 0;
190 } else if (shift < 0) {
191 if (round) {
192 src >>= -shift - 1;
193 return (src >> 1) + (src & 1);
194 }
195 return src >> -shift;
196 } else if (shift < 64) {
197 uint64_t val = src << shift;
198 if (!sat || val >> shift == src) {
199 return val;
200 }
201 } else if (!sat || src == 0) {
202 return 0;
203 }
204
205 *sat = 1;
206 return UINT64_MAX;
207 }
208
do_suqrshl_d(int64_t src,int64_t shift,bool round,uint32_t * sat)209 static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
210 bool round, uint32_t *sat)
211 {
212 if (sat && src < 0) {
213 *sat = 1;
214 return 0;
215 }
216 return do_uqrshl_d(src, shift, round, sat);
217 }
218
219 int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
220 int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
221 int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
222 int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
223
224 /**
225 * bfdotadd:
226 * @sum: addend
227 * @e1, @e2: multiplicand vectors
228 * @fpst: floating-point status to use
229 *
230 * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
231 * The @e1 and @e2 operands correspond to the 32-bit source vector
232 * slots and contain two Bfloat16 values each.
233 *
234 * Corresponds to the ARM pseudocode function BFDotAdd, specialized
235 * for the FPCR.EBF == 0 case.
236 */
237 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst);
238 /**
239 * bfdotadd_ebf:
240 * @sum: addend
241 * @e1, @e2: multiplicand vectors
242 * @fpst: floating-point status to use
243 * @fpst_odd: floating-point status to use for round-to-odd operations
244 *
245 * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
246 * The @e1 and @e2 operands correspond to the 32-bit source vector
247 * slots and contain two Bfloat16 values each.
248 *
249 * Corresponds to the ARM pseudocode function BFDotAdd, specialized
250 * for the FPCR.EBF == 1 case.
251 */
252 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
253 float_status *fpst, float_status *fpst_odd);
254
255 /**
256 * is_ebf:
257 * @env: CPU state
258 * @statusp: pointer to floating point status to fill in
259 * @oddstatusp: pointer to floating point status to fill in for round-to-odd
260 *
261 * Determine whether a BFDotAdd operation should use FPCR.EBF = 0
262 * or FPCR.EBF = 1 semantics. On return, has initialized *statusp
263 * and *oddstatusp to suitable float_status arguments to use with either
264 * bfdotadd() or bfdotadd_ebf().
265 * Returns true for EBF = 1, false for EBF = 0. (The caller should use this
266 * to decide whether to call bfdotadd() or bfdotadd_ebf().)
267 */
268 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp);
269
270 /*
271 * Negate as for FPCR.AH=1 -- do not negate NaNs.
272 */
float16_ah_chs(float16 a)273 static inline float16 float16_ah_chs(float16 a)
274 {
275 return float16_is_any_nan(a) ? a : float16_chs(a);
276 }
277
float32_ah_chs(float32 a)278 static inline float32 float32_ah_chs(float32 a)
279 {
280 return float32_is_any_nan(a) ? a : float32_chs(a);
281 }
282
float64_ah_chs(float64 a)283 static inline float64 float64_ah_chs(float64 a)
284 {
285 return float64_is_any_nan(a) ? a : float64_chs(a);
286 }
287
float16_maybe_ah_chs(float16 a,bool fpcr_ah)288 static inline float16 float16_maybe_ah_chs(float16 a, bool fpcr_ah)
289 {
290 return fpcr_ah && float16_is_any_nan(a) ? a : float16_chs(a);
291 }
292
float32_maybe_ah_chs(float32 a,bool fpcr_ah)293 static inline float32 float32_maybe_ah_chs(float32 a, bool fpcr_ah)
294 {
295 return fpcr_ah && float32_is_any_nan(a) ? a : float32_chs(a);
296 }
297
float64_maybe_ah_chs(float64 a,bool fpcr_ah)298 static inline float64 float64_maybe_ah_chs(float64 a, bool fpcr_ah)
299 {
300 return fpcr_ah && float64_is_any_nan(a) ? a : float64_chs(a);
301 }
302
303 #endif /* TARGET_ARM_VEC_INTERNAL_H */
304