xref: /openbmc/qemu/target/arm/tcg/vfp_helper.c (revision d74169e09e1d424aaca138966f460520a0d4dd0d)
1 /*
2  * ARM VFP floating-point operations
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "cpu-features.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/log.h"
26 
27 #define HELPER_H "tcg/helper.h"
28 #include "exec/helper-proto.h.inc"
29 
30 /*
31  * Set the float_status behaviour to match the Arm defaults:
32  *  * tininess-before-rounding
33  *  * 2-input NaN propagation prefers SNaN over QNaN, and then
34  *    operand A over operand B (see FPProcessNaNs() pseudocode)
35  *  * 3-input NaN propagation prefers SNaN over QNaN, and then
36  *    operand C over A over B (see FPProcessNaNs3() pseudocode,
37  *    but note that for QEMU muladd is a * b + c, whereas for
38  *    the pseudocode function the arguments are in the order c, a, b.
39  *  * 0 * Inf + NaN returns the default NaN if the input NaN is quiet,
40  *    and the input NaN if it is signalling
41  *  * Default NaN has sign bit clear, msb frac bit set
42  */
43 void arm_set_default_fp_behaviours(float_status *s)
44 {
45     set_float_detect_tininess(float_tininess_before_rounding, s);
46     set_float_ftz_detection(float_ftz_before_rounding, s);
47     set_float_2nan_prop_rule(float_2nan_prop_s_ab, s);
48     set_float_3nan_prop_rule(float_3nan_prop_s_cab, s);
49     set_float_infzeronan_rule(float_infzeronan_dnan_if_qnan, s);
50     set_float_default_nan_pattern(0b01000000, s);
51 }
52 
53 /*
54  * Set the float_status behaviour to match the FEAT_AFP
55  * FPCR.AH=1 requirements:
56  *  * tininess-after-rounding
57  *  * 2-input NaN propagation prefers the first NaN
58  *  * 3-input NaN propagation prefers a over b over c
59  *  * 0 * Inf + NaN always returns the input NaN and doesn't
60  *    set Invalid for a QNaN
61  *  * default NaN has sign bit set, msb frac bit set
62  */
63 void arm_set_ah_fp_behaviours(float_status *s)
64 {
65     set_float_detect_tininess(float_tininess_after_rounding, s);
66     set_float_ftz_detection(float_ftz_after_rounding, s);
67     set_float_2nan_prop_rule(float_2nan_prop_ab, s);
68     set_float_3nan_prop_rule(float_3nan_prop_abc, s);
69     set_float_infzeronan_rule(float_infzeronan_dnan_never |
70                               float_infzeronan_suppress_invalid, s);
71     set_float_default_nan_pattern(0b11000000, s);
72 }
73 
74 /* Convert host exception flags to vfp form.  */
75 static inline uint32_t vfp_exceptbits_from_host(int host_bits, bool ah)
76 {
77     uint32_t target_bits = 0;
78 
79     if (host_bits & float_flag_invalid) {
80         target_bits |= FPSR_IOC;
81     }
82     if (host_bits & float_flag_divbyzero) {
83         target_bits |= FPSR_DZC;
84     }
85     if (host_bits & float_flag_overflow) {
86         target_bits |= FPSR_OFC;
87     }
88     if (host_bits & (float_flag_underflow | float_flag_output_denormal_flushed)) {
89         target_bits |= FPSR_UFC;
90     }
91     if (host_bits & float_flag_inexact) {
92         target_bits |= FPSR_IXC;
93     }
94     if (host_bits & float_flag_input_denormal_flushed) {
95         target_bits |= FPSR_IDC;
96     }
97     /*
98      * With FPCR.AH, IDC is set when an input denormal is used,
99      * and flushing an output denormal to zero sets both IXC and UFC.
100      */
101     if (ah && (host_bits & float_flag_input_denormal_used)) {
102         target_bits |= FPSR_IDC;
103     }
104     if (ah && (host_bits & float_flag_output_denormal_flushed)) {
105         target_bits |= FPSR_IXC;
106     }
107     return target_bits;
108 }
109 
110 uint32_t vfp_get_fpsr_from_host(CPUARMState *env)
111 {
112     uint32_t a32_flags = 0, a64_flags = 0;
113 
114     a32_flags |= get_float_exception_flags(&env->vfp.fp_status[FPST_A32]);
115     a32_flags |= get_float_exception_flags(&env->vfp.fp_status[FPST_STD]);
116     /* FZ16 does not generate an input denormal exception.  */
117     a32_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_A32_F16])
118           & ~float_flag_input_denormal_flushed);
119     a32_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_STD_F16])
120           & ~float_flag_input_denormal_flushed);
121 
122     a64_flags |= get_float_exception_flags(&env->vfp.fp_status[FPST_A64]);
123     a64_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_A64_F16])
124           & ~(float_flag_input_denormal_flushed | float_flag_input_denormal_used));
125     /*
126      * We do not merge in flags from FPST_AH or FPST_AH_F16, because
127      * they are used for insns that must not set the cumulative exception bits.
128      */
129 
130     /*
131      * Flushing an input denormal *only* because FPCR.FIZ == 1 does
132      * not set FPSR.IDC; if FPCR.FZ is also set then this takes
133      * precedence and IDC is set (see the FPUnpackBase pseudocode).
134      * So squash it unless (FPCR.AH == 0 && FPCR.FZ == 1).
135      * We only do this for the a64 flags because FIZ has no effect
136      * on AArch32 even if it is set.
137      */
138     if ((env->vfp.fpcr & (FPCR_FZ | FPCR_AH)) != FPCR_FZ) {
139         a64_flags &= ~float_flag_input_denormal_flushed;
140     }
141     return vfp_exceptbits_from_host(a64_flags, env->vfp.fpcr & FPCR_AH) |
142         vfp_exceptbits_from_host(a32_flags, false);
143 }
144 
145 void vfp_clear_float_status_exc_flags(CPUARMState *env)
146 {
147     /*
148      * Clear out all the exception-flag information in the float_status
149      * values. The caller should have arranged for env->vfp.fpsr to
150      * be the architecturally up-to-date exception flag information first.
151      */
152     set_float_exception_flags(0, &env->vfp.fp_status[FPST_A32]);
153     set_float_exception_flags(0, &env->vfp.fp_status[FPST_A64]);
154     set_float_exception_flags(0, &env->vfp.fp_status[FPST_A32_F16]);
155     set_float_exception_flags(0, &env->vfp.fp_status[FPST_A64_F16]);
156     set_float_exception_flags(0, &env->vfp.fp_status[FPST_STD]);
157     set_float_exception_flags(0, &env->vfp.fp_status[FPST_STD_F16]);
158     set_float_exception_flags(0, &env->vfp.fp_status[FPST_AH]);
159     set_float_exception_flags(0, &env->vfp.fp_status[FPST_AH_F16]);
160 }
161 
162 static void vfp_sync_and_clear_float_status_exc_flags(CPUARMState *env)
163 {
164     /*
165      * Synchronize any pending exception-flag information in the
166      * float_status values into env->vfp.fpsr, and then clear out
167      * the float_status data.
168      */
169     env->vfp.fpsr |= vfp_get_fpsr_from_host(env);
170     vfp_clear_float_status_exc_flags(env);
171 }
172 
173 void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask)
174 {
175     uint64_t changed = env->vfp.fpcr;
176 
177     changed ^= val;
178     changed &= mask;
179     if (changed & (3 << 22)) {
180         int i = (val >> 22) & 3;
181         switch (i) {
182         case FPROUNDING_TIEEVEN:
183             i = float_round_nearest_even;
184             break;
185         case FPROUNDING_POSINF:
186             i = float_round_up;
187             break;
188         case FPROUNDING_NEGINF:
189             i = float_round_down;
190             break;
191         case FPROUNDING_ZERO:
192             i = float_round_to_zero;
193             break;
194         }
195         set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A32]);
196         set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64]);
197         set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A32_F16]);
198         set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64_F16]);
199     }
200     if (changed & FPCR_FZ16) {
201         bool ftz_enabled = val & FPCR_FZ16;
202         set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32_F16]);
203         set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]);
204         set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]);
205         set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]);
206         set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32_F16]);
207         set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]);
208         set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]);
209         set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]);
210     }
211     if (changed & FPCR_FZ) {
212         bool ftz_enabled = val & FPCR_FZ;
213         set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]);
214         set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64]);
215         /* FIZ is A64 only so FZ always makes A32 code flush inputs to zero */
216         set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]);
217     }
218     if (changed & (FPCR_FZ | FPCR_AH | FPCR_FIZ)) {
219         /*
220          * A64: Flush denormalized inputs to zero if FPCR.FIZ = 1, or
221          * both FPCR.AH = 0 and FPCR.FZ = 1.
222          */
223         bool fitz_enabled = (val & FPCR_FIZ) ||
224             (val & (FPCR_FZ | FPCR_AH)) == FPCR_FZ;
225         set_flush_inputs_to_zero(fitz_enabled, &env->vfp.fp_status[FPST_A64]);
226     }
227     if (changed & FPCR_DN) {
228         bool dnan_enabled = val & FPCR_DN;
229         set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A32]);
230         set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A64]);
231         set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A32_F16]);
232         set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A64_F16]);
233         set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_AH]);
234         set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_AH_F16]);
235     }
236     if (changed & FPCR_AH) {
237         bool ah_enabled = val & FPCR_AH;
238 
239         if (ah_enabled) {
240             /* Change behaviours for A64 FP operations */
241             arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64]);
242             arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]);
243         } else {
244             arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64]);
245             arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]);
246         }
247     }
248     /*
249      * If any bits changed that we look at in vfp_get_fpsr_from_host(),
250      * we must sync the float_status flags into vfp.fpsr now (under the
251      * old regime) before we update vfp.fpcr.
252      */
253     if (changed & (FPCR_FZ | FPCR_AH | FPCR_FIZ)) {
254         vfp_sync_and_clear_float_status_exc_flags(env);
255     }
256 }
257 
258 /*
259  * VFP support.  We follow the convention used for VFP instructions:
260  * Single precision routines have a "s" suffix, double precision a
261  * "d" suffix.
262  */
263 
264 #define VFP_HELPER(name, p) HELPER(glue(glue(vfp_,name),p))
265 
266 #define VFP_BINOP(name) \
267 dh_ctype_f16 VFP_HELPER(name, h)(dh_ctype_f16 a, dh_ctype_f16 b, float_status *fpst) \
268 { \
269     return float16_ ## name(a, b, fpst); \
270 } \
271 float32 VFP_HELPER(name, s)(float32 a, float32 b, float_status *fpst) \
272 { \
273     return float32_ ## name(a, b, fpst); \
274 } \
275 float64 VFP_HELPER(name, d)(float64 a, float64 b, float_status *fpst) \
276 { \
277     return float64_ ## name(a, b, fpst); \
278 }
279 VFP_BINOP(add)
280 VFP_BINOP(sub)
281 VFP_BINOP(mul)
282 VFP_BINOP(div)
283 VFP_BINOP(min)
284 VFP_BINOP(max)
285 VFP_BINOP(minnum)
286 VFP_BINOP(maxnum)
287 #undef VFP_BINOP
288 
289 dh_ctype_f16 VFP_HELPER(sqrt, h)(dh_ctype_f16 a, float_status *fpst)
290 {
291     return float16_sqrt(a, fpst);
292 }
293 
294 float32 VFP_HELPER(sqrt, s)(float32 a, float_status *fpst)
295 {
296     return float32_sqrt(a, fpst);
297 }
298 
299 float64 VFP_HELPER(sqrt, d)(float64 a, float_status *fpst)
300 {
301     return float64_sqrt(a, fpst);
302 }
303 
304 static void softfloat_to_vfp_compare(CPUARMState *env, FloatRelation cmp)
305 {
306     uint32_t flags;
307     switch (cmp) {
308     case float_relation_equal:
309         flags = 0x6;
310         break;
311     case float_relation_less:
312         flags = 0x8;
313         break;
314     case float_relation_greater:
315         flags = 0x2;
316         break;
317     case float_relation_unordered:
318         flags = 0x3;
319         break;
320     default:
321         g_assert_not_reached();
322     }
323     env->vfp.fpsr = deposit64(env->vfp.fpsr, 28, 4, flags); /* NZCV */
324 }
325 
326 /* XXX: check quiet/signaling case */
327 #define DO_VFP_cmp(P, FLOATTYPE, ARGTYPE, FPST) \
328 void VFP_HELPER(cmp, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env)  \
329 { \
330     softfloat_to_vfp_compare(env, \
331         FLOATTYPE ## _compare_quiet(a, b, &env->vfp.fp_status[FPST])); \
332 } \
333 void VFP_HELPER(cmpe, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \
334 { \
335     softfloat_to_vfp_compare(env, \
336         FLOATTYPE ## _compare(a, b, &env->vfp.fp_status[FPST])); \
337 }
338 DO_VFP_cmp(h, float16, dh_ctype_f16, FPST_A32_F16)
339 DO_VFP_cmp(s, float32, float32, FPST_A32)
340 DO_VFP_cmp(d, float64, float64, FPST_A32)
341 #undef DO_VFP_cmp
342 
343 /* Integer to float and float to integer conversions */
344 
345 #define CONV_ITOF(name, ftype, fsz, sign)                           \
346 ftype HELPER(name)(uint32_t x, float_status *fpst)                  \
347 {                                                                   \
348     return sign##int32_to_##float##fsz((sign##int32_t)x, fpst);     \
349 }
350 
351 #define CONV_FTOI(name, ftype, fsz, sign, round)                \
352 sign##int32_t HELPER(name)(ftype x, float_status *fpst)         \
353 {                                                               \
354     if (float##fsz##_is_any_nan(x)) {                           \
355         float_raise(float_flag_invalid, fpst);                  \
356         return 0;                                               \
357     }                                                           \
358     return float##fsz##_to_##sign##int32##round(x, fpst);       \
359 }
360 
361 #define FLOAT_CONVS(name, p, ftype, fsz, sign)            \
362     CONV_ITOF(vfp_##name##to##p, ftype, fsz, sign)        \
363     CONV_FTOI(vfp_to##name##p, ftype, fsz, sign, )        \
364     CONV_FTOI(vfp_to##name##z##p, ftype, fsz, sign, _round_to_zero)
365 
366 FLOAT_CONVS(si, h, uint32_t, 16, )
367 FLOAT_CONVS(si, s, float32, 32, )
368 FLOAT_CONVS(si, d, float64, 64, )
369 FLOAT_CONVS(ui, h, uint32_t, 16, u)
370 FLOAT_CONVS(ui, s, float32, 32, u)
371 FLOAT_CONVS(ui, d, float64, 64, u)
372 
373 #undef CONV_ITOF
374 #undef CONV_FTOI
375 #undef FLOAT_CONVS
376 
377 /* floating point conversion */
378 float64 VFP_HELPER(fcvtd, s)(float32 x, float_status *status)
379 {
380     return float32_to_float64(x, status);
381 }
382 
383 float32 VFP_HELPER(fcvts, d)(float64 x, float_status *status)
384 {
385     return float64_to_float32(x, status);
386 }
387 
388 uint32_t HELPER(bfcvt)(float32 x, float_status *status)
389 {
390     return float32_to_bfloat16(x, status);
391 }
392 
393 uint32_t HELPER(bfcvt_pair)(uint64_t pair, float_status *status)
394 {
395     bfloat16 lo = float32_to_bfloat16(extract64(pair, 0, 32), status);
396     bfloat16 hi = float32_to_bfloat16(extract64(pair, 32, 32), status);
397     return deposit32(lo, 16, 16, hi);
398 }
399 
400 /*
401  * VFP3 fixed point conversion. The AArch32 versions of fix-to-float
402  * must always round-to-nearest; the AArch64 ones honour the FPSCR
403  * rounding mode. (For AArch32 Neon the standard-FPSCR is set to
404  * round-to-nearest so either helper will work.) AArch32 float-to-fix
405  * must round-to-zero.
406  */
407 #define VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype)            \
408 ftype HELPER(vfp_##name##to##p)(uint##isz##_t  x, uint32_t shift,      \
409                                 float_status *fpst)                    \
410 { return itype##_to_##float##fsz##_scalbn(x, -shift, fpst); }
411 
412 #define VFP_CONV_FIX_FLOAT_ROUND(name, p, fsz, ftype, isz, itype)      \
413     ftype HELPER(vfp_##name##to##p##_round_to_nearest)(uint##isz##_t  x, \
414                                                      uint32_t shift,   \
415                                                      float_status *fpst) \
416     {                                                                  \
417         ftype ret;                                                     \
418         FloatRoundMode oldmode = fpst->float_rounding_mode;            \
419         fpst->float_rounding_mode = float_round_nearest_even;          \
420         ret = itype##_to_##float##fsz##_scalbn(x, -shift, fpst);       \
421         fpst->float_rounding_mode = oldmode;                           \
422         return ret;                                                    \
423     }
424 
425 #define VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, ROUND, suff) \
426 uint##isz##_t HELPER(vfp_to##name##p##suff)(ftype x, uint32_t shift,      \
427                                             float_status *fpst)           \
428 {                                                                         \
429     if (unlikely(float##fsz##_is_any_nan(x))) {                           \
430         float_raise(float_flag_invalid, fpst);                            \
431         return 0;                                                         \
432     }                                                                     \
433     return float##fsz##_to_##itype##_scalbn(x, ROUND, shift, fpst);       \
434 }
435 
436 #define VFP_CONV_FIX(name, p, fsz, ftype, isz, itype)            \
437 VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype)              \
438 VFP_CONV_FIX_FLOAT_ROUND(name, p, fsz, ftype, isz, itype)        \
439 VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype,        \
440                          float_round_to_zero, _round_to_zero)    \
441 VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype,        \
442                          get_float_rounding_mode(fpst), )
443 
444 #define VFP_CONV_FIX_A64(name, p, fsz, ftype, isz, itype)        \
445 VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype)              \
446 VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype,        \
447                          get_float_rounding_mode(fpst), )
448 
449 VFP_CONV_FIX(sh, d, 64, float64, 64, int16)
450 VFP_CONV_FIX(sl, d, 64, float64, 64, int32)
451 VFP_CONV_FIX_A64(sq, d, 64, float64, 64, int64)
452 VFP_CONV_FIX(uh, d, 64, float64, 64, uint16)
453 VFP_CONV_FIX(ul, d, 64, float64, 64, uint32)
454 VFP_CONV_FIX_A64(uq, d, 64, float64, 64, uint64)
455 VFP_CONV_FIX(sh, s, 32, float32, 32, int16)
456 VFP_CONV_FIX(sl, s, 32, float32, 32, int32)
457 VFP_CONV_FIX_A64(sq, s, 32, float32, 64, int64)
458 VFP_CONV_FIX(uh, s, 32, float32, 32, uint16)
459 VFP_CONV_FIX(ul, s, 32, float32, 32, uint32)
460 VFP_CONV_FIX_A64(uq, s, 32, float32, 64, uint64)
461 VFP_CONV_FIX(sh, h, 16, dh_ctype_f16, 32, int16)
462 VFP_CONV_FIX(sl, h, 16, dh_ctype_f16, 32, int32)
463 VFP_CONV_FIX_A64(sq, h, 16, dh_ctype_f16, 64, int64)
464 VFP_CONV_FIX(uh, h, 16, dh_ctype_f16, 32, uint16)
465 VFP_CONV_FIX(ul, h, 16, dh_ctype_f16, 32, uint32)
466 VFP_CONV_FIX_A64(uq, h, 16, dh_ctype_f16, 64, uint64)
467 VFP_CONV_FLOAT_FIX_ROUND(sq, d, 64, float64, 64, int64,
468                          float_round_to_zero, _round_to_zero)
469 VFP_CONV_FLOAT_FIX_ROUND(uq, d, 64, float64, 64, uint64,
470                          float_round_to_zero, _round_to_zero)
471 
472 #undef VFP_CONV_FIX
473 #undef VFP_CONV_FIX_FLOAT
474 #undef VFP_CONV_FLOAT_FIX_ROUND
475 #undef VFP_CONV_FIX_A64
476 
477 /* Set the current fp rounding mode and return the old one.
478  * The argument is a softfloat float_round_ value.
479  */
480 uint32_t HELPER(set_rmode)(uint32_t rmode, float_status *fp_status)
481 {
482     uint32_t prev_rmode = get_float_rounding_mode(fp_status);
483     set_float_rounding_mode(rmode, fp_status);
484 
485     return prev_rmode;
486 }
487 
488 /* Half precision conversions.  */
489 float32 HELPER(vfp_fcvt_f16_to_f32)(uint32_t a, float_status *fpst,
490                                     uint32_t ahp_mode)
491 {
492     /* Squash FZ16 to 0 for the duration of conversion.  In this case,
493      * it would affect flushing input denormals.
494      */
495     bool save = get_flush_inputs_to_zero(fpst);
496     set_flush_inputs_to_zero(false, fpst);
497     float32 r = float16_to_float32(a, !ahp_mode, fpst);
498     set_flush_inputs_to_zero(save, fpst);
499     return r;
500 }
501 
502 uint32_t HELPER(vfp_fcvt_f32_to_f16)(float32 a, float_status *fpst,
503                                      uint32_t ahp_mode)
504 {
505     /* Squash FZ16 to 0 for the duration of conversion.  In this case,
506      * it would affect flushing output denormals.
507      */
508     bool save = get_flush_to_zero(fpst);
509     set_flush_to_zero(false, fpst);
510     float16 r = float32_to_float16(a, !ahp_mode, fpst);
511     set_flush_to_zero(save, fpst);
512     return r;
513 }
514 
515 float64 HELPER(vfp_fcvt_f16_to_f64)(uint32_t a, float_status *fpst,
516                                     uint32_t ahp_mode)
517 {
518     /* Squash FZ16 to 0 for the duration of conversion.  In this case,
519      * it would affect flushing input denormals.
520      */
521     bool save = get_flush_inputs_to_zero(fpst);
522     set_flush_inputs_to_zero(false, fpst);
523     float64 r = float16_to_float64(a, !ahp_mode, fpst);
524     set_flush_inputs_to_zero(save, fpst);
525     return r;
526 }
527 
528 uint32_t HELPER(vfp_fcvt_f64_to_f16)(float64 a, float_status *fpst,
529                                      uint32_t ahp_mode)
530 {
531     /* Squash FZ16 to 0 for the duration of conversion.  In this case,
532      * it would affect flushing output denormals.
533      */
534     bool save = get_flush_to_zero(fpst);
535     set_flush_to_zero(false, fpst);
536     float16 r = float64_to_float16(a, !ahp_mode, fpst);
537     set_flush_to_zero(save, fpst);
538     return r;
539 }
540 
541 /* NEON helpers.  */
542 
543 /* Constants 256 and 512 are used in some helpers; we avoid relying on
544  * int->float conversions at run-time.  */
545 #define float64_256 make_float64(0x4070000000000000LL)
546 #define float64_512 make_float64(0x4080000000000000LL)
547 #define float16_maxnorm make_float16(0x7bff)
548 #define float32_maxnorm make_float32(0x7f7fffff)
549 #define float64_maxnorm make_float64(0x7fefffffffffffffLL)
550 
551 /* Reciprocal functions
552  *
553  * The algorithm that must be used to calculate the estimate
554  * is specified by the ARM ARM, see FPRecipEstimate()/RecipEstimate
555  */
556 
557 /* See RecipEstimate()
558  *
559  * input is a 9 bit fixed point number
560  * input range 256 .. 511 for a number from 0.5 <= x < 1.0.
561  * result range 256 .. 511 for a number from 1.0 to 511/256.
562  */
563 
564 static int recip_estimate(int input)
565 {
566     int a, b, r;
567     assert(256 <= input && input < 512);
568     a = (input * 2) + 1;
569     b = (1 << 19) / a;
570     r = (b + 1) >> 1;
571     assert(256 <= r && r < 512);
572     return r;
573 }
574 
575 /*
576  * Increased precision version:
577  * input is a 13 bit fixed point number
578  * input range 2048 .. 4095 for a number from 0.5 <= x < 1.0.
579  * result range 4096 .. 8191 for a number from 1.0 to 2.0
580  */
581 static int recip_estimate_incprec(int input)
582 {
583     int a, b, r;
584     assert(2048 <= input && input < 4096);
585     a = (input * 2) + 1;
586     /*
587      * The pseudocode expresses this as an operation on infinite
588      * precision reals where it calculates 2^25 / a and then looks
589      * at the error between that and the rounded-down-to-integer
590      * value to see if it should instead round up. We instead
591      * follow the same approach as the pseudocode for the 8-bit
592      * precision version, and calculate (2 * (2^25 / a)) as an
593      * integer so we can do the "add one and halve" to round it.
594      * So the 1 << 26 here is correct.
595      */
596     b = (1 << 26) / a;
597     r = (b + 1) >> 1;
598     assert(4096 <= r && r < 8192);
599     return r;
600 }
601 
602 /*
603  * Common wrapper to call recip_estimate
604  *
605  * The parameters are exponent and 64 bit fraction (without implicit
606  * bit) where the binary point is nominally at bit 52. Returns a
607  * float64 which can then be rounded to the appropriate size by the
608  * callee.
609  */
610 
611 static uint64_t call_recip_estimate(int *exp, int exp_off, uint64_t frac,
612                                     bool increasedprecision)
613 {
614     uint32_t scaled, estimate;
615     uint64_t result_frac;
616     int result_exp;
617 
618     /* Handle sub-normals */
619     if (*exp == 0) {
620         if (extract64(frac, 51, 1) == 0) {
621             *exp = -1;
622             frac <<= 2;
623         } else {
624             frac <<= 1;
625         }
626     }
627 
628     if (increasedprecision) {
629         /* scaled = UInt('1':fraction<51:41>) */
630         scaled = deposit32(1 << 11, 0, 11, extract64(frac, 41, 11));
631         estimate = recip_estimate_incprec(scaled);
632     } else {
633         /* scaled = UInt('1':fraction<51:44>) */
634         scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
635         estimate = recip_estimate(scaled);
636     }
637 
638     result_exp = exp_off - *exp;
639     if (increasedprecision) {
640         result_frac = deposit64(0, 40, 12, estimate);
641     } else {
642         result_frac = deposit64(0, 44, 8, estimate);
643     }
644     if (result_exp == 0) {
645         result_frac = deposit64(result_frac >> 1, 51, 1, 1);
646     } else if (result_exp == -1) {
647         result_frac = deposit64(result_frac >> 2, 50, 2, 1);
648         result_exp = 0;
649     }
650 
651     *exp = result_exp;
652 
653     return result_frac;
654 }
655 
656 static bool round_to_inf(float_status *fpst, bool sign_bit)
657 {
658     switch (fpst->float_rounding_mode) {
659     case float_round_nearest_even: /* Round to Nearest */
660         return true;
661     case float_round_up: /* Round to +Inf */
662         return !sign_bit;
663     case float_round_down: /* Round to -Inf */
664         return sign_bit;
665     case float_round_to_zero: /* Round to Zero */
666         return false;
667     default:
668         g_assert_not_reached();
669     }
670 }
671 
672 uint32_t HELPER(recpe_f16)(uint32_t input, float_status *fpst)
673 {
674     float16 f16 = float16_squash_input_denormal(input, fpst);
675     uint32_t f16_val = float16_val(f16);
676     uint32_t f16_sign = float16_is_neg(f16);
677     int f16_exp = extract32(f16_val, 10, 5);
678     uint32_t f16_frac = extract32(f16_val, 0, 10);
679     uint64_t f64_frac;
680 
681     if (float16_is_any_nan(f16)) {
682         float16 nan = f16;
683         if (float16_is_signaling_nan(f16, fpst)) {
684             float_raise(float_flag_invalid, fpst);
685             if (!fpst->default_nan_mode) {
686                 nan = float16_silence_nan(f16, fpst);
687             }
688         }
689         if (fpst->default_nan_mode) {
690             nan =  float16_default_nan(fpst);
691         }
692         return nan;
693     } else if (float16_is_infinity(f16)) {
694         return float16_set_sign(float16_zero, float16_is_neg(f16));
695     } else if (float16_is_zero(f16)) {
696         float_raise(float_flag_divbyzero, fpst);
697         return float16_set_sign(float16_infinity, float16_is_neg(f16));
698     } else if (float16_abs(f16) < (1 << 8)) {
699         /* Abs(value) < 2.0^-16 */
700         float_raise(float_flag_overflow | float_flag_inexact, fpst);
701         if (round_to_inf(fpst, f16_sign)) {
702             return float16_set_sign(float16_infinity, f16_sign);
703         } else {
704             return float16_set_sign(float16_maxnorm, f16_sign);
705         }
706     } else if (f16_exp >= 29 && fpst->flush_to_zero) {
707         float_raise(float_flag_underflow, fpst);
708         return float16_set_sign(float16_zero, float16_is_neg(f16));
709     }
710 
711     f64_frac = call_recip_estimate(&f16_exp, 29,
712                                    ((uint64_t) f16_frac) << (52 - 10), false);
713 
714     /* result = sign : result_exp<4:0> : fraction<51:42> */
715     f16_val = deposit32(0, 15, 1, f16_sign);
716     f16_val = deposit32(f16_val, 10, 5, f16_exp);
717     f16_val = deposit32(f16_val, 0, 10, extract64(f64_frac, 52 - 10, 10));
718     return make_float16(f16_val);
719 }
720 
721 /*
722  * FEAT_RPRES means the f32 FRECPE has an "increased precision" variant
723  * which is used when FPCR.AH == 1.
724  */
725 static float32 do_recpe_f32(float32 input, float_status *fpst, bool rpres)
726 {
727     float32 f32 = float32_squash_input_denormal(input, fpst);
728     uint32_t f32_val = float32_val(f32);
729     bool f32_sign = float32_is_neg(f32);
730     int f32_exp = extract32(f32_val, 23, 8);
731     uint32_t f32_frac = extract32(f32_val, 0, 23);
732     uint64_t f64_frac;
733 
734     if (float32_is_any_nan(f32)) {
735         float32 nan = f32;
736         if (float32_is_signaling_nan(f32, fpst)) {
737             float_raise(float_flag_invalid, fpst);
738             if (!fpst->default_nan_mode) {
739                 nan = float32_silence_nan(f32, fpst);
740             }
741         }
742         if (fpst->default_nan_mode) {
743             nan =  float32_default_nan(fpst);
744         }
745         return nan;
746     } else if (float32_is_infinity(f32)) {
747         return float32_set_sign(float32_zero, float32_is_neg(f32));
748     } else if (float32_is_zero(f32)) {
749         float_raise(float_flag_divbyzero, fpst);
750         return float32_set_sign(float32_infinity, float32_is_neg(f32));
751     } else if (float32_abs(f32) < (1ULL << 21)) {
752         /* Abs(value) < 2.0^-128 */
753         float_raise(float_flag_overflow | float_flag_inexact, fpst);
754         if (round_to_inf(fpst, f32_sign)) {
755             return float32_set_sign(float32_infinity, f32_sign);
756         } else {
757             return float32_set_sign(float32_maxnorm, f32_sign);
758         }
759     } else if (f32_exp >= 253 && fpst->flush_to_zero) {
760         float_raise(float_flag_underflow, fpst);
761         return float32_set_sign(float32_zero, float32_is_neg(f32));
762     }
763 
764     f64_frac = call_recip_estimate(&f32_exp, 253,
765                                    ((uint64_t) f32_frac) << (52 - 23), rpres);
766 
767     /* result = sign : result_exp<7:0> : fraction<51:29> */
768     f32_val = deposit32(0, 31, 1, f32_sign);
769     f32_val = deposit32(f32_val, 23, 8, f32_exp);
770     f32_val = deposit32(f32_val, 0, 23, extract64(f64_frac, 52 - 23, 23));
771     return make_float32(f32_val);
772 }
773 
774 float32 HELPER(recpe_f32)(float32 input, float_status *fpst)
775 {
776     return do_recpe_f32(input, fpst, false);
777 }
778 
779 float32 HELPER(recpe_rpres_f32)(float32 input, float_status *fpst)
780 {
781     return do_recpe_f32(input, fpst, true);
782 }
783 
784 float64 HELPER(recpe_f64)(float64 input, float_status *fpst)
785 {
786     float64 f64 = float64_squash_input_denormal(input, fpst);
787     uint64_t f64_val = float64_val(f64);
788     bool f64_sign = float64_is_neg(f64);
789     int f64_exp = extract64(f64_val, 52, 11);
790     uint64_t f64_frac = extract64(f64_val, 0, 52);
791 
792     /* Deal with any special cases */
793     if (float64_is_any_nan(f64)) {
794         float64 nan = f64;
795         if (float64_is_signaling_nan(f64, fpst)) {
796             float_raise(float_flag_invalid, fpst);
797             if (!fpst->default_nan_mode) {
798                 nan = float64_silence_nan(f64, fpst);
799             }
800         }
801         if (fpst->default_nan_mode) {
802             nan =  float64_default_nan(fpst);
803         }
804         return nan;
805     } else if (float64_is_infinity(f64)) {
806         return float64_set_sign(float64_zero, float64_is_neg(f64));
807     } else if (float64_is_zero(f64)) {
808         float_raise(float_flag_divbyzero, fpst);
809         return float64_set_sign(float64_infinity, float64_is_neg(f64));
810     } else if ((f64_val & ~(1ULL << 63)) < (1ULL << 50)) {
811         /* Abs(value) < 2.0^-1024 */
812         float_raise(float_flag_overflow | float_flag_inexact, fpst);
813         if (round_to_inf(fpst, f64_sign)) {
814             return float64_set_sign(float64_infinity, f64_sign);
815         } else {
816             return float64_set_sign(float64_maxnorm, f64_sign);
817         }
818     } else if (f64_exp >= 2045 && fpst->flush_to_zero) {
819         float_raise(float_flag_underflow, fpst);
820         return float64_set_sign(float64_zero, float64_is_neg(f64));
821     }
822 
823     f64_frac = call_recip_estimate(&f64_exp, 2045, f64_frac, false);
824 
825     /* result = sign : result_exp<10:0> : fraction<51:0>; */
826     f64_val = deposit64(0, 63, 1, f64_sign);
827     f64_val = deposit64(f64_val, 52, 11, f64_exp);
828     f64_val = deposit64(f64_val, 0, 52, f64_frac);
829     return make_float64(f64_val);
830 }
831 
832 /* The algorithm that must be used to calculate the estimate
833  * is specified by the ARM ARM.
834  */
835 
836 static int do_recip_sqrt_estimate(int a)
837 {
838     int b, estimate;
839 
840     assert(128 <= a && a < 512);
841     if (a < 256) {
842         a = a * 2 + 1;
843     } else {
844         a = (a >> 1) << 1;
845         a = (a + 1) * 2;
846     }
847     b = 512;
848     while (a * (b + 1) * (b + 1) < (1 << 28)) {
849         b += 1;
850     }
851     estimate = (b + 1) / 2;
852     assert(256 <= estimate && estimate < 512);
853 
854     return estimate;
855 }
856 
857 static int do_recip_sqrt_estimate_incprec(int a)
858 {
859     /*
860      * The Arm ARM describes the 12-bit precision version of RecipSqrtEstimate
861      * in terms of an infinite-precision floating point calculation of a
862      * square root. We implement this using the same kind of pure integer
863      * algorithm as the 8-bit mantissa, to get the same bit-for-bit result.
864      */
865     int64_t b, estimate;
866 
867     assert(1024 <= a && a < 4096);
868     if (a < 2048) {
869         a = a * 2 + 1;
870     } else {
871         a = (a >> 1) << 1;
872         a = (a + 1) * 2;
873     }
874     b = 8192;
875     while (a * (b + 1) * (b + 1) < (1ULL << 39)) {
876         b += 1;
877     }
878     estimate = (b + 1) / 2;
879 
880     assert(4096 <= estimate && estimate < 8192);
881 
882     return estimate;
883 }
884 
885 static uint64_t recip_sqrt_estimate(int *exp , int exp_off, uint64_t frac,
886                                     bool increasedprecision)
887 {
888     int estimate;
889     uint32_t scaled;
890 
891     if (*exp == 0) {
892         while (extract64(frac, 51, 1) == 0) {
893             frac = frac << 1;
894             *exp -= 1;
895         }
896         frac = extract64(frac, 0, 51) << 1;
897     }
898 
899     if (increasedprecision) {
900         if (*exp & 1) {
901             /* scaled = UInt('01':fraction<51:42>) */
902             scaled = deposit32(1 << 10, 0, 10, extract64(frac, 42, 10));
903         } else {
904             /* scaled = UInt('1':fraction<51:41>) */
905             scaled = deposit32(1 << 11, 0, 11, extract64(frac, 41, 11));
906         }
907         estimate = do_recip_sqrt_estimate_incprec(scaled);
908     } else {
909         if (*exp & 1) {
910             /* scaled = UInt('01':fraction<51:45>) */
911             scaled = deposit32(1 << 7, 0, 7, extract64(frac, 45, 7));
912         } else {
913             /* scaled = UInt('1':fraction<51:44>) */
914             scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
915         }
916         estimate = do_recip_sqrt_estimate(scaled);
917     }
918 
919     *exp = (exp_off - *exp) / 2;
920     if (increasedprecision) {
921         return extract64(estimate, 0, 12) << 40;
922     } else {
923         return extract64(estimate, 0, 8) << 44;
924     }
925 }
926 
927 uint32_t HELPER(rsqrte_f16)(uint32_t input, float_status *s)
928 {
929     float16 f16 = float16_squash_input_denormal(input, s);
930     uint16_t val = float16_val(f16);
931     bool f16_sign = float16_is_neg(f16);
932     int f16_exp = extract32(val, 10, 5);
933     uint16_t f16_frac = extract32(val, 0, 10);
934     uint64_t f64_frac;
935 
936     if (float16_is_any_nan(f16)) {
937         float16 nan = f16;
938         if (float16_is_signaling_nan(f16, s)) {
939             float_raise(float_flag_invalid, s);
940             if (!s->default_nan_mode) {
941                 nan = float16_silence_nan(f16, s);
942             }
943         }
944         if (s->default_nan_mode) {
945             nan =  float16_default_nan(s);
946         }
947         return nan;
948     } else if (float16_is_zero(f16)) {
949         float_raise(float_flag_divbyzero, s);
950         return float16_set_sign(float16_infinity, f16_sign);
951     } else if (f16_sign) {
952         float_raise(float_flag_invalid, s);
953         return float16_default_nan(s);
954     } else if (float16_is_infinity(f16)) {
955         return float16_zero;
956     }
957 
958     /* Scale and normalize to a double-precision value between 0.25 and 1.0,
959      * preserving the parity of the exponent.  */
960 
961     f64_frac = ((uint64_t) f16_frac) << (52 - 10);
962 
963     f64_frac = recip_sqrt_estimate(&f16_exp, 44, f64_frac, false);
964 
965     /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(2) */
966     val = deposit32(0, 15, 1, f16_sign);
967     val = deposit32(val, 10, 5, f16_exp);
968     val = deposit32(val, 2, 8, extract64(f64_frac, 52 - 8, 8));
969     return make_float16(val);
970 }
971 
972 /*
973  * FEAT_RPRES means the f32 FRSQRTE has an "increased precision" variant
974  * which is used when FPCR.AH == 1.
975  */
976 static float32 do_rsqrte_f32(float32 input, float_status *s, bool rpres)
977 {
978     float32 f32 = float32_squash_input_denormal(input, s);
979     uint32_t val = float32_val(f32);
980     uint32_t f32_sign = float32_is_neg(f32);
981     int f32_exp = extract32(val, 23, 8);
982     uint32_t f32_frac = extract32(val, 0, 23);
983     uint64_t f64_frac;
984 
985     if (float32_is_any_nan(f32)) {
986         float32 nan = f32;
987         if (float32_is_signaling_nan(f32, s)) {
988             float_raise(float_flag_invalid, s);
989             if (!s->default_nan_mode) {
990                 nan = float32_silence_nan(f32, s);
991             }
992         }
993         if (s->default_nan_mode) {
994             nan =  float32_default_nan(s);
995         }
996         return nan;
997     } else if (float32_is_zero(f32)) {
998         float_raise(float_flag_divbyzero, s);
999         return float32_set_sign(float32_infinity, float32_is_neg(f32));
1000     } else if (float32_is_neg(f32)) {
1001         float_raise(float_flag_invalid, s);
1002         return float32_default_nan(s);
1003     } else if (float32_is_infinity(f32)) {
1004         return float32_zero;
1005     }
1006 
1007     /* Scale and normalize to a double-precision value between 0.25 and 1.0,
1008      * preserving the parity of the exponent.  */
1009 
1010     f64_frac = ((uint64_t) f32_frac) << 29;
1011 
1012     f64_frac = recip_sqrt_estimate(&f32_exp, 380, f64_frac, rpres);
1013 
1014     /*
1015      * result = sign : result_exp<7:0> : estimate<7:0> : Zeros(15)
1016      * or for increased precision
1017      * result = sign : result_exp<7:0> : estimate<11:0> : Zeros(11)
1018      */
1019     val = deposit32(0, 31, 1, f32_sign);
1020     val = deposit32(val, 23, 8, f32_exp);
1021     if (rpres) {
1022         val = deposit32(val, 11, 12, extract64(f64_frac, 52 - 12, 12));
1023     } else {
1024         val = deposit32(val, 15, 8, extract64(f64_frac, 52 - 8, 8));
1025     }
1026     return make_float32(val);
1027 }
1028 
1029 float32 HELPER(rsqrte_f32)(float32 input, float_status *s)
1030 {
1031     return do_rsqrte_f32(input, s, false);
1032 }
1033 
1034 float32 HELPER(rsqrte_rpres_f32)(float32 input, float_status *s)
1035 {
1036     return do_rsqrte_f32(input, s, true);
1037 }
1038 
1039 float64 HELPER(rsqrte_f64)(float64 input, float_status *s)
1040 {
1041     float64 f64 = float64_squash_input_denormal(input, s);
1042     uint64_t val = float64_val(f64);
1043     bool f64_sign = float64_is_neg(f64);
1044     int f64_exp = extract64(val, 52, 11);
1045     uint64_t f64_frac = extract64(val, 0, 52);
1046 
1047     if (float64_is_any_nan(f64)) {
1048         float64 nan = f64;
1049         if (float64_is_signaling_nan(f64, s)) {
1050             float_raise(float_flag_invalid, s);
1051             if (!s->default_nan_mode) {
1052                 nan = float64_silence_nan(f64, s);
1053             }
1054         }
1055         if (s->default_nan_mode) {
1056             nan =  float64_default_nan(s);
1057         }
1058         return nan;
1059     } else if (float64_is_zero(f64)) {
1060         float_raise(float_flag_divbyzero, s);
1061         return float64_set_sign(float64_infinity, float64_is_neg(f64));
1062     } else if (float64_is_neg(f64)) {
1063         float_raise(float_flag_invalid, s);
1064         return float64_default_nan(s);
1065     } else if (float64_is_infinity(f64)) {
1066         return float64_zero;
1067     }
1068 
1069     f64_frac = recip_sqrt_estimate(&f64_exp, 3068, f64_frac, false);
1070 
1071     /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(44) */
1072     val = deposit64(0, 61, 1, f64_sign);
1073     val = deposit64(val, 52, 11, f64_exp);
1074     val = deposit64(val, 44, 8, extract64(f64_frac, 52 - 8, 8));
1075     return make_float64(val);
1076 }
1077 
1078 uint32_t HELPER(recpe_u32)(uint32_t a)
1079 {
1080     int input, estimate;
1081 
1082     if ((a & 0x80000000) == 0) {
1083         return 0xffffffff;
1084     }
1085 
1086     input = extract32(a, 23, 9);
1087     estimate = recip_estimate(input);
1088 
1089     return deposit32(0, (32 - 9), 9, estimate);
1090 }
1091 
1092 uint32_t HELPER(rsqrte_u32)(uint32_t a)
1093 {
1094     int estimate;
1095 
1096     if ((a & 0xc0000000) == 0) {
1097         return 0xffffffff;
1098     }
1099 
1100     estimate = do_recip_sqrt_estimate(extract32(a, 23, 9));
1101 
1102     return deposit32(0, 23, 9, estimate);
1103 }
1104 
1105 /* VFPv4 fused multiply-accumulate */
1106 dh_ctype_f16 VFP_HELPER(muladd, h)(dh_ctype_f16 a, dh_ctype_f16 b,
1107                                    dh_ctype_f16 c, float_status *fpst)
1108 {
1109     return float16_muladd(a, b, c, 0, fpst);
1110 }
1111 
1112 float32 VFP_HELPER(muladd, s)(float32 a, float32 b, float32 c,
1113                               float_status *fpst)
1114 {
1115     return float32_muladd(a, b, c, 0, fpst);
1116 }
1117 
1118 float64 VFP_HELPER(muladd, d)(float64 a, float64 b, float64 c,
1119                               float_status *fpst)
1120 {
1121     return float64_muladd(a, b, c, 0, fpst);
1122 }
1123 
1124 /* ARMv8 round to integral */
1125 dh_ctype_f16 HELPER(rinth_exact)(dh_ctype_f16 x, float_status *fp_status)
1126 {
1127     return float16_round_to_int(x, fp_status);
1128 }
1129 
1130 float32 HELPER(rints_exact)(float32 x, float_status *fp_status)
1131 {
1132     return float32_round_to_int(x, fp_status);
1133 }
1134 
1135 float64 HELPER(rintd_exact)(float64 x, float_status *fp_status)
1136 {
1137     return float64_round_to_int(x, fp_status);
1138 }
1139 
1140 dh_ctype_f16 HELPER(rinth)(dh_ctype_f16 x, float_status *fp_status)
1141 {
1142     int old_flags = get_float_exception_flags(fp_status), new_flags;
1143     float16 ret;
1144 
1145     ret = float16_round_to_int(x, fp_status);
1146 
1147     /* Suppress any inexact exceptions the conversion produced */
1148     if (!(old_flags & float_flag_inexact)) {
1149         new_flags = get_float_exception_flags(fp_status);
1150         set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
1151     }
1152 
1153     return ret;
1154 }
1155 
1156 float32 HELPER(rints)(float32 x, float_status *fp_status)
1157 {
1158     int old_flags = get_float_exception_flags(fp_status), new_flags;
1159     float32 ret;
1160 
1161     ret = float32_round_to_int(x, fp_status);
1162 
1163     /* Suppress any inexact exceptions the conversion produced */
1164     if (!(old_flags & float_flag_inexact)) {
1165         new_flags = get_float_exception_flags(fp_status);
1166         set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
1167     }
1168 
1169     return ret;
1170 }
1171 
1172 float64 HELPER(rintd)(float64 x, float_status *fp_status)
1173 {
1174     int old_flags = get_float_exception_flags(fp_status), new_flags;
1175     float64 ret;
1176 
1177     ret = float64_round_to_int(x, fp_status);
1178 
1179     /* Suppress any inexact exceptions the conversion produced */
1180     if (!(old_flags & float_flag_inexact)) {
1181         new_flags = get_float_exception_flags(fp_status);
1182         set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
1183     }
1184 
1185     return ret;
1186 }
1187 
1188 /* Convert ARM rounding mode to softfloat */
1189 const FloatRoundMode arm_rmode_to_sf_map[] = {
1190     [FPROUNDING_TIEEVEN] = float_round_nearest_even,
1191     [FPROUNDING_POSINF] = float_round_up,
1192     [FPROUNDING_NEGINF] = float_round_down,
1193     [FPROUNDING_ZERO] = float_round_to_zero,
1194     [FPROUNDING_TIEAWAY] = float_round_ties_away,
1195     [FPROUNDING_ODD] = float_round_to_odd,
1196 };
1197 
1198 /*
1199  * Implement float64 to int32_t conversion without saturation;
1200  * the result is supplied modulo 2^32.
1201  */
1202 uint64_t HELPER(fjcvtzs)(float64 value, float_status *status)
1203 {
1204     uint32_t frac, e_old, e_new;
1205     bool inexact;
1206 
1207     e_old = get_float_exception_flags(status);
1208     set_float_exception_flags(0, status);
1209     frac = float64_to_int32_modulo(value, float_round_to_zero, status);
1210     e_new = get_float_exception_flags(status);
1211     set_float_exception_flags(e_old | e_new, status);
1212 
1213     /* Normal inexact, denormal with flush-to-zero, or overflow or NaN */
1214     inexact = e_new & (float_flag_inexact |
1215                        float_flag_input_denormal_flushed |
1216                        float_flag_invalid);
1217 
1218     /* While not inexact for IEEE FP, -0.0 is inexact for JavaScript. */
1219     inexact |= value == float64_chs(float64_zero);
1220 
1221     /* Pack the result and the env->ZF representation of Z together.  */
1222     return deposit64(frac, 32, 32, inexact);
1223 }
1224 
1225 uint32_t HELPER(vjcvt)(float64 value, CPUARMState *env)
1226 {
1227     uint64_t pair = HELPER(fjcvtzs)(value, &env->vfp.fp_status[FPST_A32]);
1228     uint32_t result = pair;
1229     uint32_t z = (pair >> 32) == 0;
1230 
1231     /* Store Z, clear NCV, in FPSCR.NZCV.  */
1232     env->vfp.fpsr = (env->vfp.fpsr & ~FPSR_NZCV_MASK) | (z * FPSR_Z);
1233 
1234     return result;
1235 }
1236 
1237 /* Round a float32 to an integer that fits in int32_t or int64_t.  */
1238 static float32 frint_s(float32 f, float_status *fpst, int intsize)
1239 {
1240     int old_flags = get_float_exception_flags(fpst);
1241     uint32_t exp = extract32(f, 23, 8);
1242 
1243     if (unlikely(exp == 0xff)) {
1244         /* NaN or Inf.  */
1245         goto overflow;
1246     }
1247 
1248     /* Round and re-extract the exponent.  */
1249     f = float32_round_to_int(f, fpst);
1250     exp = extract32(f, 23, 8);
1251 
1252     /* Validate the range of the result.  */
1253     if (exp < 126 + intsize) {
1254         /* abs(F) <= INT{N}_MAX */
1255         return f;
1256     }
1257     if (exp == 126 + intsize) {
1258         uint32_t sign = extract32(f, 31, 1);
1259         uint32_t frac = extract32(f, 0, 23);
1260         if (sign && frac == 0) {
1261             /* F == INT{N}_MIN */
1262             return f;
1263         }
1264     }
1265 
1266  overflow:
1267     /*
1268      * Raise Invalid and return INT{N}_MIN as a float.  Revert any
1269      * inexact exception float32_round_to_int may have raised.
1270      */
1271     set_float_exception_flags(old_flags | float_flag_invalid, fpst);
1272     return (0x100u + 126u + intsize) << 23;
1273 }
1274 
1275 float32 HELPER(frint32_s)(float32 f, float_status *fpst)
1276 {
1277     return frint_s(f, fpst, 32);
1278 }
1279 
1280 float32 HELPER(frint64_s)(float32 f, float_status *fpst)
1281 {
1282     return frint_s(f, fpst, 64);
1283 }
1284 
1285 /* Round a float64 to an integer that fits in int32_t or int64_t.  */
1286 static float64 frint_d(float64 f, float_status *fpst, int intsize)
1287 {
1288     int old_flags = get_float_exception_flags(fpst);
1289     uint32_t exp = extract64(f, 52, 11);
1290 
1291     if (unlikely(exp == 0x7ff)) {
1292         /* NaN or Inf.  */
1293         goto overflow;
1294     }
1295 
1296     /* Round and re-extract the exponent.  */
1297     f = float64_round_to_int(f, fpst);
1298     exp = extract64(f, 52, 11);
1299 
1300     /* Validate the range of the result.  */
1301     if (exp < 1022 + intsize) {
1302         /* abs(F) <= INT{N}_MAX */
1303         return f;
1304     }
1305     if (exp == 1022 + intsize) {
1306         uint64_t sign = extract64(f, 63, 1);
1307         uint64_t frac = extract64(f, 0, 52);
1308         if (sign && frac == 0) {
1309             /* F == INT{N}_MIN */
1310             return f;
1311         }
1312     }
1313 
1314  overflow:
1315     /*
1316      * Raise Invalid and return INT{N}_MIN as a float.  Revert any
1317      * inexact exception float64_round_to_int may have raised.
1318      */
1319     set_float_exception_flags(old_flags | float_flag_invalid, fpst);
1320     return (uint64_t)(0x800 + 1022 + intsize) << 52;
1321 }
1322 
1323 float64 HELPER(frint32_d)(float64 f, float_status *fpst)
1324 {
1325     return frint_d(f, fpst, 32);
1326 }
1327 
1328 float64 HELPER(frint64_d)(float64 f, float_status *fpst)
1329 {
1330     return frint_d(f, fpst, 64);
1331 }
1332 
1333 void HELPER(check_hcr_el2_trap)(CPUARMState *env, uint32_t rt, uint32_t reg)
1334 {
1335     uint32_t syndrome;
1336 
1337     switch (reg) {
1338     case ARM_VFP_MVFR0:
1339     case ARM_VFP_MVFR1:
1340     case ARM_VFP_MVFR2:
1341         if (!(arm_hcr_el2_eff(env) & HCR_TID3)) {
1342             return;
1343         }
1344         break;
1345     case ARM_VFP_FPSID:
1346         if (!(arm_hcr_el2_eff(env) & HCR_TID0)) {
1347             return;
1348         }
1349         break;
1350     default:
1351         g_assert_not_reached();
1352     }
1353 
1354     syndrome = ((EC_FPIDTRAP << ARM_EL_EC_SHIFT)
1355                 | ARM_EL_IL
1356                 | (1 << 24) | (0xe << 20) | (7 << 14)
1357                 | (reg << 10) | (rt << 5) | 1);
1358 
1359     raise_exception(env, EXCP_HYP_TRAP, syndrome, 2);
1360 }
1361 
1362 uint32_t HELPER(vfp_get_fpscr)(CPUARMState *env)
1363 {
1364     return vfp_get_fpscr(env);
1365 }
1366 
1367 void HELPER(vfp_set_fpscr)(CPUARMState *env, uint32_t val)
1368 {
1369     vfp_set_fpscr(env, val);
1370 }
1371