xref: /openbmc/qemu/fpu/softfloat.c (revision 7c45bad8)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             float_raise(float_flag_input_denormal, s);                  \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345              f32_check_fn pre, f32_check_fn post)
346 {
347     union_float32 ua, ub, ur;
348 
349     ua.s = xa;
350     ub.s = xb;
351 
352     if (unlikely(!can_use_fpu(s))) {
353         goto soft;
354     }
355 
356     float32_input_flush2(&ua.s, &ub.s, s);
357     if (unlikely(!pre(ua, ub))) {
358         goto soft;
359     }
360 
361     ur.h = hard(ua.h, ub.h);
362     if (unlikely(f32_is_inf(ur))) {
363         float_raise(float_flag_overflow, s);
364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365         goto soft;
366     }
367     return ur.s;
368 
369  soft:
370     return soft(ua.s, ub.s, s);
371 }
372 
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376              f64_check_fn pre, f64_check_fn post)
377 {
378     union_float64 ua, ub, ur;
379 
380     ua.s = xa;
381     ub.s = xb;
382 
383     if (unlikely(!can_use_fpu(s))) {
384         goto soft;
385     }
386 
387     float64_input_flush2(&ua.s, &ub.s, s);
388     if (unlikely(!pre(ua, ub))) {
389         goto soft;
390     }
391 
392     ur.h = hard(ua.h, ub.h);
393     if (unlikely(f64_is_inf(ur))) {
394         float_raise(float_flag_overflow, s);
395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396         goto soft;
397     }
398     return ur.s;
399 
400  soft:
401     return soft(ua.s, ub.s, s);
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410     return float32_val(a) & 0x007FFFFF;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 static inline int extractFloat32Exp(float32 a)
418 {
419     return (float32_val(a) >> 23) & 0xFF;
420 }
421 
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline bool extractFloat32Sign(float32 a)
427 {
428     return float32_val(a) >> 31;
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434 
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439 
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443 
444 static inline int extractFloat64Exp(float64 a)
445 {
446     return (float64_val(a) >> 52) & 0x7FF;
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452 
453 static inline bool extractFloat64Sign(float64 a)
454 {
455     return float64_val(a) >> 63;
456 }
457 
458 /*
459  * Classify a floating point number. Everything above float_class_qnan
460  * is a NaN so cls >= float_class_qnan is any NaN.
461  */
462 
463 typedef enum __attribute__ ((__packed__)) {
464     float_class_unclassified,
465     float_class_zero,
466     float_class_normal,
467     float_class_inf,
468     float_class_qnan,  /* all NaNs from here */
469     float_class_snan,
470 } FloatClass;
471 
472 #define float_cmask(bit)  (1u << (bit))
473 
474 enum {
475     float_cmask_zero    = float_cmask(float_class_zero),
476     float_cmask_normal  = float_cmask(float_class_normal),
477     float_cmask_inf     = float_cmask(float_class_inf),
478     float_cmask_qnan    = float_cmask(float_class_qnan),
479     float_cmask_snan    = float_cmask(float_class_snan),
480 
481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
483 };
484 
485 
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489     return unlikely(c >= float_class_qnan);
490 }
491 
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494     return c == float_class_snan;
495 }
496 
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499     return c == float_class_qnan;
500 }
501 
502 /*
503  * Structure holding all of the decomposed parts of a float.
504  * The exponent is unbiased and the fraction is normalized.
505  *
506  * The fraction words are stored in big-endian word ordering,
507  * so that truncation from a larger format to a smaller format
508  * can be done simply by ignoring subsequent elements.
509  */
510 
511 typedef struct {
512     FloatClass cls;
513     bool sign;
514     int32_t exp;
515     union {
516         /* Routines that know the structure may reference the singular name. */
517         uint64_t frac;
518         /*
519          * Routines expanded with multiple structures reference "hi" and "lo"
520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
521          * both the same word and aliased here.
522          */
523         uint64_t frac_hi;
524         uint64_t frac_lo;
525     };
526 } FloatParts64;
527 
528 typedef struct {
529     FloatClass cls;
530     bool sign;
531     int32_t exp;
532     uint64_t frac_hi;
533     uint64_t frac_lo;
534 } FloatParts128;
535 
536 /* These apply to the most significant word of each FloatPartsN. */
537 #define DECOMPOSED_BINARY_POINT    63
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 
540 /* Structure holding all of the relevant parameters for a format.
541  *   exp_size: the size of the exponent field
542  *   exp_bias: the offset applied to the exponent field
543  *   exp_max: the maximum normalised exponent
544  *   frac_size: the size of the fraction field
545  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
546  * The following are computed based the size of fraction
547  *   frac_lsb: least significant bit of fraction
548  *   frac_lsbm1: the bit below the least significant bit (for rounding)
549  *   round_mask/roundeven_mask: masks used for rounding
550  * The following optional modifiers are available:
551  *   arm_althp: handle ARM Alternative Half Precision
552  */
553 typedef struct {
554     int exp_size;
555     int exp_bias;
556     int exp_max;
557     int frac_size;
558     int frac_shift;
559     uint64_t frac_lsb;
560     uint64_t frac_lsbm1;
561     uint64_t round_mask;
562     uint64_t roundeven_mask;
563     bool arm_althp;
564 } FloatFmt;
565 
566 /* Expand fields based on the size of exponent and fraction */
567 #define FLOAT_PARAMS(E, F)                                           \
568     .exp_size       = E,                                             \
569     .exp_bias       = ((1 << E) - 1) >> 1,                           \
570     .exp_max        = (1 << E) - 1,                                  \
571     .frac_size      = F,                                             \
572     .frac_shift     = (-F - 1) & 63,                                 \
573     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
574     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
575     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
576     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
577 
578 static const FloatFmt float16_params = {
579     FLOAT_PARAMS(5, 10)
580 };
581 
582 static const FloatFmt float16_params_ahp = {
583     FLOAT_PARAMS(5, 10),
584     .arm_althp = true
585 };
586 
587 static const FloatFmt bfloat16_params = {
588     FLOAT_PARAMS(8, 7)
589 };
590 
591 static const FloatFmt float32_params = {
592     FLOAT_PARAMS(8, 23)
593 };
594 
595 static const FloatFmt float64_params = {
596     FLOAT_PARAMS(11, 52)
597 };
598 
599 static const FloatFmt float128_params = {
600     FLOAT_PARAMS(15, 112)
601 };
602 
603 /* Unpack a float to parts, but do not canonicalize.  */
604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
605 {
606     const int f_size = fmt->frac_size;
607     const int e_size = fmt->exp_size;
608 
609     *r = (FloatParts64) {
610         .cls = float_class_unclassified,
611         .sign = extract64(raw, f_size + e_size, 1),
612         .exp = extract64(raw, f_size, e_size),
613         .frac = extract64(raw, 0, f_size)
614     };
615 }
616 
617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
618 {
619     unpack_raw64(p, &float16_params, f);
620 }
621 
622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
623 {
624     unpack_raw64(p, &bfloat16_params, f);
625 }
626 
627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
628 {
629     unpack_raw64(p, &float32_params, f);
630 }
631 
632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
633 {
634     unpack_raw64(p, &float64_params, f);
635 }
636 
637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
638 {
639     const int f_size = float128_params.frac_size - 64;
640     const int e_size = float128_params.exp_size;
641 
642     *p = (FloatParts128) {
643         .cls = float_class_unclassified,
644         .sign = extract64(f.high, f_size + e_size, 1),
645         .exp = extract64(f.high, f_size, e_size),
646         .frac_hi = extract64(f.high, 0, f_size),
647         .frac_lo = f.low,
648     };
649 }
650 
651 /* Pack a float from parts, but do not canonicalize.  */
652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
653 {
654     const int f_size = fmt->frac_size;
655     const int e_size = fmt->exp_size;
656     uint64_t ret;
657 
658     ret = (uint64_t)p->sign << (f_size + e_size);
659     ret = deposit64(ret, f_size, e_size, p->exp);
660     ret = deposit64(ret, 0, f_size, p->frac);
661     return ret;
662 }
663 
664 static inline float16 float16_pack_raw(const FloatParts64 *p)
665 {
666     return make_float16(pack_raw64(p, &float16_params));
667 }
668 
669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
670 {
671     return pack_raw64(p, &bfloat16_params);
672 }
673 
674 static inline float32 float32_pack_raw(const FloatParts64 *p)
675 {
676     return make_float32(pack_raw64(p, &float32_params));
677 }
678 
679 static inline float64 float64_pack_raw(const FloatParts64 *p)
680 {
681     return make_float64(pack_raw64(p, &float64_params));
682 }
683 
684 static float128 float128_pack_raw(const FloatParts128 *p)
685 {
686     const int f_size = float128_params.frac_size - 64;
687     const int e_size = float128_params.exp_size;
688     uint64_t hi;
689 
690     hi = (uint64_t)p->sign << (f_size + e_size);
691     hi = deposit64(hi, f_size, e_size, p->exp);
692     hi = deposit64(hi, 0, f_size, p->frac_hi);
693     return make_float128(hi, p->frac_lo);
694 }
695 
696 /*----------------------------------------------------------------------------
697 | Functions and definitions to determine:  (1) whether tininess for underflow
698 | is detected before or after rounding by default, (2) what (if anything)
699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
701 | are propagated from function inputs to output.  These details are target-
702 | specific.
703 *----------------------------------------------------------------------------*/
704 #include "softfloat-specialize.c.inc"
705 
706 #define PARTS_GENERIC_64_128(NAME, P) \
707     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
708 
709 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
710 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
711 
712 static void parts64_return_nan(FloatParts64 *a, float_status *s);
713 static void parts128_return_nan(FloatParts128 *a, float_status *s);
714 
715 #define parts_return_nan(P, S)     PARTS_GENERIC_64_128(return_nan, P)(P, S)
716 
717 /*
718  * Helper functions for softfloat-parts.c.inc, per-size operations.
719  */
720 
721 static void frac128_shl(FloatParts128 *a, int c)
722 {
723     shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
724 }
725 
726 #define frac_shl(A, C)             frac128_shl(A, C)
727 
728 static void frac128_shr(FloatParts128 *a, int c)
729 {
730     shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
731 }
732 
733 #define frac_shr(A, C)             frac128_shr(A, C)
734 
735 /* Canonicalize EXP and FRAC, setting CLS.  */
736 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
737                                   float_status *status)
738 {
739     if (part.exp == parm->exp_max && !parm->arm_althp) {
740         if (part.frac == 0) {
741             part.cls = float_class_inf;
742         } else {
743             part.frac <<= parm->frac_shift;
744             part.cls = (parts_is_snan_frac(part.frac, status)
745                         ? float_class_snan : float_class_qnan);
746         }
747     } else if (part.exp == 0) {
748         if (likely(part.frac == 0)) {
749             part.cls = float_class_zero;
750         } else if (status->flush_inputs_to_zero) {
751             float_raise(float_flag_input_denormal, status);
752             part.cls = float_class_zero;
753             part.frac = 0;
754         } else {
755             int shift = clz64(part.frac);
756             part.cls = float_class_normal;
757             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
758             part.frac <<= shift;
759         }
760     } else {
761         part.cls = float_class_normal;
762         part.exp -= parm->exp_bias;
763         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
764     }
765     return part;
766 }
767 
768 /* Round and uncanonicalize a floating-point number by parts. There
769  * are FRAC_SHIFT bits that may require rounding at the bottom of the
770  * fraction; these bits will be removed. The exponent will be biased
771  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
772  */
773 
774 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
775                                   const FloatFmt *parm)
776 {
777     const uint64_t frac_lsb = parm->frac_lsb;
778     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
779     const uint64_t round_mask = parm->round_mask;
780     const uint64_t roundeven_mask = parm->roundeven_mask;
781     const int exp_max = parm->exp_max;
782     const int frac_shift = parm->frac_shift;
783     uint64_t frac, inc;
784     int exp, flags = 0;
785     bool overflow_norm;
786 
787     frac = p.frac;
788     exp = p.exp;
789 
790     switch (p.cls) {
791     case float_class_normal:
792         switch (s->float_rounding_mode) {
793         case float_round_nearest_even:
794             overflow_norm = false;
795             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
796             break;
797         case float_round_ties_away:
798             overflow_norm = false;
799             inc = frac_lsbm1;
800             break;
801         case float_round_to_zero:
802             overflow_norm = true;
803             inc = 0;
804             break;
805         case float_round_up:
806             inc = p.sign ? 0 : round_mask;
807             overflow_norm = p.sign;
808             break;
809         case float_round_down:
810             inc = p.sign ? round_mask : 0;
811             overflow_norm = !p.sign;
812             break;
813         case float_round_to_odd:
814             overflow_norm = true;
815             inc = frac & frac_lsb ? 0 : round_mask;
816             break;
817         default:
818             g_assert_not_reached();
819         }
820 
821         exp += parm->exp_bias;
822         if (likely(exp > 0)) {
823             if (frac & round_mask) {
824                 flags |= float_flag_inexact;
825                 if (uadd64_overflow(frac, inc, &frac)) {
826                     frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
827                     exp++;
828                 }
829             }
830             frac >>= frac_shift;
831 
832             if (parm->arm_althp) {
833                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
834                 if (unlikely(exp > exp_max)) {
835                     /* Overflow.  Return the maximum normal.  */
836                     flags = float_flag_invalid;
837                     exp = exp_max;
838                     frac = -1;
839                 }
840             } else if (unlikely(exp >= exp_max)) {
841                 flags |= float_flag_overflow | float_flag_inexact;
842                 if (overflow_norm) {
843                     exp = exp_max - 1;
844                     frac = -1;
845                 } else {
846                     p.cls = float_class_inf;
847                     goto do_inf;
848                 }
849             }
850         } else if (s->flush_to_zero) {
851             flags |= float_flag_output_denormal;
852             p.cls = float_class_zero;
853             goto do_zero;
854         } else {
855             bool is_tiny = s->tininess_before_rounding || (exp < 0);
856 
857             if (!is_tiny) {
858                 uint64_t discard;
859                 is_tiny = !uadd64_overflow(frac, inc, &discard);
860             }
861 
862             shift64RightJamming(frac, 1 - exp, &frac);
863             if (frac & round_mask) {
864                 /* Need to recompute round-to-even.  */
865                 switch (s->float_rounding_mode) {
866                 case float_round_nearest_even:
867                     inc = ((frac & roundeven_mask) != frac_lsbm1
868                            ? frac_lsbm1 : 0);
869                     break;
870                 case float_round_to_odd:
871                     inc = frac & frac_lsb ? 0 : round_mask;
872                     break;
873                 default:
874                     break;
875                 }
876                 flags |= float_flag_inexact;
877                 frac += inc;
878             }
879 
880             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
881             frac >>= frac_shift;
882 
883             if (is_tiny && (flags & float_flag_inexact)) {
884                 flags |= float_flag_underflow;
885             }
886             if (exp == 0 && frac == 0) {
887                 p.cls = float_class_zero;
888             }
889         }
890         break;
891 
892     case float_class_zero:
893     do_zero:
894         exp = 0;
895         frac = 0;
896         break;
897 
898     case float_class_inf:
899     do_inf:
900         assert(!parm->arm_althp);
901         exp = exp_max;
902         frac = 0;
903         break;
904 
905     case float_class_qnan:
906     case float_class_snan:
907         assert(!parm->arm_althp);
908         exp = exp_max;
909         frac >>= parm->frac_shift;
910         break;
911 
912     default:
913         g_assert_not_reached();
914     }
915 
916     float_raise(flags, s);
917     p.exp = exp;
918     p.frac = frac;
919     return p;
920 }
921 
922 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s)
923 {
924     if (is_snan(a.cls) || is_snan(b.cls)) {
925         float_raise(float_flag_invalid, s);
926     }
927 
928     if (s->default_nan_mode) {
929         parts_default_nan(&a, s);
930     } else {
931         if (pickNaN(a.cls, b.cls,
932                     a.frac > b.frac ||
933                     (a.frac == b.frac && a.sign < b.sign), s)) {
934             a = b;
935         }
936         if (is_snan(a.cls)) {
937             parts_silence_nan(&a, s);
938         }
939     }
940     return a;
941 }
942 
943 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c,
944                                   bool inf_zero, float_status *s)
945 {
946     int which;
947 
948     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
949         float_raise(float_flag_invalid, s);
950     }
951 
952     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
953 
954     if (s->default_nan_mode) {
955         /* Note that this check is after pickNaNMulAdd so that function
956          * has an opportunity to set the Invalid flag.
957          */
958         which = 3;
959     }
960 
961     switch (which) {
962     case 0:
963         break;
964     case 1:
965         a = b;
966         break;
967     case 2:
968         a = c;
969         break;
970     case 3:
971         parts_default_nan(&a, s);
972         break;
973     default:
974         g_assert_not_reached();
975     }
976 
977     if (is_snan(a.cls)) {
978         parts_silence_nan(&a, s);
979     }
980     return a;
981 }
982 
983 #define partsN(NAME)   parts64_##NAME
984 #define FloatPartsN    FloatParts64
985 
986 #include "softfloat-parts.c.inc"
987 
988 #undef  partsN
989 #undef  FloatPartsN
990 #define partsN(NAME)   parts128_##NAME
991 #define FloatPartsN    FloatParts128
992 
993 #include "softfloat-parts.c.inc"
994 
995 #undef  partsN
996 #undef  FloatPartsN
997 
998 /*
999  * Pack/unpack routines with a specific FloatFmt.
1000  */
1001 
1002 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1003                                       float_status *s, const FloatFmt *params)
1004 {
1005     float16_unpack_raw(p, f);
1006     *p = sf_canonicalize(*p, params, s);
1007 }
1008 
1009 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1010                                      float_status *s)
1011 {
1012     float16a_unpack_canonical(p, f, s, &float16_params);
1013 }
1014 
1015 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1016                                       float_status *s)
1017 {
1018     bfloat16_unpack_raw(p, f);
1019     *p = sf_canonicalize(*p, &bfloat16_params, s);
1020 }
1021 
1022 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1023                                              float_status *s,
1024                                              const FloatFmt *params)
1025 {
1026     *p = round_canonical(*p, s, params);
1027     return float16_pack_raw(p);
1028 }
1029 
1030 static float16 float16_round_pack_canonical(FloatParts64 *p,
1031                                             float_status *s)
1032 {
1033     return float16a_round_pack_canonical(p, s, &float16_params);
1034 }
1035 
1036 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1037                                               float_status *s)
1038 {
1039     *p = round_canonical(*p, s, &bfloat16_params);
1040     return bfloat16_pack_raw(p);
1041 }
1042 
1043 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1044                                      float_status *s)
1045 {
1046     float32_unpack_raw(p, f);
1047     *p = sf_canonicalize(*p, &float32_params, s);
1048 }
1049 
1050 static float32 float32_round_pack_canonical(FloatParts64 *p,
1051                                             float_status *s)
1052 {
1053     *p = round_canonical(*p, s, &float32_params);
1054     return float32_pack_raw(p);
1055 }
1056 
1057 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1058                                      float_status *s)
1059 {
1060     float64_unpack_raw(p, f);
1061     *p = sf_canonicalize(*p, &float64_params, s);
1062 }
1063 
1064 static float64 float64_round_pack_canonical(FloatParts64 *p,
1065                                             float_status *s)
1066 {
1067     *p = round_canonical(*p, s, &float64_params);
1068     return float64_pack_raw(p);
1069 }
1070 
1071 /*
1072  * Returns the result of adding or subtracting the values of the
1073  * floating-point values `a' and `b'. The operation is performed
1074  * according to the IEC/IEEE Standard for Binary Floating-Point
1075  * Arithmetic.
1076  */
1077 
1078 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
1079                                 float_status *s)
1080 {
1081     bool a_sign = a.sign;
1082     bool b_sign = b.sign ^ subtract;
1083 
1084     if (a_sign != b_sign) {
1085         /* Subtraction */
1086 
1087         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1088             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
1089                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1090                 a.frac = a.frac - b.frac;
1091             } else {
1092                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1093                 a.frac = b.frac - a.frac;
1094                 a.exp = b.exp;
1095                 a_sign ^= 1;
1096             }
1097 
1098             if (a.frac == 0) {
1099                 a.cls = float_class_zero;
1100                 a.sign = s->float_rounding_mode == float_round_down;
1101             } else {
1102                 int shift = clz64(a.frac);
1103                 a.frac = a.frac << shift;
1104                 a.exp = a.exp - shift;
1105                 a.sign = a_sign;
1106             }
1107             return a;
1108         }
1109         if (is_nan(a.cls) || is_nan(b.cls)) {
1110             return pick_nan(a, b, s);
1111         }
1112         if (a.cls == float_class_inf) {
1113             if (b.cls == float_class_inf) {
1114                 float_raise(float_flag_invalid, s);
1115                 parts_default_nan(&a, s);
1116             }
1117             return a;
1118         }
1119         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1120             a.sign = s->float_rounding_mode == float_round_down;
1121             return a;
1122         }
1123         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1124             b.sign = a_sign ^ 1;
1125             return b;
1126         }
1127         if (b.cls == float_class_zero) {
1128             return a;
1129         }
1130     } else {
1131         /* Addition */
1132         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1133             if (a.exp > b.exp) {
1134                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1135             } else if (a.exp < b.exp) {
1136                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1137                 a.exp = b.exp;
1138             }
1139 
1140             if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1141                 shift64RightJamming(a.frac, 1, &a.frac);
1142                 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1143                 a.exp += 1;
1144             }
1145             return a;
1146         }
1147         if (is_nan(a.cls) || is_nan(b.cls)) {
1148             return pick_nan(a, b, s);
1149         }
1150         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1151             return a;
1152         }
1153         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1154             b.sign = b_sign;
1155             return b;
1156         }
1157     }
1158     g_assert_not_reached();
1159 }
1160 
1161 /*
1162  * Returns the result of adding or subtracting the floating-point
1163  * values `a' and `b'. The operation is performed according to the
1164  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1165  */
1166 
1167 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1168 {
1169     FloatParts64 pa, pb, pr;
1170 
1171     float16_unpack_canonical(&pa, a, status);
1172     float16_unpack_canonical(&pb, b, status);
1173     pr = addsub_floats(pa, pb, false, status);
1174 
1175     return float16_round_pack_canonical(&pr, status);
1176 }
1177 
1178 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1179 {
1180     FloatParts64 pa, pb, pr;
1181 
1182     float16_unpack_canonical(&pa, a, status);
1183     float16_unpack_canonical(&pb, b, status);
1184     pr = addsub_floats(pa, pb, true, status);
1185 
1186     return float16_round_pack_canonical(&pr, status);
1187 }
1188 
1189 static float32 QEMU_SOFTFLOAT_ATTR
1190 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1191 {
1192     FloatParts64 pa, pb, pr;
1193 
1194     float32_unpack_canonical(&pa, a, status);
1195     float32_unpack_canonical(&pb, b, status);
1196     pr = addsub_floats(pa, pb, subtract, status);
1197 
1198     return float32_round_pack_canonical(&pr, status);
1199 }
1200 
1201 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1202 {
1203     return soft_f32_addsub(a, b, false, status);
1204 }
1205 
1206 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1207 {
1208     return soft_f32_addsub(a, b, true, status);
1209 }
1210 
1211 static float64 QEMU_SOFTFLOAT_ATTR
1212 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1213 {
1214     FloatParts64 pa, pb, pr;
1215 
1216     float64_unpack_canonical(&pa, a, status);
1217     float64_unpack_canonical(&pb, b, status);
1218     pr = addsub_floats(pa, pb, subtract, status);
1219 
1220     return float64_round_pack_canonical(&pr, status);
1221 }
1222 
1223 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1224 {
1225     return soft_f64_addsub(a, b, false, status);
1226 }
1227 
1228 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1229 {
1230     return soft_f64_addsub(a, b, true, status);
1231 }
1232 
1233 static float hard_f32_add(float a, float b)
1234 {
1235     return a + b;
1236 }
1237 
1238 static float hard_f32_sub(float a, float b)
1239 {
1240     return a - b;
1241 }
1242 
1243 static double hard_f64_add(double a, double b)
1244 {
1245     return a + b;
1246 }
1247 
1248 static double hard_f64_sub(double a, double b)
1249 {
1250     return a - b;
1251 }
1252 
1253 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1254 {
1255     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1256         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1257     }
1258     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1259 }
1260 
1261 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1262 {
1263     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1264         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1265     } else {
1266         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1267     }
1268 }
1269 
1270 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1271                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1272 {
1273     return float32_gen2(a, b, s, hard, soft,
1274                         f32_is_zon2, f32_addsubmul_post);
1275 }
1276 
1277 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1278                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1279 {
1280     return float64_gen2(a, b, s, hard, soft,
1281                         f64_is_zon2, f64_addsubmul_post);
1282 }
1283 
1284 float32 QEMU_FLATTEN
1285 float32_add(float32 a, float32 b, float_status *s)
1286 {
1287     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1288 }
1289 
1290 float32 QEMU_FLATTEN
1291 float32_sub(float32 a, float32 b, float_status *s)
1292 {
1293     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1294 }
1295 
1296 float64 QEMU_FLATTEN
1297 float64_add(float64 a, float64 b, float_status *s)
1298 {
1299     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1300 }
1301 
1302 float64 QEMU_FLATTEN
1303 float64_sub(float64 a, float64 b, float_status *s)
1304 {
1305     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1306 }
1307 
1308 /*
1309  * Returns the result of adding or subtracting the bfloat16
1310  * values `a' and `b'.
1311  */
1312 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1313 {
1314     FloatParts64 pa, pb, pr;
1315 
1316     bfloat16_unpack_canonical(&pa, a, status);
1317     bfloat16_unpack_canonical(&pb, b, status);
1318     pr = addsub_floats(pa, pb, false, status);
1319 
1320     return bfloat16_round_pack_canonical(&pr, status);
1321 }
1322 
1323 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1324 {
1325     FloatParts64 pa, pb, pr;
1326 
1327     bfloat16_unpack_canonical(&pa, a, status);
1328     bfloat16_unpack_canonical(&pb, b, status);
1329     pr = addsub_floats(pa, pb, true, status);
1330 
1331     return bfloat16_round_pack_canonical(&pr, status);
1332 }
1333 
1334 /*
1335  * Returns the result of multiplying the floating-point values `a' and
1336  * `b'. The operation is performed according to the IEC/IEEE Standard
1337  * for Binary Floating-Point Arithmetic.
1338  */
1339 
1340 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1341 {
1342     bool sign = a.sign ^ b.sign;
1343 
1344     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1345         uint64_t hi, lo;
1346         int exp = a.exp + b.exp;
1347 
1348         mul64To128(a.frac, b.frac, &hi, &lo);
1349         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1350             exp += 1;
1351         } else {
1352             hi <<= 1;
1353         }
1354         hi |= (lo != 0);
1355 
1356         /* Re-use a */
1357         a.exp = exp;
1358         a.sign = sign;
1359         a.frac = hi;
1360         return a;
1361     }
1362     /* handle all the NaN cases */
1363     if (is_nan(a.cls) || is_nan(b.cls)) {
1364         return pick_nan(a, b, s);
1365     }
1366     /* Inf * Zero == NaN */
1367     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1368         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1369         float_raise(float_flag_invalid, s);
1370         parts_default_nan(&a, s);
1371         return a;
1372     }
1373     /* Multiply by 0 or Inf */
1374     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1375         a.sign = sign;
1376         return a;
1377     }
1378     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1379         b.sign = sign;
1380         return b;
1381     }
1382     g_assert_not_reached();
1383 }
1384 
1385 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1386 {
1387     FloatParts64 pa, pb, pr;
1388 
1389     float16_unpack_canonical(&pa, a, status);
1390     float16_unpack_canonical(&pb, b, status);
1391     pr = mul_floats(pa, pb, status);
1392 
1393     return float16_round_pack_canonical(&pr, status);
1394 }
1395 
1396 static float32 QEMU_SOFTFLOAT_ATTR
1397 soft_f32_mul(float32 a, float32 b, float_status *status)
1398 {
1399     FloatParts64 pa, pb, pr;
1400 
1401     float32_unpack_canonical(&pa, a, status);
1402     float32_unpack_canonical(&pb, b, status);
1403     pr = mul_floats(pa, pb, status);
1404 
1405     return float32_round_pack_canonical(&pr, status);
1406 }
1407 
1408 static float64 QEMU_SOFTFLOAT_ATTR
1409 soft_f64_mul(float64 a, float64 b, float_status *status)
1410 {
1411     FloatParts64 pa, pb, pr;
1412 
1413     float64_unpack_canonical(&pa, a, status);
1414     float64_unpack_canonical(&pb, b, status);
1415     pr = mul_floats(pa, pb, status);
1416 
1417     return float64_round_pack_canonical(&pr, status);
1418 }
1419 
1420 static float hard_f32_mul(float a, float b)
1421 {
1422     return a * b;
1423 }
1424 
1425 static double hard_f64_mul(double a, double b)
1426 {
1427     return a * b;
1428 }
1429 
1430 float32 QEMU_FLATTEN
1431 float32_mul(float32 a, float32 b, float_status *s)
1432 {
1433     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1434                         f32_is_zon2, f32_addsubmul_post);
1435 }
1436 
1437 float64 QEMU_FLATTEN
1438 float64_mul(float64 a, float64 b, float_status *s)
1439 {
1440     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1441                         f64_is_zon2, f64_addsubmul_post);
1442 }
1443 
1444 /*
1445  * Returns the result of multiplying the bfloat16
1446  * values `a' and `b'.
1447  */
1448 
1449 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1450 {
1451     FloatParts64 pa, pb, pr;
1452 
1453     bfloat16_unpack_canonical(&pa, a, status);
1454     bfloat16_unpack_canonical(&pb, b, status);
1455     pr = mul_floats(pa, pb, status);
1456 
1457     return bfloat16_round_pack_canonical(&pr, status);
1458 }
1459 
1460 /*
1461  * Returns the result of multiplying the floating-point values `a' and
1462  * `b' then adding 'c', with no intermediate rounding step after the
1463  * multiplication. The operation is performed according to the
1464  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1465  * The flags argument allows the caller to select negation of the
1466  * addend, the intermediate product, or the final result. (The
1467  * difference between this and having the caller do a separate
1468  * negation is that negating externally will flip the sign bit on
1469  * NaNs.)
1470  */
1471 
1472 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1473                                 int flags, float_status *s)
1474 {
1475     bool inf_zero, p_sign;
1476     bool sign_flip = flags & float_muladd_negate_result;
1477     FloatClass p_class;
1478     uint64_t hi, lo;
1479     int p_exp;
1480     int ab_mask, abc_mask;
1481 
1482     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1483     abc_mask = float_cmask(c.cls) | ab_mask;
1484     inf_zero = ab_mask == float_cmask_infzero;
1485 
1486     /* It is implementation-defined whether the cases of (0,inf,qnan)
1487      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1488      * they return if they do), so we have to hand this information
1489      * off to the target-specific pick-a-NaN routine.
1490      */
1491     if (unlikely(abc_mask & float_cmask_anynan)) {
1492         return pick_nan_muladd(a, b, c, inf_zero, s);
1493     }
1494 
1495     if (inf_zero) {
1496         float_raise(float_flag_invalid, s);
1497         parts_default_nan(&a, s);
1498         return a;
1499     }
1500 
1501     if (flags & float_muladd_negate_c) {
1502         c.sign ^= 1;
1503     }
1504 
1505     p_sign = a.sign ^ b.sign;
1506 
1507     if (flags & float_muladd_negate_product) {
1508         p_sign ^= 1;
1509     }
1510 
1511     if (ab_mask & float_cmask_inf) {
1512         p_class = float_class_inf;
1513     } else if (ab_mask & float_cmask_zero) {
1514         p_class = float_class_zero;
1515     } else {
1516         p_class = float_class_normal;
1517     }
1518 
1519     if (c.cls == float_class_inf) {
1520         if (p_class == float_class_inf && p_sign != c.sign) {
1521             float_raise(float_flag_invalid, s);
1522             parts_default_nan(&c, s);
1523         } else {
1524             c.sign ^= sign_flip;
1525         }
1526         return c;
1527     }
1528 
1529     if (p_class == float_class_inf) {
1530         a.cls = float_class_inf;
1531         a.sign = p_sign ^ sign_flip;
1532         return a;
1533     }
1534 
1535     if (p_class == float_class_zero) {
1536         if (c.cls == float_class_zero) {
1537             if (p_sign != c.sign) {
1538                 p_sign = s->float_rounding_mode == float_round_down;
1539             }
1540             c.sign = p_sign;
1541         } else if (flags & float_muladd_halve_result) {
1542             c.exp -= 1;
1543         }
1544         c.sign ^= sign_flip;
1545         return c;
1546     }
1547 
1548     /* a & b should be normals now... */
1549     assert(a.cls == float_class_normal &&
1550            b.cls == float_class_normal);
1551 
1552     p_exp = a.exp + b.exp;
1553 
1554     mul64To128(a.frac, b.frac, &hi, &lo);
1555 
1556     /* Renormalize to the msb. */
1557     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1558         p_exp += 1;
1559     } else {
1560         shortShift128Left(hi, lo, 1, &hi, &lo);
1561     }
1562 
1563     /* + add/sub */
1564     if (c.cls != float_class_zero) {
1565         int exp_diff = p_exp - c.exp;
1566         if (p_sign == c.sign) {
1567             /* Addition */
1568             if (exp_diff <= 0) {
1569                 shift64RightJamming(hi, -exp_diff, &hi);
1570                 p_exp = c.exp;
1571                 if (uadd64_overflow(hi, c.frac, &hi)) {
1572                     shift64RightJamming(hi, 1, &hi);
1573                     hi |= DECOMPOSED_IMPLICIT_BIT;
1574                     p_exp += 1;
1575                 }
1576             } else {
1577                 uint64_t c_hi, c_lo, over;
1578                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1579                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1580                 if (over) {
1581                     shift64RightJamming(hi, 1, &hi);
1582                     hi |= DECOMPOSED_IMPLICIT_BIT;
1583                     p_exp += 1;
1584                 }
1585             }
1586         } else {
1587             /* Subtraction */
1588             uint64_t c_hi = c.frac, c_lo = 0;
1589 
1590             if (exp_diff <= 0) {
1591                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1592                 if (exp_diff == 0
1593                     &&
1594                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1595                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1596                 } else {
1597                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1598                     p_sign ^= 1;
1599                     p_exp = c.exp;
1600                 }
1601             } else {
1602                 shift128RightJamming(c_hi, c_lo,
1603                                      exp_diff,
1604                                      &c_hi, &c_lo);
1605                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1606             }
1607 
1608             if (hi == 0 && lo == 0) {
1609                 a.cls = float_class_zero;
1610                 a.sign = s->float_rounding_mode == float_round_down;
1611                 a.sign ^= sign_flip;
1612                 return a;
1613             } else {
1614                 int shift;
1615                 if (hi != 0) {
1616                     shift = clz64(hi);
1617                 } else {
1618                     shift = clz64(lo) + 64;
1619                 }
1620                 /* Normalizing to a binary point of 124 is the
1621                    correct adjust for the exponent.  However since we're
1622                    shifting, we might as well put the binary point back
1623                    at 63 where we really want it.  Therefore shift as
1624                    if we're leaving 1 bit at the top of the word, but
1625                    adjust the exponent as if we're leaving 3 bits.  */
1626                 shift128Left(hi, lo, shift, &hi, &lo);
1627                 p_exp -= shift;
1628             }
1629         }
1630     }
1631     hi |= (lo != 0);
1632 
1633     if (flags & float_muladd_halve_result) {
1634         p_exp -= 1;
1635     }
1636 
1637     /* finally prepare our result */
1638     a.cls = float_class_normal;
1639     a.sign = p_sign ^ sign_flip;
1640     a.exp = p_exp;
1641     a.frac = hi;
1642 
1643     return a;
1644 }
1645 
1646 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1647                                                 int flags, float_status *status)
1648 {
1649     FloatParts64 pa, pb, pc, pr;
1650 
1651     float16_unpack_canonical(&pa, a, status);
1652     float16_unpack_canonical(&pb, b, status);
1653     float16_unpack_canonical(&pc, c, status);
1654     pr = muladd_floats(pa, pb, pc, flags, status);
1655 
1656     return float16_round_pack_canonical(&pr, status);
1657 }
1658 
1659 static float32 QEMU_SOFTFLOAT_ATTR
1660 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1661                 float_status *status)
1662 {
1663     FloatParts64 pa, pb, pc, pr;
1664 
1665     float32_unpack_canonical(&pa, a, status);
1666     float32_unpack_canonical(&pb, b, status);
1667     float32_unpack_canonical(&pc, c, status);
1668     pr = muladd_floats(pa, pb, pc, flags, status);
1669 
1670     return float32_round_pack_canonical(&pr, status);
1671 }
1672 
1673 static float64 QEMU_SOFTFLOAT_ATTR
1674 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1675                 float_status *status)
1676 {
1677     FloatParts64 pa, pb, pc, pr;
1678 
1679     float64_unpack_canonical(&pa, a, status);
1680     float64_unpack_canonical(&pb, b, status);
1681     float64_unpack_canonical(&pc, c, status);
1682     pr = muladd_floats(pa, pb, pc, flags, status);
1683 
1684     return float64_round_pack_canonical(&pr, status);
1685 }
1686 
1687 static bool force_soft_fma;
1688 
1689 float32 QEMU_FLATTEN
1690 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1691 {
1692     union_float32 ua, ub, uc, ur;
1693 
1694     ua.s = xa;
1695     ub.s = xb;
1696     uc.s = xc;
1697 
1698     if (unlikely(!can_use_fpu(s))) {
1699         goto soft;
1700     }
1701     if (unlikely(flags & float_muladd_halve_result)) {
1702         goto soft;
1703     }
1704 
1705     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1706     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1707         goto soft;
1708     }
1709 
1710     if (unlikely(force_soft_fma)) {
1711         goto soft;
1712     }
1713 
1714     /*
1715      * When (a || b) == 0, there's no need to check for under/over flow,
1716      * since we know the addend is (normal || 0) and the product is 0.
1717      */
1718     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1719         union_float32 up;
1720         bool prod_sign;
1721 
1722         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1723         prod_sign ^= !!(flags & float_muladd_negate_product);
1724         up.s = float32_set_sign(float32_zero, prod_sign);
1725 
1726         if (flags & float_muladd_negate_c) {
1727             uc.h = -uc.h;
1728         }
1729         ur.h = up.h + uc.h;
1730     } else {
1731         union_float32 ua_orig = ua;
1732         union_float32 uc_orig = uc;
1733 
1734         if (flags & float_muladd_negate_product) {
1735             ua.h = -ua.h;
1736         }
1737         if (flags & float_muladd_negate_c) {
1738             uc.h = -uc.h;
1739         }
1740 
1741         ur.h = fmaf(ua.h, ub.h, uc.h);
1742 
1743         if (unlikely(f32_is_inf(ur))) {
1744             float_raise(float_flag_overflow, s);
1745         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1746             ua = ua_orig;
1747             uc = uc_orig;
1748             goto soft;
1749         }
1750     }
1751     if (flags & float_muladd_negate_result) {
1752         return float32_chs(ur.s);
1753     }
1754     return ur.s;
1755 
1756  soft:
1757     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1758 }
1759 
1760 float64 QEMU_FLATTEN
1761 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1762 {
1763     union_float64 ua, ub, uc, ur;
1764 
1765     ua.s = xa;
1766     ub.s = xb;
1767     uc.s = xc;
1768 
1769     if (unlikely(!can_use_fpu(s))) {
1770         goto soft;
1771     }
1772     if (unlikely(flags & float_muladd_halve_result)) {
1773         goto soft;
1774     }
1775 
1776     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1777     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1778         goto soft;
1779     }
1780 
1781     if (unlikely(force_soft_fma)) {
1782         goto soft;
1783     }
1784 
1785     /*
1786      * When (a || b) == 0, there's no need to check for under/over flow,
1787      * since we know the addend is (normal || 0) and the product is 0.
1788      */
1789     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1790         union_float64 up;
1791         bool prod_sign;
1792 
1793         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1794         prod_sign ^= !!(flags & float_muladd_negate_product);
1795         up.s = float64_set_sign(float64_zero, prod_sign);
1796 
1797         if (flags & float_muladd_negate_c) {
1798             uc.h = -uc.h;
1799         }
1800         ur.h = up.h + uc.h;
1801     } else {
1802         union_float64 ua_orig = ua;
1803         union_float64 uc_orig = uc;
1804 
1805         if (flags & float_muladd_negate_product) {
1806             ua.h = -ua.h;
1807         }
1808         if (flags & float_muladd_negate_c) {
1809             uc.h = -uc.h;
1810         }
1811 
1812         ur.h = fma(ua.h, ub.h, uc.h);
1813 
1814         if (unlikely(f64_is_inf(ur))) {
1815             float_raise(float_flag_overflow, s);
1816         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1817             ua = ua_orig;
1818             uc = uc_orig;
1819             goto soft;
1820         }
1821     }
1822     if (flags & float_muladd_negate_result) {
1823         return float64_chs(ur.s);
1824     }
1825     return ur.s;
1826 
1827  soft:
1828     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1829 }
1830 
1831 /*
1832  * Returns the result of multiplying the bfloat16 values `a'
1833  * and `b' then adding 'c', with no intermediate rounding step after the
1834  * multiplication.
1835  */
1836 
1837 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1838                                       int flags, float_status *status)
1839 {
1840     FloatParts64 pa, pb, pc, pr;
1841 
1842     bfloat16_unpack_canonical(&pa, a, status);
1843     bfloat16_unpack_canonical(&pb, b, status);
1844     bfloat16_unpack_canonical(&pc, c, status);
1845     pr = muladd_floats(pa, pb, pc, flags, status);
1846 
1847     return bfloat16_round_pack_canonical(&pr, status);
1848 }
1849 
1850 /*
1851  * Returns the result of dividing the floating-point value `a' by the
1852  * corresponding value `b'. The operation is performed according to
1853  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1854  */
1855 
1856 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1857 {
1858     bool sign = a.sign ^ b.sign;
1859 
1860     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1861         uint64_t n0, n1, q, r;
1862         int exp = a.exp - b.exp;
1863 
1864         /*
1865          * We want a 2*N / N-bit division to produce exactly an N-bit
1866          * result, so that we do not lose any precision and so that we
1867          * do not have to renormalize afterward.  If A.frac < B.frac,
1868          * then division would produce an (N-1)-bit result; shift A left
1869          * by one to produce the an N-bit result, and decrement the
1870          * exponent to match.
1871          *
1872          * The udiv_qrnnd algorithm that we're using requires normalization,
1873          * i.e. the msb of the denominator must be set, which is already true.
1874          */
1875         if (a.frac < b.frac) {
1876             exp -= 1;
1877             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1878         } else {
1879             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1880         }
1881         q = udiv_qrnnd(&r, n1, n0, b.frac);
1882 
1883         /* Set lsb if there is a remainder, to set inexact. */
1884         a.frac = q | (r != 0);
1885         a.sign = sign;
1886         a.exp = exp;
1887         return a;
1888     }
1889     /* handle all the NaN cases */
1890     if (is_nan(a.cls) || is_nan(b.cls)) {
1891         return pick_nan(a, b, s);
1892     }
1893     /* 0/0 or Inf/Inf */
1894     if (a.cls == b.cls
1895         &&
1896         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1897         float_raise(float_flag_invalid, s);
1898         parts_default_nan(&a, s);
1899         return a;
1900     }
1901     /* Inf / x or 0 / x */
1902     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1903         a.sign = sign;
1904         return a;
1905     }
1906     /* Div 0 => Inf */
1907     if (b.cls == float_class_zero) {
1908         float_raise(float_flag_divbyzero, s);
1909         a.cls = float_class_inf;
1910         a.sign = sign;
1911         return a;
1912     }
1913     /* Div by Inf */
1914     if (b.cls == float_class_inf) {
1915         a.cls = float_class_zero;
1916         a.sign = sign;
1917         return a;
1918     }
1919     g_assert_not_reached();
1920 }
1921 
1922 float16 float16_div(float16 a, float16 b, float_status *status)
1923 {
1924     FloatParts64 pa, pb, pr;
1925 
1926     float16_unpack_canonical(&pa, a, status);
1927     float16_unpack_canonical(&pb, b, status);
1928     pr = div_floats(pa, pb, status);
1929 
1930     return float16_round_pack_canonical(&pr, status);
1931 }
1932 
1933 static float32 QEMU_SOFTFLOAT_ATTR
1934 soft_f32_div(float32 a, float32 b, float_status *status)
1935 {
1936     FloatParts64 pa, pb, pr;
1937 
1938     float32_unpack_canonical(&pa, a, status);
1939     float32_unpack_canonical(&pb, b, status);
1940     pr = div_floats(pa, pb, status);
1941 
1942     return float32_round_pack_canonical(&pr, status);
1943 }
1944 
1945 static float64 QEMU_SOFTFLOAT_ATTR
1946 soft_f64_div(float64 a, float64 b, float_status *status)
1947 {
1948     FloatParts64 pa, pb, pr;
1949 
1950     float64_unpack_canonical(&pa, a, status);
1951     float64_unpack_canonical(&pb, b, status);
1952     pr = div_floats(pa, pb, status);
1953 
1954     return float64_round_pack_canonical(&pr, status);
1955 }
1956 
1957 static float hard_f32_div(float a, float b)
1958 {
1959     return a / b;
1960 }
1961 
1962 static double hard_f64_div(double a, double b)
1963 {
1964     return a / b;
1965 }
1966 
1967 static bool f32_div_pre(union_float32 a, union_float32 b)
1968 {
1969     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1970         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1971                fpclassify(b.h) == FP_NORMAL;
1972     }
1973     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1974 }
1975 
1976 static bool f64_div_pre(union_float64 a, union_float64 b)
1977 {
1978     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1979         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1980                fpclassify(b.h) == FP_NORMAL;
1981     }
1982     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1983 }
1984 
1985 static bool f32_div_post(union_float32 a, union_float32 b)
1986 {
1987     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1988         return fpclassify(a.h) != FP_ZERO;
1989     }
1990     return !float32_is_zero(a.s);
1991 }
1992 
1993 static bool f64_div_post(union_float64 a, union_float64 b)
1994 {
1995     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1996         return fpclassify(a.h) != FP_ZERO;
1997     }
1998     return !float64_is_zero(a.s);
1999 }
2000 
2001 float32 QEMU_FLATTEN
2002 float32_div(float32 a, float32 b, float_status *s)
2003 {
2004     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
2005                         f32_div_pre, f32_div_post);
2006 }
2007 
2008 float64 QEMU_FLATTEN
2009 float64_div(float64 a, float64 b, float_status *s)
2010 {
2011     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
2012                         f64_div_pre, f64_div_post);
2013 }
2014 
2015 /*
2016  * Returns the result of dividing the bfloat16
2017  * value `a' by the corresponding value `b'.
2018  */
2019 
2020 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
2021 {
2022     FloatParts64 pa, pb, pr;
2023 
2024     bfloat16_unpack_canonical(&pa, a, status);
2025     bfloat16_unpack_canonical(&pb, b, status);
2026     pr = div_floats(pa, pb, status);
2027 
2028     return bfloat16_round_pack_canonical(&pr, status);
2029 }
2030 
2031 /*
2032  * Float to Float conversions
2033  *
2034  * Returns the result of converting one float format to another. The
2035  * conversion is performed according to the IEC/IEEE Standard for
2036  * Binary Floating-Point Arithmetic.
2037  *
2038  * The float_to_float helper only needs to take care of raising
2039  * invalid exceptions and handling the conversion on NaNs.
2040  */
2041 
2042 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
2043                                  float_status *s)
2044 {
2045     if (dstf->arm_althp) {
2046         switch (a.cls) {
2047         case float_class_qnan:
2048         case float_class_snan:
2049             /* There is no NaN in the destination format.  Raise Invalid
2050              * and return a zero with the sign of the input NaN.
2051              */
2052             float_raise(float_flag_invalid, s);
2053             a.cls = float_class_zero;
2054             a.frac = 0;
2055             a.exp = 0;
2056             break;
2057 
2058         case float_class_inf:
2059             /* There is no Inf in the destination format.  Raise Invalid
2060              * and return the maximum normal with the correct sign.
2061              */
2062             float_raise(float_flag_invalid, s);
2063             a.cls = float_class_normal;
2064             a.exp = dstf->exp_max;
2065             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
2066             break;
2067 
2068         default:
2069             break;
2070         }
2071     } else if (is_nan(a.cls)) {
2072         parts_return_nan(&a, s);
2073     }
2074     return a;
2075 }
2076 
2077 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
2078 {
2079     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2080     FloatParts64 pa, pr;
2081 
2082     float16a_unpack_canonical(&pa, a, s, fmt16);
2083     pr = float_to_float(pa, &float32_params, s);
2084     return float32_round_pack_canonical(&pr, s);
2085 }
2086 
2087 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2088 {
2089     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2090     FloatParts64 pa, pr;
2091 
2092     float16a_unpack_canonical(&pa, a, s, fmt16);
2093     pr = float_to_float(pa, &float64_params, s);
2094     return float64_round_pack_canonical(&pr, s);
2095 }
2096 
2097 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2098 {
2099     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2100     FloatParts64 pa, pr;
2101 
2102     float32_unpack_canonical(&pa, a, s);
2103     pr = float_to_float(pa, fmt16, s);
2104     return float16a_round_pack_canonical(&pr, s, fmt16);
2105 }
2106 
2107 static float64 QEMU_SOFTFLOAT_ATTR
2108 soft_float32_to_float64(float32 a, float_status *s)
2109 {
2110     FloatParts64 pa, pr;
2111 
2112     float32_unpack_canonical(&pa, a, s);
2113     pr = float_to_float(pa, &float64_params, s);
2114     return float64_round_pack_canonical(&pr, s);
2115 }
2116 
2117 float64 float32_to_float64(float32 a, float_status *s)
2118 {
2119     if (likely(float32_is_normal(a))) {
2120         /* Widening conversion can never produce inexact results.  */
2121         union_float32 uf;
2122         union_float64 ud;
2123         uf.s = a;
2124         ud.h = uf.h;
2125         return ud.s;
2126     } else if (float32_is_zero(a)) {
2127         return float64_set_sign(float64_zero, float32_is_neg(a));
2128     } else {
2129         return soft_float32_to_float64(a, s);
2130     }
2131 }
2132 
2133 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2134 {
2135     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2136     FloatParts64 pa, pr;
2137 
2138     float64_unpack_canonical(&pa, a, s);
2139     pr = float_to_float(pa, fmt16, s);
2140     return float16a_round_pack_canonical(&pr, s, fmt16);
2141 }
2142 
2143 float32 float64_to_float32(float64 a, float_status *s)
2144 {
2145     FloatParts64 pa, pr;
2146 
2147     float64_unpack_canonical(&pa, a, s);
2148     pr = float_to_float(pa, &float32_params, s);
2149     return float32_round_pack_canonical(&pr, s);
2150 }
2151 
2152 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2153 {
2154     FloatParts64 pa, pr;
2155 
2156     bfloat16_unpack_canonical(&pa, a, s);
2157     pr = float_to_float(pa, &float32_params, s);
2158     return float32_round_pack_canonical(&pr, s);
2159 }
2160 
2161 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2162 {
2163     FloatParts64 pa, pr;
2164 
2165     bfloat16_unpack_canonical(&pa, a, s);
2166     pr = float_to_float(pa, &float64_params, s);
2167     return float64_round_pack_canonical(&pr, s);
2168 }
2169 
2170 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2171 {
2172     FloatParts64 pa, pr;
2173 
2174     float32_unpack_canonical(&pa, a, s);
2175     pr = float_to_float(pa, &bfloat16_params, s);
2176     return bfloat16_round_pack_canonical(&pr, s);
2177 }
2178 
2179 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2180 {
2181     FloatParts64 pa, pr;
2182 
2183     float64_unpack_canonical(&pa, a, s);
2184     pr = float_to_float(pa, &bfloat16_params, s);
2185     return bfloat16_round_pack_canonical(&pr, s);
2186 }
2187 
2188 /*
2189  * Rounds the floating-point value `a' to an integer, and returns the
2190  * result as a floating-point value. The operation is performed
2191  * according to the IEC/IEEE Standard for Binary Floating-Point
2192  * Arithmetic.
2193  */
2194 
2195 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2196                                int scale, float_status *s)
2197 {
2198     switch (a.cls) {
2199     case float_class_qnan:
2200     case float_class_snan:
2201         parts_return_nan(&a, s);
2202         break;
2203 
2204     case float_class_zero:
2205     case float_class_inf:
2206         /* already "integral" */
2207         break;
2208 
2209     case float_class_normal:
2210         scale = MIN(MAX(scale, -0x10000), 0x10000);
2211         a.exp += scale;
2212 
2213         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2214             /* already integral */
2215             break;
2216         }
2217         if (a.exp < 0) {
2218             bool one;
2219             /* all fractional */
2220             float_raise(float_flag_inexact, s);
2221             switch (rmode) {
2222             case float_round_nearest_even:
2223                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2224                 break;
2225             case float_round_ties_away:
2226                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2227                 break;
2228             case float_round_to_zero:
2229                 one = false;
2230                 break;
2231             case float_round_up:
2232                 one = !a.sign;
2233                 break;
2234             case float_round_down:
2235                 one = a.sign;
2236                 break;
2237             case float_round_to_odd:
2238                 one = true;
2239                 break;
2240             default:
2241                 g_assert_not_reached();
2242             }
2243 
2244             if (one) {
2245                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2246                 a.exp = 0;
2247             } else {
2248                 a.cls = float_class_zero;
2249             }
2250         } else {
2251             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2252             uint64_t frac_lsbm1 = frac_lsb >> 1;
2253             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2254             uint64_t rnd_mask = rnd_even_mask >> 1;
2255             uint64_t inc;
2256 
2257             switch (rmode) {
2258             case float_round_nearest_even:
2259                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2260                 break;
2261             case float_round_ties_away:
2262                 inc = frac_lsbm1;
2263                 break;
2264             case float_round_to_zero:
2265                 inc = 0;
2266                 break;
2267             case float_round_up:
2268                 inc = a.sign ? 0 : rnd_mask;
2269                 break;
2270             case float_round_down:
2271                 inc = a.sign ? rnd_mask : 0;
2272                 break;
2273             case float_round_to_odd:
2274                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2275                 break;
2276             default:
2277                 g_assert_not_reached();
2278             }
2279 
2280             if (a.frac & rnd_mask) {
2281                 float_raise(float_flag_inexact, s);
2282                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2283                     a.frac >>= 1;
2284                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2285                     a.exp++;
2286                 }
2287                 a.frac &= ~rnd_mask;
2288             }
2289         }
2290         break;
2291     default:
2292         g_assert_not_reached();
2293     }
2294     return a;
2295 }
2296 
2297 float16 float16_round_to_int(float16 a, float_status *s)
2298 {
2299     FloatParts64 pa, pr;
2300 
2301     float16_unpack_canonical(&pa, a, s);
2302     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2303     return float16_round_pack_canonical(&pr, s);
2304 }
2305 
2306 float32 float32_round_to_int(float32 a, float_status *s)
2307 {
2308     FloatParts64 pa, pr;
2309 
2310     float32_unpack_canonical(&pa, a, s);
2311     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2312     return float32_round_pack_canonical(&pr, s);
2313 }
2314 
2315 float64 float64_round_to_int(float64 a, float_status *s)
2316 {
2317     FloatParts64 pa, pr;
2318 
2319     float64_unpack_canonical(&pa, a, s);
2320     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2321     return float64_round_pack_canonical(&pr, s);
2322 }
2323 
2324 /*
2325  * Rounds the bfloat16 value `a' to an integer, and returns the
2326  * result as a bfloat16 value.
2327  */
2328 
2329 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2330 {
2331     FloatParts64 pa, pr;
2332 
2333     bfloat16_unpack_canonical(&pa, a, s);
2334     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2335     return bfloat16_round_pack_canonical(&pr, s);
2336 }
2337 
2338 /*
2339  * Returns the result of converting the floating-point value `a' to
2340  * the two's complement integer format. The conversion is performed
2341  * according to the IEC/IEEE Standard for Binary Floating-Point
2342  * Arithmetic---which means in particular that the conversion is
2343  * rounded according to the current rounding mode. If `a' is a NaN,
2344  * the largest positive integer is returned. Otherwise, if the
2345  * conversion overflows, the largest integer with the same sign as `a'
2346  * is returned.
2347 */
2348 
2349 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2350                                      int scale, int64_t min, int64_t max,
2351                                      float_status *s)
2352 {
2353     uint64_t r;
2354     int orig_flags = get_float_exception_flags(s);
2355     FloatParts64 p = round_to_int(in, rmode, scale, s);
2356 
2357     switch (p.cls) {
2358     case float_class_snan:
2359     case float_class_qnan:
2360         s->float_exception_flags = orig_flags | float_flag_invalid;
2361         return max;
2362     case float_class_inf:
2363         s->float_exception_flags = orig_flags | float_flag_invalid;
2364         return p.sign ? min : max;
2365     case float_class_zero:
2366         return 0;
2367     case float_class_normal:
2368         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2369             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2370         } else {
2371             r = UINT64_MAX;
2372         }
2373         if (p.sign) {
2374             if (r <= -(uint64_t) min) {
2375                 return -r;
2376             } else {
2377                 s->float_exception_flags = orig_flags | float_flag_invalid;
2378                 return min;
2379             }
2380         } else {
2381             if (r <= max) {
2382                 return r;
2383             } else {
2384                 s->float_exception_flags = orig_flags | float_flag_invalid;
2385                 return max;
2386             }
2387         }
2388     default:
2389         g_assert_not_reached();
2390     }
2391 }
2392 
2393 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2394                               float_status *s)
2395 {
2396     FloatParts64 p;
2397 
2398     float16_unpack_canonical(&p, a, s);
2399     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2400 }
2401 
2402 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2403                                 float_status *s)
2404 {
2405     FloatParts64 p;
2406 
2407     float16_unpack_canonical(&p, a, s);
2408     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2409 }
2410 
2411 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2412                                 float_status *s)
2413 {
2414     FloatParts64 p;
2415 
2416     float16_unpack_canonical(&p, a, s);
2417     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2418 }
2419 
2420 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2421                                 float_status *s)
2422 {
2423     FloatParts64 p;
2424 
2425     float16_unpack_canonical(&p, a, s);
2426     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2427 }
2428 
2429 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2430                                 float_status *s)
2431 {
2432     FloatParts64 p;
2433 
2434     float32_unpack_canonical(&p, a, s);
2435     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2436 }
2437 
2438 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2439                                 float_status *s)
2440 {
2441     FloatParts64 p;
2442 
2443     float32_unpack_canonical(&p, a, s);
2444     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2445 }
2446 
2447 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2448                                 float_status *s)
2449 {
2450     FloatParts64 p;
2451 
2452     float32_unpack_canonical(&p, a, s);
2453     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2454 }
2455 
2456 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2457                                 float_status *s)
2458 {
2459     FloatParts64 p;
2460 
2461     float64_unpack_canonical(&p, a, s);
2462     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2463 }
2464 
2465 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2466                                 float_status *s)
2467 {
2468     FloatParts64 p;
2469 
2470     float64_unpack_canonical(&p, a, s);
2471     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2472 }
2473 
2474 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2475                                 float_status *s)
2476 {
2477     FloatParts64 p;
2478 
2479     float64_unpack_canonical(&p, a, s);
2480     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2481 }
2482 
2483 int8_t float16_to_int8(float16 a, float_status *s)
2484 {
2485     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2486 }
2487 
2488 int16_t float16_to_int16(float16 a, float_status *s)
2489 {
2490     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2491 }
2492 
2493 int32_t float16_to_int32(float16 a, float_status *s)
2494 {
2495     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2496 }
2497 
2498 int64_t float16_to_int64(float16 a, float_status *s)
2499 {
2500     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2501 }
2502 
2503 int16_t float32_to_int16(float32 a, float_status *s)
2504 {
2505     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2506 }
2507 
2508 int32_t float32_to_int32(float32 a, float_status *s)
2509 {
2510     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2511 }
2512 
2513 int64_t float32_to_int64(float32 a, float_status *s)
2514 {
2515     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2516 }
2517 
2518 int16_t float64_to_int16(float64 a, float_status *s)
2519 {
2520     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2521 }
2522 
2523 int32_t float64_to_int32(float64 a, float_status *s)
2524 {
2525     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2526 }
2527 
2528 int64_t float64_to_int64(float64 a, float_status *s)
2529 {
2530     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2531 }
2532 
2533 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2534 {
2535     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2536 }
2537 
2538 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2539 {
2540     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2541 }
2542 
2543 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2544 {
2545     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2546 }
2547 
2548 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2549 {
2550     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2551 }
2552 
2553 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2554 {
2555     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2556 }
2557 
2558 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2559 {
2560     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2561 }
2562 
2563 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2564 {
2565     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2566 }
2567 
2568 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2569 {
2570     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2571 }
2572 
2573 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2574 {
2575     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2576 }
2577 
2578 /*
2579  * Returns the result of converting the floating-point value `a' to
2580  * the two's complement integer format.
2581  */
2582 
2583 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2584                                  float_status *s)
2585 {
2586     FloatParts64 p;
2587 
2588     bfloat16_unpack_canonical(&p, a, s);
2589     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2590 }
2591 
2592 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2593                                  float_status *s)
2594 {
2595     FloatParts64 p;
2596 
2597     bfloat16_unpack_canonical(&p, a, s);
2598     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2599 }
2600 
2601 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2602                                  float_status *s)
2603 {
2604     FloatParts64 p;
2605 
2606     bfloat16_unpack_canonical(&p, a, s);
2607     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2608 }
2609 
2610 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2611 {
2612     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2613 }
2614 
2615 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2616 {
2617     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2618 }
2619 
2620 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2621 {
2622     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2623 }
2624 
2625 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2626 {
2627     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2628 }
2629 
2630 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2631 {
2632     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2633 }
2634 
2635 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2636 {
2637     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2638 }
2639 
2640 /*
2641  *  Returns the result of converting the floating-point value `a' to
2642  *  the unsigned integer format. The conversion is performed according
2643  *  to the IEC/IEEE Standard for Binary Floating-Point
2644  *  Arithmetic---which means in particular that the conversion is
2645  *  rounded according to the current rounding mode. If `a' is a NaN,
2646  *  the largest unsigned integer is returned. Otherwise, if the
2647  *  conversion overflows, the largest unsigned integer is returned. If
2648  *  the 'a' is negative, the result is rounded and zero is returned;
2649  *  values that do not round to zero will raise the inexact exception
2650  *  flag.
2651  */
2652 
2653 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2654                                        int scale, uint64_t max,
2655                                        float_status *s)
2656 {
2657     int orig_flags = get_float_exception_flags(s);
2658     FloatParts64 p = round_to_int(in, rmode, scale, s);
2659     uint64_t r;
2660 
2661     switch (p.cls) {
2662     case float_class_snan:
2663     case float_class_qnan:
2664         s->float_exception_flags = orig_flags | float_flag_invalid;
2665         return max;
2666     case float_class_inf:
2667         s->float_exception_flags = orig_flags | float_flag_invalid;
2668         return p.sign ? 0 : max;
2669     case float_class_zero:
2670         return 0;
2671     case float_class_normal:
2672         if (p.sign) {
2673             s->float_exception_flags = orig_flags | float_flag_invalid;
2674             return 0;
2675         }
2676 
2677         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2678             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2679         } else {
2680             s->float_exception_flags = orig_flags | float_flag_invalid;
2681             return max;
2682         }
2683 
2684         /* For uint64 this will never trip, but if p.exp is too large
2685          * to shift a decomposed fraction we shall have exited via the
2686          * 3rd leg above.
2687          */
2688         if (r > max) {
2689             s->float_exception_flags = orig_flags | float_flag_invalid;
2690             return max;
2691         }
2692         return r;
2693     default:
2694         g_assert_not_reached();
2695     }
2696 }
2697 
2698 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2699                                 float_status *s)
2700 {
2701     FloatParts64 p;
2702 
2703     float16_unpack_canonical(&p, a, s);
2704     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2705 }
2706 
2707 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2708                                   float_status *s)
2709 {
2710     FloatParts64 p;
2711 
2712     float16_unpack_canonical(&p, a, s);
2713     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2714 }
2715 
2716 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2717                                   float_status *s)
2718 {
2719     FloatParts64 p;
2720 
2721     float16_unpack_canonical(&p, a, s);
2722     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2723 }
2724 
2725 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2726                                   float_status *s)
2727 {
2728     FloatParts64 p;
2729 
2730     float16_unpack_canonical(&p, a, s);
2731     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2732 }
2733 
2734 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2735                                   float_status *s)
2736 {
2737     FloatParts64 p;
2738 
2739     float32_unpack_canonical(&p, a, s);
2740     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2741 }
2742 
2743 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2744                                   float_status *s)
2745 {
2746     FloatParts64 p;
2747 
2748     float32_unpack_canonical(&p, a, s);
2749     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2750 }
2751 
2752 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2753                                   float_status *s)
2754 {
2755     FloatParts64 p;
2756 
2757     float32_unpack_canonical(&p, a, s);
2758     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2759 }
2760 
2761 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2762                                   float_status *s)
2763 {
2764     FloatParts64 p;
2765 
2766     float64_unpack_canonical(&p, a, s);
2767     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2768 }
2769 
2770 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2771                                   float_status *s)
2772 {
2773     FloatParts64 p;
2774 
2775     float64_unpack_canonical(&p, a, s);
2776     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2777 }
2778 
2779 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2780                                   float_status *s)
2781 {
2782     FloatParts64 p;
2783 
2784     float64_unpack_canonical(&p, a, s);
2785     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2786 }
2787 
2788 uint8_t float16_to_uint8(float16 a, float_status *s)
2789 {
2790     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2791 }
2792 
2793 uint16_t float16_to_uint16(float16 a, float_status *s)
2794 {
2795     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2796 }
2797 
2798 uint32_t float16_to_uint32(float16 a, float_status *s)
2799 {
2800     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2801 }
2802 
2803 uint64_t float16_to_uint64(float16 a, float_status *s)
2804 {
2805     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2806 }
2807 
2808 uint16_t float32_to_uint16(float32 a, float_status *s)
2809 {
2810     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2811 }
2812 
2813 uint32_t float32_to_uint32(float32 a, float_status *s)
2814 {
2815     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2816 }
2817 
2818 uint64_t float32_to_uint64(float32 a, float_status *s)
2819 {
2820     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2821 }
2822 
2823 uint16_t float64_to_uint16(float64 a, float_status *s)
2824 {
2825     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2826 }
2827 
2828 uint32_t float64_to_uint32(float64 a, float_status *s)
2829 {
2830     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2831 }
2832 
2833 uint64_t float64_to_uint64(float64 a, float_status *s)
2834 {
2835     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2836 }
2837 
2838 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2839 {
2840     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2841 }
2842 
2843 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2844 {
2845     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2846 }
2847 
2848 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2849 {
2850     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2851 }
2852 
2853 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2854 {
2855     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2856 }
2857 
2858 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2859 {
2860     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2861 }
2862 
2863 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2864 {
2865     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2866 }
2867 
2868 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2869 {
2870     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2871 }
2872 
2873 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2874 {
2875     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2876 }
2877 
2878 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2879 {
2880     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2881 }
2882 
2883 /*
2884  *  Returns the result of converting the bfloat16 value `a' to
2885  *  the unsigned integer format.
2886  */
2887 
2888 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2889                                    int scale, float_status *s)
2890 {
2891     FloatParts64 p;
2892 
2893     bfloat16_unpack_canonical(&p, a, s);
2894     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2895 }
2896 
2897 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2898                                    int scale, float_status *s)
2899 {
2900     FloatParts64 p;
2901 
2902     bfloat16_unpack_canonical(&p, a, s);
2903     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2904 }
2905 
2906 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2907                                    int scale, float_status *s)
2908 {
2909     FloatParts64 p;
2910 
2911     bfloat16_unpack_canonical(&p, a, s);
2912     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2913 }
2914 
2915 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2916 {
2917     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2918 }
2919 
2920 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2921 {
2922     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2923 }
2924 
2925 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2926 {
2927     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2928 }
2929 
2930 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2931 {
2932     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2933 }
2934 
2935 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2936 {
2937     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2938 }
2939 
2940 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2941 {
2942     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2943 }
2944 
2945 /*
2946  * Integer to float conversions
2947  *
2948  * Returns the result of converting the two's complement integer `a'
2949  * to the floating-point format. The conversion is performed according
2950  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2951  */
2952 
2953 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2954 {
2955     FloatParts64 r = { .sign = false };
2956 
2957     if (a == 0) {
2958         r.cls = float_class_zero;
2959     } else {
2960         uint64_t f = a;
2961         int shift;
2962 
2963         r.cls = float_class_normal;
2964         if (a < 0) {
2965             f = -f;
2966             r.sign = true;
2967         }
2968         shift = clz64(f);
2969         scale = MIN(MAX(scale, -0x10000), 0x10000);
2970 
2971         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2972         r.frac = f << shift;
2973     }
2974 
2975     return r;
2976 }
2977 
2978 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2979 {
2980     FloatParts64 pa = int_to_float(a, scale, status);
2981     return float16_round_pack_canonical(&pa, status);
2982 }
2983 
2984 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2985 {
2986     return int64_to_float16_scalbn(a, scale, status);
2987 }
2988 
2989 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2990 {
2991     return int64_to_float16_scalbn(a, scale, status);
2992 }
2993 
2994 float16 int64_to_float16(int64_t a, float_status *status)
2995 {
2996     return int64_to_float16_scalbn(a, 0, status);
2997 }
2998 
2999 float16 int32_to_float16(int32_t a, float_status *status)
3000 {
3001     return int64_to_float16_scalbn(a, 0, status);
3002 }
3003 
3004 float16 int16_to_float16(int16_t a, float_status *status)
3005 {
3006     return int64_to_float16_scalbn(a, 0, status);
3007 }
3008 
3009 float16 int8_to_float16(int8_t a, float_status *status)
3010 {
3011     return int64_to_float16_scalbn(a, 0, status);
3012 }
3013 
3014 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
3015 {
3016     FloatParts64 pa = int_to_float(a, scale, status);
3017     return float32_round_pack_canonical(&pa, status);
3018 }
3019 
3020 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
3021 {
3022     return int64_to_float32_scalbn(a, scale, status);
3023 }
3024 
3025 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
3026 {
3027     return int64_to_float32_scalbn(a, scale, status);
3028 }
3029 
3030 float32 int64_to_float32(int64_t a, float_status *status)
3031 {
3032     return int64_to_float32_scalbn(a, 0, status);
3033 }
3034 
3035 float32 int32_to_float32(int32_t a, float_status *status)
3036 {
3037     return int64_to_float32_scalbn(a, 0, status);
3038 }
3039 
3040 float32 int16_to_float32(int16_t a, float_status *status)
3041 {
3042     return int64_to_float32_scalbn(a, 0, status);
3043 }
3044 
3045 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
3046 {
3047     FloatParts64 pa = int_to_float(a, scale, status);
3048     return float64_round_pack_canonical(&pa, status);
3049 }
3050 
3051 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
3052 {
3053     return int64_to_float64_scalbn(a, scale, status);
3054 }
3055 
3056 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
3057 {
3058     return int64_to_float64_scalbn(a, scale, status);
3059 }
3060 
3061 float64 int64_to_float64(int64_t a, float_status *status)
3062 {
3063     return int64_to_float64_scalbn(a, 0, status);
3064 }
3065 
3066 float64 int32_to_float64(int32_t a, float_status *status)
3067 {
3068     return int64_to_float64_scalbn(a, 0, status);
3069 }
3070 
3071 float64 int16_to_float64(int16_t a, float_status *status)
3072 {
3073     return int64_to_float64_scalbn(a, 0, status);
3074 }
3075 
3076 /*
3077  * Returns the result of converting the two's complement integer `a'
3078  * to the bfloat16 format.
3079  */
3080 
3081 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3082 {
3083     FloatParts64 pa = int_to_float(a, scale, status);
3084     return bfloat16_round_pack_canonical(&pa, status);
3085 }
3086 
3087 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3088 {
3089     return int64_to_bfloat16_scalbn(a, scale, status);
3090 }
3091 
3092 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3093 {
3094     return int64_to_bfloat16_scalbn(a, scale, status);
3095 }
3096 
3097 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3098 {
3099     return int64_to_bfloat16_scalbn(a, 0, status);
3100 }
3101 
3102 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3103 {
3104     return int64_to_bfloat16_scalbn(a, 0, status);
3105 }
3106 
3107 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3108 {
3109     return int64_to_bfloat16_scalbn(a, 0, status);
3110 }
3111 
3112 /*
3113  * Unsigned Integer to float conversions
3114  *
3115  * Returns the result of converting the unsigned integer `a' to the
3116  * floating-point format. The conversion is performed according to the
3117  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3118  */
3119 
3120 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3121 {
3122     FloatParts64 r = { .sign = false };
3123     int shift;
3124 
3125     if (a == 0) {
3126         r.cls = float_class_zero;
3127     } else {
3128         scale = MIN(MAX(scale, -0x10000), 0x10000);
3129         shift = clz64(a);
3130         r.cls = float_class_normal;
3131         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3132         r.frac = a << shift;
3133     }
3134 
3135     return r;
3136 }
3137 
3138 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3139 {
3140     FloatParts64 pa = uint_to_float(a, scale, status);
3141     return float16_round_pack_canonical(&pa, status);
3142 }
3143 
3144 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3145 {
3146     return uint64_to_float16_scalbn(a, scale, status);
3147 }
3148 
3149 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3150 {
3151     return uint64_to_float16_scalbn(a, scale, status);
3152 }
3153 
3154 float16 uint64_to_float16(uint64_t a, float_status *status)
3155 {
3156     return uint64_to_float16_scalbn(a, 0, status);
3157 }
3158 
3159 float16 uint32_to_float16(uint32_t a, float_status *status)
3160 {
3161     return uint64_to_float16_scalbn(a, 0, status);
3162 }
3163 
3164 float16 uint16_to_float16(uint16_t a, float_status *status)
3165 {
3166     return uint64_to_float16_scalbn(a, 0, status);
3167 }
3168 
3169 float16 uint8_to_float16(uint8_t a, float_status *status)
3170 {
3171     return uint64_to_float16_scalbn(a, 0, status);
3172 }
3173 
3174 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3175 {
3176     FloatParts64 pa = uint_to_float(a, scale, status);
3177     return float32_round_pack_canonical(&pa, status);
3178 }
3179 
3180 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3181 {
3182     return uint64_to_float32_scalbn(a, scale, status);
3183 }
3184 
3185 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3186 {
3187     return uint64_to_float32_scalbn(a, scale, status);
3188 }
3189 
3190 float32 uint64_to_float32(uint64_t a, float_status *status)
3191 {
3192     return uint64_to_float32_scalbn(a, 0, status);
3193 }
3194 
3195 float32 uint32_to_float32(uint32_t a, float_status *status)
3196 {
3197     return uint64_to_float32_scalbn(a, 0, status);
3198 }
3199 
3200 float32 uint16_to_float32(uint16_t a, float_status *status)
3201 {
3202     return uint64_to_float32_scalbn(a, 0, status);
3203 }
3204 
3205 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3206 {
3207     FloatParts64 pa = uint_to_float(a, scale, status);
3208     return float64_round_pack_canonical(&pa, status);
3209 }
3210 
3211 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3212 {
3213     return uint64_to_float64_scalbn(a, scale, status);
3214 }
3215 
3216 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3217 {
3218     return uint64_to_float64_scalbn(a, scale, status);
3219 }
3220 
3221 float64 uint64_to_float64(uint64_t a, float_status *status)
3222 {
3223     return uint64_to_float64_scalbn(a, 0, status);
3224 }
3225 
3226 float64 uint32_to_float64(uint32_t a, float_status *status)
3227 {
3228     return uint64_to_float64_scalbn(a, 0, status);
3229 }
3230 
3231 float64 uint16_to_float64(uint16_t a, float_status *status)
3232 {
3233     return uint64_to_float64_scalbn(a, 0, status);
3234 }
3235 
3236 /*
3237  * Returns the result of converting the unsigned integer `a' to the
3238  * bfloat16 format.
3239  */
3240 
3241 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3242 {
3243     FloatParts64 pa = uint_to_float(a, scale, status);
3244     return bfloat16_round_pack_canonical(&pa, status);
3245 }
3246 
3247 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3248 {
3249     return uint64_to_bfloat16_scalbn(a, scale, status);
3250 }
3251 
3252 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3253 {
3254     return uint64_to_bfloat16_scalbn(a, scale, status);
3255 }
3256 
3257 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3258 {
3259     return uint64_to_bfloat16_scalbn(a, 0, status);
3260 }
3261 
3262 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3263 {
3264     return uint64_to_bfloat16_scalbn(a, 0, status);
3265 }
3266 
3267 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3268 {
3269     return uint64_to_bfloat16_scalbn(a, 0, status);
3270 }
3271 
3272 /* Float Min/Max */
3273 /* min() and max() functions. These can't be implemented as
3274  * 'compare and pick one input' because that would mishandle
3275  * NaNs and +0 vs -0.
3276  *
3277  * minnum() and maxnum() functions. These are similar to the min()
3278  * and max() functions but if one of the arguments is a QNaN and
3279  * the other is numerical then the numerical argument is returned.
3280  * SNaNs will get quietened before being returned.
3281  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3282  * and maxNum() operations. min() and max() are the typical min/max
3283  * semantics provided by many CPUs which predate that specification.
3284  *
3285  * minnummag() and maxnummag() functions correspond to minNumMag()
3286  * and minNumMag() from the IEEE-754 2008.
3287  */
3288 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3289                                 bool ieee, bool ismag, float_status *s)
3290 {
3291     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3292         if (ieee) {
3293             /* Takes two floating-point values `a' and `b', one of
3294              * which is a NaN, and returns the appropriate NaN
3295              * result. If either `a' or `b' is a signaling NaN,
3296              * the invalid exception is raised.
3297              */
3298             if (is_snan(a.cls) || is_snan(b.cls)) {
3299                 return pick_nan(a, b, s);
3300             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3301                 return b;
3302             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3303                 return a;
3304             }
3305         }
3306         return pick_nan(a, b, s);
3307     } else {
3308         int a_exp, b_exp;
3309 
3310         switch (a.cls) {
3311         case float_class_normal:
3312             a_exp = a.exp;
3313             break;
3314         case float_class_inf:
3315             a_exp = INT_MAX;
3316             break;
3317         case float_class_zero:
3318             a_exp = INT_MIN;
3319             break;
3320         default:
3321             g_assert_not_reached();
3322             break;
3323         }
3324         switch (b.cls) {
3325         case float_class_normal:
3326             b_exp = b.exp;
3327             break;
3328         case float_class_inf:
3329             b_exp = INT_MAX;
3330             break;
3331         case float_class_zero:
3332             b_exp = INT_MIN;
3333             break;
3334         default:
3335             g_assert_not_reached();
3336             break;
3337         }
3338 
3339         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3340             bool a_less = a_exp < b_exp;
3341             if (a_exp == b_exp) {
3342                 a_less = a.frac < b.frac;
3343             }
3344             return a_less ^ ismin ? b : a;
3345         }
3346 
3347         if (a.sign == b.sign) {
3348             bool a_less = a_exp < b_exp;
3349             if (a_exp == b_exp) {
3350                 a_less = a.frac < b.frac;
3351             }
3352             return a.sign ^ a_less ^ ismin ? b : a;
3353         } else {
3354             return a.sign ^ ismin ? b : a;
3355         }
3356     }
3357 }
3358 
3359 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3360 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3361                                      float_status *s)                   \
3362 {                                                                       \
3363     FloatParts64 pa, pb, pr;                                            \
3364     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3365     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3366     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3367     return float ## sz ## _round_pack_canonical(&pr, s);                \
3368 }
3369 
3370 MINMAX(16, min, true, false, false)
3371 MINMAX(16, minnum, true, true, false)
3372 MINMAX(16, minnummag, true, true, true)
3373 MINMAX(16, max, false, false, false)
3374 MINMAX(16, maxnum, false, true, false)
3375 MINMAX(16, maxnummag, false, true, true)
3376 
3377 MINMAX(32, min, true, false, false)
3378 MINMAX(32, minnum, true, true, false)
3379 MINMAX(32, minnummag, true, true, true)
3380 MINMAX(32, max, false, false, false)
3381 MINMAX(32, maxnum, false, true, false)
3382 MINMAX(32, maxnummag, false, true, true)
3383 
3384 MINMAX(64, min, true, false, false)
3385 MINMAX(64, minnum, true, true, false)
3386 MINMAX(64, minnummag, true, true, true)
3387 MINMAX(64, max, false, false, false)
3388 MINMAX(64, maxnum, false, true, false)
3389 MINMAX(64, maxnummag, false, true, true)
3390 
3391 #undef MINMAX
3392 
3393 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3394 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3395 {                                                                       \
3396     FloatParts64 pa, pb, pr;                                            \
3397     bfloat16_unpack_canonical(&pa, a, s);                               \
3398     bfloat16_unpack_canonical(&pb, b, s);                               \
3399     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3400     return bfloat16_round_pack_canonical(&pr, s);                       \
3401 }
3402 
3403 BF16_MINMAX(min, true, false, false)
3404 BF16_MINMAX(minnum, true, true, false)
3405 BF16_MINMAX(minnummag, true, true, true)
3406 BF16_MINMAX(max, false, false, false)
3407 BF16_MINMAX(maxnum, false, true, false)
3408 BF16_MINMAX(maxnummag, false, true, true)
3409 
3410 #undef BF16_MINMAX
3411 
3412 /* Floating point compare */
3413 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3414                                     float_status *s)
3415 {
3416     if (is_nan(a.cls) || is_nan(b.cls)) {
3417         if (!is_quiet ||
3418             a.cls == float_class_snan ||
3419             b.cls == float_class_snan) {
3420             float_raise(float_flag_invalid, s);
3421         }
3422         return float_relation_unordered;
3423     }
3424 
3425     if (a.cls == float_class_zero) {
3426         if (b.cls == float_class_zero) {
3427             return float_relation_equal;
3428         }
3429         return b.sign ? float_relation_greater : float_relation_less;
3430     } else if (b.cls == float_class_zero) {
3431         return a.sign ? float_relation_less : float_relation_greater;
3432     }
3433 
3434     /* The only really important thing about infinity is its sign. If
3435      * both are infinities the sign marks the smallest of the two.
3436      */
3437     if (a.cls == float_class_inf) {
3438         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3439             return float_relation_equal;
3440         }
3441         return a.sign ? float_relation_less : float_relation_greater;
3442     } else if (b.cls == float_class_inf) {
3443         return b.sign ? float_relation_greater : float_relation_less;
3444     }
3445 
3446     if (a.sign != b.sign) {
3447         return a.sign ? float_relation_less : float_relation_greater;
3448     }
3449 
3450     if (a.exp == b.exp) {
3451         if (a.frac == b.frac) {
3452             return float_relation_equal;
3453         }
3454         if (a.sign) {
3455             return a.frac > b.frac ?
3456                 float_relation_less : float_relation_greater;
3457         } else {
3458             return a.frac > b.frac ?
3459                 float_relation_greater : float_relation_less;
3460         }
3461     } else {
3462         if (a.sign) {
3463             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3464         } else {
3465             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3466         }
3467     }
3468 }
3469 
3470 #define COMPARE(name, attr, sz)                                         \
3471 static int attr                                                         \
3472 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3473 {                                                                       \
3474     FloatParts64 pa, pb;                                                \
3475     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3476     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3477     return compare_floats(pa, pb, is_quiet, s);                         \
3478 }
3479 
3480 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3481 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3482 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3483 
3484 #undef COMPARE
3485 
3486 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3487 {
3488     return soft_f16_compare(a, b, false, s);
3489 }
3490 
3491 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3492 {
3493     return soft_f16_compare(a, b, true, s);
3494 }
3495 
3496 static FloatRelation QEMU_FLATTEN
3497 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3498 {
3499     union_float32 ua, ub;
3500 
3501     ua.s = xa;
3502     ub.s = xb;
3503 
3504     if (QEMU_NO_HARDFLOAT) {
3505         goto soft;
3506     }
3507 
3508     float32_input_flush2(&ua.s, &ub.s, s);
3509     if (isgreaterequal(ua.h, ub.h)) {
3510         if (isgreater(ua.h, ub.h)) {
3511             return float_relation_greater;
3512         }
3513         return float_relation_equal;
3514     }
3515     if (likely(isless(ua.h, ub.h))) {
3516         return float_relation_less;
3517     }
3518     /* The only condition remaining is unordered.
3519      * Fall through to set flags.
3520      */
3521  soft:
3522     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3523 }
3524 
3525 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3526 {
3527     return f32_compare(a, b, false, s);
3528 }
3529 
3530 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3531 {
3532     return f32_compare(a, b, true, s);
3533 }
3534 
3535 static FloatRelation QEMU_FLATTEN
3536 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3537 {
3538     union_float64 ua, ub;
3539 
3540     ua.s = xa;
3541     ub.s = xb;
3542 
3543     if (QEMU_NO_HARDFLOAT) {
3544         goto soft;
3545     }
3546 
3547     float64_input_flush2(&ua.s, &ub.s, s);
3548     if (isgreaterequal(ua.h, ub.h)) {
3549         if (isgreater(ua.h, ub.h)) {
3550             return float_relation_greater;
3551         }
3552         return float_relation_equal;
3553     }
3554     if (likely(isless(ua.h, ub.h))) {
3555         return float_relation_less;
3556     }
3557     /* The only condition remaining is unordered.
3558      * Fall through to set flags.
3559      */
3560  soft:
3561     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3562 }
3563 
3564 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3565 {
3566     return f64_compare(a, b, false, s);
3567 }
3568 
3569 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3570 {
3571     return f64_compare(a, b, true, s);
3572 }
3573 
3574 static FloatRelation QEMU_FLATTEN
3575 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3576 {
3577     FloatParts64 pa, pb;
3578 
3579     bfloat16_unpack_canonical(&pa, a, s);
3580     bfloat16_unpack_canonical(&pb, b, s);
3581     return compare_floats(pa, pb, is_quiet, s);
3582 }
3583 
3584 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3585 {
3586     return soft_bf16_compare(a, b, false, s);
3587 }
3588 
3589 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3590 {
3591     return soft_bf16_compare(a, b, true, s);
3592 }
3593 
3594 /* Multiply A by 2 raised to the power N.  */
3595 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3596 {
3597     if (unlikely(is_nan(a.cls))) {
3598         parts_return_nan(&a, s);
3599     }
3600     if (a.cls == float_class_normal) {
3601         /* The largest float type (even though not supported by FloatParts64)
3602          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3603          * still allows rounding to infinity, without allowing overflow
3604          * within the int32_t that backs FloatParts64.exp.
3605          */
3606         n = MIN(MAX(n, -0x10000), 0x10000);
3607         a.exp += n;
3608     }
3609     return a;
3610 }
3611 
3612 float16 float16_scalbn(float16 a, int n, float_status *status)
3613 {
3614     FloatParts64 pa, pr;
3615 
3616     float16_unpack_canonical(&pa, a, status);
3617     pr = scalbn_decomposed(pa, n, status);
3618     return float16_round_pack_canonical(&pr, status);
3619 }
3620 
3621 float32 float32_scalbn(float32 a, int n, float_status *status)
3622 {
3623     FloatParts64 pa, pr;
3624 
3625     float32_unpack_canonical(&pa, a, status);
3626     pr = scalbn_decomposed(pa, n, status);
3627     return float32_round_pack_canonical(&pr, status);
3628 }
3629 
3630 float64 float64_scalbn(float64 a, int n, float_status *status)
3631 {
3632     FloatParts64 pa, pr;
3633 
3634     float64_unpack_canonical(&pa, a, status);
3635     pr = scalbn_decomposed(pa, n, status);
3636     return float64_round_pack_canonical(&pr, status);
3637 }
3638 
3639 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3640 {
3641     FloatParts64 pa, pr;
3642 
3643     bfloat16_unpack_canonical(&pa, a, status);
3644     pr = scalbn_decomposed(pa, n, status);
3645     return bfloat16_round_pack_canonical(&pr, status);
3646 }
3647 
3648 /*
3649  * Square Root
3650  *
3651  * The old softfloat code did an approximation step before zeroing in
3652  * on the final result. However for simpleness we just compute the
3653  * square root by iterating down from the implicit bit to enough extra
3654  * bits to ensure we get a correctly rounded result.
3655  *
3656  * This does mean however the calculation is slower than before,
3657  * especially for 64 bit floats.
3658  */
3659 
3660 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3661 {
3662     uint64_t a_frac, r_frac, s_frac;
3663     int bit, last_bit;
3664 
3665     if (is_nan(a.cls)) {
3666         parts_return_nan(&a, s);
3667         return a;
3668     }
3669     if (a.cls == float_class_zero) {
3670         return a;  /* sqrt(+-0) = +-0 */
3671     }
3672     if (a.sign) {
3673         float_raise(float_flag_invalid, s);
3674         parts_default_nan(&a, s);
3675         return a;
3676     }
3677     if (a.cls == float_class_inf) {
3678         return a;  /* sqrt(+inf) = +inf */
3679     }
3680 
3681     assert(a.cls == float_class_normal);
3682 
3683     /* We need two overflow bits at the top. Adding room for that is a
3684      * right shift. If the exponent is odd, we can discard the low bit
3685      * by multiplying the fraction by 2; that's a left shift. Combine
3686      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3687      */
3688     a_frac = a.frac >> (2 - (a.exp & 1));
3689     a.exp >>= 1;
3690 
3691     /* Bit-by-bit computation of sqrt.  */
3692     r_frac = 0;
3693     s_frac = 0;
3694 
3695     /* Iterate from implicit bit down to the 3 extra bits to compute a
3696      * properly rounded result. Remember we've inserted two more bits
3697      * at the top, so these positions are two less.
3698      */
3699     bit = DECOMPOSED_BINARY_POINT - 2;
3700     last_bit = MAX(p->frac_shift - 4, 0);
3701     do {
3702         uint64_t q = 1ULL << bit;
3703         uint64_t t_frac = s_frac + q;
3704         if (t_frac <= a_frac) {
3705             s_frac = t_frac + q;
3706             a_frac -= t_frac;
3707             r_frac += q;
3708         }
3709         a_frac <<= 1;
3710     } while (--bit >= last_bit);
3711 
3712     /* Undo the right shift done above. If there is any remaining
3713      * fraction, the result is inexact. Set the sticky bit.
3714      */
3715     a.frac = (r_frac << 2) + (a_frac != 0);
3716 
3717     return a;
3718 }
3719 
3720 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3721 {
3722     FloatParts64 pa, pr;
3723 
3724     float16_unpack_canonical(&pa, a, status);
3725     pr = sqrt_float(pa, status, &float16_params);
3726     return float16_round_pack_canonical(&pr, status);
3727 }
3728 
3729 static float32 QEMU_SOFTFLOAT_ATTR
3730 soft_f32_sqrt(float32 a, float_status *status)
3731 {
3732     FloatParts64 pa, pr;
3733 
3734     float32_unpack_canonical(&pa, a, status);
3735     pr = sqrt_float(pa, status, &float32_params);
3736     return float32_round_pack_canonical(&pr, status);
3737 }
3738 
3739 static float64 QEMU_SOFTFLOAT_ATTR
3740 soft_f64_sqrt(float64 a, float_status *status)
3741 {
3742     FloatParts64 pa, pr;
3743 
3744     float64_unpack_canonical(&pa, a, status);
3745     pr = sqrt_float(pa, status, &float64_params);
3746     return float64_round_pack_canonical(&pr, status);
3747 }
3748 
3749 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3750 {
3751     union_float32 ua, ur;
3752 
3753     ua.s = xa;
3754     if (unlikely(!can_use_fpu(s))) {
3755         goto soft;
3756     }
3757 
3758     float32_input_flush1(&ua.s, s);
3759     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3760         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3761                        fpclassify(ua.h) == FP_ZERO) ||
3762                      signbit(ua.h))) {
3763             goto soft;
3764         }
3765     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3766                         float32_is_neg(ua.s))) {
3767         goto soft;
3768     }
3769     ur.h = sqrtf(ua.h);
3770     return ur.s;
3771 
3772  soft:
3773     return soft_f32_sqrt(ua.s, s);
3774 }
3775 
3776 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3777 {
3778     union_float64 ua, ur;
3779 
3780     ua.s = xa;
3781     if (unlikely(!can_use_fpu(s))) {
3782         goto soft;
3783     }
3784 
3785     float64_input_flush1(&ua.s, s);
3786     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3787         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3788                        fpclassify(ua.h) == FP_ZERO) ||
3789                      signbit(ua.h))) {
3790             goto soft;
3791         }
3792     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3793                         float64_is_neg(ua.s))) {
3794         goto soft;
3795     }
3796     ur.h = sqrt(ua.h);
3797     return ur.s;
3798 
3799  soft:
3800     return soft_f64_sqrt(ua.s, s);
3801 }
3802 
3803 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3804 {
3805     FloatParts64 pa, pr;
3806 
3807     bfloat16_unpack_canonical(&pa, a, status);
3808     pr = sqrt_float(pa, status, &bfloat16_params);
3809     return bfloat16_round_pack_canonical(&pr, status);
3810 }
3811 
3812 /*----------------------------------------------------------------------------
3813 | The pattern for a default generated NaN.
3814 *----------------------------------------------------------------------------*/
3815 
3816 float16 float16_default_nan(float_status *status)
3817 {
3818     FloatParts64 p;
3819 
3820     parts_default_nan(&p, status);
3821     p.frac >>= float16_params.frac_shift;
3822     return float16_pack_raw(&p);
3823 }
3824 
3825 float32 float32_default_nan(float_status *status)
3826 {
3827     FloatParts64 p;
3828 
3829     parts_default_nan(&p, status);
3830     p.frac >>= float32_params.frac_shift;
3831     return float32_pack_raw(&p);
3832 }
3833 
3834 float64 float64_default_nan(float_status *status)
3835 {
3836     FloatParts64 p;
3837 
3838     parts_default_nan(&p, status);
3839     p.frac >>= float64_params.frac_shift;
3840     return float64_pack_raw(&p);
3841 }
3842 
3843 float128 float128_default_nan(float_status *status)
3844 {
3845     FloatParts128 p;
3846 
3847     parts_default_nan(&p, status);
3848     frac_shr(&p, float128_params.frac_shift);
3849     return float128_pack_raw(&p);
3850 }
3851 
3852 bfloat16 bfloat16_default_nan(float_status *status)
3853 {
3854     FloatParts64 p;
3855 
3856     parts_default_nan(&p, status);
3857     p.frac >>= bfloat16_params.frac_shift;
3858     return bfloat16_pack_raw(&p);
3859 }
3860 
3861 /*----------------------------------------------------------------------------
3862 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3863 *----------------------------------------------------------------------------*/
3864 
3865 float16 float16_silence_nan(float16 a, float_status *status)
3866 {
3867     FloatParts64 p;
3868 
3869     float16_unpack_raw(&p, a);
3870     p.frac <<= float16_params.frac_shift;
3871     parts_silence_nan(&p, status);
3872     p.frac >>= float16_params.frac_shift;
3873     return float16_pack_raw(&p);
3874 }
3875 
3876 float32 float32_silence_nan(float32 a, float_status *status)
3877 {
3878     FloatParts64 p;
3879 
3880     float32_unpack_raw(&p, a);
3881     p.frac <<= float32_params.frac_shift;
3882     parts_silence_nan(&p, status);
3883     p.frac >>= float32_params.frac_shift;
3884     return float32_pack_raw(&p);
3885 }
3886 
3887 float64 float64_silence_nan(float64 a, float_status *status)
3888 {
3889     FloatParts64 p;
3890 
3891     float64_unpack_raw(&p, a);
3892     p.frac <<= float64_params.frac_shift;
3893     parts_silence_nan(&p, status);
3894     p.frac >>= float64_params.frac_shift;
3895     return float64_pack_raw(&p);
3896 }
3897 
3898 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3899 {
3900     FloatParts64 p;
3901 
3902     bfloat16_unpack_raw(&p, a);
3903     p.frac <<= bfloat16_params.frac_shift;
3904     parts_silence_nan(&p, status);
3905     p.frac >>= bfloat16_params.frac_shift;
3906     return bfloat16_pack_raw(&p);
3907 }
3908 
3909 float128 float128_silence_nan(float128 a, float_status *status)
3910 {
3911     FloatParts128 p;
3912 
3913     float128_unpack_raw(&p, a);
3914     frac_shl(&p, float128_params.frac_shift);
3915     parts_silence_nan(&p, status);
3916     frac_shr(&p, float128_params.frac_shift);
3917     return float128_pack_raw(&p);
3918 }
3919 
3920 /*----------------------------------------------------------------------------
3921 | If `a' is denormal and we are in flush-to-zero mode then set the
3922 | input-denormal exception and return zero. Otherwise just return the value.
3923 *----------------------------------------------------------------------------*/
3924 
3925 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3926 {
3927     if (p.exp == 0 && p.frac != 0) {
3928         float_raise(float_flag_input_denormal, status);
3929         return true;
3930     }
3931 
3932     return false;
3933 }
3934 
3935 float16 float16_squash_input_denormal(float16 a, float_status *status)
3936 {
3937     if (status->flush_inputs_to_zero) {
3938         FloatParts64 p;
3939 
3940         float16_unpack_raw(&p, a);
3941         if (parts_squash_denormal(p, status)) {
3942             return float16_set_sign(float16_zero, p.sign);
3943         }
3944     }
3945     return a;
3946 }
3947 
3948 float32 float32_squash_input_denormal(float32 a, float_status *status)
3949 {
3950     if (status->flush_inputs_to_zero) {
3951         FloatParts64 p;
3952 
3953         float32_unpack_raw(&p, a);
3954         if (parts_squash_denormal(p, status)) {
3955             return float32_set_sign(float32_zero, p.sign);
3956         }
3957     }
3958     return a;
3959 }
3960 
3961 float64 float64_squash_input_denormal(float64 a, float_status *status)
3962 {
3963     if (status->flush_inputs_to_zero) {
3964         FloatParts64 p;
3965 
3966         float64_unpack_raw(&p, a);
3967         if (parts_squash_denormal(p, status)) {
3968             return float64_set_sign(float64_zero, p.sign);
3969         }
3970     }
3971     return a;
3972 }
3973 
3974 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3975 {
3976     if (status->flush_inputs_to_zero) {
3977         FloatParts64 p;
3978 
3979         bfloat16_unpack_raw(&p, a);
3980         if (parts_squash_denormal(p, status)) {
3981             return bfloat16_set_sign(bfloat16_zero, p.sign);
3982         }
3983     }
3984     return a;
3985 }
3986 
3987 /*----------------------------------------------------------------------------
3988 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3989 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3990 | input.  If `zSign' is 1, the input is negated before being converted to an
3991 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3992 | is simply rounded to an integer, with the inexact exception raised if the
3993 | input cannot be represented exactly as an integer.  However, if the fixed-
3994 | point input is too large, the invalid exception is raised and the largest
3995 | positive or negative integer is returned.
3996 *----------------------------------------------------------------------------*/
3997 
3998 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3999                                  float_status *status)
4000 {
4001     int8_t roundingMode;
4002     bool roundNearestEven;
4003     int8_t roundIncrement, roundBits;
4004     int32_t z;
4005 
4006     roundingMode = status->float_rounding_mode;
4007     roundNearestEven = ( roundingMode == float_round_nearest_even );
4008     switch (roundingMode) {
4009     case float_round_nearest_even:
4010     case float_round_ties_away:
4011         roundIncrement = 0x40;
4012         break;
4013     case float_round_to_zero:
4014         roundIncrement = 0;
4015         break;
4016     case float_round_up:
4017         roundIncrement = zSign ? 0 : 0x7f;
4018         break;
4019     case float_round_down:
4020         roundIncrement = zSign ? 0x7f : 0;
4021         break;
4022     case float_round_to_odd:
4023         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
4024         break;
4025     default:
4026         abort();
4027     }
4028     roundBits = absZ & 0x7F;
4029     absZ = ( absZ + roundIncrement )>>7;
4030     if (!(roundBits ^ 0x40) && roundNearestEven) {
4031         absZ &= ~1;
4032     }
4033     z = absZ;
4034     if ( zSign ) z = - z;
4035     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
4036         float_raise(float_flag_invalid, status);
4037         return zSign ? INT32_MIN : INT32_MAX;
4038     }
4039     if (roundBits) {
4040         float_raise(float_flag_inexact, status);
4041     }
4042     return z;
4043 
4044 }
4045 
4046 /*----------------------------------------------------------------------------
4047 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4048 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4049 | and returns the properly rounded 64-bit integer corresponding to the input.
4050 | If `zSign' is 1, the input is negated before being converted to an integer.
4051 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4052 | the inexact exception raised if the input cannot be represented exactly as
4053 | an integer.  However, if the fixed-point input is too large, the invalid
4054 | exception is raised and the largest positive or negative integer is
4055 | returned.
4056 *----------------------------------------------------------------------------*/
4057 
4058 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
4059                                float_status *status)
4060 {
4061     int8_t roundingMode;
4062     bool roundNearestEven, increment;
4063     int64_t z;
4064 
4065     roundingMode = status->float_rounding_mode;
4066     roundNearestEven = ( roundingMode == float_round_nearest_even );
4067     switch (roundingMode) {
4068     case float_round_nearest_even:
4069     case float_round_ties_away:
4070         increment = ((int64_t) absZ1 < 0);
4071         break;
4072     case float_round_to_zero:
4073         increment = 0;
4074         break;
4075     case float_round_up:
4076         increment = !zSign && absZ1;
4077         break;
4078     case float_round_down:
4079         increment = zSign && absZ1;
4080         break;
4081     case float_round_to_odd:
4082         increment = !(absZ0 & 1) && absZ1;
4083         break;
4084     default:
4085         abort();
4086     }
4087     if ( increment ) {
4088         ++absZ0;
4089         if ( absZ0 == 0 ) goto overflow;
4090         if (!(absZ1 << 1) && roundNearestEven) {
4091             absZ0 &= ~1;
4092         }
4093     }
4094     z = absZ0;
4095     if ( zSign ) z = - z;
4096     if ( z && ( ( z < 0 ) ^ zSign ) ) {
4097  overflow:
4098         float_raise(float_flag_invalid, status);
4099         return zSign ? INT64_MIN : INT64_MAX;
4100     }
4101     if (absZ1) {
4102         float_raise(float_flag_inexact, status);
4103     }
4104     return z;
4105 
4106 }
4107 
4108 /*----------------------------------------------------------------------------
4109 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4110 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4111 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4112 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
4113 | with the inexact exception raised if the input cannot be represented exactly
4114 | as an integer.  However, if the fixed-point input is too large, the invalid
4115 | exception is raised and the largest unsigned integer is returned.
4116 *----------------------------------------------------------------------------*/
4117 
4118 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4119                                 uint64_t absZ1, float_status *status)
4120 {
4121     int8_t roundingMode;
4122     bool roundNearestEven, increment;
4123 
4124     roundingMode = status->float_rounding_mode;
4125     roundNearestEven = (roundingMode == float_round_nearest_even);
4126     switch (roundingMode) {
4127     case float_round_nearest_even:
4128     case float_round_ties_away:
4129         increment = ((int64_t)absZ1 < 0);
4130         break;
4131     case float_round_to_zero:
4132         increment = 0;
4133         break;
4134     case float_round_up:
4135         increment = !zSign && absZ1;
4136         break;
4137     case float_round_down:
4138         increment = zSign && absZ1;
4139         break;
4140     case float_round_to_odd:
4141         increment = !(absZ0 & 1) && absZ1;
4142         break;
4143     default:
4144         abort();
4145     }
4146     if (increment) {
4147         ++absZ0;
4148         if (absZ0 == 0) {
4149             float_raise(float_flag_invalid, status);
4150             return UINT64_MAX;
4151         }
4152         if (!(absZ1 << 1) && roundNearestEven) {
4153             absZ0 &= ~1;
4154         }
4155     }
4156 
4157     if (zSign && absZ0) {
4158         float_raise(float_flag_invalid, status);
4159         return 0;
4160     }
4161 
4162     if (absZ1) {
4163         float_raise(float_flag_inexact, status);
4164     }
4165     return absZ0;
4166 }
4167 
4168 /*----------------------------------------------------------------------------
4169 | Normalizes the subnormal single-precision floating-point value represented
4170 | by the denormalized significand `aSig'.  The normalized exponent and
4171 | significand are stored at the locations pointed to by `zExpPtr' and
4172 | `zSigPtr', respectively.
4173 *----------------------------------------------------------------------------*/
4174 
4175 static void
4176  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4177 {
4178     int8_t shiftCount;
4179 
4180     shiftCount = clz32(aSig) - 8;
4181     *zSigPtr = aSig<<shiftCount;
4182     *zExpPtr = 1 - shiftCount;
4183 
4184 }
4185 
4186 /*----------------------------------------------------------------------------
4187 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4188 | and significand `zSig', and returns the proper single-precision floating-
4189 | point value corresponding to the abstract input.  Ordinarily, the abstract
4190 | value is simply rounded and packed into the single-precision format, with
4191 | the inexact exception raised if the abstract input cannot be represented
4192 | exactly.  However, if the abstract value is too large, the overflow and
4193 | inexact exceptions are raised and an infinity or maximal finite value is
4194 | returned.  If the abstract value is too small, the input value is rounded to
4195 | a subnormal number, and the underflow and inexact exceptions are raised if
4196 | the abstract input cannot be represented exactly as a subnormal single-
4197 | precision floating-point number.
4198 |     The input significand `zSig' has its binary point between bits 30
4199 | and 29, which is 7 bits to the left of the usual location.  This shifted
4200 | significand must be normalized or smaller.  If `zSig' is not normalized,
4201 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4202 | and it must not require rounding.  In the usual case that `zSig' is
4203 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4204 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4205 | Binary Floating-Point Arithmetic.
4206 *----------------------------------------------------------------------------*/
4207 
4208 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4209                                    float_status *status)
4210 {
4211     int8_t roundingMode;
4212     bool roundNearestEven;
4213     int8_t roundIncrement, roundBits;
4214     bool isTiny;
4215 
4216     roundingMode = status->float_rounding_mode;
4217     roundNearestEven = ( roundingMode == float_round_nearest_even );
4218     switch (roundingMode) {
4219     case float_round_nearest_even:
4220     case float_round_ties_away:
4221         roundIncrement = 0x40;
4222         break;
4223     case float_round_to_zero:
4224         roundIncrement = 0;
4225         break;
4226     case float_round_up:
4227         roundIncrement = zSign ? 0 : 0x7f;
4228         break;
4229     case float_round_down:
4230         roundIncrement = zSign ? 0x7f : 0;
4231         break;
4232     case float_round_to_odd:
4233         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4234         break;
4235     default:
4236         abort();
4237         break;
4238     }
4239     roundBits = zSig & 0x7F;
4240     if ( 0xFD <= (uint16_t) zExp ) {
4241         if (    ( 0xFD < zExp )
4242              || (    ( zExp == 0xFD )
4243                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4244            ) {
4245             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4246                                    roundIncrement != 0;
4247             float_raise(float_flag_overflow | float_flag_inexact, status);
4248             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4249         }
4250         if ( zExp < 0 ) {
4251             if (status->flush_to_zero) {
4252                 float_raise(float_flag_output_denormal, status);
4253                 return packFloat32(zSign, 0, 0);
4254             }
4255             isTiny = status->tininess_before_rounding
4256                   || (zExp < -1)
4257                   || (zSig + roundIncrement < 0x80000000);
4258             shift32RightJamming( zSig, - zExp, &zSig );
4259             zExp = 0;
4260             roundBits = zSig & 0x7F;
4261             if (isTiny && roundBits) {
4262                 float_raise(float_flag_underflow, status);
4263             }
4264             if (roundingMode == float_round_to_odd) {
4265                 /*
4266                  * For round-to-odd case, the roundIncrement depends on
4267                  * zSig which just changed.
4268                  */
4269                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4270             }
4271         }
4272     }
4273     if (roundBits) {
4274         float_raise(float_flag_inexact, status);
4275     }
4276     zSig = ( zSig + roundIncrement )>>7;
4277     if (!(roundBits ^ 0x40) && roundNearestEven) {
4278         zSig &= ~1;
4279     }
4280     if ( zSig == 0 ) zExp = 0;
4281     return packFloat32( zSign, zExp, zSig );
4282 
4283 }
4284 
4285 /*----------------------------------------------------------------------------
4286 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4287 | and significand `zSig', and returns the proper single-precision floating-
4288 | point value corresponding to the abstract input.  This routine is just like
4289 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4290 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4291 | floating-point exponent.
4292 *----------------------------------------------------------------------------*/
4293 
4294 static float32
4295  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4296                               float_status *status)
4297 {
4298     int8_t shiftCount;
4299 
4300     shiftCount = clz32(zSig) - 1;
4301     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4302                                status);
4303 
4304 }
4305 
4306 /*----------------------------------------------------------------------------
4307 | Normalizes the subnormal double-precision floating-point value represented
4308 | by the denormalized significand `aSig'.  The normalized exponent and
4309 | significand are stored at the locations pointed to by `zExpPtr' and
4310 | `zSigPtr', respectively.
4311 *----------------------------------------------------------------------------*/
4312 
4313 static void
4314  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4315 {
4316     int8_t shiftCount;
4317 
4318     shiftCount = clz64(aSig) - 11;
4319     *zSigPtr = aSig<<shiftCount;
4320     *zExpPtr = 1 - shiftCount;
4321 
4322 }
4323 
4324 /*----------------------------------------------------------------------------
4325 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4326 | double-precision floating-point value, returning the result.  After being
4327 | shifted into the proper positions, the three fields are simply added
4328 | together to form the result.  This means that any integer portion of `zSig'
4329 | will be added into the exponent.  Since a properly normalized significand
4330 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4331 | than the desired result exponent whenever `zSig' is a complete, normalized
4332 | significand.
4333 *----------------------------------------------------------------------------*/
4334 
4335 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4336 {
4337 
4338     return make_float64(
4339         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4340 
4341 }
4342 
4343 /*----------------------------------------------------------------------------
4344 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4345 | and significand `zSig', and returns the proper double-precision floating-
4346 | point value corresponding to the abstract input.  Ordinarily, the abstract
4347 | value is simply rounded and packed into the double-precision format, with
4348 | the inexact exception raised if the abstract input cannot be represented
4349 | exactly.  However, if the abstract value is too large, the overflow and
4350 | inexact exceptions are raised and an infinity or maximal finite value is
4351 | returned.  If the abstract value is too small, the input value is rounded to
4352 | a subnormal number, and the underflow and inexact exceptions are raised if
4353 | the abstract input cannot be represented exactly as a subnormal double-
4354 | precision floating-point number.
4355 |     The input significand `zSig' has its binary point between bits 62
4356 | and 61, which is 10 bits to the left of the usual location.  This shifted
4357 | significand must be normalized or smaller.  If `zSig' is not normalized,
4358 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4359 | and it must not require rounding.  In the usual case that `zSig' is
4360 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4361 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4362 | Binary Floating-Point Arithmetic.
4363 *----------------------------------------------------------------------------*/
4364 
4365 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4366                                    float_status *status)
4367 {
4368     int8_t roundingMode;
4369     bool roundNearestEven;
4370     int roundIncrement, roundBits;
4371     bool isTiny;
4372 
4373     roundingMode = status->float_rounding_mode;
4374     roundNearestEven = ( roundingMode == float_round_nearest_even );
4375     switch (roundingMode) {
4376     case float_round_nearest_even:
4377     case float_round_ties_away:
4378         roundIncrement = 0x200;
4379         break;
4380     case float_round_to_zero:
4381         roundIncrement = 0;
4382         break;
4383     case float_round_up:
4384         roundIncrement = zSign ? 0 : 0x3ff;
4385         break;
4386     case float_round_down:
4387         roundIncrement = zSign ? 0x3ff : 0;
4388         break;
4389     case float_round_to_odd:
4390         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4391         break;
4392     default:
4393         abort();
4394     }
4395     roundBits = zSig & 0x3FF;
4396     if ( 0x7FD <= (uint16_t) zExp ) {
4397         if (    ( 0x7FD < zExp )
4398              || (    ( zExp == 0x7FD )
4399                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4400            ) {
4401             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4402                                    roundIncrement != 0;
4403             float_raise(float_flag_overflow | float_flag_inexact, status);
4404             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4405         }
4406         if ( zExp < 0 ) {
4407             if (status->flush_to_zero) {
4408                 float_raise(float_flag_output_denormal, status);
4409                 return packFloat64(zSign, 0, 0);
4410             }
4411             isTiny = status->tininess_before_rounding
4412                   || (zExp < -1)
4413                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4414             shift64RightJamming( zSig, - zExp, &zSig );
4415             zExp = 0;
4416             roundBits = zSig & 0x3FF;
4417             if (isTiny && roundBits) {
4418                 float_raise(float_flag_underflow, status);
4419             }
4420             if (roundingMode == float_round_to_odd) {
4421                 /*
4422                  * For round-to-odd case, the roundIncrement depends on
4423                  * zSig which just changed.
4424                  */
4425                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4426             }
4427         }
4428     }
4429     if (roundBits) {
4430         float_raise(float_flag_inexact, status);
4431     }
4432     zSig = ( zSig + roundIncrement )>>10;
4433     if (!(roundBits ^ 0x200) && roundNearestEven) {
4434         zSig &= ~1;
4435     }
4436     if ( zSig == 0 ) zExp = 0;
4437     return packFloat64( zSign, zExp, zSig );
4438 
4439 }
4440 
4441 /*----------------------------------------------------------------------------
4442 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4443 | and significand `zSig', and returns the proper double-precision floating-
4444 | point value corresponding to the abstract input.  This routine is just like
4445 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4446 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4447 | floating-point exponent.
4448 *----------------------------------------------------------------------------*/
4449 
4450 static float64
4451  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4452                               float_status *status)
4453 {
4454     int8_t shiftCount;
4455 
4456     shiftCount = clz64(zSig) - 1;
4457     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4458                                status);
4459 
4460 }
4461 
4462 /*----------------------------------------------------------------------------
4463 | Normalizes the subnormal extended double-precision floating-point value
4464 | represented by the denormalized significand `aSig'.  The normalized exponent
4465 | and significand are stored at the locations pointed to by `zExpPtr' and
4466 | `zSigPtr', respectively.
4467 *----------------------------------------------------------------------------*/
4468 
4469 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4470                                 uint64_t *zSigPtr)
4471 {
4472     int8_t shiftCount;
4473 
4474     shiftCount = clz64(aSig);
4475     *zSigPtr = aSig<<shiftCount;
4476     *zExpPtr = 1 - shiftCount;
4477 }
4478 
4479 /*----------------------------------------------------------------------------
4480 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4481 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4482 | and returns the proper extended double-precision floating-point value
4483 | corresponding to the abstract input.  Ordinarily, the abstract value is
4484 | rounded and packed into the extended double-precision format, with the
4485 | inexact exception raised if the abstract input cannot be represented
4486 | exactly.  However, if the abstract value is too large, the overflow and
4487 | inexact exceptions are raised and an infinity or maximal finite value is
4488 | returned.  If the abstract value is too small, the input value is rounded to
4489 | a subnormal number, and the underflow and inexact exceptions are raised if
4490 | the abstract input cannot be represented exactly as a subnormal extended
4491 | double-precision floating-point number.
4492 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4493 | number of bits as single or double precision, respectively.  Otherwise, the
4494 | result is rounded to the full precision of the extended double-precision
4495 | format.
4496 |     The input significand must be normalized or smaller.  If the input
4497 | significand is not normalized, `zExp' must be 0; in that case, the result
4498 | returned is a subnormal number, and it must not require rounding.  The
4499 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4500 | Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4502 
4503 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4504                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4505                               float_status *status)
4506 {
4507     int8_t roundingMode;
4508     bool roundNearestEven, increment, isTiny;
4509     int64_t roundIncrement, roundMask, roundBits;
4510 
4511     roundingMode = status->float_rounding_mode;
4512     roundNearestEven = ( roundingMode == float_round_nearest_even );
4513     if ( roundingPrecision == 80 ) goto precision80;
4514     if ( roundingPrecision == 64 ) {
4515         roundIncrement = UINT64_C(0x0000000000000400);
4516         roundMask = UINT64_C(0x00000000000007FF);
4517     }
4518     else if ( roundingPrecision == 32 ) {
4519         roundIncrement = UINT64_C(0x0000008000000000);
4520         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4521     }
4522     else {
4523         goto precision80;
4524     }
4525     zSig0 |= ( zSig1 != 0 );
4526     switch (roundingMode) {
4527     case float_round_nearest_even:
4528     case float_round_ties_away:
4529         break;
4530     case float_round_to_zero:
4531         roundIncrement = 0;
4532         break;
4533     case float_round_up:
4534         roundIncrement = zSign ? 0 : roundMask;
4535         break;
4536     case float_round_down:
4537         roundIncrement = zSign ? roundMask : 0;
4538         break;
4539     default:
4540         abort();
4541     }
4542     roundBits = zSig0 & roundMask;
4543     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4544         if (    ( 0x7FFE < zExp )
4545              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4546            ) {
4547             goto overflow;
4548         }
4549         if ( zExp <= 0 ) {
4550             if (status->flush_to_zero) {
4551                 float_raise(float_flag_output_denormal, status);
4552                 return packFloatx80(zSign, 0, 0);
4553             }
4554             isTiny = status->tininess_before_rounding
4555                   || (zExp < 0 )
4556                   || (zSig0 <= zSig0 + roundIncrement);
4557             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4558             zExp = 0;
4559             roundBits = zSig0 & roundMask;
4560             if (isTiny && roundBits) {
4561                 float_raise(float_flag_underflow, status);
4562             }
4563             if (roundBits) {
4564                 float_raise(float_flag_inexact, status);
4565             }
4566             zSig0 += roundIncrement;
4567             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4568             roundIncrement = roundMask + 1;
4569             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4570                 roundMask |= roundIncrement;
4571             }
4572             zSig0 &= ~ roundMask;
4573             return packFloatx80( zSign, zExp, zSig0 );
4574         }
4575     }
4576     if (roundBits) {
4577         float_raise(float_flag_inexact, status);
4578     }
4579     zSig0 += roundIncrement;
4580     if ( zSig0 < roundIncrement ) {
4581         ++zExp;
4582         zSig0 = UINT64_C(0x8000000000000000);
4583     }
4584     roundIncrement = roundMask + 1;
4585     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4586         roundMask |= roundIncrement;
4587     }
4588     zSig0 &= ~ roundMask;
4589     if ( zSig0 == 0 ) zExp = 0;
4590     return packFloatx80( zSign, zExp, zSig0 );
4591  precision80:
4592     switch (roundingMode) {
4593     case float_round_nearest_even:
4594     case float_round_ties_away:
4595         increment = ((int64_t)zSig1 < 0);
4596         break;
4597     case float_round_to_zero:
4598         increment = 0;
4599         break;
4600     case float_round_up:
4601         increment = !zSign && zSig1;
4602         break;
4603     case float_round_down:
4604         increment = zSign && zSig1;
4605         break;
4606     default:
4607         abort();
4608     }
4609     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4610         if (    ( 0x7FFE < zExp )
4611              || (    ( zExp == 0x7FFE )
4612                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4613                   && increment
4614                 )
4615            ) {
4616             roundMask = 0;
4617  overflow:
4618             float_raise(float_flag_overflow | float_flag_inexact, status);
4619             if (    ( roundingMode == float_round_to_zero )
4620                  || ( zSign && ( roundingMode == float_round_up ) )
4621                  || ( ! zSign && ( roundingMode == float_round_down ) )
4622                ) {
4623                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4624             }
4625             return packFloatx80(zSign,
4626                                 floatx80_infinity_high,
4627                                 floatx80_infinity_low);
4628         }
4629         if ( zExp <= 0 ) {
4630             isTiny = status->tininess_before_rounding
4631                   || (zExp < 0)
4632                   || !increment
4633                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4634             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4635             zExp = 0;
4636             if (isTiny && zSig1) {
4637                 float_raise(float_flag_underflow, status);
4638             }
4639             if (zSig1) {
4640                 float_raise(float_flag_inexact, status);
4641             }
4642             switch (roundingMode) {
4643             case float_round_nearest_even:
4644             case float_round_ties_away:
4645                 increment = ((int64_t)zSig1 < 0);
4646                 break;
4647             case float_round_to_zero:
4648                 increment = 0;
4649                 break;
4650             case float_round_up:
4651                 increment = !zSign && zSig1;
4652                 break;
4653             case float_round_down:
4654                 increment = zSign && zSig1;
4655                 break;
4656             default:
4657                 abort();
4658             }
4659             if ( increment ) {
4660                 ++zSig0;
4661                 if (!(zSig1 << 1) && roundNearestEven) {
4662                     zSig0 &= ~1;
4663                 }
4664                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4665             }
4666             return packFloatx80( zSign, zExp, zSig0 );
4667         }
4668     }
4669     if (zSig1) {
4670         float_raise(float_flag_inexact, status);
4671     }
4672     if ( increment ) {
4673         ++zSig0;
4674         if ( zSig0 == 0 ) {
4675             ++zExp;
4676             zSig0 = UINT64_C(0x8000000000000000);
4677         }
4678         else {
4679             if (!(zSig1 << 1) && roundNearestEven) {
4680                 zSig0 &= ~1;
4681             }
4682         }
4683     }
4684     else {
4685         if ( zSig0 == 0 ) zExp = 0;
4686     }
4687     return packFloatx80( zSign, zExp, zSig0 );
4688 
4689 }
4690 
4691 /*----------------------------------------------------------------------------
4692 | Takes an abstract floating-point value having sign `zSign', exponent
4693 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4694 | and returns the proper extended double-precision floating-point value
4695 | corresponding to the abstract input.  This routine is just like
4696 | `roundAndPackFloatx80' except that the input significand does not have to be
4697 | normalized.
4698 *----------------------------------------------------------------------------*/
4699 
4700 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4701                                        bool zSign, int32_t zExp,
4702                                        uint64_t zSig0, uint64_t zSig1,
4703                                        float_status *status)
4704 {
4705     int8_t shiftCount;
4706 
4707     if ( zSig0 == 0 ) {
4708         zSig0 = zSig1;
4709         zSig1 = 0;
4710         zExp -= 64;
4711     }
4712     shiftCount = clz64(zSig0);
4713     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4714     zExp -= shiftCount;
4715     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4716                                 zSig0, zSig1, status);
4717 
4718 }
4719 
4720 /*----------------------------------------------------------------------------
4721 | Returns the least-significant 64 fraction bits of the quadruple-precision
4722 | floating-point value `a'.
4723 *----------------------------------------------------------------------------*/
4724 
4725 static inline uint64_t extractFloat128Frac1( float128 a )
4726 {
4727 
4728     return a.low;
4729 
4730 }
4731 
4732 /*----------------------------------------------------------------------------
4733 | Returns the most-significant 48 fraction bits of the quadruple-precision
4734 | floating-point value `a'.
4735 *----------------------------------------------------------------------------*/
4736 
4737 static inline uint64_t extractFloat128Frac0( float128 a )
4738 {
4739 
4740     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4741 
4742 }
4743 
4744 /*----------------------------------------------------------------------------
4745 | Returns the exponent bits of the quadruple-precision floating-point value
4746 | `a'.
4747 *----------------------------------------------------------------------------*/
4748 
4749 static inline int32_t extractFloat128Exp( float128 a )
4750 {
4751 
4752     return ( a.high>>48 ) & 0x7FFF;
4753 
4754 }
4755 
4756 /*----------------------------------------------------------------------------
4757 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4758 *----------------------------------------------------------------------------*/
4759 
4760 static inline bool extractFloat128Sign(float128 a)
4761 {
4762     return a.high >> 63;
4763 }
4764 
4765 /*----------------------------------------------------------------------------
4766 | Normalizes the subnormal quadruple-precision floating-point value
4767 | represented by the denormalized significand formed by the concatenation of
4768 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4769 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4770 | significand are stored at the location pointed to by `zSig0Ptr', and the
4771 | least significant 64 bits of the normalized significand are stored at the
4772 | location pointed to by `zSig1Ptr'.
4773 *----------------------------------------------------------------------------*/
4774 
4775 static void
4776  normalizeFloat128Subnormal(
4777      uint64_t aSig0,
4778      uint64_t aSig1,
4779      int32_t *zExpPtr,
4780      uint64_t *zSig0Ptr,
4781      uint64_t *zSig1Ptr
4782  )
4783 {
4784     int8_t shiftCount;
4785 
4786     if ( aSig0 == 0 ) {
4787         shiftCount = clz64(aSig1) - 15;
4788         if ( shiftCount < 0 ) {
4789             *zSig0Ptr = aSig1>>( - shiftCount );
4790             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4791         }
4792         else {
4793             *zSig0Ptr = aSig1<<shiftCount;
4794             *zSig1Ptr = 0;
4795         }
4796         *zExpPtr = - shiftCount - 63;
4797     }
4798     else {
4799         shiftCount = clz64(aSig0) - 15;
4800         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4801         *zExpPtr = 1 - shiftCount;
4802     }
4803 
4804 }
4805 
4806 /*----------------------------------------------------------------------------
4807 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4808 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4809 | floating-point value, returning the result.  After being shifted into the
4810 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4811 | added together to form the most significant 32 bits of the result.  This
4812 | means that any integer portion of `zSig0' will be added into the exponent.
4813 | Since a properly normalized significand will have an integer portion equal
4814 | to 1, the `zExp' input should be 1 less than the desired result exponent
4815 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4816 | significand.
4817 *----------------------------------------------------------------------------*/
4818 
4819 static inline float128
4820 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4821 {
4822     float128 z;
4823 
4824     z.low = zSig1;
4825     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4826     return z;
4827 }
4828 
4829 /*----------------------------------------------------------------------------
4830 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4831 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4832 | and `zSig2', and returns the proper quadruple-precision floating-point value
4833 | corresponding to the abstract input.  Ordinarily, the abstract value is
4834 | simply rounded and packed into the quadruple-precision format, with the
4835 | inexact exception raised if the abstract input cannot be represented
4836 | exactly.  However, if the abstract value is too large, the overflow and
4837 | inexact exceptions are raised and an infinity or maximal finite value is
4838 | returned.  If the abstract value is too small, the input value is rounded to
4839 | a subnormal number, and the underflow and inexact exceptions are raised if
4840 | the abstract input cannot be represented exactly as a subnormal quadruple-
4841 | precision floating-point number.
4842 |     The input significand must be normalized or smaller.  If the input
4843 | significand is not normalized, `zExp' must be 0; in that case, the result
4844 | returned is a subnormal number, and it must not require rounding.  In the
4845 | usual case that the input significand is normalized, `zExp' must be 1 less
4846 | than the ``true'' floating-point exponent.  The handling of underflow and
4847 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4848 *----------------------------------------------------------------------------*/
4849 
4850 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4851                                      uint64_t zSig0, uint64_t zSig1,
4852                                      uint64_t zSig2, float_status *status)
4853 {
4854     int8_t roundingMode;
4855     bool roundNearestEven, increment, isTiny;
4856 
4857     roundingMode = status->float_rounding_mode;
4858     roundNearestEven = ( roundingMode == float_round_nearest_even );
4859     switch (roundingMode) {
4860     case float_round_nearest_even:
4861     case float_round_ties_away:
4862         increment = ((int64_t)zSig2 < 0);
4863         break;
4864     case float_round_to_zero:
4865         increment = 0;
4866         break;
4867     case float_round_up:
4868         increment = !zSign && zSig2;
4869         break;
4870     case float_round_down:
4871         increment = zSign && zSig2;
4872         break;
4873     case float_round_to_odd:
4874         increment = !(zSig1 & 0x1) && zSig2;
4875         break;
4876     default:
4877         abort();
4878     }
4879     if ( 0x7FFD <= (uint32_t) zExp ) {
4880         if (    ( 0x7FFD < zExp )
4881              || (    ( zExp == 0x7FFD )
4882                   && eq128(
4883                          UINT64_C(0x0001FFFFFFFFFFFF),
4884                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4885                          zSig0,
4886                          zSig1
4887                      )
4888                   && increment
4889                 )
4890            ) {
4891             float_raise(float_flag_overflow | float_flag_inexact, status);
4892             if (    ( roundingMode == float_round_to_zero )
4893                  || ( zSign && ( roundingMode == float_round_up ) )
4894                  || ( ! zSign && ( roundingMode == float_round_down ) )
4895                  || (roundingMode == float_round_to_odd)
4896                ) {
4897                 return
4898                     packFloat128(
4899                         zSign,
4900                         0x7FFE,
4901                         UINT64_C(0x0000FFFFFFFFFFFF),
4902                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4903                     );
4904             }
4905             return packFloat128( zSign, 0x7FFF, 0, 0 );
4906         }
4907         if ( zExp < 0 ) {
4908             if (status->flush_to_zero) {
4909                 float_raise(float_flag_output_denormal, status);
4910                 return packFloat128(zSign, 0, 0, 0);
4911             }
4912             isTiny = status->tininess_before_rounding
4913                   || (zExp < -1)
4914                   || !increment
4915                   || lt128(zSig0, zSig1,
4916                            UINT64_C(0x0001FFFFFFFFFFFF),
4917                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4918             shift128ExtraRightJamming(
4919                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4920             zExp = 0;
4921             if (isTiny && zSig2) {
4922                 float_raise(float_flag_underflow, status);
4923             }
4924             switch (roundingMode) {
4925             case float_round_nearest_even:
4926             case float_round_ties_away:
4927                 increment = ((int64_t)zSig2 < 0);
4928                 break;
4929             case float_round_to_zero:
4930                 increment = 0;
4931                 break;
4932             case float_round_up:
4933                 increment = !zSign && zSig2;
4934                 break;
4935             case float_round_down:
4936                 increment = zSign && zSig2;
4937                 break;
4938             case float_round_to_odd:
4939                 increment = !(zSig1 & 0x1) && zSig2;
4940                 break;
4941             default:
4942                 abort();
4943             }
4944         }
4945     }
4946     if (zSig2) {
4947         float_raise(float_flag_inexact, status);
4948     }
4949     if ( increment ) {
4950         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4951         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4952             zSig1 &= ~1;
4953         }
4954     }
4955     else {
4956         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4957     }
4958     return packFloat128( zSign, zExp, zSig0, zSig1 );
4959 
4960 }
4961 
4962 /*----------------------------------------------------------------------------
4963 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4964 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4965 | returns the proper quadruple-precision floating-point value corresponding
4966 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4967 | except that the input significand has fewer bits and does not have to be
4968 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4969 | point exponent.
4970 *----------------------------------------------------------------------------*/
4971 
4972 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4973                                               uint64_t zSig0, uint64_t zSig1,
4974                                               float_status *status)
4975 {
4976     int8_t shiftCount;
4977     uint64_t zSig2;
4978 
4979     if ( zSig0 == 0 ) {
4980         zSig0 = zSig1;
4981         zSig1 = 0;
4982         zExp -= 64;
4983     }
4984     shiftCount = clz64(zSig0) - 15;
4985     if ( 0 <= shiftCount ) {
4986         zSig2 = 0;
4987         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4988     }
4989     else {
4990         shift128ExtraRightJamming(
4991             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4992     }
4993     zExp -= shiftCount;
4994     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4995 
4996 }
4997 
4998 
4999 /*----------------------------------------------------------------------------
5000 | Returns the result of converting the 32-bit two's complement integer `a'
5001 | to the extended double-precision floating-point format.  The conversion
5002 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5003 | Arithmetic.
5004 *----------------------------------------------------------------------------*/
5005 
5006 floatx80 int32_to_floatx80(int32_t a, float_status *status)
5007 {
5008     bool zSign;
5009     uint32_t absA;
5010     int8_t shiftCount;
5011     uint64_t zSig;
5012 
5013     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5014     zSign = ( a < 0 );
5015     absA = zSign ? - a : a;
5016     shiftCount = clz32(absA) + 32;
5017     zSig = absA;
5018     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
5019 
5020 }
5021 
5022 /*----------------------------------------------------------------------------
5023 | Returns the result of converting the 32-bit two's complement integer `a' to
5024 | the quadruple-precision floating-point format.  The conversion is performed
5025 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5026 *----------------------------------------------------------------------------*/
5027 
5028 float128 int32_to_float128(int32_t a, float_status *status)
5029 {
5030     bool zSign;
5031     uint32_t absA;
5032     int8_t shiftCount;
5033     uint64_t zSig0;
5034 
5035     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5036     zSign = ( a < 0 );
5037     absA = zSign ? - a : a;
5038     shiftCount = clz32(absA) + 17;
5039     zSig0 = absA;
5040     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
5041 
5042 }
5043 
5044 /*----------------------------------------------------------------------------
5045 | Returns the result of converting the 64-bit two's complement integer `a'
5046 | to the extended double-precision floating-point format.  The conversion
5047 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5048 | Arithmetic.
5049 *----------------------------------------------------------------------------*/
5050 
5051 floatx80 int64_to_floatx80(int64_t a, float_status *status)
5052 {
5053     bool zSign;
5054     uint64_t absA;
5055     int8_t shiftCount;
5056 
5057     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5058     zSign = ( a < 0 );
5059     absA = zSign ? - a : a;
5060     shiftCount = clz64(absA);
5061     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
5062 
5063 }
5064 
5065 /*----------------------------------------------------------------------------
5066 | Returns the result of converting the 64-bit two's complement integer `a' to
5067 | the quadruple-precision floating-point format.  The conversion is performed
5068 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5069 *----------------------------------------------------------------------------*/
5070 
5071 float128 int64_to_float128(int64_t a, float_status *status)
5072 {
5073     bool zSign;
5074     uint64_t absA;
5075     int8_t shiftCount;
5076     int32_t zExp;
5077     uint64_t zSig0, zSig1;
5078 
5079     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5080     zSign = ( a < 0 );
5081     absA = zSign ? - a : a;
5082     shiftCount = clz64(absA) + 49;
5083     zExp = 0x406E - shiftCount;
5084     if ( 64 <= shiftCount ) {
5085         zSig1 = 0;
5086         zSig0 = absA;
5087         shiftCount -= 64;
5088     }
5089     else {
5090         zSig1 = absA;
5091         zSig0 = 0;
5092     }
5093     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5094     return packFloat128( zSign, zExp, zSig0, zSig1 );
5095 
5096 }
5097 
5098 /*----------------------------------------------------------------------------
5099 | Returns the result of converting the 64-bit unsigned integer `a'
5100 | to the quadruple-precision floating-point format.  The conversion is performed
5101 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5102 *----------------------------------------------------------------------------*/
5103 
5104 float128 uint64_to_float128(uint64_t a, float_status *status)
5105 {
5106     if (a == 0) {
5107         return float128_zero;
5108     }
5109     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5110 }
5111 
5112 /*----------------------------------------------------------------------------
5113 | Returns the result of converting the single-precision floating-point value
5114 | `a' to the extended double-precision floating-point format.  The conversion
5115 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5116 | Arithmetic.
5117 *----------------------------------------------------------------------------*/
5118 
5119 floatx80 float32_to_floatx80(float32 a, float_status *status)
5120 {
5121     bool aSign;
5122     int aExp;
5123     uint32_t aSig;
5124 
5125     a = float32_squash_input_denormal(a, status);
5126     aSig = extractFloat32Frac( a );
5127     aExp = extractFloat32Exp( a );
5128     aSign = extractFloat32Sign( a );
5129     if ( aExp == 0xFF ) {
5130         if (aSig) {
5131             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5132                                                status);
5133             return floatx80_silence_nan(res, status);
5134         }
5135         return packFloatx80(aSign,
5136                             floatx80_infinity_high,
5137                             floatx80_infinity_low);
5138     }
5139     if ( aExp == 0 ) {
5140         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5141         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5142     }
5143     aSig |= 0x00800000;
5144     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5145 
5146 }
5147 
5148 /*----------------------------------------------------------------------------
5149 | Returns the result of converting the single-precision floating-point value
5150 | `a' to the double-precision floating-point format.  The conversion is
5151 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5152 | Arithmetic.
5153 *----------------------------------------------------------------------------*/
5154 
5155 float128 float32_to_float128(float32 a, float_status *status)
5156 {
5157     bool aSign;
5158     int aExp;
5159     uint32_t aSig;
5160 
5161     a = float32_squash_input_denormal(a, status);
5162     aSig = extractFloat32Frac( a );
5163     aExp = extractFloat32Exp( a );
5164     aSign = extractFloat32Sign( a );
5165     if ( aExp == 0xFF ) {
5166         if (aSig) {
5167             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5168         }
5169         return packFloat128( aSign, 0x7FFF, 0, 0 );
5170     }
5171     if ( aExp == 0 ) {
5172         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5173         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5174         --aExp;
5175     }
5176     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5177 
5178 }
5179 
5180 /*----------------------------------------------------------------------------
5181 | Returns the remainder of the single-precision floating-point value `a'
5182 | with respect to the corresponding value `b'.  The operation is performed
5183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5184 *----------------------------------------------------------------------------*/
5185 
5186 float32 float32_rem(float32 a, float32 b, float_status *status)
5187 {
5188     bool aSign, zSign;
5189     int aExp, bExp, expDiff;
5190     uint32_t aSig, bSig;
5191     uint32_t q;
5192     uint64_t aSig64, bSig64, q64;
5193     uint32_t alternateASig;
5194     int32_t sigMean;
5195     a = float32_squash_input_denormal(a, status);
5196     b = float32_squash_input_denormal(b, status);
5197 
5198     aSig = extractFloat32Frac( a );
5199     aExp = extractFloat32Exp( a );
5200     aSign = extractFloat32Sign( a );
5201     bSig = extractFloat32Frac( b );
5202     bExp = extractFloat32Exp( b );
5203     if ( aExp == 0xFF ) {
5204         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5205             return propagateFloat32NaN(a, b, status);
5206         }
5207         float_raise(float_flag_invalid, status);
5208         return float32_default_nan(status);
5209     }
5210     if ( bExp == 0xFF ) {
5211         if (bSig) {
5212             return propagateFloat32NaN(a, b, status);
5213         }
5214         return a;
5215     }
5216     if ( bExp == 0 ) {
5217         if ( bSig == 0 ) {
5218             float_raise(float_flag_invalid, status);
5219             return float32_default_nan(status);
5220         }
5221         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5222     }
5223     if ( aExp == 0 ) {
5224         if ( aSig == 0 ) return a;
5225         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5226     }
5227     expDiff = aExp - bExp;
5228     aSig |= 0x00800000;
5229     bSig |= 0x00800000;
5230     if ( expDiff < 32 ) {
5231         aSig <<= 8;
5232         bSig <<= 8;
5233         if ( expDiff < 0 ) {
5234             if ( expDiff < -1 ) return a;
5235             aSig >>= 1;
5236         }
5237         q = ( bSig <= aSig );
5238         if ( q ) aSig -= bSig;
5239         if ( 0 < expDiff ) {
5240             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5241             q >>= 32 - expDiff;
5242             bSig >>= 2;
5243             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5244         }
5245         else {
5246             aSig >>= 2;
5247             bSig >>= 2;
5248         }
5249     }
5250     else {
5251         if ( bSig <= aSig ) aSig -= bSig;
5252         aSig64 = ( (uint64_t) aSig )<<40;
5253         bSig64 = ( (uint64_t) bSig )<<40;
5254         expDiff -= 64;
5255         while ( 0 < expDiff ) {
5256             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5257             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5258             aSig64 = - ( ( bSig * q64 )<<38 );
5259             expDiff -= 62;
5260         }
5261         expDiff += 64;
5262         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5263         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5264         q = q64>>( 64 - expDiff );
5265         bSig <<= 6;
5266         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5267     }
5268     do {
5269         alternateASig = aSig;
5270         ++q;
5271         aSig -= bSig;
5272     } while ( 0 <= (int32_t) aSig );
5273     sigMean = aSig + alternateASig;
5274     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5275         aSig = alternateASig;
5276     }
5277     zSign = ( (int32_t) aSig < 0 );
5278     if ( zSign ) aSig = - aSig;
5279     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5280 }
5281 
5282 
5283 
5284 /*----------------------------------------------------------------------------
5285 | Returns the binary exponential of the single-precision floating-point value
5286 | `a'. The operation is performed according to the IEC/IEEE Standard for
5287 | Binary Floating-Point Arithmetic.
5288 |
5289 | Uses the following identities:
5290 |
5291 | 1. -------------------------------------------------------------------------
5292 |      x    x*ln(2)
5293 |     2  = e
5294 |
5295 | 2. -------------------------------------------------------------------------
5296 |                      2     3     4     5           n
5297 |      x        x     x     x     x     x           x
5298 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5299 |               1!    2!    3!    4!    5!          n!
5300 *----------------------------------------------------------------------------*/
5301 
5302 static const float64 float32_exp2_coefficients[15] =
5303 {
5304     const_float64( 0x3ff0000000000000ll ), /*  1 */
5305     const_float64( 0x3fe0000000000000ll ), /*  2 */
5306     const_float64( 0x3fc5555555555555ll ), /*  3 */
5307     const_float64( 0x3fa5555555555555ll ), /*  4 */
5308     const_float64( 0x3f81111111111111ll ), /*  5 */
5309     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5310     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5311     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5312     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5313     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5314     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5315     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5316     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5317     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5318     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5319 };
5320 
5321 float32 float32_exp2(float32 a, float_status *status)
5322 {
5323     bool aSign;
5324     int aExp;
5325     uint32_t aSig;
5326     float64 r, x, xn;
5327     int i;
5328     a = float32_squash_input_denormal(a, status);
5329 
5330     aSig = extractFloat32Frac( a );
5331     aExp = extractFloat32Exp( a );
5332     aSign = extractFloat32Sign( a );
5333 
5334     if ( aExp == 0xFF) {
5335         if (aSig) {
5336             return propagateFloat32NaN(a, float32_zero, status);
5337         }
5338         return (aSign) ? float32_zero : a;
5339     }
5340     if (aExp == 0) {
5341         if (aSig == 0) return float32_one;
5342     }
5343 
5344     float_raise(float_flag_inexact, status);
5345 
5346     /* ******************************* */
5347     /* using float64 for approximation */
5348     /* ******************************* */
5349     x = float32_to_float64(a, status);
5350     x = float64_mul(x, float64_ln2, status);
5351 
5352     xn = x;
5353     r = float64_one;
5354     for (i = 0 ; i < 15 ; i++) {
5355         float64 f;
5356 
5357         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5358         r = float64_add(r, f, status);
5359 
5360         xn = float64_mul(xn, x, status);
5361     }
5362 
5363     return float64_to_float32(r, status);
5364 }
5365 
5366 /*----------------------------------------------------------------------------
5367 | Returns the binary log of the single-precision floating-point value `a'.
5368 | The operation is performed according to the IEC/IEEE Standard for Binary
5369 | Floating-Point Arithmetic.
5370 *----------------------------------------------------------------------------*/
5371 float32 float32_log2(float32 a, float_status *status)
5372 {
5373     bool aSign, zSign;
5374     int aExp;
5375     uint32_t aSig, zSig, i;
5376 
5377     a = float32_squash_input_denormal(a, status);
5378     aSig = extractFloat32Frac( a );
5379     aExp = extractFloat32Exp( a );
5380     aSign = extractFloat32Sign( a );
5381 
5382     if ( aExp == 0 ) {
5383         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5384         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5385     }
5386     if ( aSign ) {
5387         float_raise(float_flag_invalid, status);
5388         return float32_default_nan(status);
5389     }
5390     if ( aExp == 0xFF ) {
5391         if (aSig) {
5392             return propagateFloat32NaN(a, float32_zero, status);
5393         }
5394         return a;
5395     }
5396 
5397     aExp -= 0x7F;
5398     aSig |= 0x00800000;
5399     zSign = aExp < 0;
5400     zSig = aExp << 23;
5401 
5402     for (i = 1 << 22; i > 0; i >>= 1) {
5403         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5404         if ( aSig & 0x01000000 ) {
5405             aSig >>= 1;
5406             zSig |= i;
5407         }
5408     }
5409 
5410     if ( zSign )
5411         zSig = -zSig;
5412 
5413     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5414 }
5415 
5416 /*----------------------------------------------------------------------------
5417 | Returns the result of converting the double-precision floating-point value
5418 | `a' to the extended double-precision floating-point format.  The conversion
5419 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5420 | Arithmetic.
5421 *----------------------------------------------------------------------------*/
5422 
5423 floatx80 float64_to_floatx80(float64 a, float_status *status)
5424 {
5425     bool aSign;
5426     int aExp;
5427     uint64_t aSig;
5428 
5429     a = float64_squash_input_denormal(a, status);
5430     aSig = extractFloat64Frac( a );
5431     aExp = extractFloat64Exp( a );
5432     aSign = extractFloat64Sign( a );
5433     if ( aExp == 0x7FF ) {
5434         if (aSig) {
5435             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5436                                                status);
5437             return floatx80_silence_nan(res, status);
5438         }
5439         return packFloatx80(aSign,
5440                             floatx80_infinity_high,
5441                             floatx80_infinity_low);
5442     }
5443     if ( aExp == 0 ) {
5444         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5445         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5446     }
5447     return
5448         packFloatx80(
5449             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5450 
5451 }
5452 
5453 /*----------------------------------------------------------------------------
5454 | Returns the result of converting the double-precision floating-point value
5455 | `a' to the quadruple-precision floating-point format.  The conversion is
5456 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5457 | Arithmetic.
5458 *----------------------------------------------------------------------------*/
5459 
5460 float128 float64_to_float128(float64 a, float_status *status)
5461 {
5462     bool aSign;
5463     int aExp;
5464     uint64_t aSig, zSig0, zSig1;
5465 
5466     a = float64_squash_input_denormal(a, status);
5467     aSig = extractFloat64Frac( a );
5468     aExp = extractFloat64Exp( a );
5469     aSign = extractFloat64Sign( a );
5470     if ( aExp == 0x7FF ) {
5471         if (aSig) {
5472             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5473         }
5474         return packFloat128( aSign, 0x7FFF, 0, 0 );
5475     }
5476     if ( aExp == 0 ) {
5477         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5478         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5479         --aExp;
5480     }
5481     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5482     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5483 
5484 }
5485 
5486 
5487 /*----------------------------------------------------------------------------
5488 | Returns the remainder of the double-precision floating-point value `a'
5489 | with respect to the corresponding value `b'.  The operation is performed
5490 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5491 *----------------------------------------------------------------------------*/
5492 
5493 float64 float64_rem(float64 a, float64 b, float_status *status)
5494 {
5495     bool aSign, zSign;
5496     int aExp, bExp, expDiff;
5497     uint64_t aSig, bSig;
5498     uint64_t q, alternateASig;
5499     int64_t sigMean;
5500 
5501     a = float64_squash_input_denormal(a, status);
5502     b = float64_squash_input_denormal(b, status);
5503     aSig = extractFloat64Frac( a );
5504     aExp = extractFloat64Exp( a );
5505     aSign = extractFloat64Sign( a );
5506     bSig = extractFloat64Frac( b );
5507     bExp = extractFloat64Exp( b );
5508     if ( aExp == 0x7FF ) {
5509         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5510             return propagateFloat64NaN(a, b, status);
5511         }
5512         float_raise(float_flag_invalid, status);
5513         return float64_default_nan(status);
5514     }
5515     if ( bExp == 0x7FF ) {
5516         if (bSig) {
5517             return propagateFloat64NaN(a, b, status);
5518         }
5519         return a;
5520     }
5521     if ( bExp == 0 ) {
5522         if ( bSig == 0 ) {
5523             float_raise(float_flag_invalid, status);
5524             return float64_default_nan(status);
5525         }
5526         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5527     }
5528     if ( aExp == 0 ) {
5529         if ( aSig == 0 ) return a;
5530         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5531     }
5532     expDiff = aExp - bExp;
5533     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5534     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5535     if ( expDiff < 0 ) {
5536         if ( expDiff < -1 ) return a;
5537         aSig >>= 1;
5538     }
5539     q = ( bSig <= aSig );
5540     if ( q ) aSig -= bSig;
5541     expDiff -= 64;
5542     while ( 0 < expDiff ) {
5543         q = estimateDiv128To64( aSig, 0, bSig );
5544         q = ( 2 < q ) ? q - 2 : 0;
5545         aSig = - ( ( bSig>>2 ) * q );
5546         expDiff -= 62;
5547     }
5548     expDiff += 64;
5549     if ( 0 < expDiff ) {
5550         q = estimateDiv128To64( aSig, 0, bSig );
5551         q = ( 2 < q ) ? q - 2 : 0;
5552         q >>= 64 - expDiff;
5553         bSig >>= 2;
5554         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5555     }
5556     else {
5557         aSig >>= 2;
5558         bSig >>= 2;
5559     }
5560     do {
5561         alternateASig = aSig;
5562         ++q;
5563         aSig -= bSig;
5564     } while ( 0 <= (int64_t) aSig );
5565     sigMean = aSig + alternateASig;
5566     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5567         aSig = alternateASig;
5568     }
5569     zSign = ( (int64_t) aSig < 0 );
5570     if ( zSign ) aSig = - aSig;
5571     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5572 
5573 }
5574 
5575 /*----------------------------------------------------------------------------
5576 | Returns the binary log of the double-precision floating-point value `a'.
5577 | The operation is performed according to the IEC/IEEE Standard for Binary
5578 | Floating-Point Arithmetic.
5579 *----------------------------------------------------------------------------*/
5580 float64 float64_log2(float64 a, float_status *status)
5581 {
5582     bool aSign, zSign;
5583     int aExp;
5584     uint64_t aSig, aSig0, aSig1, zSig, i;
5585     a = float64_squash_input_denormal(a, status);
5586 
5587     aSig = extractFloat64Frac( a );
5588     aExp = extractFloat64Exp( a );
5589     aSign = extractFloat64Sign( a );
5590 
5591     if ( aExp == 0 ) {
5592         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5593         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5594     }
5595     if ( aSign ) {
5596         float_raise(float_flag_invalid, status);
5597         return float64_default_nan(status);
5598     }
5599     if ( aExp == 0x7FF ) {
5600         if (aSig) {
5601             return propagateFloat64NaN(a, float64_zero, status);
5602         }
5603         return a;
5604     }
5605 
5606     aExp -= 0x3FF;
5607     aSig |= UINT64_C(0x0010000000000000);
5608     zSign = aExp < 0;
5609     zSig = (uint64_t)aExp << 52;
5610     for (i = 1LL << 51; i > 0; i >>= 1) {
5611         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5612         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5613         if ( aSig & UINT64_C(0x0020000000000000) ) {
5614             aSig >>= 1;
5615             zSig |= i;
5616         }
5617     }
5618 
5619     if ( zSign )
5620         zSig = -zSig;
5621     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5622 }
5623 
5624 /*----------------------------------------------------------------------------
5625 | Returns the result of converting the extended double-precision floating-
5626 | point value `a' to the 32-bit two's complement integer format.  The
5627 | conversion is performed according to the IEC/IEEE Standard for Binary
5628 | Floating-Point Arithmetic---which means in particular that the conversion
5629 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5630 | largest positive integer is returned.  Otherwise, if the conversion
5631 | overflows, the largest integer with the same sign as `a' is returned.
5632 *----------------------------------------------------------------------------*/
5633 
5634 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5635 {
5636     bool aSign;
5637     int32_t aExp, shiftCount;
5638     uint64_t aSig;
5639 
5640     if (floatx80_invalid_encoding(a)) {
5641         float_raise(float_flag_invalid, status);
5642         return 1 << 31;
5643     }
5644     aSig = extractFloatx80Frac( a );
5645     aExp = extractFloatx80Exp( a );
5646     aSign = extractFloatx80Sign( a );
5647     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5648     shiftCount = 0x4037 - aExp;
5649     if ( shiftCount <= 0 ) shiftCount = 1;
5650     shift64RightJamming( aSig, shiftCount, &aSig );
5651     return roundAndPackInt32(aSign, aSig, status);
5652 
5653 }
5654 
5655 /*----------------------------------------------------------------------------
5656 | Returns the result of converting the extended double-precision floating-
5657 | point value `a' to the 32-bit two's complement integer format.  The
5658 | conversion is performed according to the IEC/IEEE Standard for Binary
5659 | Floating-Point Arithmetic, except that the conversion is always rounded
5660 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5661 | Otherwise, if the conversion overflows, the largest integer with the same
5662 | sign as `a' is returned.
5663 *----------------------------------------------------------------------------*/
5664 
5665 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5666 {
5667     bool aSign;
5668     int32_t aExp, shiftCount;
5669     uint64_t aSig, savedASig;
5670     int32_t z;
5671 
5672     if (floatx80_invalid_encoding(a)) {
5673         float_raise(float_flag_invalid, status);
5674         return 1 << 31;
5675     }
5676     aSig = extractFloatx80Frac( a );
5677     aExp = extractFloatx80Exp( a );
5678     aSign = extractFloatx80Sign( a );
5679     if ( 0x401E < aExp ) {
5680         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5681         goto invalid;
5682     }
5683     else if ( aExp < 0x3FFF ) {
5684         if (aExp || aSig) {
5685             float_raise(float_flag_inexact, status);
5686         }
5687         return 0;
5688     }
5689     shiftCount = 0x403E - aExp;
5690     savedASig = aSig;
5691     aSig >>= shiftCount;
5692     z = aSig;
5693     if ( aSign ) z = - z;
5694     if ( ( z < 0 ) ^ aSign ) {
5695  invalid:
5696         float_raise(float_flag_invalid, status);
5697         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5698     }
5699     if ( ( aSig<<shiftCount ) != savedASig ) {
5700         float_raise(float_flag_inexact, status);
5701     }
5702     return z;
5703 
5704 }
5705 
5706 /*----------------------------------------------------------------------------
5707 | Returns the result of converting the extended double-precision floating-
5708 | point value `a' to the 64-bit two's complement integer format.  The
5709 | conversion is performed according to the IEC/IEEE Standard for Binary
5710 | Floating-Point Arithmetic---which means in particular that the conversion
5711 | is rounded according to the current rounding mode.  If `a' is a NaN,
5712 | the largest positive integer is returned.  Otherwise, if the conversion
5713 | overflows, the largest integer with the same sign as `a' is returned.
5714 *----------------------------------------------------------------------------*/
5715 
5716 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5717 {
5718     bool aSign;
5719     int32_t aExp, shiftCount;
5720     uint64_t aSig, aSigExtra;
5721 
5722     if (floatx80_invalid_encoding(a)) {
5723         float_raise(float_flag_invalid, status);
5724         return 1ULL << 63;
5725     }
5726     aSig = extractFloatx80Frac( a );
5727     aExp = extractFloatx80Exp( a );
5728     aSign = extractFloatx80Sign( a );
5729     shiftCount = 0x403E - aExp;
5730     if ( shiftCount <= 0 ) {
5731         if ( shiftCount ) {
5732             float_raise(float_flag_invalid, status);
5733             if (!aSign || floatx80_is_any_nan(a)) {
5734                 return INT64_MAX;
5735             }
5736             return INT64_MIN;
5737         }
5738         aSigExtra = 0;
5739     }
5740     else {
5741         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5742     }
5743     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5744 
5745 }
5746 
5747 /*----------------------------------------------------------------------------
5748 | Returns the result of converting the extended double-precision floating-
5749 | point value `a' to the 64-bit two's complement integer format.  The
5750 | conversion is performed according to the IEC/IEEE Standard for Binary
5751 | Floating-Point Arithmetic, except that the conversion is always rounded
5752 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5753 | Otherwise, if the conversion overflows, the largest integer with the same
5754 | sign as `a' is returned.
5755 *----------------------------------------------------------------------------*/
5756 
5757 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5758 {
5759     bool aSign;
5760     int32_t aExp, shiftCount;
5761     uint64_t aSig;
5762     int64_t z;
5763 
5764     if (floatx80_invalid_encoding(a)) {
5765         float_raise(float_flag_invalid, status);
5766         return 1ULL << 63;
5767     }
5768     aSig = extractFloatx80Frac( a );
5769     aExp = extractFloatx80Exp( a );
5770     aSign = extractFloatx80Sign( a );
5771     shiftCount = aExp - 0x403E;
5772     if ( 0 <= shiftCount ) {
5773         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5774         if ( ( a.high != 0xC03E ) || aSig ) {
5775             float_raise(float_flag_invalid, status);
5776             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5777                 return INT64_MAX;
5778             }
5779         }
5780         return INT64_MIN;
5781     }
5782     else if ( aExp < 0x3FFF ) {
5783         if (aExp | aSig) {
5784             float_raise(float_flag_inexact, status);
5785         }
5786         return 0;
5787     }
5788     z = aSig>>( - shiftCount );
5789     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5790         float_raise(float_flag_inexact, status);
5791     }
5792     if ( aSign ) z = - z;
5793     return z;
5794 
5795 }
5796 
5797 /*----------------------------------------------------------------------------
5798 | Returns the result of converting the extended double-precision floating-
5799 | point value `a' to the single-precision floating-point format.  The
5800 | conversion is performed according to the IEC/IEEE Standard for Binary
5801 | Floating-Point Arithmetic.
5802 *----------------------------------------------------------------------------*/
5803 
5804 float32 floatx80_to_float32(floatx80 a, float_status *status)
5805 {
5806     bool aSign;
5807     int32_t aExp;
5808     uint64_t aSig;
5809 
5810     if (floatx80_invalid_encoding(a)) {
5811         float_raise(float_flag_invalid, status);
5812         return float32_default_nan(status);
5813     }
5814     aSig = extractFloatx80Frac( a );
5815     aExp = extractFloatx80Exp( a );
5816     aSign = extractFloatx80Sign( a );
5817     if ( aExp == 0x7FFF ) {
5818         if ( (uint64_t) ( aSig<<1 ) ) {
5819             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5820                                              status);
5821             return float32_silence_nan(res, status);
5822         }
5823         return packFloat32( aSign, 0xFF, 0 );
5824     }
5825     shift64RightJamming( aSig, 33, &aSig );
5826     if ( aExp || aSig ) aExp -= 0x3F81;
5827     return roundAndPackFloat32(aSign, aExp, aSig, status);
5828 
5829 }
5830 
5831 /*----------------------------------------------------------------------------
5832 | Returns the result of converting the extended double-precision floating-
5833 | point value `a' to the double-precision floating-point format.  The
5834 | conversion is performed according to the IEC/IEEE Standard for Binary
5835 | Floating-Point Arithmetic.
5836 *----------------------------------------------------------------------------*/
5837 
5838 float64 floatx80_to_float64(floatx80 a, float_status *status)
5839 {
5840     bool aSign;
5841     int32_t aExp;
5842     uint64_t aSig, zSig;
5843 
5844     if (floatx80_invalid_encoding(a)) {
5845         float_raise(float_flag_invalid, status);
5846         return float64_default_nan(status);
5847     }
5848     aSig = extractFloatx80Frac( a );
5849     aExp = extractFloatx80Exp( a );
5850     aSign = extractFloatx80Sign( a );
5851     if ( aExp == 0x7FFF ) {
5852         if ( (uint64_t) ( aSig<<1 ) ) {
5853             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5854                                              status);
5855             return float64_silence_nan(res, status);
5856         }
5857         return packFloat64( aSign, 0x7FF, 0 );
5858     }
5859     shift64RightJamming( aSig, 1, &zSig );
5860     if ( aExp || aSig ) aExp -= 0x3C01;
5861     return roundAndPackFloat64(aSign, aExp, zSig, status);
5862 
5863 }
5864 
5865 /*----------------------------------------------------------------------------
5866 | Returns the result of converting the extended double-precision floating-
5867 | point value `a' to the quadruple-precision floating-point format.  The
5868 | conversion is performed according to the IEC/IEEE Standard for Binary
5869 | Floating-Point Arithmetic.
5870 *----------------------------------------------------------------------------*/
5871 
5872 float128 floatx80_to_float128(floatx80 a, float_status *status)
5873 {
5874     bool aSign;
5875     int aExp;
5876     uint64_t aSig, zSig0, zSig1;
5877 
5878     if (floatx80_invalid_encoding(a)) {
5879         float_raise(float_flag_invalid, status);
5880         return float128_default_nan(status);
5881     }
5882     aSig = extractFloatx80Frac( a );
5883     aExp = extractFloatx80Exp( a );
5884     aSign = extractFloatx80Sign( a );
5885     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5886         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5887                                            status);
5888         return float128_silence_nan(res, status);
5889     }
5890     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5891     return packFloat128( aSign, aExp, zSig0, zSig1 );
5892 
5893 }
5894 
5895 /*----------------------------------------------------------------------------
5896 | Rounds the extended double-precision floating-point value `a'
5897 | to the precision provided by floatx80_rounding_precision and returns the
5898 | result as an extended double-precision floating-point value.
5899 | The operation is performed according to the IEC/IEEE Standard for Binary
5900 | Floating-Point Arithmetic.
5901 *----------------------------------------------------------------------------*/
5902 
5903 floatx80 floatx80_round(floatx80 a, float_status *status)
5904 {
5905     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5906                                 extractFloatx80Sign(a),
5907                                 extractFloatx80Exp(a),
5908                                 extractFloatx80Frac(a), 0, status);
5909 }
5910 
5911 /*----------------------------------------------------------------------------
5912 | Rounds the extended double-precision floating-point value `a' to an integer,
5913 | and returns the result as an extended quadruple-precision floating-point
5914 | value.  The operation is performed according to the IEC/IEEE Standard for
5915 | Binary Floating-Point Arithmetic.
5916 *----------------------------------------------------------------------------*/
5917 
5918 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5919 {
5920     bool aSign;
5921     int32_t aExp;
5922     uint64_t lastBitMask, roundBitsMask;
5923     floatx80 z;
5924 
5925     if (floatx80_invalid_encoding(a)) {
5926         float_raise(float_flag_invalid, status);
5927         return floatx80_default_nan(status);
5928     }
5929     aExp = extractFloatx80Exp( a );
5930     if ( 0x403E <= aExp ) {
5931         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5932             return propagateFloatx80NaN(a, a, status);
5933         }
5934         return a;
5935     }
5936     if ( aExp < 0x3FFF ) {
5937         if (    ( aExp == 0 )
5938              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5939             return a;
5940         }
5941         float_raise(float_flag_inexact, status);
5942         aSign = extractFloatx80Sign( a );
5943         switch (status->float_rounding_mode) {
5944          case float_round_nearest_even:
5945             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5946                ) {
5947                 return
5948                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5949             }
5950             break;
5951         case float_round_ties_away:
5952             if (aExp == 0x3FFE) {
5953                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5954             }
5955             break;
5956          case float_round_down:
5957             return
5958                   aSign ?
5959                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5960                 : packFloatx80( 0, 0, 0 );
5961          case float_round_up:
5962             return
5963                   aSign ? packFloatx80( 1, 0, 0 )
5964                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5965 
5966         case float_round_to_zero:
5967             break;
5968         default:
5969             g_assert_not_reached();
5970         }
5971         return packFloatx80( aSign, 0, 0 );
5972     }
5973     lastBitMask = 1;
5974     lastBitMask <<= 0x403E - aExp;
5975     roundBitsMask = lastBitMask - 1;
5976     z = a;
5977     switch (status->float_rounding_mode) {
5978     case float_round_nearest_even:
5979         z.low += lastBitMask>>1;
5980         if ((z.low & roundBitsMask) == 0) {
5981             z.low &= ~lastBitMask;
5982         }
5983         break;
5984     case float_round_ties_away:
5985         z.low += lastBitMask >> 1;
5986         break;
5987     case float_round_to_zero:
5988         break;
5989     case float_round_up:
5990         if (!extractFloatx80Sign(z)) {
5991             z.low += roundBitsMask;
5992         }
5993         break;
5994     case float_round_down:
5995         if (extractFloatx80Sign(z)) {
5996             z.low += roundBitsMask;
5997         }
5998         break;
5999     default:
6000         abort();
6001     }
6002     z.low &= ~ roundBitsMask;
6003     if ( z.low == 0 ) {
6004         ++z.high;
6005         z.low = UINT64_C(0x8000000000000000);
6006     }
6007     if (z.low != a.low) {
6008         float_raise(float_flag_inexact, status);
6009     }
6010     return z;
6011 
6012 }
6013 
6014 /*----------------------------------------------------------------------------
6015 | Returns the result of adding the absolute values of the extended double-
6016 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
6017 | negated before being returned.  `zSign' is ignored if the result is a NaN.
6018 | The addition is performed according to the IEC/IEEE Standard for Binary
6019 | Floating-Point Arithmetic.
6020 *----------------------------------------------------------------------------*/
6021 
6022 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6023                                 float_status *status)
6024 {
6025     int32_t aExp, bExp, zExp;
6026     uint64_t aSig, bSig, zSig0, zSig1;
6027     int32_t expDiff;
6028 
6029     aSig = extractFloatx80Frac( a );
6030     aExp = extractFloatx80Exp( a );
6031     bSig = extractFloatx80Frac( b );
6032     bExp = extractFloatx80Exp( b );
6033     expDiff = aExp - bExp;
6034     if ( 0 < expDiff ) {
6035         if ( aExp == 0x7FFF ) {
6036             if ((uint64_t)(aSig << 1)) {
6037                 return propagateFloatx80NaN(a, b, status);
6038             }
6039             return a;
6040         }
6041         if ( bExp == 0 ) --expDiff;
6042         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6043         zExp = aExp;
6044     }
6045     else if ( expDiff < 0 ) {
6046         if ( bExp == 0x7FFF ) {
6047             if ((uint64_t)(bSig << 1)) {
6048                 return propagateFloatx80NaN(a, b, status);
6049             }
6050             return packFloatx80(zSign,
6051                                 floatx80_infinity_high,
6052                                 floatx80_infinity_low);
6053         }
6054         if ( aExp == 0 ) ++expDiff;
6055         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6056         zExp = bExp;
6057     }
6058     else {
6059         if ( aExp == 0x7FFF ) {
6060             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6061                 return propagateFloatx80NaN(a, b, status);
6062             }
6063             return a;
6064         }
6065         zSig1 = 0;
6066         zSig0 = aSig + bSig;
6067         if ( aExp == 0 ) {
6068             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
6069                 /* At least one of the values is a pseudo-denormal,
6070                  * and there is a carry out of the result.  */
6071                 zExp = 1;
6072                 goto shiftRight1;
6073             }
6074             if (zSig0 == 0) {
6075                 return packFloatx80(zSign, 0, 0);
6076             }
6077             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
6078             goto roundAndPack;
6079         }
6080         zExp = aExp;
6081         goto shiftRight1;
6082     }
6083     zSig0 = aSig + bSig;
6084     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6085  shiftRight1:
6086     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6087     zSig0 |= UINT64_C(0x8000000000000000);
6088     ++zExp;
6089  roundAndPack:
6090     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6091                                 zSign, zExp, zSig0, zSig1, status);
6092 }
6093 
6094 /*----------------------------------------------------------------------------
6095 | Returns the result of subtracting the absolute values of the extended
6096 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
6097 | difference is negated before being returned.  `zSign' is ignored if the
6098 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6099 | Standard for Binary Floating-Point Arithmetic.
6100 *----------------------------------------------------------------------------*/
6101 
6102 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6103                                 float_status *status)
6104 {
6105     int32_t aExp, bExp, zExp;
6106     uint64_t aSig, bSig, zSig0, zSig1;
6107     int32_t expDiff;
6108 
6109     aSig = extractFloatx80Frac( a );
6110     aExp = extractFloatx80Exp( a );
6111     bSig = extractFloatx80Frac( b );
6112     bExp = extractFloatx80Exp( b );
6113     expDiff = aExp - bExp;
6114     if ( 0 < expDiff ) goto aExpBigger;
6115     if ( expDiff < 0 ) goto bExpBigger;
6116     if ( aExp == 0x7FFF ) {
6117         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6118             return propagateFloatx80NaN(a, b, status);
6119         }
6120         float_raise(float_flag_invalid, status);
6121         return floatx80_default_nan(status);
6122     }
6123     if ( aExp == 0 ) {
6124         aExp = 1;
6125         bExp = 1;
6126     }
6127     zSig1 = 0;
6128     if ( bSig < aSig ) goto aBigger;
6129     if ( aSig < bSig ) goto bBigger;
6130     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6131  bExpBigger:
6132     if ( bExp == 0x7FFF ) {
6133         if ((uint64_t)(bSig << 1)) {
6134             return propagateFloatx80NaN(a, b, status);
6135         }
6136         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6137                             floatx80_infinity_low);
6138     }
6139     if ( aExp == 0 ) ++expDiff;
6140     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6141  bBigger:
6142     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6143     zExp = bExp;
6144     zSign ^= 1;
6145     goto normalizeRoundAndPack;
6146  aExpBigger:
6147     if ( aExp == 0x7FFF ) {
6148         if ((uint64_t)(aSig << 1)) {
6149             return propagateFloatx80NaN(a, b, status);
6150         }
6151         return a;
6152     }
6153     if ( bExp == 0 ) --expDiff;
6154     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6155  aBigger:
6156     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6157     zExp = aExp;
6158  normalizeRoundAndPack:
6159     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6160                                          zSign, zExp, zSig0, zSig1, status);
6161 }
6162 
6163 /*----------------------------------------------------------------------------
6164 | Returns the result of adding the extended double-precision floating-point
6165 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6166 | Standard for Binary Floating-Point Arithmetic.
6167 *----------------------------------------------------------------------------*/
6168 
6169 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6170 {
6171     bool aSign, bSign;
6172 
6173     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6174         float_raise(float_flag_invalid, status);
6175         return floatx80_default_nan(status);
6176     }
6177     aSign = extractFloatx80Sign( a );
6178     bSign = extractFloatx80Sign( b );
6179     if ( aSign == bSign ) {
6180         return addFloatx80Sigs(a, b, aSign, status);
6181     }
6182     else {
6183         return subFloatx80Sigs(a, b, aSign, status);
6184     }
6185 
6186 }
6187 
6188 /*----------------------------------------------------------------------------
6189 | Returns the result of subtracting the extended double-precision floating-
6190 | point values `a' and `b'.  The operation is performed according to the
6191 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6192 *----------------------------------------------------------------------------*/
6193 
6194 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6195 {
6196     bool aSign, bSign;
6197 
6198     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6199         float_raise(float_flag_invalid, status);
6200         return floatx80_default_nan(status);
6201     }
6202     aSign = extractFloatx80Sign( a );
6203     bSign = extractFloatx80Sign( b );
6204     if ( aSign == bSign ) {
6205         return subFloatx80Sigs(a, b, aSign, status);
6206     }
6207     else {
6208         return addFloatx80Sigs(a, b, aSign, status);
6209     }
6210 
6211 }
6212 
6213 /*----------------------------------------------------------------------------
6214 | Returns the result of multiplying the extended double-precision floating-
6215 | point values `a' and `b'.  The operation is performed according to the
6216 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6217 *----------------------------------------------------------------------------*/
6218 
6219 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6220 {
6221     bool aSign, bSign, zSign;
6222     int32_t aExp, bExp, zExp;
6223     uint64_t aSig, bSig, zSig0, zSig1;
6224 
6225     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6226         float_raise(float_flag_invalid, status);
6227         return floatx80_default_nan(status);
6228     }
6229     aSig = extractFloatx80Frac( a );
6230     aExp = extractFloatx80Exp( a );
6231     aSign = extractFloatx80Sign( a );
6232     bSig = extractFloatx80Frac( b );
6233     bExp = extractFloatx80Exp( b );
6234     bSign = extractFloatx80Sign( b );
6235     zSign = aSign ^ bSign;
6236     if ( aExp == 0x7FFF ) {
6237         if (    (uint64_t) ( aSig<<1 )
6238              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6239             return propagateFloatx80NaN(a, b, status);
6240         }
6241         if ( ( bExp | bSig ) == 0 ) goto invalid;
6242         return packFloatx80(zSign, floatx80_infinity_high,
6243                                    floatx80_infinity_low);
6244     }
6245     if ( bExp == 0x7FFF ) {
6246         if ((uint64_t)(bSig << 1)) {
6247             return propagateFloatx80NaN(a, b, status);
6248         }
6249         if ( ( aExp | aSig ) == 0 ) {
6250  invalid:
6251             float_raise(float_flag_invalid, status);
6252             return floatx80_default_nan(status);
6253         }
6254         return packFloatx80(zSign, floatx80_infinity_high,
6255                                    floatx80_infinity_low);
6256     }
6257     if ( aExp == 0 ) {
6258         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6259         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6260     }
6261     if ( bExp == 0 ) {
6262         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6263         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6264     }
6265     zExp = aExp + bExp - 0x3FFE;
6266     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6267     if ( 0 < (int64_t) zSig0 ) {
6268         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6269         --zExp;
6270     }
6271     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6272                                 zSign, zExp, zSig0, zSig1, status);
6273 }
6274 
6275 /*----------------------------------------------------------------------------
6276 | Returns the result of dividing the extended double-precision floating-point
6277 | value `a' by the corresponding value `b'.  The operation is performed
6278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6279 *----------------------------------------------------------------------------*/
6280 
6281 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6282 {
6283     bool aSign, bSign, zSign;
6284     int32_t aExp, bExp, zExp;
6285     uint64_t aSig, bSig, zSig0, zSig1;
6286     uint64_t rem0, rem1, rem2, term0, term1, term2;
6287 
6288     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6289         float_raise(float_flag_invalid, status);
6290         return floatx80_default_nan(status);
6291     }
6292     aSig = extractFloatx80Frac( a );
6293     aExp = extractFloatx80Exp( a );
6294     aSign = extractFloatx80Sign( a );
6295     bSig = extractFloatx80Frac( b );
6296     bExp = extractFloatx80Exp( b );
6297     bSign = extractFloatx80Sign( b );
6298     zSign = aSign ^ bSign;
6299     if ( aExp == 0x7FFF ) {
6300         if ((uint64_t)(aSig << 1)) {
6301             return propagateFloatx80NaN(a, b, status);
6302         }
6303         if ( bExp == 0x7FFF ) {
6304             if ((uint64_t)(bSig << 1)) {
6305                 return propagateFloatx80NaN(a, b, status);
6306             }
6307             goto invalid;
6308         }
6309         return packFloatx80(zSign, floatx80_infinity_high,
6310                                    floatx80_infinity_low);
6311     }
6312     if ( bExp == 0x7FFF ) {
6313         if ((uint64_t)(bSig << 1)) {
6314             return propagateFloatx80NaN(a, b, status);
6315         }
6316         return packFloatx80( zSign, 0, 0 );
6317     }
6318     if ( bExp == 0 ) {
6319         if ( bSig == 0 ) {
6320             if ( ( aExp | aSig ) == 0 ) {
6321  invalid:
6322                 float_raise(float_flag_invalid, status);
6323                 return floatx80_default_nan(status);
6324             }
6325             float_raise(float_flag_divbyzero, status);
6326             return packFloatx80(zSign, floatx80_infinity_high,
6327                                        floatx80_infinity_low);
6328         }
6329         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6330     }
6331     if ( aExp == 0 ) {
6332         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6333         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6334     }
6335     zExp = aExp - bExp + 0x3FFE;
6336     rem1 = 0;
6337     if ( bSig <= aSig ) {
6338         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6339         ++zExp;
6340     }
6341     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6342     mul64To128( bSig, zSig0, &term0, &term1 );
6343     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6344     while ( (int64_t) rem0 < 0 ) {
6345         --zSig0;
6346         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6347     }
6348     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6349     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6350         mul64To128( bSig, zSig1, &term1, &term2 );
6351         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6352         while ( (int64_t) rem1 < 0 ) {
6353             --zSig1;
6354             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6355         }
6356         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6357     }
6358     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6359                                 zSign, zExp, zSig0, zSig1, status);
6360 }
6361 
6362 /*----------------------------------------------------------------------------
6363 | Returns the remainder of the extended double-precision floating-point value
6364 | `a' with respect to the corresponding value `b'.  The operation is performed
6365 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6366 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6367 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6368 | the absolute value of the integer quotient.
6369 *----------------------------------------------------------------------------*/
6370 
6371 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6372                          float_status *status)
6373 {
6374     bool aSign, zSign;
6375     int32_t aExp, bExp, expDiff, aExpOrig;
6376     uint64_t aSig0, aSig1, bSig;
6377     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6378 
6379     *quotient = 0;
6380     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6381         float_raise(float_flag_invalid, status);
6382         return floatx80_default_nan(status);
6383     }
6384     aSig0 = extractFloatx80Frac( a );
6385     aExpOrig = aExp = extractFloatx80Exp( a );
6386     aSign = extractFloatx80Sign( a );
6387     bSig = extractFloatx80Frac( b );
6388     bExp = extractFloatx80Exp( b );
6389     if ( aExp == 0x7FFF ) {
6390         if (    (uint64_t) ( aSig0<<1 )
6391              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6392             return propagateFloatx80NaN(a, b, status);
6393         }
6394         goto invalid;
6395     }
6396     if ( bExp == 0x7FFF ) {
6397         if ((uint64_t)(bSig << 1)) {
6398             return propagateFloatx80NaN(a, b, status);
6399         }
6400         if (aExp == 0 && aSig0 >> 63) {
6401             /*
6402              * Pseudo-denormal argument must be returned in normalized
6403              * form.
6404              */
6405             return packFloatx80(aSign, 1, aSig0);
6406         }
6407         return a;
6408     }
6409     if ( bExp == 0 ) {
6410         if ( bSig == 0 ) {
6411  invalid:
6412             float_raise(float_flag_invalid, status);
6413             return floatx80_default_nan(status);
6414         }
6415         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6416     }
6417     if ( aExp == 0 ) {
6418         if ( aSig0 == 0 ) return a;
6419         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6420     }
6421     zSign = aSign;
6422     expDiff = aExp - bExp;
6423     aSig1 = 0;
6424     if ( expDiff < 0 ) {
6425         if ( mod || expDiff < -1 ) {
6426             if (aExp == 1 && aExpOrig == 0) {
6427                 /*
6428                  * Pseudo-denormal argument must be returned in
6429                  * normalized form.
6430                  */
6431                 return packFloatx80(aSign, aExp, aSig0);
6432             }
6433             return a;
6434         }
6435         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6436         expDiff = 0;
6437     }
6438     *quotient = q = ( bSig <= aSig0 );
6439     if ( q ) aSig0 -= bSig;
6440     expDiff -= 64;
6441     while ( 0 < expDiff ) {
6442         q = estimateDiv128To64( aSig0, aSig1, bSig );
6443         q = ( 2 < q ) ? q - 2 : 0;
6444         mul64To128( bSig, q, &term0, &term1 );
6445         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6446         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6447         expDiff -= 62;
6448         *quotient <<= 62;
6449         *quotient += q;
6450     }
6451     expDiff += 64;
6452     if ( 0 < expDiff ) {
6453         q = estimateDiv128To64( aSig0, aSig1, bSig );
6454         q = ( 2 < q ) ? q - 2 : 0;
6455         q >>= 64 - expDiff;
6456         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6457         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6458         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6459         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6460             ++q;
6461             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6462         }
6463         if (expDiff < 64) {
6464             *quotient <<= expDiff;
6465         } else {
6466             *quotient = 0;
6467         }
6468         *quotient += q;
6469     }
6470     else {
6471         term1 = 0;
6472         term0 = bSig;
6473     }
6474     if (!mod) {
6475         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6476         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6477                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6478                         && ( q & 1 ) )
6479             ) {
6480             aSig0 = alternateASig0;
6481             aSig1 = alternateASig1;
6482             zSign = ! zSign;
6483             ++*quotient;
6484         }
6485     }
6486     return
6487         normalizeRoundAndPackFloatx80(
6488             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6489 
6490 }
6491 
6492 /*----------------------------------------------------------------------------
6493 | Returns the remainder of the extended double-precision floating-point value
6494 | `a' with respect to the corresponding value `b'.  The operation is performed
6495 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6496 *----------------------------------------------------------------------------*/
6497 
6498 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6499 {
6500     uint64_t quotient;
6501     return floatx80_modrem(a, b, false, &quotient, status);
6502 }
6503 
6504 /*----------------------------------------------------------------------------
6505 | Returns the remainder of the extended double-precision floating-point value
6506 | `a' with respect to the corresponding value `b', with the quotient truncated
6507 | toward zero.
6508 *----------------------------------------------------------------------------*/
6509 
6510 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6511 {
6512     uint64_t quotient;
6513     return floatx80_modrem(a, b, true, &quotient, status);
6514 }
6515 
6516 /*----------------------------------------------------------------------------
6517 | Returns the square root of the extended double-precision floating-point
6518 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6519 | for Binary Floating-Point Arithmetic.
6520 *----------------------------------------------------------------------------*/
6521 
6522 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6523 {
6524     bool aSign;
6525     int32_t aExp, zExp;
6526     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6527     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6528 
6529     if (floatx80_invalid_encoding(a)) {
6530         float_raise(float_flag_invalid, status);
6531         return floatx80_default_nan(status);
6532     }
6533     aSig0 = extractFloatx80Frac( a );
6534     aExp = extractFloatx80Exp( a );
6535     aSign = extractFloatx80Sign( a );
6536     if ( aExp == 0x7FFF ) {
6537         if ((uint64_t)(aSig0 << 1)) {
6538             return propagateFloatx80NaN(a, a, status);
6539         }
6540         if ( ! aSign ) return a;
6541         goto invalid;
6542     }
6543     if ( aSign ) {
6544         if ( ( aExp | aSig0 ) == 0 ) return a;
6545  invalid:
6546         float_raise(float_flag_invalid, status);
6547         return floatx80_default_nan(status);
6548     }
6549     if ( aExp == 0 ) {
6550         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6551         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6552     }
6553     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6554     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6555     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6556     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6557     doubleZSig0 = zSig0<<1;
6558     mul64To128( zSig0, zSig0, &term0, &term1 );
6559     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6560     while ( (int64_t) rem0 < 0 ) {
6561         --zSig0;
6562         doubleZSig0 -= 2;
6563         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6564     }
6565     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6566     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6567         if ( zSig1 == 0 ) zSig1 = 1;
6568         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6569         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6570         mul64To128( zSig1, zSig1, &term2, &term3 );
6571         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6572         while ( (int64_t) rem1 < 0 ) {
6573             --zSig1;
6574             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6575             term3 |= 1;
6576             term2 |= doubleZSig0;
6577             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6578         }
6579         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6580     }
6581     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6582     zSig0 |= doubleZSig0;
6583     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6584                                 0, zExp, zSig0, zSig1, status);
6585 }
6586 
6587 /*----------------------------------------------------------------------------
6588 | Returns the result of converting the quadruple-precision floating-point
6589 | value `a' to the 32-bit two's complement integer format.  The conversion
6590 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6591 | Arithmetic---which means in particular that the conversion is rounded
6592 | according to the current rounding mode.  If `a' is a NaN, the largest
6593 | positive integer is returned.  Otherwise, if the conversion overflows, the
6594 | largest integer with the same sign as `a' is returned.
6595 *----------------------------------------------------------------------------*/
6596 
6597 int32_t float128_to_int32(float128 a, float_status *status)
6598 {
6599     bool aSign;
6600     int32_t aExp, shiftCount;
6601     uint64_t aSig0, aSig1;
6602 
6603     aSig1 = extractFloat128Frac1( a );
6604     aSig0 = extractFloat128Frac0( a );
6605     aExp = extractFloat128Exp( a );
6606     aSign = extractFloat128Sign( a );
6607     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6608     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6609     aSig0 |= ( aSig1 != 0 );
6610     shiftCount = 0x4028 - aExp;
6611     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6612     return roundAndPackInt32(aSign, aSig0, status);
6613 
6614 }
6615 
6616 /*----------------------------------------------------------------------------
6617 | Returns the result of converting the quadruple-precision floating-point
6618 | value `a' to the 32-bit two's complement integer format.  The conversion
6619 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6620 | Arithmetic, except that the conversion is always rounded toward zero.  If
6621 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6622 | conversion overflows, the largest integer with the same sign as `a' is
6623 | returned.
6624 *----------------------------------------------------------------------------*/
6625 
6626 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6627 {
6628     bool aSign;
6629     int32_t aExp, shiftCount;
6630     uint64_t aSig0, aSig1, savedASig;
6631     int32_t z;
6632 
6633     aSig1 = extractFloat128Frac1( a );
6634     aSig0 = extractFloat128Frac0( a );
6635     aExp = extractFloat128Exp( a );
6636     aSign = extractFloat128Sign( a );
6637     aSig0 |= ( aSig1 != 0 );
6638     if ( 0x401E < aExp ) {
6639         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6640         goto invalid;
6641     }
6642     else if ( aExp < 0x3FFF ) {
6643         if (aExp || aSig0) {
6644             float_raise(float_flag_inexact, status);
6645         }
6646         return 0;
6647     }
6648     aSig0 |= UINT64_C(0x0001000000000000);
6649     shiftCount = 0x402F - aExp;
6650     savedASig = aSig0;
6651     aSig0 >>= shiftCount;
6652     z = aSig0;
6653     if ( aSign ) z = - z;
6654     if ( ( z < 0 ) ^ aSign ) {
6655  invalid:
6656         float_raise(float_flag_invalid, status);
6657         return aSign ? INT32_MIN : INT32_MAX;
6658     }
6659     if ( ( aSig0<<shiftCount ) != savedASig ) {
6660         float_raise(float_flag_inexact, status);
6661     }
6662     return z;
6663 
6664 }
6665 
6666 /*----------------------------------------------------------------------------
6667 | Returns the result of converting the quadruple-precision floating-point
6668 | value `a' to the 64-bit two's complement integer format.  The conversion
6669 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6670 | Arithmetic---which means in particular that the conversion is rounded
6671 | according to the current rounding mode.  If `a' is a NaN, the largest
6672 | positive integer is returned.  Otherwise, if the conversion overflows, the
6673 | largest integer with the same sign as `a' is returned.
6674 *----------------------------------------------------------------------------*/
6675 
6676 int64_t float128_to_int64(float128 a, float_status *status)
6677 {
6678     bool aSign;
6679     int32_t aExp, shiftCount;
6680     uint64_t aSig0, aSig1;
6681 
6682     aSig1 = extractFloat128Frac1( a );
6683     aSig0 = extractFloat128Frac0( a );
6684     aExp = extractFloat128Exp( a );
6685     aSign = extractFloat128Sign( a );
6686     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6687     shiftCount = 0x402F - aExp;
6688     if ( shiftCount <= 0 ) {
6689         if ( 0x403E < aExp ) {
6690             float_raise(float_flag_invalid, status);
6691             if (    ! aSign
6692                  || (    ( aExp == 0x7FFF )
6693                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6694                     )
6695                ) {
6696                 return INT64_MAX;
6697             }
6698             return INT64_MIN;
6699         }
6700         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6701     }
6702     else {
6703         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6704     }
6705     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6706 
6707 }
6708 
6709 /*----------------------------------------------------------------------------
6710 | Returns the result of converting the quadruple-precision floating-point
6711 | value `a' to the 64-bit two's complement integer format.  The conversion
6712 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6713 | Arithmetic, except that the conversion is always rounded toward zero.
6714 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6715 | the conversion overflows, the largest integer with the same sign as `a' is
6716 | returned.
6717 *----------------------------------------------------------------------------*/
6718 
6719 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6720 {
6721     bool aSign;
6722     int32_t aExp, shiftCount;
6723     uint64_t aSig0, aSig1;
6724     int64_t z;
6725 
6726     aSig1 = extractFloat128Frac1( a );
6727     aSig0 = extractFloat128Frac0( a );
6728     aExp = extractFloat128Exp( a );
6729     aSign = extractFloat128Sign( a );
6730     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6731     shiftCount = aExp - 0x402F;
6732     if ( 0 < shiftCount ) {
6733         if ( 0x403E <= aExp ) {
6734             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6735             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6736                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6737                 if (aSig1) {
6738                     float_raise(float_flag_inexact, status);
6739                 }
6740             }
6741             else {
6742                 float_raise(float_flag_invalid, status);
6743                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6744                     return INT64_MAX;
6745                 }
6746             }
6747             return INT64_MIN;
6748         }
6749         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6750         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6751             float_raise(float_flag_inexact, status);
6752         }
6753     }
6754     else {
6755         if ( aExp < 0x3FFF ) {
6756             if ( aExp | aSig0 | aSig1 ) {
6757                 float_raise(float_flag_inexact, status);
6758             }
6759             return 0;
6760         }
6761         z = aSig0>>( - shiftCount );
6762         if (    aSig1
6763              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6764             float_raise(float_flag_inexact, status);
6765         }
6766     }
6767     if ( aSign ) z = - z;
6768     return z;
6769 
6770 }
6771 
6772 /*----------------------------------------------------------------------------
6773 | Returns the result of converting the quadruple-precision floating-point value
6774 | `a' to the 64-bit unsigned integer format.  The conversion is
6775 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6776 | Arithmetic---which means in particular that the conversion is rounded
6777 | according to the current rounding mode.  If `a' is a NaN, the largest
6778 | positive integer is returned.  If the conversion overflows, the
6779 | largest unsigned integer is returned.  If 'a' is negative, the value is
6780 | rounded and zero is returned; negative values that do not round to zero
6781 | will raise the inexact exception.
6782 *----------------------------------------------------------------------------*/
6783 
6784 uint64_t float128_to_uint64(float128 a, float_status *status)
6785 {
6786     bool aSign;
6787     int aExp;
6788     int shiftCount;
6789     uint64_t aSig0, aSig1;
6790 
6791     aSig0 = extractFloat128Frac0(a);
6792     aSig1 = extractFloat128Frac1(a);
6793     aExp = extractFloat128Exp(a);
6794     aSign = extractFloat128Sign(a);
6795     if (aSign && (aExp > 0x3FFE)) {
6796         float_raise(float_flag_invalid, status);
6797         if (float128_is_any_nan(a)) {
6798             return UINT64_MAX;
6799         } else {
6800             return 0;
6801         }
6802     }
6803     if (aExp) {
6804         aSig0 |= UINT64_C(0x0001000000000000);
6805     }
6806     shiftCount = 0x402F - aExp;
6807     if (shiftCount <= 0) {
6808         if (0x403E < aExp) {
6809             float_raise(float_flag_invalid, status);
6810             return UINT64_MAX;
6811         }
6812         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6813     } else {
6814         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6815     }
6816     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6817 }
6818 
6819 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6820 {
6821     uint64_t v;
6822     signed char current_rounding_mode = status->float_rounding_mode;
6823 
6824     set_float_rounding_mode(float_round_to_zero, status);
6825     v = float128_to_uint64(a, status);
6826     set_float_rounding_mode(current_rounding_mode, status);
6827 
6828     return v;
6829 }
6830 
6831 /*----------------------------------------------------------------------------
6832 | Returns the result of converting the quadruple-precision floating-point
6833 | value `a' to the 32-bit unsigned integer format.  The conversion
6834 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6835 | Arithmetic except that the conversion is always rounded toward zero.
6836 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6837 | if the conversion overflows, the largest unsigned integer is returned.
6838 | If 'a' is negative, the value is rounded and zero is returned; negative
6839 | values that do not round to zero will raise the inexact exception.
6840 *----------------------------------------------------------------------------*/
6841 
6842 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6843 {
6844     uint64_t v;
6845     uint32_t res;
6846     int old_exc_flags = get_float_exception_flags(status);
6847 
6848     v = float128_to_uint64_round_to_zero(a, status);
6849     if (v > 0xffffffff) {
6850         res = 0xffffffff;
6851     } else {
6852         return v;
6853     }
6854     set_float_exception_flags(old_exc_flags, status);
6855     float_raise(float_flag_invalid, status);
6856     return res;
6857 }
6858 
6859 /*----------------------------------------------------------------------------
6860 | Returns the result of converting the quadruple-precision floating-point value
6861 | `a' to the 32-bit unsigned integer format.  The conversion is
6862 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6863 | Arithmetic---which means in particular that the conversion is rounded
6864 | according to the current rounding mode.  If `a' is a NaN, the largest
6865 | positive integer is returned.  If the conversion overflows, the
6866 | largest unsigned integer is returned.  If 'a' is negative, the value is
6867 | rounded and zero is returned; negative values that do not round to zero
6868 | will raise the inexact exception.
6869 *----------------------------------------------------------------------------*/
6870 
6871 uint32_t float128_to_uint32(float128 a, float_status *status)
6872 {
6873     uint64_t v;
6874     uint32_t res;
6875     int old_exc_flags = get_float_exception_flags(status);
6876 
6877     v = float128_to_uint64(a, status);
6878     if (v > 0xffffffff) {
6879         res = 0xffffffff;
6880     } else {
6881         return v;
6882     }
6883     set_float_exception_flags(old_exc_flags, status);
6884     float_raise(float_flag_invalid, status);
6885     return res;
6886 }
6887 
6888 /*----------------------------------------------------------------------------
6889 | Returns the result of converting the quadruple-precision floating-point
6890 | value `a' to the single-precision floating-point format.  The conversion
6891 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6892 | Arithmetic.
6893 *----------------------------------------------------------------------------*/
6894 
6895 float32 float128_to_float32(float128 a, float_status *status)
6896 {
6897     bool aSign;
6898     int32_t aExp;
6899     uint64_t aSig0, aSig1;
6900     uint32_t zSig;
6901 
6902     aSig1 = extractFloat128Frac1( a );
6903     aSig0 = extractFloat128Frac0( a );
6904     aExp = extractFloat128Exp( a );
6905     aSign = extractFloat128Sign( a );
6906     if ( aExp == 0x7FFF ) {
6907         if ( aSig0 | aSig1 ) {
6908             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6909         }
6910         return packFloat32( aSign, 0xFF, 0 );
6911     }
6912     aSig0 |= ( aSig1 != 0 );
6913     shift64RightJamming( aSig0, 18, &aSig0 );
6914     zSig = aSig0;
6915     if ( aExp || zSig ) {
6916         zSig |= 0x40000000;
6917         aExp -= 0x3F81;
6918     }
6919     return roundAndPackFloat32(aSign, aExp, zSig, status);
6920 
6921 }
6922 
6923 /*----------------------------------------------------------------------------
6924 | Returns the result of converting the quadruple-precision floating-point
6925 | value `a' to the double-precision floating-point format.  The conversion
6926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6927 | Arithmetic.
6928 *----------------------------------------------------------------------------*/
6929 
6930 float64 float128_to_float64(float128 a, float_status *status)
6931 {
6932     bool aSign;
6933     int32_t aExp;
6934     uint64_t aSig0, aSig1;
6935 
6936     aSig1 = extractFloat128Frac1( a );
6937     aSig0 = extractFloat128Frac0( a );
6938     aExp = extractFloat128Exp( a );
6939     aSign = extractFloat128Sign( a );
6940     if ( aExp == 0x7FFF ) {
6941         if ( aSig0 | aSig1 ) {
6942             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6943         }
6944         return packFloat64( aSign, 0x7FF, 0 );
6945     }
6946     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6947     aSig0 |= ( aSig1 != 0 );
6948     if ( aExp || aSig0 ) {
6949         aSig0 |= UINT64_C(0x4000000000000000);
6950         aExp -= 0x3C01;
6951     }
6952     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6953 
6954 }
6955 
6956 /*----------------------------------------------------------------------------
6957 | Returns the result of converting the quadruple-precision floating-point
6958 | value `a' to the extended double-precision floating-point format.  The
6959 | conversion is performed according to the IEC/IEEE Standard for Binary
6960 | Floating-Point Arithmetic.
6961 *----------------------------------------------------------------------------*/
6962 
6963 floatx80 float128_to_floatx80(float128 a, float_status *status)
6964 {
6965     bool aSign;
6966     int32_t aExp;
6967     uint64_t aSig0, aSig1;
6968 
6969     aSig1 = extractFloat128Frac1( a );
6970     aSig0 = extractFloat128Frac0( a );
6971     aExp = extractFloat128Exp( a );
6972     aSign = extractFloat128Sign( a );
6973     if ( aExp == 0x7FFF ) {
6974         if ( aSig0 | aSig1 ) {
6975             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6976                                                status);
6977             return floatx80_silence_nan(res, status);
6978         }
6979         return packFloatx80(aSign, floatx80_infinity_high,
6980                                    floatx80_infinity_low);
6981     }
6982     if ( aExp == 0 ) {
6983         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6984         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6985     }
6986     else {
6987         aSig0 |= UINT64_C(0x0001000000000000);
6988     }
6989     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6990     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6991 
6992 }
6993 
6994 /*----------------------------------------------------------------------------
6995 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6996 | returns the result as a quadruple-precision floating-point value.  The
6997 | operation is performed according to the IEC/IEEE Standard for Binary
6998 | Floating-Point Arithmetic.
6999 *----------------------------------------------------------------------------*/
7000 
7001 float128 float128_round_to_int(float128 a, float_status *status)
7002 {
7003     bool aSign;
7004     int32_t aExp;
7005     uint64_t lastBitMask, roundBitsMask;
7006     float128 z;
7007 
7008     aExp = extractFloat128Exp( a );
7009     if ( 0x402F <= aExp ) {
7010         if ( 0x406F <= aExp ) {
7011             if (    ( aExp == 0x7FFF )
7012                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
7013                ) {
7014                 return propagateFloat128NaN(a, a, status);
7015             }
7016             return a;
7017         }
7018         lastBitMask = 1;
7019         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7020         roundBitsMask = lastBitMask - 1;
7021         z = a;
7022         switch (status->float_rounding_mode) {
7023         case float_round_nearest_even:
7024             if ( lastBitMask ) {
7025                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7026                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7027             }
7028             else {
7029                 if ( (int64_t) z.low < 0 ) {
7030                     ++z.high;
7031                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7032                 }
7033             }
7034             break;
7035         case float_round_ties_away:
7036             if (lastBitMask) {
7037                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7038             } else {
7039                 if ((int64_t) z.low < 0) {
7040                     ++z.high;
7041                 }
7042             }
7043             break;
7044         case float_round_to_zero:
7045             break;
7046         case float_round_up:
7047             if (!extractFloat128Sign(z)) {
7048                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7049             }
7050             break;
7051         case float_round_down:
7052             if (extractFloat128Sign(z)) {
7053                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7054             }
7055             break;
7056         case float_round_to_odd:
7057             /*
7058              * Note that if lastBitMask == 0, the last bit is the lsb
7059              * of high, and roundBitsMask == -1.
7060              */
7061             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7062                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7063             }
7064             break;
7065         default:
7066             abort();
7067         }
7068         z.low &= ~ roundBitsMask;
7069     }
7070     else {
7071         if ( aExp < 0x3FFF ) {
7072             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7073             float_raise(float_flag_inexact, status);
7074             aSign = extractFloat128Sign( a );
7075             switch (status->float_rounding_mode) {
7076             case float_round_nearest_even:
7077                 if (    ( aExp == 0x3FFE )
7078                      && (   extractFloat128Frac0( a )
7079                           | extractFloat128Frac1( a ) )
7080                    ) {
7081                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7082                 }
7083                 break;
7084             case float_round_ties_away:
7085                 if (aExp == 0x3FFE) {
7086                     return packFloat128(aSign, 0x3FFF, 0, 0);
7087                 }
7088                 break;
7089             case float_round_down:
7090                 return
7091                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7092                     : packFloat128( 0, 0, 0, 0 );
7093             case float_round_up:
7094                 return
7095                       aSign ? packFloat128( 1, 0, 0, 0 )
7096                     : packFloat128( 0, 0x3FFF, 0, 0 );
7097 
7098             case float_round_to_odd:
7099                 return packFloat128(aSign, 0x3FFF, 0, 0);
7100 
7101             case float_round_to_zero:
7102                 break;
7103             }
7104             return packFloat128( aSign, 0, 0, 0 );
7105         }
7106         lastBitMask = 1;
7107         lastBitMask <<= 0x402F - aExp;
7108         roundBitsMask = lastBitMask - 1;
7109         z.low = 0;
7110         z.high = a.high;
7111         switch (status->float_rounding_mode) {
7112         case float_round_nearest_even:
7113             z.high += lastBitMask>>1;
7114             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7115                 z.high &= ~ lastBitMask;
7116             }
7117             break;
7118         case float_round_ties_away:
7119             z.high += lastBitMask>>1;
7120             break;
7121         case float_round_to_zero:
7122             break;
7123         case float_round_up:
7124             if (!extractFloat128Sign(z)) {
7125                 z.high |= ( a.low != 0 );
7126                 z.high += roundBitsMask;
7127             }
7128             break;
7129         case float_round_down:
7130             if (extractFloat128Sign(z)) {
7131                 z.high |= (a.low != 0);
7132                 z.high += roundBitsMask;
7133             }
7134             break;
7135         case float_round_to_odd:
7136             if ((z.high & lastBitMask) == 0) {
7137                 z.high |= (a.low != 0);
7138                 z.high += roundBitsMask;
7139             }
7140             break;
7141         default:
7142             abort();
7143         }
7144         z.high &= ~ roundBitsMask;
7145     }
7146     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7147         float_raise(float_flag_inexact, status);
7148     }
7149     return z;
7150 
7151 }
7152 
7153 /*----------------------------------------------------------------------------
7154 | Returns the result of adding the absolute values of the quadruple-precision
7155 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7156 | before being returned.  `zSign' is ignored if the result is a NaN.
7157 | The addition is performed according to the IEC/IEEE Standard for Binary
7158 | Floating-Point Arithmetic.
7159 *----------------------------------------------------------------------------*/
7160 
7161 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
7162                                 float_status *status)
7163 {
7164     int32_t aExp, bExp, zExp;
7165     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7166     int32_t expDiff;
7167 
7168     aSig1 = extractFloat128Frac1( a );
7169     aSig0 = extractFloat128Frac0( a );
7170     aExp = extractFloat128Exp( a );
7171     bSig1 = extractFloat128Frac1( b );
7172     bSig0 = extractFloat128Frac0( b );
7173     bExp = extractFloat128Exp( b );
7174     expDiff = aExp - bExp;
7175     if ( 0 < expDiff ) {
7176         if ( aExp == 0x7FFF ) {
7177             if (aSig0 | aSig1) {
7178                 return propagateFloat128NaN(a, b, status);
7179             }
7180             return a;
7181         }
7182         if ( bExp == 0 ) {
7183             --expDiff;
7184         }
7185         else {
7186             bSig0 |= UINT64_C(0x0001000000000000);
7187         }
7188         shift128ExtraRightJamming(
7189             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7190         zExp = aExp;
7191     }
7192     else if ( expDiff < 0 ) {
7193         if ( bExp == 0x7FFF ) {
7194             if (bSig0 | bSig1) {
7195                 return propagateFloat128NaN(a, b, status);
7196             }
7197             return packFloat128( zSign, 0x7FFF, 0, 0 );
7198         }
7199         if ( aExp == 0 ) {
7200             ++expDiff;
7201         }
7202         else {
7203             aSig0 |= UINT64_C(0x0001000000000000);
7204         }
7205         shift128ExtraRightJamming(
7206             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7207         zExp = bExp;
7208     }
7209     else {
7210         if ( aExp == 0x7FFF ) {
7211             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7212                 return propagateFloat128NaN(a, b, status);
7213             }
7214             return a;
7215         }
7216         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7217         if ( aExp == 0 ) {
7218             if (status->flush_to_zero) {
7219                 if (zSig0 | zSig1) {
7220                     float_raise(float_flag_output_denormal, status);
7221                 }
7222                 return packFloat128(zSign, 0, 0, 0);
7223             }
7224             return packFloat128( zSign, 0, zSig0, zSig1 );
7225         }
7226         zSig2 = 0;
7227         zSig0 |= UINT64_C(0x0002000000000000);
7228         zExp = aExp;
7229         goto shiftRight1;
7230     }
7231     aSig0 |= UINT64_C(0x0001000000000000);
7232     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7233     --zExp;
7234     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7235     ++zExp;
7236  shiftRight1:
7237     shift128ExtraRightJamming(
7238         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7239  roundAndPack:
7240     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7241 
7242 }
7243 
7244 /*----------------------------------------------------------------------------
7245 | Returns the result of subtracting the absolute values of the quadruple-
7246 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7247 | difference is negated before being returned.  `zSign' is ignored if the
7248 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7249 | Standard for Binary Floating-Point Arithmetic.
7250 *----------------------------------------------------------------------------*/
7251 
7252 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7253                                 float_status *status)
7254 {
7255     int32_t aExp, bExp, zExp;
7256     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7257     int32_t expDiff;
7258 
7259     aSig1 = extractFloat128Frac1( a );
7260     aSig0 = extractFloat128Frac0( a );
7261     aExp = extractFloat128Exp( a );
7262     bSig1 = extractFloat128Frac1( b );
7263     bSig0 = extractFloat128Frac0( b );
7264     bExp = extractFloat128Exp( b );
7265     expDiff = aExp - bExp;
7266     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7267     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7268     if ( 0 < expDiff ) goto aExpBigger;
7269     if ( expDiff < 0 ) goto bExpBigger;
7270     if ( aExp == 0x7FFF ) {
7271         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7272             return propagateFloat128NaN(a, b, status);
7273         }
7274         float_raise(float_flag_invalid, status);
7275         return float128_default_nan(status);
7276     }
7277     if ( aExp == 0 ) {
7278         aExp = 1;
7279         bExp = 1;
7280     }
7281     if ( bSig0 < aSig0 ) goto aBigger;
7282     if ( aSig0 < bSig0 ) goto bBigger;
7283     if ( bSig1 < aSig1 ) goto aBigger;
7284     if ( aSig1 < bSig1 ) goto bBigger;
7285     return packFloat128(status->float_rounding_mode == float_round_down,
7286                         0, 0, 0);
7287  bExpBigger:
7288     if ( bExp == 0x7FFF ) {
7289         if (bSig0 | bSig1) {
7290             return propagateFloat128NaN(a, b, status);
7291         }
7292         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7293     }
7294     if ( aExp == 0 ) {
7295         ++expDiff;
7296     }
7297     else {
7298         aSig0 |= UINT64_C(0x4000000000000000);
7299     }
7300     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7301     bSig0 |= UINT64_C(0x4000000000000000);
7302  bBigger:
7303     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7304     zExp = bExp;
7305     zSign ^= 1;
7306     goto normalizeRoundAndPack;
7307  aExpBigger:
7308     if ( aExp == 0x7FFF ) {
7309         if (aSig0 | aSig1) {
7310             return propagateFloat128NaN(a, b, status);
7311         }
7312         return a;
7313     }
7314     if ( bExp == 0 ) {
7315         --expDiff;
7316     }
7317     else {
7318         bSig0 |= UINT64_C(0x4000000000000000);
7319     }
7320     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7321     aSig0 |= UINT64_C(0x4000000000000000);
7322  aBigger:
7323     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7324     zExp = aExp;
7325  normalizeRoundAndPack:
7326     --zExp;
7327     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7328                                          status);
7329 
7330 }
7331 
7332 /*----------------------------------------------------------------------------
7333 | Returns the result of adding the quadruple-precision floating-point values
7334 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7335 | for Binary Floating-Point Arithmetic.
7336 *----------------------------------------------------------------------------*/
7337 
7338 float128 float128_add(float128 a, float128 b, float_status *status)
7339 {
7340     bool aSign, bSign;
7341 
7342     aSign = extractFloat128Sign( a );
7343     bSign = extractFloat128Sign( b );
7344     if ( aSign == bSign ) {
7345         return addFloat128Sigs(a, b, aSign, status);
7346     }
7347     else {
7348         return subFloat128Sigs(a, b, aSign, status);
7349     }
7350 
7351 }
7352 
7353 /*----------------------------------------------------------------------------
7354 | Returns the result of subtracting the quadruple-precision floating-point
7355 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7356 | Standard for Binary Floating-Point Arithmetic.
7357 *----------------------------------------------------------------------------*/
7358 
7359 float128 float128_sub(float128 a, float128 b, float_status *status)
7360 {
7361     bool aSign, bSign;
7362 
7363     aSign = extractFloat128Sign( a );
7364     bSign = extractFloat128Sign( b );
7365     if ( aSign == bSign ) {
7366         return subFloat128Sigs(a, b, aSign, status);
7367     }
7368     else {
7369         return addFloat128Sigs(a, b, aSign, status);
7370     }
7371 
7372 }
7373 
7374 /*----------------------------------------------------------------------------
7375 | Returns the result of multiplying the quadruple-precision floating-point
7376 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7377 | Standard for Binary Floating-Point Arithmetic.
7378 *----------------------------------------------------------------------------*/
7379 
7380 float128 float128_mul(float128 a, float128 b, float_status *status)
7381 {
7382     bool aSign, bSign, zSign;
7383     int32_t aExp, bExp, zExp;
7384     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7385 
7386     aSig1 = extractFloat128Frac1( a );
7387     aSig0 = extractFloat128Frac0( a );
7388     aExp = extractFloat128Exp( a );
7389     aSign = extractFloat128Sign( a );
7390     bSig1 = extractFloat128Frac1( b );
7391     bSig0 = extractFloat128Frac0( b );
7392     bExp = extractFloat128Exp( b );
7393     bSign = extractFloat128Sign( b );
7394     zSign = aSign ^ bSign;
7395     if ( aExp == 0x7FFF ) {
7396         if (    ( aSig0 | aSig1 )
7397              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7398             return propagateFloat128NaN(a, b, status);
7399         }
7400         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7401         return packFloat128( zSign, 0x7FFF, 0, 0 );
7402     }
7403     if ( bExp == 0x7FFF ) {
7404         if (bSig0 | bSig1) {
7405             return propagateFloat128NaN(a, b, status);
7406         }
7407         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7408  invalid:
7409             float_raise(float_flag_invalid, status);
7410             return float128_default_nan(status);
7411         }
7412         return packFloat128( zSign, 0x7FFF, 0, 0 );
7413     }
7414     if ( aExp == 0 ) {
7415         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7416         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7417     }
7418     if ( bExp == 0 ) {
7419         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7420         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7421     }
7422     zExp = aExp + bExp - 0x4000;
7423     aSig0 |= UINT64_C(0x0001000000000000);
7424     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7425     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7426     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7427     zSig2 |= ( zSig3 != 0 );
7428     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7429         shift128ExtraRightJamming(
7430             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7431         ++zExp;
7432     }
7433     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7434 
7435 }
7436 
7437 /*----------------------------------------------------------------------------
7438 | Returns the result of dividing the quadruple-precision floating-point value
7439 | `a' by the corresponding value `b'.  The operation is performed according to
7440 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7441 *----------------------------------------------------------------------------*/
7442 
7443 float128 float128_div(float128 a, float128 b, float_status *status)
7444 {
7445     bool aSign, bSign, zSign;
7446     int32_t aExp, bExp, zExp;
7447     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7448     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7449 
7450     aSig1 = extractFloat128Frac1( a );
7451     aSig0 = extractFloat128Frac0( a );
7452     aExp = extractFloat128Exp( a );
7453     aSign = extractFloat128Sign( a );
7454     bSig1 = extractFloat128Frac1( b );
7455     bSig0 = extractFloat128Frac0( b );
7456     bExp = extractFloat128Exp( b );
7457     bSign = extractFloat128Sign( b );
7458     zSign = aSign ^ bSign;
7459     if ( aExp == 0x7FFF ) {
7460         if (aSig0 | aSig1) {
7461             return propagateFloat128NaN(a, b, status);
7462         }
7463         if ( bExp == 0x7FFF ) {
7464             if (bSig0 | bSig1) {
7465                 return propagateFloat128NaN(a, b, status);
7466             }
7467             goto invalid;
7468         }
7469         return packFloat128( zSign, 0x7FFF, 0, 0 );
7470     }
7471     if ( bExp == 0x7FFF ) {
7472         if (bSig0 | bSig1) {
7473             return propagateFloat128NaN(a, b, status);
7474         }
7475         return packFloat128( zSign, 0, 0, 0 );
7476     }
7477     if ( bExp == 0 ) {
7478         if ( ( bSig0 | bSig1 ) == 0 ) {
7479             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7480  invalid:
7481                 float_raise(float_flag_invalid, status);
7482                 return float128_default_nan(status);
7483             }
7484             float_raise(float_flag_divbyzero, status);
7485             return packFloat128( zSign, 0x7FFF, 0, 0 );
7486         }
7487         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7488     }
7489     if ( aExp == 0 ) {
7490         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7491         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7492     }
7493     zExp = aExp - bExp + 0x3FFD;
7494     shortShift128Left(
7495         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7496     shortShift128Left(
7497         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7498     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7499         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7500         ++zExp;
7501     }
7502     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7503     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7504     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7505     while ( (int64_t) rem0 < 0 ) {
7506         --zSig0;
7507         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7508     }
7509     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7510     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7511         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7512         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7513         while ( (int64_t) rem1 < 0 ) {
7514             --zSig1;
7515             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7516         }
7517         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7518     }
7519     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7520     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7521 
7522 }
7523 
7524 /*----------------------------------------------------------------------------
7525 | Returns the remainder of the quadruple-precision floating-point value `a'
7526 | with respect to the corresponding value `b'.  The operation is performed
7527 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7528 *----------------------------------------------------------------------------*/
7529 
7530 float128 float128_rem(float128 a, float128 b, float_status *status)
7531 {
7532     bool aSign, zSign;
7533     int32_t aExp, bExp, expDiff;
7534     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7535     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7536     int64_t sigMean0;
7537 
7538     aSig1 = extractFloat128Frac1( a );
7539     aSig0 = extractFloat128Frac0( a );
7540     aExp = extractFloat128Exp( a );
7541     aSign = extractFloat128Sign( a );
7542     bSig1 = extractFloat128Frac1( b );
7543     bSig0 = extractFloat128Frac0( b );
7544     bExp = extractFloat128Exp( b );
7545     if ( aExp == 0x7FFF ) {
7546         if (    ( aSig0 | aSig1 )
7547              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7548             return propagateFloat128NaN(a, b, status);
7549         }
7550         goto invalid;
7551     }
7552     if ( bExp == 0x7FFF ) {
7553         if (bSig0 | bSig1) {
7554             return propagateFloat128NaN(a, b, status);
7555         }
7556         return a;
7557     }
7558     if ( bExp == 0 ) {
7559         if ( ( bSig0 | bSig1 ) == 0 ) {
7560  invalid:
7561             float_raise(float_flag_invalid, status);
7562             return float128_default_nan(status);
7563         }
7564         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7565     }
7566     if ( aExp == 0 ) {
7567         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7568         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7569     }
7570     expDiff = aExp - bExp;
7571     if ( expDiff < -1 ) return a;
7572     shortShift128Left(
7573         aSig0 | UINT64_C(0x0001000000000000),
7574         aSig1,
7575         15 - ( expDiff < 0 ),
7576         &aSig0,
7577         &aSig1
7578     );
7579     shortShift128Left(
7580         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7581     q = le128( bSig0, bSig1, aSig0, aSig1 );
7582     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7583     expDiff -= 64;
7584     while ( 0 < expDiff ) {
7585         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7586         q = ( 4 < q ) ? q - 4 : 0;
7587         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7588         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7589         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7590         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7591         expDiff -= 61;
7592     }
7593     if ( -64 < expDiff ) {
7594         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7595         q = ( 4 < q ) ? q - 4 : 0;
7596         q >>= - expDiff;
7597         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7598         expDiff += 52;
7599         if ( expDiff < 0 ) {
7600             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7601         }
7602         else {
7603             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7604         }
7605         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7606         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7607     }
7608     else {
7609         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7610         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7611     }
7612     do {
7613         alternateASig0 = aSig0;
7614         alternateASig1 = aSig1;
7615         ++q;
7616         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7617     } while ( 0 <= (int64_t) aSig0 );
7618     add128(
7619         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7620     if (    ( sigMean0 < 0 )
7621          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7622         aSig0 = alternateASig0;
7623         aSig1 = alternateASig1;
7624     }
7625     zSign = ( (int64_t) aSig0 < 0 );
7626     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7627     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7628                                          status);
7629 }
7630 
7631 /*----------------------------------------------------------------------------
7632 | Returns the square root of the quadruple-precision floating-point value `a'.
7633 | The operation is performed according to the IEC/IEEE Standard for Binary
7634 | Floating-Point Arithmetic.
7635 *----------------------------------------------------------------------------*/
7636 
7637 float128 float128_sqrt(float128 a, float_status *status)
7638 {
7639     bool aSign;
7640     int32_t aExp, zExp;
7641     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7642     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7643 
7644     aSig1 = extractFloat128Frac1( a );
7645     aSig0 = extractFloat128Frac0( a );
7646     aExp = extractFloat128Exp( a );
7647     aSign = extractFloat128Sign( a );
7648     if ( aExp == 0x7FFF ) {
7649         if (aSig0 | aSig1) {
7650             return propagateFloat128NaN(a, a, status);
7651         }
7652         if ( ! aSign ) return a;
7653         goto invalid;
7654     }
7655     if ( aSign ) {
7656         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7657  invalid:
7658         float_raise(float_flag_invalid, status);
7659         return float128_default_nan(status);
7660     }
7661     if ( aExp == 0 ) {
7662         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7663         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7664     }
7665     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7666     aSig0 |= UINT64_C(0x0001000000000000);
7667     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7668     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7669     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7670     doubleZSig0 = zSig0<<1;
7671     mul64To128( zSig0, zSig0, &term0, &term1 );
7672     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7673     while ( (int64_t) rem0 < 0 ) {
7674         --zSig0;
7675         doubleZSig0 -= 2;
7676         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7677     }
7678     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7679     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7680         if ( zSig1 == 0 ) zSig1 = 1;
7681         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7682         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7683         mul64To128( zSig1, zSig1, &term2, &term3 );
7684         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7685         while ( (int64_t) rem1 < 0 ) {
7686             --zSig1;
7687             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7688             term3 |= 1;
7689             term2 |= doubleZSig0;
7690             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7691         }
7692         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7693     }
7694     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7695     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7696 
7697 }
7698 
7699 static inline FloatRelation
7700 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7701                           float_status *status)
7702 {
7703     bool aSign, bSign;
7704 
7705     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7706         float_raise(float_flag_invalid, status);
7707         return float_relation_unordered;
7708     }
7709     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7710           ( extractFloatx80Frac( a )<<1 ) ) ||
7711         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7712           ( extractFloatx80Frac( b )<<1 ) )) {
7713         if (!is_quiet ||
7714             floatx80_is_signaling_nan(a, status) ||
7715             floatx80_is_signaling_nan(b, status)) {
7716             float_raise(float_flag_invalid, status);
7717         }
7718         return float_relation_unordered;
7719     }
7720     aSign = extractFloatx80Sign( a );
7721     bSign = extractFloatx80Sign( b );
7722     if ( aSign != bSign ) {
7723 
7724         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7725              ( ( a.low | b.low ) == 0 ) ) {
7726             /* zero case */
7727             return float_relation_equal;
7728         } else {
7729             return 1 - (2 * aSign);
7730         }
7731     } else {
7732         /* Normalize pseudo-denormals before comparison.  */
7733         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7734             ++a.high;
7735         }
7736         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7737             ++b.high;
7738         }
7739         if (a.low == b.low && a.high == b.high) {
7740             return float_relation_equal;
7741         } else {
7742             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7743         }
7744     }
7745 }
7746 
7747 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7748 {
7749     return floatx80_compare_internal(a, b, 0, status);
7750 }
7751 
7752 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7753                                      float_status *status)
7754 {
7755     return floatx80_compare_internal(a, b, 1, status);
7756 }
7757 
7758 static inline FloatRelation
7759 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7760                           float_status *status)
7761 {
7762     bool aSign, bSign;
7763 
7764     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7765           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7766         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7767           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7768         if (!is_quiet ||
7769             float128_is_signaling_nan(a, status) ||
7770             float128_is_signaling_nan(b, status)) {
7771             float_raise(float_flag_invalid, status);
7772         }
7773         return float_relation_unordered;
7774     }
7775     aSign = extractFloat128Sign( a );
7776     bSign = extractFloat128Sign( b );
7777     if ( aSign != bSign ) {
7778         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7779             /* zero case */
7780             return float_relation_equal;
7781         } else {
7782             return 1 - (2 * aSign);
7783         }
7784     } else {
7785         if (a.low == b.low && a.high == b.high) {
7786             return float_relation_equal;
7787         } else {
7788             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7789         }
7790     }
7791 }
7792 
7793 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7794 {
7795     return float128_compare_internal(a, b, 0, status);
7796 }
7797 
7798 FloatRelation float128_compare_quiet(float128 a, float128 b,
7799                                      float_status *status)
7800 {
7801     return float128_compare_internal(a, b, 1, status);
7802 }
7803 
7804 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7805 {
7806     bool aSign;
7807     int32_t aExp;
7808     uint64_t aSig;
7809 
7810     if (floatx80_invalid_encoding(a)) {
7811         float_raise(float_flag_invalid, status);
7812         return floatx80_default_nan(status);
7813     }
7814     aSig = extractFloatx80Frac( a );
7815     aExp = extractFloatx80Exp( a );
7816     aSign = extractFloatx80Sign( a );
7817 
7818     if ( aExp == 0x7FFF ) {
7819         if ( aSig<<1 ) {
7820             return propagateFloatx80NaN(a, a, status);
7821         }
7822         return a;
7823     }
7824 
7825     if (aExp == 0) {
7826         if (aSig == 0) {
7827             return a;
7828         }
7829         aExp++;
7830     }
7831 
7832     if (n > 0x10000) {
7833         n = 0x10000;
7834     } else if (n < -0x10000) {
7835         n = -0x10000;
7836     }
7837 
7838     aExp += n;
7839     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7840                                          aSign, aExp, aSig, 0, status);
7841 }
7842 
7843 float128 float128_scalbn(float128 a, int n, float_status *status)
7844 {
7845     bool aSign;
7846     int32_t aExp;
7847     uint64_t aSig0, aSig1;
7848 
7849     aSig1 = extractFloat128Frac1( a );
7850     aSig0 = extractFloat128Frac0( a );
7851     aExp = extractFloat128Exp( a );
7852     aSign = extractFloat128Sign( a );
7853     if ( aExp == 0x7FFF ) {
7854         if ( aSig0 | aSig1 ) {
7855             return propagateFloat128NaN(a, a, status);
7856         }
7857         return a;
7858     }
7859     if (aExp != 0) {
7860         aSig0 |= UINT64_C(0x0001000000000000);
7861     } else if (aSig0 == 0 && aSig1 == 0) {
7862         return a;
7863     } else {
7864         aExp++;
7865     }
7866 
7867     if (n > 0x10000) {
7868         n = 0x10000;
7869     } else if (n < -0x10000) {
7870         n = -0x10000;
7871     }
7872 
7873     aExp += n - 1;
7874     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7875                                          , status);
7876 
7877 }
7878 
7879 static void __attribute__((constructor)) softfloat_init(void)
7880 {
7881     union_float64 ua, ub, uc, ur;
7882 
7883     if (QEMU_NO_HARDFLOAT) {
7884         return;
7885     }
7886     /*
7887      * Test that the host's FMA is not obviously broken. For example,
7888      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7889      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7890      */
7891     ua.s = 0x0020000000000001ULL;
7892     ub.s = 0x3ca0000000000000ULL;
7893     uc.s = 0x0020000000000000ULL;
7894     ur.h = fma(ua.h, ub.h, uc.h);
7895     if (ur.s != 0x0020000000000001ULL) {
7896         force_soft_fma = true;
7897     }
7898 }
7899