xref: /openbmc/qemu/fpu/softfloat.c (revision aca84527)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             float_raise(float_flag_input_denormal, s);                  \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345              f32_check_fn pre, f32_check_fn post)
346 {
347     union_float32 ua, ub, ur;
348 
349     ua.s = xa;
350     ub.s = xb;
351 
352     if (unlikely(!can_use_fpu(s))) {
353         goto soft;
354     }
355 
356     float32_input_flush2(&ua.s, &ub.s, s);
357     if (unlikely(!pre(ua, ub))) {
358         goto soft;
359     }
360 
361     ur.h = hard(ua.h, ub.h);
362     if (unlikely(f32_is_inf(ur))) {
363         float_raise(float_flag_overflow, s);
364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365         goto soft;
366     }
367     return ur.s;
368 
369  soft:
370     return soft(ua.s, ub.s, s);
371 }
372 
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376              f64_check_fn pre, f64_check_fn post)
377 {
378     union_float64 ua, ub, ur;
379 
380     ua.s = xa;
381     ub.s = xb;
382 
383     if (unlikely(!can_use_fpu(s))) {
384         goto soft;
385     }
386 
387     float64_input_flush2(&ua.s, &ub.s, s);
388     if (unlikely(!pre(ua, ub))) {
389         goto soft;
390     }
391 
392     ur.h = hard(ua.h, ub.h);
393     if (unlikely(f64_is_inf(ur))) {
394         float_raise(float_flag_overflow, s);
395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396         goto soft;
397     }
398     return ur.s;
399 
400  soft:
401     return soft(ua.s, ub.s, s);
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410     return float32_val(a) & 0x007FFFFF;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 static inline int extractFloat32Exp(float32 a)
418 {
419     return (float32_val(a) >> 23) & 0xFF;
420 }
421 
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline bool extractFloat32Sign(float32 a)
427 {
428     return float32_val(a) >> 31;
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434 
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439 
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443 
444 static inline int extractFloat64Exp(float64 a)
445 {
446     return (float64_val(a) >> 52) & 0x7FF;
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452 
453 static inline bool extractFloat64Sign(float64 a)
454 {
455     return float64_val(a) >> 63;
456 }
457 
458 /*
459  * Classify a floating point number. Everything above float_class_qnan
460  * is a NaN so cls >= float_class_qnan is any NaN.
461  */
462 
463 typedef enum __attribute__ ((__packed__)) {
464     float_class_unclassified,
465     float_class_zero,
466     float_class_normal,
467     float_class_inf,
468     float_class_qnan,  /* all NaNs from here */
469     float_class_snan,
470 } FloatClass;
471 
472 #define float_cmask(bit)  (1u << (bit))
473 
474 enum {
475     float_cmask_zero    = float_cmask(float_class_zero),
476     float_cmask_normal  = float_cmask(float_class_normal),
477     float_cmask_inf     = float_cmask(float_class_inf),
478     float_cmask_qnan    = float_cmask(float_class_qnan),
479     float_cmask_snan    = float_cmask(float_class_snan),
480 
481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
483 };
484 
485 
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489     return unlikely(c >= float_class_qnan);
490 }
491 
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494     return c == float_class_snan;
495 }
496 
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499     return c == float_class_qnan;
500 }
501 
502 /*
503  * Structure holding all of the decomposed parts of a float.
504  * The exponent is unbiased and the fraction is normalized.
505  *
506  * The fraction words are stored in big-endian word ordering,
507  * so that truncation from a larger format to a smaller format
508  * can be done simply by ignoring subsequent elements.
509  */
510 
511 typedef struct {
512     FloatClass cls;
513     bool sign;
514     int32_t exp;
515     union {
516         /* Routines that know the structure may reference the singular name. */
517         uint64_t frac;
518         /*
519          * Routines expanded with multiple structures reference "hi" and "lo"
520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
521          * both the same word and aliased here.
522          */
523         uint64_t frac_hi;
524         uint64_t frac_lo;
525     };
526 } FloatParts64;
527 
528 typedef struct {
529     FloatClass cls;
530     bool sign;
531     int32_t exp;
532     uint64_t frac_hi;
533     uint64_t frac_lo;
534 } FloatParts128;
535 
536 typedef struct {
537     FloatClass cls;
538     bool sign;
539     int32_t exp;
540     uint64_t frac_hi;
541     uint64_t frac_hm;  /* high-middle */
542     uint64_t frac_lm;  /* low-middle */
543     uint64_t frac_lo;
544 } FloatParts256;
545 
546 /* These apply to the most significant word of each FloatPartsN. */
547 #define DECOMPOSED_BINARY_POINT    63
548 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
549 
550 /* Structure holding all of the relevant parameters for a format.
551  *   exp_size: the size of the exponent field
552  *   exp_bias: the offset applied to the exponent field
553  *   exp_max: the maximum normalised exponent
554  *   frac_size: the size of the fraction field
555  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
556  * The following are computed based the size of fraction
557  *   frac_lsb: least significant bit of fraction
558  *   frac_lsbm1: the bit below the least significant bit (for rounding)
559  *   round_mask/roundeven_mask: masks used for rounding
560  * The following optional modifiers are available:
561  *   arm_althp: handle ARM Alternative Half Precision
562  */
563 typedef struct {
564     int exp_size;
565     int exp_bias;
566     int exp_max;
567     int frac_size;
568     int frac_shift;
569     uint64_t frac_lsb;
570     uint64_t frac_lsbm1;
571     uint64_t round_mask;
572     uint64_t roundeven_mask;
573     bool arm_althp;
574 } FloatFmt;
575 
576 /* Expand fields based on the size of exponent and fraction */
577 #define FLOAT_PARAMS(E, F)                                           \
578     .exp_size       = E,                                             \
579     .exp_bias       = ((1 << E) - 1) >> 1,                           \
580     .exp_max        = (1 << E) - 1,                                  \
581     .frac_size      = F,                                             \
582     .frac_shift     = (-F - 1) & 63,                                 \
583     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
584     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
585     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
586     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
587 
588 static const FloatFmt float16_params = {
589     FLOAT_PARAMS(5, 10)
590 };
591 
592 static const FloatFmt float16_params_ahp = {
593     FLOAT_PARAMS(5, 10),
594     .arm_althp = true
595 };
596 
597 static const FloatFmt bfloat16_params = {
598     FLOAT_PARAMS(8, 7)
599 };
600 
601 static const FloatFmt float32_params = {
602     FLOAT_PARAMS(8, 23)
603 };
604 
605 static const FloatFmt float64_params = {
606     FLOAT_PARAMS(11, 52)
607 };
608 
609 static const FloatFmt float128_params = {
610     FLOAT_PARAMS(15, 112)
611 };
612 
613 /* Unpack a float to parts, but do not canonicalize.  */
614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
615 {
616     const int f_size = fmt->frac_size;
617     const int e_size = fmt->exp_size;
618 
619     *r = (FloatParts64) {
620         .cls = float_class_unclassified,
621         .sign = extract64(raw, f_size + e_size, 1),
622         .exp = extract64(raw, f_size, e_size),
623         .frac = extract64(raw, 0, f_size)
624     };
625 }
626 
627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
628 {
629     unpack_raw64(p, &float16_params, f);
630 }
631 
632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
633 {
634     unpack_raw64(p, &bfloat16_params, f);
635 }
636 
637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
638 {
639     unpack_raw64(p, &float32_params, f);
640 }
641 
642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
643 {
644     unpack_raw64(p, &float64_params, f);
645 }
646 
647 static void float128_unpack_raw(FloatParts128 *p, float128 f)
648 {
649     const int f_size = float128_params.frac_size - 64;
650     const int e_size = float128_params.exp_size;
651 
652     *p = (FloatParts128) {
653         .cls = float_class_unclassified,
654         .sign = extract64(f.high, f_size + e_size, 1),
655         .exp = extract64(f.high, f_size, e_size),
656         .frac_hi = extract64(f.high, 0, f_size),
657         .frac_lo = f.low,
658     };
659 }
660 
661 /* Pack a float from parts, but do not canonicalize.  */
662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
663 {
664     const int f_size = fmt->frac_size;
665     const int e_size = fmt->exp_size;
666     uint64_t ret;
667 
668     ret = (uint64_t)p->sign << (f_size + e_size);
669     ret = deposit64(ret, f_size, e_size, p->exp);
670     ret = deposit64(ret, 0, f_size, p->frac);
671     return ret;
672 }
673 
674 static inline float16 float16_pack_raw(const FloatParts64 *p)
675 {
676     return make_float16(pack_raw64(p, &float16_params));
677 }
678 
679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
680 {
681     return pack_raw64(p, &bfloat16_params);
682 }
683 
684 static inline float32 float32_pack_raw(const FloatParts64 *p)
685 {
686     return make_float32(pack_raw64(p, &float32_params));
687 }
688 
689 static inline float64 float64_pack_raw(const FloatParts64 *p)
690 {
691     return make_float64(pack_raw64(p, &float64_params));
692 }
693 
694 static float128 float128_pack_raw(const FloatParts128 *p)
695 {
696     const int f_size = float128_params.frac_size - 64;
697     const int e_size = float128_params.exp_size;
698     uint64_t hi;
699 
700     hi = (uint64_t)p->sign << (f_size + e_size);
701     hi = deposit64(hi, f_size, e_size, p->exp);
702     hi = deposit64(hi, 0, f_size, p->frac_hi);
703     return make_float128(hi, p->frac_lo);
704 }
705 
706 /*----------------------------------------------------------------------------
707 | Functions and definitions to determine:  (1) whether tininess for underflow
708 | is detected before or after rounding by default, (2) what (if anything)
709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
711 | are propagated from function inputs to output.  These details are target-
712 | specific.
713 *----------------------------------------------------------------------------*/
714 #include "softfloat-specialize.c.inc"
715 
716 #define PARTS_GENERIC_64_128(NAME, P) \
717     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
718 
719 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
720 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
721 
722 static void parts64_return_nan(FloatParts64 *a, float_status *s);
723 static void parts128_return_nan(FloatParts128 *a, float_status *s);
724 
725 #define parts_return_nan(P, S)     PARTS_GENERIC_64_128(return_nan, P)(P, S)
726 
727 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
728                                       float_status *s);
729 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
730                                         float_status *s);
731 
732 #define parts_pick_nan(A, B, S)    PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
733 
734 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
735                                              FloatParts64 *c, float_status *s,
736                                              int ab_mask, int abc_mask);
737 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
738                                                FloatParts128 *b,
739                                                FloatParts128 *c,
740                                                float_status *s,
741                                                int ab_mask, int abc_mask);
742 
743 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
744     PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
745 
746 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
747                                  const FloatFmt *fmt);
748 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
749                                   const FloatFmt *fmt);
750 
751 #define parts_canonicalize(A, S, F) \
752     PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
753 
754 static void parts64_uncanon(FloatParts64 *p, float_status *status,
755                             const FloatFmt *fmt);
756 static void parts128_uncanon(FloatParts128 *p, float_status *status,
757                              const FloatFmt *fmt);
758 
759 #define parts_uncanon(A, S, F) \
760     PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
761 
762 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b);
763 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b);
764 
765 #define parts_add_normal(A, B) \
766     PARTS_GENERIC_64_128(add_normal, A)(A, B)
767 
768 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b);
769 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b);
770 
771 #define parts_sub_normal(A, B) \
772     PARTS_GENERIC_64_128(sub_normal, A)(A, B)
773 
774 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b,
775                                     float_status *s, bool subtract);
776 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b,
777                                       float_status *s, bool subtract);
778 
779 #define parts_addsub(A, B, S, Z) \
780     PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
781 
782 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b,
783                                  float_status *s);
784 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
785                                    float_status *s);
786 
787 #define parts_mul(A, B, S) \
788     PARTS_GENERIC_64_128(mul, A)(A, B, S)
789 
790 /*
791  * Helper functions for softfloat-parts.c.inc, per-size operations.
792  */
793 
794 #define FRAC_GENERIC_64_128(NAME, P) \
795     QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
796 
797 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
798 {
799     return uadd64_overflow(a->frac, b->frac, &r->frac);
800 }
801 
802 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
803 {
804     bool c = 0;
805     r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
806     r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
807     return c;
808 }
809 
810 #define frac_add(R, A, B)  FRAC_GENERIC_64_128(add, R)(R, A, B)
811 
812 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c)
813 {
814     return uadd64_overflow(a->frac, c, &r->frac);
815 }
816 
817 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c)
818 {
819     c = uadd64_overflow(a->frac_lo, c, &r->frac_lo);
820     return uadd64_overflow(a->frac_hi, c, &r->frac_hi);
821 }
822 
823 #define frac_addi(R, A, C)  FRAC_GENERIC_64_128(addi, R)(R, A, C)
824 
825 static void frac64_allones(FloatParts64 *a)
826 {
827     a->frac = -1;
828 }
829 
830 static void frac128_allones(FloatParts128 *a)
831 {
832     a->frac_hi = a->frac_lo = -1;
833 }
834 
835 #define frac_allones(A)  FRAC_GENERIC_64_128(allones, A)(A)
836 
837 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
838 {
839     return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
840 }
841 
842 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
843 {
844     uint64_t ta = a->frac_hi, tb = b->frac_hi;
845     if (ta == tb) {
846         ta = a->frac_lo, tb = b->frac_lo;
847         if (ta == tb) {
848             return 0;
849         }
850     }
851     return ta < tb ? -1 : 1;
852 }
853 
854 #define frac_cmp(A, B)  FRAC_GENERIC_64_128(cmp, A)(A, B)
855 
856 static void frac64_clear(FloatParts64 *a)
857 {
858     a->frac = 0;
859 }
860 
861 static void frac128_clear(FloatParts128 *a)
862 {
863     a->frac_hi = a->frac_lo = 0;
864 }
865 
866 #define frac_clear(A)  FRAC_GENERIC_64_128(clear, A)(A)
867 
868 static bool frac64_eqz(FloatParts64 *a)
869 {
870     return a->frac == 0;
871 }
872 
873 static bool frac128_eqz(FloatParts128 *a)
874 {
875     return (a->frac_hi | a->frac_lo) == 0;
876 }
877 
878 #define frac_eqz(A)  FRAC_GENERIC_64_128(eqz, A)(A)
879 
880 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b)
881 {
882     mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac);
883 }
884 
885 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b)
886 {
887     mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo,
888                 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo);
889 }
890 
891 #define frac_mulw(R, A, B)  FRAC_GENERIC_64_128(mulw, A)(R, A, B)
892 
893 static void frac64_neg(FloatParts64 *a)
894 {
895     a->frac = -a->frac;
896 }
897 
898 static void frac128_neg(FloatParts128 *a)
899 {
900     bool c = 0;
901     a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
902     a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
903 }
904 
905 #define frac_neg(A)  FRAC_GENERIC_64_128(neg, A)(A)
906 
907 static int frac64_normalize(FloatParts64 *a)
908 {
909     if (a->frac) {
910         int shift = clz64(a->frac);
911         a->frac <<= shift;
912         return shift;
913     }
914     return 64;
915 }
916 
917 static int frac128_normalize(FloatParts128 *a)
918 {
919     if (a->frac_hi) {
920         int shl = clz64(a->frac_hi);
921         if (shl) {
922             int shr = 64 - shl;
923             a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
924             a->frac_lo = (a->frac_lo << shl);
925         }
926         return shl;
927     } else if (a->frac_lo) {
928         int shl = clz64(a->frac_lo);
929         a->frac_hi = (a->frac_lo << shl);
930         a->frac_lo = 0;
931         return shl + 64;
932     }
933     return 128;
934 }
935 
936 #define frac_normalize(A)  FRAC_GENERIC_64_128(normalize, A)(A)
937 
938 static void frac64_shl(FloatParts64 *a, int c)
939 {
940     a->frac <<= c;
941 }
942 
943 static void frac128_shl(FloatParts128 *a, int c)
944 {
945     shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
946 }
947 
948 #define frac_shl(A, C)  FRAC_GENERIC_64_128(shl, A)(A, C)
949 
950 static void frac64_shr(FloatParts64 *a, int c)
951 {
952     a->frac >>= c;
953 }
954 
955 static void frac128_shr(FloatParts128 *a, int c)
956 {
957     shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
958 }
959 
960 #define frac_shr(A, C)  FRAC_GENERIC_64_128(shr, A)(A, C)
961 
962 static void frac64_shrjam(FloatParts64 *a, int c)
963 {
964     shift64RightJamming(a->frac, c, &a->frac);
965 }
966 
967 static void frac128_shrjam(FloatParts128 *a, int c)
968 {
969     shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
970 }
971 
972 #define frac_shrjam(A, C)  FRAC_GENERIC_64_128(shrjam, A)(A, C)
973 
974 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
975 {
976     return usub64_overflow(a->frac, b->frac, &r->frac);
977 }
978 
979 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
980 {
981     bool c = 0;
982     r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
983     r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
984     return c;
985 }
986 
987 #define frac_sub(R, A, B)  FRAC_GENERIC_64_128(sub, R)(R, A, B)
988 
989 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a)
990 {
991     r->frac = a->frac_hi | (a->frac_lo != 0);
992 }
993 
994 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a)
995 {
996     r->frac_hi = a->frac_hi;
997     r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0);
998 }
999 
1000 #define frac_truncjam(R, A)  FRAC_GENERIC_64_128(truncjam, R)(R, A)
1001 
1002 #define partsN(NAME)   glue(glue(glue(parts,N),_),NAME)
1003 #define FloatPartsN    glue(FloatParts,N)
1004 #define FloatPartsW    glue(FloatParts,W)
1005 
1006 #define N 64
1007 #define W 128
1008 
1009 #include "softfloat-parts-addsub.c.inc"
1010 #include "softfloat-parts.c.inc"
1011 
1012 #undef  N
1013 #undef  W
1014 #define N 128
1015 #define W 256
1016 
1017 #include "softfloat-parts-addsub.c.inc"
1018 #include "softfloat-parts.c.inc"
1019 
1020 #undef  N
1021 #undef  W
1022 #undef  partsN
1023 #undef  FloatPartsN
1024 #undef  FloatPartsW
1025 
1026 /*
1027  * Pack/unpack routines with a specific FloatFmt.
1028  */
1029 
1030 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1031                                       float_status *s, const FloatFmt *params)
1032 {
1033     float16_unpack_raw(p, f);
1034     parts_canonicalize(p, s, params);
1035 }
1036 
1037 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1038                                      float_status *s)
1039 {
1040     float16a_unpack_canonical(p, f, s, &float16_params);
1041 }
1042 
1043 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1044                                       float_status *s)
1045 {
1046     bfloat16_unpack_raw(p, f);
1047     parts_canonicalize(p, s, &bfloat16_params);
1048 }
1049 
1050 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1051                                              float_status *s,
1052                                              const FloatFmt *params)
1053 {
1054     parts_uncanon(p, s, params);
1055     return float16_pack_raw(p);
1056 }
1057 
1058 static float16 float16_round_pack_canonical(FloatParts64 *p,
1059                                             float_status *s)
1060 {
1061     return float16a_round_pack_canonical(p, s, &float16_params);
1062 }
1063 
1064 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1065                                               float_status *s)
1066 {
1067     parts_uncanon(p, s, &bfloat16_params);
1068     return bfloat16_pack_raw(p);
1069 }
1070 
1071 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1072                                      float_status *s)
1073 {
1074     float32_unpack_raw(p, f);
1075     parts_canonicalize(p, s, &float32_params);
1076 }
1077 
1078 static float32 float32_round_pack_canonical(FloatParts64 *p,
1079                                             float_status *s)
1080 {
1081     parts_uncanon(p, s, &float32_params);
1082     return float32_pack_raw(p);
1083 }
1084 
1085 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1086                                      float_status *s)
1087 {
1088     float64_unpack_raw(p, f);
1089     parts_canonicalize(p, s, &float64_params);
1090 }
1091 
1092 static float64 float64_round_pack_canonical(FloatParts64 *p,
1093                                             float_status *s)
1094 {
1095     parts_uncanon(p, s, &float64_params);
1096     return float64_pack_raw(p);
1097 }
1098 
1099 static void float128_unpack_canonical(FloatParts128 *p, float128 f,
1100                                       float_status *s)
1101 {
1102     float128_unpack_raw(p, f);
1103     parts_canonicalize(p, s, &float128_params);
1104 }
1105 
1106 static float128 float128_round_pack_canonical(FloatParts128 *p,
1107                                               float_status *s)
1108 {
1109     parts_uncanon(p, s, &float128_params);
1110     return float128_pack_raw(p);
1111 }
1112 
1113 /*
1114  * Addition and subtraction
1115  */
1116 
1117 static float16 QEMU_FLATTEN
1118 float16_addsub(float16 a, float16 b, float_status *status, bool subtract)
1119 {
1120     FloatParts64 pa, pb, *pr;
1121 
1122     float16_unpack_canonical(&pa, a, status);
1123     float16_unpack_canonical(&pb, b, status);
1124     pr = parts_addsub(&pa, &pb, status, subtract);
1125 
1126     return float16_round_pack_canonical(pr, status);
1127 }
1128 
1129 float16 float16_add(float16 a, float16 b, float_status *status)
1130 {
1131     return float16_addsub(a, b, status, false);
1132 }
1133 
1134 float16 float16_sub(float16 a, float16 b, float_status *status)
1135 {
1136     return float16_addsub(a, b, status, true);
1137 }
1138 
1139 static float32 QEMU_SOFTFLOAT_ATTR
1140 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract)
1141 {
1142     FloatParts64 pa, pb, *pr;
1143 
1144     float32_unpack_canonical(&pa, a, status);
1145     float32_unpack_canonical(&pb, b, status);
1146     pr = parts_addsub(&pa, &pb, status, subtract);
1147 
1148     return float32_round_pack_canonical(pr, status);
1149 }
1150 
1151 static float32 soft_f32_add(float32 a, float32 b, float_status *status)
1152 {
1153     return soft_f32_addsub(a, b, status, false);
1154 }
1155 
1156 static float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1157 {
1158     return soft_f32_addsub(a, b, status, true);
1159 }
1160 
1161 static float64 QEMU_SOFTFLOAT_ATTR
1162 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract)
1163 {
1164     FloatParts64 pa, pb, *pr;
1165 
1166     float64_unpack_canonical(&pa, a, status);
1167     float64_unpack_canonical(&pb, b, status);
1168     pr = parts_addsub(&pa, &pb, status, subtract);
1169 
1170     return float64_round_pack_canonical(pr, status);
1171 }
1172 
1173 static float64 soft_f64_add(float64 a, float64 b, float_status *status)
1174 {
1175     return soft_f64_addsub(a, b, status, false);
1176 }
1177 
1178 static float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1179 {
1180     return soft_f64_addsub(a, b, status, true);
1181 }
1182 
1183 static float hard_f32_add(float a, float b)
1184 {
1185     return a + b;
1186 }
1187 
1188 static float hard_f32_sub(float a, float b)
1189 {
1190     return a - b;
1191 }
1192 
1193 static double hard_f64_add(double a, double b)
1194 {
1195     return a + b;
1196 }
1197 
1198 static double hard_f64_sub(double a, double b)
1199 {
1200     return a - b;
1201 }
1202 
1203 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1204 {
1205     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1206         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1207     }
1208     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1209 }
1210 
1211 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1212 {
1213     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1214         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1215     } else {
1216         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1217     }
1218 }
1219 
1220 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1221                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1222 {
1223     return float32_gen2(a, b, s, hard, soft,
1224                         f32_is_zon2, f32_addsubmul_post);
1225 }
1226 
1227 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1228                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1229 {
1230     return float64_gen2(a, b, s, hard, soft,
1231                         f64_is_zon2, f64_addsubmul_post);
1232 }
1233 
1234 float32 QEMU_FLATTEN
1235 float32_add(float32 a, float32 b, float_status *s)
1236 {
1237     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1238 }
1239 
1240 float32 QEMU_FLATTEN
1241 float32_sub(float32 a, float32 b, float_status *s)
1242 {
1243     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1244 }
1245 
1246 float64 QEMU_FLATTEN
1247 float64_add(float64 a, float64 b, float_status *s)
1248 {
1249     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1250 }
1251 
1252 float64 QEMU_FLATTEN
1253 float64_sub(float64 a, float64 b, float_status *s)
1254 {
1255     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1256 }
1257 
1258 static bfloat16 QEMU_FLATTEN
1259 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract)
1260 {
1261     FloatParts64 pa, pb, *pr;
1262 
1263     bfloat16_unpack_canonical(&pa, a, status);
1264     bfloat16_unpack_canonical(&pb, b, status);
1265     pr = parts_addsub(&pa, &pb, status, subtract);
1266 
1267     return bfloat16_round_pack_canonical(pr, status);
1268 }
1269 
1270 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1271 {
1272     return bfloat16_addsub(a, b, status, false);
1273 }
1274 
1275 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1276 {
1277     return bfloat16_addsub(a, b, status, true);
1278 }
1279 
1280 static float128 QEMU_FLATTEN
1281 float128_addsub(float128 a, float128 b, float_status *status, bool subtract)
1282 {
1283     FloatParts128 pa, pb, *pr;
1284 
1285     float128_unpack_canonical(&pa, a, status);
1286     float128_unpack_canonical(&pb, b, status);
1287     pr = parts_addsub(&pa, &pb, status, subtract);
1288 
1289     return float128_round_pack_canonical(pr, status);
1290 }
1291 
1292 float128 float128_add(float128 a, float128 b, float_status *status)
1293 {
1294     return float128_addsub(a, b, status, false);
1295 }
1296 
1297 float128 float128_sub(float128 a, float128 b, float_status *status)
1298 {
1299     return float128_addsub(a, b, status, true);
1300 }
1301 
1302 /*
1303  * Multiplication
1304  */
1305 
1306 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1307 {
1308     FloatParts64 pa, pb, *pr;
1309 
1310     float16_unpack_canonical(&pa, a, status);
1311     float16_unpack_canonical(&pb, b, status);
1312     pr = parts_mul(&pa, &pb, status);
1313 
1314     return float16_round_pack_canonical(pr, status);
1315 }
1316 
1317 static float32 QEMU_SOFTFLOAT_ATTR
1318 soft_f32_mul(float32 a, float32 b, float_status *status)
1319 {
1320     FloatParts64 pa, pb, *pr;
1321 
1322     float32_unpack_canonical(&pa, a, status);
1323     float32_unpack_canonical(&pb, b, status);
1324     pr = parts_mul(&pa, &pb, status);
1325 
1326     return float32_round_pack_canonical(pr, status);
1327 }
1328 
1329 static float64 QEMU_SOFTFLOAT_ATTR
1330 soft_f64_mul(float64 a, float64 b, float_status *status)
1331 {
1332     FloatParts64 pa, pb, *pr;
1333 
1334     float64_unpack_canonical(&pa, a, status);
1335     float64_unpack_canonical(&pb, b, status);
1336     pr = parts_mul(&pa, &pb, status);
1337 
1338     return float64_round_pack_canonical(pr, status);
1339 }
1340 
1341 static float hard_f32_mul(float a, float b)
1342 {
1343     return a * b;
1344 }
1345 
1346 static double hard_f64_mul(double a, double b)
1347 {
1348     return a * b;
1349 }
1350 
1351 float32 QEMU_FLATTEN
1352 float32_mul(float32 a, float32 b, float_status *s)
1353 {
1354     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1355                         f32_is_zon2, f32_addsubmul_post);
1356 }
1357 
1358 float64 QEMU_FLATTEN
1359 float64_mul(float64 a, float64 b, float_status *s)
1360 {
1361     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1362                         f64_is_zon2, f64_addsubmul_post);
1363 }
1364 
1365 bfloat16 QEMU_FLATTEN
1366 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1367 {
1368     FloatParts64 pa, pb, *pr;
1369 
1370     bfloat16_unpack_canonical(&pa, a, status);
1371     bfloat16_unpack_canonical(&pb, b, status);
1372     pr = parts_mul(&pa, &pb, status);
1373 
1374     return bfloat16_round_pack_canonical(pr, status);
1375 }
1376 
1377 float128 QEMU_FLATTEN
1378 float128_mul(float128 a, float128 b, float_status *status)
1379 {
1380     FloatParts128 pa, pb, *pr;
1381 
1382     float128_unpack_canonical(&pa, a, status);
1383     float128_unpack_canonical(&pb, b, status);
1384     pr = parts_mul(&pa, &pb, status);
1385 
1386     return float128_round_pack_canonical(pr, status);
1387 }
1388 
1389 /*
1390  * Returns the result of multiplying the floating-point values `a' and
1391  * `b' then adding 'c', with no intermediate rounding step after the
1392  * multiplication. The operation is performed according to the
1393  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1394  * The flags argument allows the caller to select negation of the
1395  * addend, the intermediate product, or the final result. (The
1396  * difference between this and having the caller do a separate
1397  * negation is that negating externally will flip the sign bit on
1398  * NaNs.)
1399  */
1400 
1401 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1402                                 int flags, float_status *s)
1403 {
1404     bool inf_zero, p_sign;
1405     bool sign_flip = flags & float_muladd_negate_result;
1406     FloatClass p_class;
1407     uint64_t hi, lo;
1408     int p_exp;
1409     int ab_mask, abc_mask;
1410 
1411     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1412     abc_mask = float_cmask(c.cls) | ab_mask;
1413     inf_zero = ab_mask == float_cmask_infzero;
1414 
1415     /* It is implementation-defined whether the cases of (0,inf,qnan)
1416      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1417      * they return if they do), so we have to hand this information
1418      * off to the target-specific pick-a-NaN routine.
1419      */
1420     if (unlikely(abc_mask & float_cmask_anynan)) {
1421         return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask);
1422     }
1423 
1424     if (inf_zero) {
1425         float_raise(float_flag_invalid, s);
1426         parts_default_nan(&a, s);
1427         return a;
1428     }
1429 
1430     if (flags & float_muladd_negate_c) {
1431         c.sign ^= 1;
1432     }
1433 
1434     p_sign = a.sign ^ b.sign;
1435 
1436     if (flags & float_muladd_negate_product) {
1437         p_sign ^= 1;
1438     }
1439 
1440     if (ab_mask & float_cmask_inf) {
1441         p_class = float_class_inf;
1442     } else if (ab_mask & float_cmask_zero) {
1443         p_class = float_class_zero;
1444     } else {
1445         p_class = float_class_normal;
1446     }
1447 
1448     if (c.cls == float_class_inf) {
1449         if (p_class == float_class_inf && p_sign != c.sign) {
1450             float_raise(float_flag_invalid, s);
1451             parts_default_nan(&c, s);
1452         } else {
1453             c.sign ^= sign_flip;
1454         }
1455         return c;
1456     }
1457 
1458     if (p_class == float_class_inf) {
1459         a.cls = float_class_inf;
1460         a.sign = p_sign ^ sign_flip;
1461         return a;
1462     }
1463 
1464     if (p_class == float_class_zero) {
1465         if (c.cls == float_class_zero) {
1466             if (p_sign != c.sign) {
1467                 p_sign = s->float_rounding_mode == float_round_down;
1468             }
1469             c.sign = p_sign;
1470         } else if (flags & float_muladd_halve_result) {
1471             c.exp -= 1;
1472         }
1473         c.sign ^= sign_flip;
1474         return c;
1475     }
1476 
1477     /* a & b should be normals now... */
1478     assert(a.cls == float_class_normal &&
1479            b.cls == float_class_normal);
1480 
1481     p_exp = a.exp + b.exp;
1482 
1483     mul64To128(a.frac, b.frac, &hi, &lo);
1484 
1485     /* Renormalize to the msb. */
1486     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1487         p_exp += 1;
1488     } else {
1489         shortShift128Left(hi, lo, 1, &hi, &lo);
1490     }
1491 
1492     /* + add/sub */
1493     if (c.cls != float_class_zero) {
1494         int exp_diff = p_exp - c.exp;
1495         if (p_sign == c.sign) {
1496             /* Addition */
1497             if (exp_diff <= 0) {
1498                 shift64RightJamming(hi, -exp_diff, &hi);
1499                 p_exp = c.exp;
1500                 if (uadd64_overflow(hi, c.frac, &hi)) {
1501                     shift64RightJamming(hi, 1, &hi);
1502                     hi |= DECOMPOSED_IMPLICIT_BIT;
1503                     p_exp += 1;
1504                 }
1505             } else {
1506                 uint64_t c_hi, c_lo, over;
1507                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1508                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1509                 if (over) {
1510                     shift64RightJamming(hi, 1, &hi);
1511                     hi |= DECOMPOSED_IMPLICIT_BIT;
1512                     p_exp += 1;
1513                 }
1514             }
1515         } else {
1516             /* Subtraction */
1517             uint64_t c_hi = c.frac, c_lo = 0;
1518 
1519             if (exp_diff <= 0) {
1520                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1521                 if (exp_diff == 0
1522                     &&
1523                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1524                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1525                 } else {
1526                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1527                     p_sign ^= 1;
1528                     p_exp = c.exp;
1529                 }
1530             } else {
1531                 shift128RightJamming(c_hi, c_lo,
1532                                      exp_diff,
1533                                      &c_hi, &c_lo);
1534                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1535             }
1536 
1537             if (hi == 0 && lo == 0) {
1538                 a.cls = float_class_zero;
1539                 a.sign = s->float_rounding_mode == float_round_down;
1540                 a.sign ^= sign_flip;
1541                 return a;
1542             } else {
1543                 int shift;
1544                 if (hi != 0) {
1545                     shift = clz64(hi);
1546                 } else {
1547                     shift = clz64(lo) + 64;
1548                 }
1549                 /* Normalizing to a binary point of 124 is the
1550                    correct adjust for the exponent.  However since we're
1551                    shifting, we might as well put the binary point back
1552                    at 63 where we really want it.  Therefore shift as
1553                    if we're leaving 1 bit at the top of the word, but
1554                    adjust the exponent as if we're leaving 3 bits.  */
1555                 shift128Left(hi, lo, shift, &hi, &lo);
1556                 p_exp -= shift;
1557             }
1558         }
1559     }
1560     hi |= (lo != 0);
1561 
1562     if (flags & float_muladd_halve_result) {
1563         p_exp -= 1;
1564     }
1565 
1566     /* finally prepare our result */
1567     a.cls = float_class_normal;
1568     a.sign = p_sign ^ sign_flip;
1569     a.exp = p_exp;
1570     a.frac = hi;
1571 
1572     return a;
1573 }
1574 
1575 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1576                                                 int flags, float_status *status)
1577 {
1578     FloatParts64 pa, pb, pc, pr;
1579 
1580     float16_unpack_canonical(&pa, a, status);
1581     float16_unpack_canonical(&pb, b, status);
1582     float16_unpack_canonical(&pc, c, status);
1583     pr = muladd_floats(pa, pb, pc, flags, status);
1584 
1585     return float16_round_pack_canonical(&pr, status);
1586 }
1587 
1588 static float32 QEMU_SOFTFLOAT_ATTR
1589 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1590                 float_status *status)
1591 {
1592     FloatParts64 pa, pb, pc, pr;
1593 
1594     float32_unpack_canonical(&pa, a, status);
1595     float32_unpack_canonical(&pb, b, status);
1596     float32_unpack_canonical(&pc, c, status);
1597     pr = muladd_floats(pa, pb, pc, flags, status);
1598 
1599     return float32_round_pack_canonical(&pr, status);
1600 }
1601 
1602 static float64 QEMU_SOFTFLOAT_ATTR
1603 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1604                 float_status *status)
1605 {
1606     FloatParts64 pa, pb, pc, pr;
1607 
1608     float64_unpack_canonical(&pa, a, status);
1609     float64_unpack_canonical(&pb, b, status);
1610     float64_unpack_canonical(&pc, c, status);
1611     pr = muladd_floats(pa, pb, pc, flags, status);
1612 
1613     return float64_round_pack_canonical(&pr, status);
1614 }
1615 
1616 static bool force_soft_fma;
1617 
1618 float32 QEMU_FLATTEN
1619 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1620 {
1621     union_float32 ua, ub, uc, ur;
1622 
1623     ua.s = xa;
1624     ub.s = xb;
1625     uc.s = xc;
1626 
1627     if (unlikely(!can_use_fpu(s))) {
1628         goto soft;
1629     }
1630     if (unlikely(flags & float_muladd_halve_result)) {
1631         goto soft;
1632     }
1633 
1634     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1635     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1636         goto soft;
1637     }
1638 
1639     if (unlikely(force_soft_fma)) {
1640         goto soft;
1641     }
1642 
1643     /*
1644      * When (a || b) == 0, there's no need to check for under/over flow,
1645      * since we know the addend is (normal || 0) and the product is 0.
1646      */
1647     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1648         union_float32 up;
1649         bool prod_sign;
1650 
1651         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1652         prod_sign ^= !!(flags & float_muladd_negate_product);
1653         up.s = float32_set_sign(float32_zero, prod_sign);
1654 
1655         if (flags & float_muladd_negate_c) {
1656             uc.h = -uc.h;
1657         }
1658         ur.h = up.h + uc.h;
1659     } else {
1660         union_float32 ua_orig = ua;
1661         union_float32 uc_orig = uc;
1662 
1663         if (flags & float_muladd_negate_product) {
1664             ua.h = -ua.h;
1665         }
1666         if (flags & float_muladd_negate_c) {
1667             uc.h = -uc.h;
1668         }
1669 
1670         ur.h = fmaf(ua.h, ub.h, uc.h);
1671 
1672         if (unlikely(f32_is_inf(ur))) {
1673             float_raise(float_flag_overflow, s);
1674         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1675             ua = ua_orig;
1676             uc = uc_orig;
1677             goto soft;
1678         }
1679     }
1680     if (flags & float_muladd_negate_result) {
1681         return float32_chs(ur.s);
1682     }
1683     return ur.s;
1684 
1685  soft:
1686     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1687 }
1688 
1689 float64 QEMU_FLATTEN
1690 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1691 {
1692     union_float64 ua, ub, uc, ur;
1693 
1694     ua.s = xa;
1695     ub.s = xb;
1696     uc.s = xc;
1697 
1698     if (unlikely(!can_use_fpu(s))) {
1699         goto soft;
1700     }
1701     if (unlikely(flags & float_muladd_halve_result)) {
1702         goto soft;
1703     }
1704 
1705     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1706     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1707         goto soft;
1708     }
1709 
1710     if (unlikely(force_soft_fma)) {
1711         goto soft;
1712     }
1713 
1714     /*
1715      * When (a || b) == 0, there's no need to check for under/over flow,
1716      * since we know the addend is (normal || 0) and the product is 0.
1717      */
1718     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1719         union_float64 up;
1720         bool prod_sign;
1721 
1722         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1723         prod_sign ^= !!(flags & float_muladd_negate_product);
1724         up.s = float64_set_sign(float64_zero, prod_sign);
1725 
1726         if (flags & float_muladd_negate_c) {
1727             uc.h = -uc.h;
1728         }
1729         ur.h = up.h + uc.h;
1730     } else {
1731         union_float64 ua_orig = ua;
1732         union_float64 uc_orig = uc;
1733 
1734         if (flags & float_muladd_negate_product) {
1735             ua.h = -ua.h;
1736         }
1737         if (flags & float_muladd_negate_c) {
1738             uc.h = -uc.h;
1739         }
1740 
1741         ur.h = fma(ua.h, ub.h, uc.h);
1742 
1743         if (unlikely(f64_is_inf(ur))) {
1744             float_raise(float_flag_overflow, s);
1745         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1746             ua = ua_orig;
1747             uc = uc_orig;
1748             goto soft;
1749         }
1750     }
1751     if (flags & float_muladd_negate_result) {
1752         return float64_chs(ur.s);
1753     }
1754     return ur.s;
1755 
1756  soft:
1757     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1758 }
1759 
1760 /*
1761  * Returns the result of multiplying the bfloat16 values `a'
1762  * and `b' then adding 'c', with no intermediate rounding step after the
1763  * multiplication.
1764  */
1765 
1766 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1767                                       int flags, float_status *status)
1768 {
1769     FloatParts64 pa, pb, pc, pr;
1770 
1771     bfloat16_unpack_canonical(&pa, a, status);
1772     bfloat16_unpack_canonical(&pb, b, status);
1773     bfloat16_unpack_canonical(&pc, c, status);
1774     pr = muladd_floats(pa, pb, pc, flags, status);
1775 
1776     return bfloat16_round_pack_canonical(&pr, status);
1777 }
1778 
1779 /*
1780  * Returns the result of dividing the floating-point value `a' by the
1781  * corresponding value `b'. The operation is performed according to
1782  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1783  */
1784 
1785 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1786 {
1787     bool sign = a.sign ^ b.sign;
1788 
1789     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1790         uint64_t n0, n1, q, r;
1791         int exp = a.exp - b.exp;
1792 
1793         /*
1794          * We want a 2*N / N-bit division to produce exactly an N-bit
1795          * result, so that we do not lose any precision and so that we
1796          * do not have to renormalize afterward.  If A.frac < B.frac,
1797          * then division would produce an (N-1)-bit result; shift A left
1798          * by one to produce the an N-bit result, and decrement the
1799          * exponent to match.
1800          *
1801          * The udiv_qrnnd algorithm that we're using requires normalization,
1802          * i.e. the msb of the denominator must be set, which is already true.
1803          */
1804         if (a.frac < b.frac) {
1805             exp -= 1;
1806             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1807         } else {
1808             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1809         }
1810         q = udiv_qrnnd(&r, n1, n0, b.frac);
1811 
1812         /* Set lsb if there is a remainder, to set inexact. */
1813         a.frac = q | (r != 0);
1814         a.sign = sign;
1815         a.exp = exp;
1816         return a;
1817     }
1818     /* handle all the NaN cases */
1819     if (is_nan(a.cls) || is_nan(b.cls)) {
1820         return *parts_pick_nan(&a, &b, s);
1821     }
1822     /* 0/0 or Inf/Inf */
1823     if (a.cls == b.cls
1824         &&
1825         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1826         float_raise(float_flag_invalid, s);
1827         parts_default_nan(&a, s);
1828         return a;
1829     }
1830     /* Inf / x or 0 / x */
1831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1832         a.sign = sign;
1833         return a;
1834     }
1835     /* Div 0 => Inf */
1836     if (b.cls == float_class_zero) {
1837         float_raise(float_flag_divbyzero, s);
1838         a.cls = float_class_inf;
1839         a.sign = sign;
1840         return a;
1841     }
1842     /* Div by Inf */
1843     if (b.cls == float_class_inf) {
1844         a.cls = float_class_zero;
1845         a.sign = sign;
1846         return a;
1847     }
1848     g_assert_not_reached();
1849 }
1850 
1851 float16 float16_div(float16 a, float16 b, float_status *status)
1852 {
1853     FloatParts64 pa, pb, pr;
1854 
1855     float16_unpack_canonical(&pa, a, status);
1856     float16_unpack_canonical(&pb, b, status);
1857     pr = div_floats(pa, pb, status);
1858 
1859     return float16_round_pack_canonical(&pr, status);
1860 }
1861 
1862 static float32 QEMU_SOFTFLOAT_ATTR
1863 soft_f32_div(float32 a, float32 b, float_status *status)
1864 {
1865     FloatParts64 pa, pb, pr;
1866 
1867     float32_unpack_canonical(&pa, a, status);
1868     float32_unpack_canonical(&pb, b, status);
1869     pr = div_floats(pa, pb, status);
1870 
1871     return float32_round_pack_canonical(&pr, status);
1872 }
1873 
1874 static float64 QEMU_SOFTFLOAT_ATTR
1875 soft_f64_div(float64 a, float64 b, float_status *status)
1876 {
1877     FloatParts64 pa, pb, pr;
1878 
1879     float64_unpack_canonical(&pa, a, status);
1880     float64_unpack_canonical(&pb, b, status);
1881     pr = div_floats(pa, pb, status);
1882 
1883     return float64_round_pack_canonical(&pr, status);
1884 }
1885 
1886 static float hard_f32_div(float a, float b)
1887 {
1888     return a / b;
1889 }
1890 
1891 static double hard_f64_div(double a, double b)
1892 {
1893     return a / b;
1894 }
1895 
1896 static bool f32_div_pre(union_float32 a, union_float32 b)
1897 {
1898     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1899         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1900                fpclassify(b.h) == FP_NORMAL;
1901     }
1902     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1903 }
1904 
1905 static bool f64_div_pre(union_float64 a, union_float64 b)
1906 {
1907     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1908         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1909                fpclassify(b.h) == FP_NORMAL;
1910     }
1911     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1912 }
1913 
1914 static bool f32_div_post(union_float32 a, union_float32 b)
1915 {
1916     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1917         return fpclassify(a.h) != FP_ZERO;
1918     }
1919     return !float32_is_zero(a.s);
1920 }
1921 
1922 static bool f64_div_post(union_float64 a, union_float64 b)
1923 {
1924     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1925         return fpclassify(a.h) != FP_ZERO;
1926     }
1927     return !float64_is_zero(a.s);
1928 }
1929 
1930 float32 QEMU_FLATTEN
1931 float32_div(float32 a, float32 b, float_status *s)
1932 {
1933     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1934                         f32_div_pre, f32_div_post);
1935 }
1936 
1937 float64 QEMU_FLATTEN
1938 float64_div(float64 a, float64 b, float_status *s)
1939 {
1940     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1941                         f64_div_pre, f64_div_post);
1942 }
1943 
1944 /*
1945  * Returns the result of dividing the bfloat16
1946  * value `a' by the corresponding value `b'.
1947  */
1948 
1949 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1950 {
1951     FloatParts64 pa, pb, pr;
1952 
1953     bfloat16_unpack_canonical(&pa, a, status);
1954     bfloat16_unpack_canonical(&pb, b, status);
1955     pr = div_floats(pa, pb, status);
1956 
1957     return bfloat16_round_pack_canonical(&pr, status);
1958 }
1959 
1960 /*
1961  * Float to Float conversions
1962  *
1963  * Returns the result of converting one float format to another. The
1964  * conversion is performed according to the IEC/IEEE Standard for
1965  * Binary Floating-Point Arithmetic.
1966  *
1967  * The float_to_float helper only needs to take care of raising
1968  * invalid exceptions and handling the conversion on NaNs.
1969  */
1970 
1971 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1972                                  float_status *s)
1973 {
1974     if (dstf->arm_althp) {
1975         switch (a.cls) {
1976         case float_class_qnan:
1977         case float_class_snan:
1978             /* There is no NaN in the destination format.  Raise Invalid
1979              * and return a zero with the sign of the input NaN.
1980              */
1981             float_raise(float_flag_invalid, s);
1982             a.cls = float_class_zero;
1983             a.frac = 0;
1984             a.exp = 0;
1985             break;
1986 
1987         case float_class_inf:
1988             /* There is no Inf in the destination format.  Raise Invalid
1989              * and return the maximum normal with the correct sign.
1990              */
1991             float_raise(float_flag_invalid, s);
1992             a.cls = float_class_normal;
1993             a.exp = dstf->exp_max;
1994             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1995             break;
1996 
1997         default:
1998             break;
1999         }
2000     } else if (is_nan(a.cls)) {
2001         parts_return_nan(&a, s);
2002     }
2003     return a;
2004 }
2005 
2006 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
2007 {
2008     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2009     FloatParts64 pa, pr;
2010 
2011     float16a_unpack_canonical(&pa, a, s, fmt16);
2012     pr = float_to_float(pa, &float32_params, s);
2013     return float32_round_pack_canonical(&pr, s);
2014 }
2015 
2016 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2017 {
2018     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2019     FloatParts64 pa, pr;
2020 
2021     float16a_unpack_canonical(&pa, a, s, fmt16);
2022     pr = float_to_float(pa, &float64_params, s);
2023     return float64_round_pack_canonical(&pr, s);
2024 }
2025 
2026 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2027 {
2028     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2029     FloatParts64 pa, pr;
2030 
2031     float32_unpack_canonical(&pa, a, s);
2032     pr = float_to_float(pa, fmt16, s);
2033     return float16a_round_pack_canonical(&pr, s, fmt16);
2034 }
2035 
2036 static float64 QEMU_SOFTFLOAT_ATTR
2037 soft_float32_to_float64(float32 a, float_status *s)
2038 {
2039     FloatParts64 pa, pr;
2040 
2041     float32_unpack_canonical(&pa, a, s);
2042     pr = float_to_float(pa, &float64_params, s);
2043     return float64_round_pack_canonical(&pr, s);
2044 }
2045 
2046 float64 float32_to_float64(float32 a, float_status *s)
2047 {
2048     if (likely(float32_is_normal(a))) {
2049         /* Widening conversion can never produce inexact results.  */
2050         union_float32 uf;
2051         union_float64 ud;
2052         uf.s = a;
2053         ud.h = uf.h;
2054         return ud.s;
2055     } else if (float32_is_zero(a)) {
2056         return float64_set_sign(float64_zero, float32_is_neg(a));
2057     } else {
2058         return soft_float32_to_float64(a, s);
2059     }
2060 }
2061 
2062 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2063 {
2064     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2065     FloatParts64 pa, pr;
2066 
2067     float64_unpack_canonical(&pa, a, s);
2068     pr = float_to_float(pa, fmt16, s);
2069     return float16a_round_pack_canonical(&pr, s, fmt16);
2070 }
2071 
2072 float32 float64_to_float32(float64 a, float_status *s)
2073 {
2074     FloatParts64 pa, pr;
2075 
2076     float64_unpack_canonical(&pa, a, s);
2077     pr = float_to_float(pa, &float32_params, s);
2078     return float32_round_pack_canonical(&pr, s);
2079 }
2080 
2081 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2082 {
2083     FloatParts64 pa, pr;
2084 
2085     bfloat16_unpack_canonical(&pa, a, s);
2086     pr = float_to_float(pa, &float32_params, s);
2087     return float32_round_pack_canonical(&pr, s);
2088 }
2089 
2090 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2091 {
2092     FloatParts64 pa, pr;
2093 
2094     bfloat16_unpack_canonical(&pa, a, s);
2095     pr = float_to_float(pa, &float64_params, s);
2096     return float64_round_pack_canonical(&pr, s);
2097 }
2098 
2099 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2100 {
2101     FloatParts64 pa, pr;
2102 
2103     float32_unpack_canonical(&pa, a, s);
2104     pr = float_to_float(pa, &bfloat16_params, s);
2105     return bfloat16_round_pack_canonical(&pr, s);
2106 }
2107 
2108 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2109 {
2110     FloatParts64 pa, pr;
2111 
2112     float64_unpack_canonical(&pa, a, s);
2113     pr = float_to_float(pa, &bfloat16_params, s);
2114     return bfloat16_round_pack_canonical(&pr, s);
2115 }
2116 
2117 /*
2118  * Rounds the floating-point value `a' to an integer, and returns the
2119  * result as a floating-point value. The operation is performed
2120  * according to the IEC/IEEE Standard for Binary Floating-Point
2121  * Arithmetic.
2122  */
2123 
2124 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2125                                int scale, float_status *s)
2126 {
2127     switch (a.cls) {
2128     case float_class_qnan:
2129     case float_class_snan:
2130         parts_return_nan(&a, s);
2131         break;
2132 
2133     case float_class_zero:
2134     case float_class_inf:
2135         /* already "integral" */
2136         break;
2137 
2138     case float_class_normal:
2139         scale = MIN(MAX(scale, -0x10000), 0x10000);
2140         a.exp += scale;
2141 
2142         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2143             /* already integral */
2144             break;
2145         }
2146         if (a.exp < 0) {
2147             bool one;
2148             /* all fractional */
2149             float_raise(float_flag_inexact, s);
2150             switch (rmode) {
2151             case float_round_nearest_even:
2152                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2153                 break;
2154             case float_round_ties_away:
2155                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2156                 break;
2157             case float_round_to_zero:
2158                 one = false;
2159                 break;
2160             case float_round_up:
2161                 one = !a.sign;
2162                 break;
2163             case float_round_down:
2164                 one = a.sign;
2165                 break;
2166             case float_round_to_odd:
2167                 one = true;
2168                 break;
2169             default:
2170                 g_assert_not_reached();
2171             }
2172 
2173             if (one) {
2174                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2175                 a.exp = 0;
2176             } else {
2177                 a.cls = float_class_zero;
2178             }
2179         } else {
2180             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2181             uint64_t frac_lsbm1 = frac_lsb >> 1;
2182             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2183             uint64_t rnd_mask = rnd_even_mask >> 1;
2184             uint64_t inc;
2185 
2186             switch (rmode) {
2187             case float_round_nearest_even:
2188                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2189                 break;
2190             case float_round_ties_away:
2191                 inc = frac_lsbm1;
2192                 break;
2193             case float_round_to_zero:
2194                 inc = 0;
2195                 break;
2196             case float_round_up:
2197                 inc = a.sign ? 0 : rnd_mask;
2198                 break;
2199             case float_round_down:
2200                 inc = a.sign ? rnd_mask : 0;
2201                 break;
2202             case float_round_to_odd:
2203                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2204                 break;
2205             default:
2206                 g_assert_not_reached();
2207             }
2208 
2209             if (a.frac & rnd_mask) {
2210                 float_raise(float_flag_inexact, s);
2211                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2212                     a.frac >>= 1;
2213                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2214                     a.exp++;
2215                 }
2216                 a.frac &= ~rnd_mask;
2217             }
2218         }
2219         break;
2220     default:
2221         g_assert_not_reached();
2222     }
2223     return a;
2224 }
2225 
2226 float16 float16_round_to_int(float16 a, float_status *s)
2227 {
2228     FloatParts64 pa, pr;
2229 
2230     float16_unpack_canonical(&pa, a, s);
2231     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2232     return float16_round_pack_canonical(&pr, s);
2233 }
2234 
2235 float32 float32_round_to_int(float32 a, float_status *s)
2236 {
2237     FloatParts64 pa, pr;
2238 
2239     float32_unpack_canonical(&pa, a, s);
2240     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2241     return float32_round_pack_canonical(&pr, s);
2242 }
2243 
2244 float64 float64_round_to_int(float64 a, float_status *s)
2245 {
2246     FloatParts64 pa, pr;
2247 
2248     float64_unpack_canonical(&pa, a, s);
2249     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2250     return float64_round_pack_canonical(&pr, s);
2251 }
2252 
2253 /*
2254  * Rounds the bfloat16 value `a' to an integer, and returns the
2255  * result as a bfloat16 value.
2256  */
2257 
2258 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2259 {
2260     FloatParts64 pa, pr;
2261 
2262     bfloat16_unpack_canonical(&pa, a, s);
2263     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2264     return bfloat16_round_pack_canonical(&pr, s);
2265 }
2266 
2267 /*
2268  * Returns the result of converting the floating-point value `a' to
2269  * the two's complement integer format. The conversion is performed
2270  * according to the IEC/IEEE Standard for Binary Floating-Point
2271  * Arithmetic---which means in particular that the conversion is
2272  * rounded according to the current rounding mode. If `a' is a NaN,
2273  * the largest positive integer is returned. Otherwise, if the
2274  * conversion overflows, the largest integer with the same sign as `a'
2275  * is returned.
2276 */
2277 
2278 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2279                                      int scale, int64_t min, int64_t max,
2280                                      float_status *s)
2281 {
2282     uint64_t r;
2283     int orig_flags = get_float_exception_flags(s);
2284     FloatParts64 p = round_to_int(in, rmode, scale, s);
2285 
2286     switch (p.cls) {
2287     case float_class_snan:
2288     case float_class_qnan:
2289         s->float_exception_flags = orig_flags | float_flag_invalid;
2290         return max;
2291     case float_class_inf:
2292         s->float_exception_flags = orig_flags | float_flag_invalid;
2293         return p.sign ? min : max;
2294     case float_class_zero:
2295         return 0;
2296     case float_class_normal:
2297         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2298             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2299         } else {
2300             r = UINT64_MAX;
2301         }
2302         if (p.sign) {
2303             if (r <= -(uint64_t) min) {
2304                 return -r;
2305             } else {
2306                 s->float_exception_flags = orig_flags | float_flag_invalid;
2307                 return min;
2308             }
2309         } else {
2310             if (r <= max) {
2311                 return r;
2312             } else {
2313                 s->float_exception_flags = orig_flags | float_flag_invalid;
2314                 return max;
2315             }
2316         }
2317     default:
2318         g_assert_not_reached();
2319     }
2320 }
2321 
2322 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2323                               float_status *s)
2324 {
2325     FloatParts64 p;
2326 
2327     float16_unpack_canonical(&p, a, s);
2328     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2329 }
2330 
2331 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2332                                 float_status *s)
2333 {
2334     FloatParts64 p;
2335 
2336     float16_unpack_canonical(&p, a, s);
2337     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2338 }
2339 
2340 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2341                                 float_status *s)
2342 {
2343     FloatParts64 p;
2344 
2345     float16_unpack_canonical(&p, a, s);
2346     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2347 }
2348 
2349 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2350                                 float_status *s)
2351 {
2352     FloatParts64 p;
2353 
2354     float16_unpack_canonical(&p, a, s);
2355     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2356 }
2357 
2358 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2359                                 float_status *s)
2360 {
2361     FloatParts64 p;
2362 
2363     float32_unpack_canonical(&p, a, s);
2364     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2365 }
2366 
2367 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2368                                 float_status *s)
2369 {
2370     FloatParts64 p;
2371 
2372     float32_unpack_canonical(&p, a, s);
2373     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2374 }
2375 
2376 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2377                                 float_status *s)
2378 {
2379     FloatParts64 p;
2380 
2381     float32_unpack_canonical(&p, a, s);
2382     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2383 }
2384 
2385 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2386                                 float_status *s)
2387 {
2388     FloatParts64 p;
2389 
2390     float64_unpack_canonical(&p, a, s);
2391     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2392 }
2393 
2394 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2395                                 float_status *s)
2396 {
2397     FloatParts64 p;
2398 
2399     float64_unpack_canonical(&p, a, s);
2400     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2401 }
2402 
2403 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2404                                 float_status *s)
2405 {
2406     FloatParts64 p;
2407 
2408     float64_unpack_canonical(&p, a, s);
2409     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2410 }
2411 
2412 int8_t float16_to_int8(float16 a, float_status *s)
2413 {
2414     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2415 }
2416 
2417 int16_t float16_to_int16(float16 a, float_status *s)
2418 {
2419     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2420 }
2421 
2422 int32_t float16_to_int32(float16 a, float_status *s)
2423 {
2424     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2425 }
2426 
2427 int64_t float16_to_int64(float16 a, float_status *s)
2428 {
2429     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2430 }
2431 
2432 int16_t float32_to_int16(float32 a, float_status *s)
2433 {
2434     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2435 }
2436 
2437 int32_t float32_to_int32(float32 a, float_status *s)
2438 {
2439     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2440 }
2441 
2442 int64_t float32_to_int64(float32 a, float_status *s)
2443 {
2444     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2445 }
2446 
2447 int16_t float64_to_int16(float64 a, float_status *s)
2448 {
2449     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2450 }
2451 
2452 int32_t float64_to_int32(float64 a, float_status *s)
2453 {
2454     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2455 }
2456 
2457 int64_t float64_to_int64(float64 a, float_status *s)
2458 {
2459     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2460 }
2461 
2462 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2463 {
2464     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2465 }
2466 
2467 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2468 {
2469     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2470 }
2471 
2472 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2473 {
2474     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2475 }
2476 
2477 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2478 {
2479     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2480 }
2481 
2482 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2483 {
2484     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2485 }
2486 
2487 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2488 {
2489     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2490 }
2491 
2492 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2493 {
2494     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2495 }
2496 
2497 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2498 {
2499     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2500 }
2501 
2502 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2503 {
2504     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2505 }
2506 
2507 /*
2508  * Returns the result of converting the floating-point value `a' to
2509  * the two's complement integer format.
2510  */
2511 
2512 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2513                                  float_status *s)
2514 {
2515     FloatParts64 p;
2516 
2517     bfloat16_unpack_canonical(&p, a, s);
2518     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2519 }
2520 
2521 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2522                                  float_status *s)
2523 {
2524     FloatParts64 p;
2525 
2526     bfloat16_unpack_canonical(&p, a, s);
2527     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2528 }
2529 
2530 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2531                                  float_status *s)
2532 {
2533     FloatParts64 p;
2534 
2535     bfloat16_unpack_canonical(&p, a, s);
2536     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2537 }
2538 
2539 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2540 {
2541     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2542 }
2543 
2544 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2545 {
2546     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2547 }
2548 
2549 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2550 {
2551     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2552 }
2553 
2554 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2555 {
2556     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2557 }
2558 
2559 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2560 {
2561     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2562 }
2563 
2564 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2565 {
2566     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2567 }
2568 
2569 /*
2570  *  Returns the result of converting the floating-point value `a' to
2571  *  the unsigned integer format. The conversion is performed according
2572  *  to the IEC/IEEE Standard for Binary Floating-Point
2573  *  Arithmetic---which means in particular that the conversion is
2574  *  rounded according to the current rounding mode. If `a' is a NaN,
2575  *  the largest unsigned integer is returned. Otherwise, if the
2576  *  conversion overflows, the largest unsigned integer is returned. If
2577  *  the 'a' is negative, the result is rounded and zero is returned;
2578  *  values that do not round to zero will raise the inexact exception
2579  *  flag.
2580  */
2581 
2582 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2583                                        int scale, uint64_t max,
2584                                        float_status *s)
2585 {
2586     int orig_flags = get_float_exception_flags(s);
2587     FloatParts64 p = round_to_int(in, rmode, scale, s);
2588     uint64_t r;
2589 
2590     switch (p.cls) {
2591     case float_class_snan:
2592     case float_class_qnan:
2593         s->float_exception_flags = orig_flags | float_flag_invalid;
2594         return max;
2595     case float_class_inf:
2596         s->float_exception_flags = orig_flags | float_flag_invalid;
2597         return p.sign ? 0 : max;
2598     case float_class_zero:
2599         return 0;
2600     case float_class_normal:
2601         if (p.sign) {
2602             s->float_exception_flags = orig_flags | float_flag_invalid;
2603             return 0;
2604         }
2605 
2606         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2607             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2608         } else {
2609             s->float_exception_flags = orig_flags | float_flag_invalid;
2610             return max;
2611         }
2612 
2613         /* For uint64 this will never trip, but if p.exp is too large
2614          * to shift a decomposed fraction we shall have exited via the
2615          * 3rd leg above.
2616          */
2617         if (r > max) {
2618             s->float_exception_flags = orig_flags | float_flag_invalid;
2619             return max;
2620         }
2621         return r;
2622     default:
2623         g_assert_not_reached();
2624     }
2625 }
2626 
2627 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2628                                 float_status *s)
2629 {
2630     FloatParts64 p;
2631 
2632     float16_unpack_canonical(&p, a, s);
2633     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2634 }
2635 
2636 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2637                                   float_status *s)
2638 {
2639     FloatParts64 p;
2640 
2641     float16_unpack_canonical(&p, a, s);
2642     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2643 }
2644 
2645 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2646                                   float_status *s)
2647 {
2648     FloatParts64 p;
2649 
2650     float16_unpack_canonical(&p, a, s);
2651     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2652 }
2653 
2654 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2655                                   float_status *s)
2656 {
2657     FloatParts64 p;
2658 
2659     float16_unpack_canonical(&p, a, s);
2660     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2661 }
2662 
2663 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2664                                   float_status *s)
2665 {
2666     FloatParts64 p;
2667 
2668     float32_unpack_canonical(&p, a, s);
2669     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2670 }
2671 
2672 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2673                                   float_status *s)
2674 {
2675     FloatParts64 p;
2676 
2677     float32_unpack_canonical(&p, a, s);
2678     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2679 }
2680 
2681 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2682                                   float_status *s)
2683 {
2684     FloatParts64 p;
2685 
2686     float32_unpack_canonical(&p, a, s);
2687     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2688 }
2689 
2690 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2691                                   float_status *s)
2692 {
2693     FloatParts64 p;
2694 
2695     float64_unpack_canonical(&p, a, s);
2696     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2697 }
2698 
2699 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2700                                   float_status *s)
2701 {
2702     FloatParts64 p;
2703 
2704     float64_unpack_canonical(&p, a, s);
2705     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2706 }
2707 
2708 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2709                                   float_status *s)
2710 {
2711     FloatParts64 p;
2712 
2713     float64_unpack_canonical(&p, a, s);
2714     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2715 }
2716 
2717 uint8_t float16_to_uint8(float16 a, float_status *s)
2718 {
2719     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2720 }
2721 
2722 uint16_t float16_to_uint16(float16 a, float_status *s)
2723 {
2724     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2725 }
2726 
2727 uint32_t float16_to_uint32(float16 a, float_status *s)
2728 {
2729     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2730 }
2731 
2732 uint64_t float16_to_uint64(float16 a, float_status *s)
2733 {
2734     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2735 }
2736 
2737 uint16_t float32_to_uint16(float32 a, float_status *s)
2738 {
2739     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2740 }
2741 
2742 uint32_t float32_to_uint32(float32 a, float_status *s)
2743 {
2744     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2745 }
2746 
2747 uint64_t float32_to_uint64(float32 a, float_status *s)
2748 {
2749     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2750 }
2751 
2752 uint16_t float64_to_uint16(float64 a, float_status *s)
2753 {
2754     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2755 }
2756 
2757 uint32_t float64_to_uint32(float64 a, float_status *s)
2758 {
2759     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2760 }
2761 
2762 uint64_t float64_to_uint64(float64 a, float_status *s)
2763 {
2764     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2765 }
2766 
2767 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2768 {
2769     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2770 }
2771 
2772 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2773 {
2774     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2775 }
2776 
2777 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2778 {
2779     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2780 }
2781 
2782 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2783 {
2784     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2785 }
2786 
2787 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2788 {
2789     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2790 }
2791 
2792 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2793 {
2794     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2795 }
2796 
2797 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2798 {
2799     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2800 }
2801 
2802 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2803 {
2804     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2805 }
2806 
2807 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2808 {
2809     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2810 }
2811 
2812 /*
2813  *  Returns the result of converting the bfloat16 value `a' to
2814  *  the unsigned integer format.
2815  */
2816 
2817 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2818                                    int scale, float_status *s)
2819 {
2820     FloatParts64 p;
2821 
2822     bfloat16_unpack_canonical(&p, a, s);
2823     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2824 }
2825 
2826 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2827                                    int scale, float_status *s)
2828 {
2829     FloatParts64 p;
2830 
2831     bfloat16_unpack_canonical(&p, a, s);
2832     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2833 }
2834 
2835 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2836                                    int scale, float_status *s)
2837 {
2838     FloatParts64 p;
2839 
2840     bfloat16_unpack_canonical(&p, a, s);
2841     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2842 }
2843 
2844 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2845 {
2846     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2847 }
2848 
2849 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2850 {
2851     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2852 }
2853 
2854 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2855 {
2856     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2857 }
2858 
2859 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2860 {
2861     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2862 }
2863 
2864 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2865 {
2866     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2867 }
2868 
2869 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2870 {
2871     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2872 }
2873 
2874 /*
2875  * Integer to float conversions
2876  *
2877  * Returns the result of converting the two's complement integer `a'
2878  * to the floating-point format. The conversion is performed according
2879  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2880  */
2881 
2882 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2883 {
2884     FloatParts64 r = { .sign = false };
2885 
2886     if (a == 0) {
2887         r.cls = float_class_zero;
2888     } else {
2889         uint64_t f = a;
2890         int shift;
2891 
2892         r.cls = float_class_normal;
2893         if (a < 0) {
2894             f = -f;
2895             r.sign = true;
2896         }
2897         shift = clz64(f);
2898         scale = MIN(MAX(scale, -0x10000), 0x10000);
2899 
2900         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2901         r.frac = f << shift;
2902     }
2903 
2904     return r;
2905 }
2906 
2907 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2908 {
2909     FloatParts64 pa = int_to_float(a, scale, status);
2910     return float16_round_pack_canonical(&pa, status);
2911 }
2912 
2913 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2914 {
2915     return int64_to_float16_scalbn(a, scale, status);
2916 }
2917 
2918 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2919 {
2920     return int64_to_float16_scalbn(a, scale, status);
2921 }
2922 
2923 float16 int64_to_float16(int64_t a, float_status *status)
2924 {
2925     return int64_to_float16_scalbn(a, 0, status);
2926 }
2927 
2928 float16 int32_to_float16(int32_t a, float_status *status)
2929 {
2930     return int64_to_float16_scalbn(a, 0, status);
2931 }
2932 
2933 float16 int16_to_float16(int16_t a, float_status *status)
2934 {
2935     return int64_to_float16_scalbn(a, 0, status);
2936 }
2937 
2938 float16 int8_to_float16(int8_t a, float_status *status)
2939 {
2940     return int64_to_float16_scalbn(a, 0, status);
2941 }
2942 
2943 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2944 {
2945     FloatParts64 pa = int_to_float(a, scale, status);
2946     return float32_round_pack_canonical(&pa, status);
2947 }
2948 
2949 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2950 {
2951     return int64_to_float32_scalbn(a, scale, status);
2952 }
2953 
2954 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2955 {
2956     return int64_to_float32_scalbn(a, scale, status);
2957 }
2958 
2959 float32 int64_to_float32(int64_t a, float_status *status)
2960 {
2961     return int64_to_float32_scalbn(a, 0, status);
2962 }
2963 
2964 float32 int32_to_float32(int32_t a, float_status *status)
2965 {
2966     return int64_to_float32_scalbn(a, 0, status);
2967 }
2968 
2969 float32 int16_to_float32(int16_t a, float_status *status)
2970 {
2971     return int64_to_float32_scalbn(a, 0, status);
2972 }
2973 
2974 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2975 {
2976     FloatParts64 pa = int_to_float(a, scale, status);
2977     return float64_round_pack_canonical(&pa, status);
2978 }
2979 
2980 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2981 {
2982     return int64_to_float64_scalbn(a, scale, status);
2983 }
2984 
2985 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2986 {
2987     return int64_to_float64_scalbn(a, scale, status);
2988 }
2989 
2990 float64 int64_to_float64(int64_t a, float_status *status)
2991 {
2992     return int64_to_float64_scalbn(a, 0, status);
2993 }
2994 
2995 float64 int32_to_float64(int32_t a, float_status *status)
2996 {
2997     return int64_to_float64_scalbn(a, 0, status);
2998 }
2999 
3000 float64 int16_to_float64(int16_t a, float_status *status)
3001 {
3002     return int64_to_float64_scalbn(a, 0, status);
3003 }
3004 
3005 /*
3006  * Returns the result of converting the two's complement integer `a'
3007  * to the bfloat16 format.
3008  */
3009 
3010 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3011 {
3012     FloatParts64 pa = int_to_float(a, scale, status);
3013     return bfloat16_round_pack_canonical(&pa, status);
3014 }
3015 
3016 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3017 {
3018     return int64_to_bfloat16_scalbn(a, scale, status);
3019 }
3020 
3021 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3022 {
3023     return int64_to_bfloat16_scalbn(a, scale, status);
3024 }
3025 
3026 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3027 {
3028     return int64_to_bfloat16_scalbn(a, 0, status);
3029 }
3030 
3031 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3032 {
3033     return int64_to_bfloat16_scalbn(a, 0, status);
3034 }
3035 
3036 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3037 {
3038     return int64_to_bfloat16_scalbn(a, 0, status);
3039 }
3040 
3041 /*
3042  * Unsigned Integer to float conversions
3043  *
3044  * Returns the result of converting the unsigned integer `a' to the
3045  * floating-point format. The conversion is performed according to the
3046  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3047  */
3048 
3049 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3050 {
3051     FloatParts64 r = { .sign = false };
3052     int shift;
3053 
3054     if (a == 0) {
3055         r.cls = float_class_zero;
3056     } else {
3057         scale = MIN(MAX(scale, -0x10000), 0x10000);
3058         shift = clz64(a);
3059         r.cls = float_class_normal;
3060         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3061         r.frac = a << shift;
3062     }
3063 
3064     return r;
3065 }
3066 
3067 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3068 {
3069     FloatParts64 pa = uint_to_float(a, scale, status);
3070     return float16_round_pack_canonical(&pa, status);
3071 }
3072 
3073 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3074 {
3075     return uint64_to_float16_scalbn(a, scale, status);
3076 }
3077 
3078 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3079 {
3080     return uint64_to_float16_scalbn(a, scale, status);
3081 }
3082 
3083 float16 uint64_to_float16(uint64_t a, float_status *status)
3084 {
3085     return uint64_to_float16_scalbn(a, 0, status);
3086 }
3087 
3088 float16 uint32_to_float16(uint32_t a, float_status *status)
3089 {
3090     return uint64_to_float16_scalbn(a, 0, status);
3091 }
3092 
3093 float16 uint16_to_float16(uint16_t a, float_status *status)
3094 {
3095     return uint64_to_float16_scalbn(a, 0, status);
3096 }
3097 
3098 float16 uint8_to_float16(uint8_t a, float_status *status)
3099 {
3100     return uint64_to_float16_scalbn(a, 0, status);
3101 }
3102 
3103 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3104 {
3105     FloatParts64 pa = uint_to_float(a, scale, status);
3106     return float32_round_pack_canonical(&pa, status);
3107 }
3108 
3109 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3110 {
3111     return uint64_to_float32_scalbn(a, scale, status);
3112 }
3113 
3114 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3115 {
3116     return uint64_to_float32_scalbn(a, scale, status);
3117 }
3118 
3119 float32 uint64_to_float32(uint64_t a, float_status *status)
3120 {
3121     return uint64_to_float32_scalbn(a, 0, status);
3122 }
3123 
3124 float32 uint32_to_float32(uint32_t a, float_status *status)
3125 {
3126     return uint64_to_float32_scalbn(a, 0, status);
3127 }
3128 
3129 float32 uint16_to_float32(uint16_t a, float_status *status)
3130 {
3131     return uint64_to_float32_scalbn(a, 0, status);
3132 }
3133 
3134 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3135 {
3136     FloatParts64 pa = uint_to_float(a, scale, status);
3137     return float64_round_pack_canonical(&pa, status);
3138 }
3139 
3140 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3141 {
3142     return uint64_to_float64_scalbn(a, scale, status);
3143 }
3144 
3145 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3146 {
3147     return uint64_to_float64_scalbn(a, scale, status);
3148 }
3149 
3150 float64 uint64_to_float64(uint64_t a, float_status *status)
3151 {
3152     return uint64_to_float64_scalbn(a, 0, status);
3153 }
3154 
3155 float64 uint32_to_float64(uint32_t a, float_status *status)
3156 {
3157     return uint64_to_float64_scalbn(a, 0, status);
3158 }
3159 
3160 float64 uint16_to_float64(uint16_t a, float_status *status)
3161 {
3162     return uint64_to_float64_scalbn(a, 0, status);
3163 }
3164 
3165 /*
3166  * Returns the result of converting the unsigned integer `a' to the
3167  * bfloat16 format.
3168  */
3169 
3170 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3171 {
3172     FloatParts64 pa = uint_to_float(a, scale, status);
3173     return bfloat16_round_pack_canonical(&pa, status);
3174 }
3175 
3176 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3177 {
3178     return uint64_to_bfloat16_scalbn(a, scale, status);
3179 }
3180 
3181 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3182 {
3183     return uint64_to_bfloat16_scalbn(a, scale, status);
3184 }
3185 
3186 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3187 {
3188     return uint64_to_bfloat16_scalbn(a, 0, status);
3189 }
3190 
3191 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3192 {
3193     return uint64_to_bfloat16_scalbn(a, 0, status);
3194 }
3195 
3196 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3197 {
3198     return uint64_to_bfloat16_scalbn(a, 0, status);
3199 }
3200 
3201 /* Float Min/Max */
3202 /* min() and max() functions. These can't be implemented as
3203  * 'compare and pick one input' because that would mishandle
3204  * NaNs and +0 vs -0.
3205  *
3206  * minnum() and maxnum() functions. These are similar to the min()
3207  * and max() functions but if one of the arguments is a QNaN and
3208  * the other is numerical then the numerical argument is returned.
3209  * SNaNs will get quietened before being returned.
3210  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3211  * and maxNum() operations. min() and max() are the typical min/max
3212  * semantics provided by many CPUs which predate that specification.
3213  *
3214  * minnummag() and maxnummag() functions correspond to minNumMag()
3215  * and minNumMag() from the IEEE-754 2008.
3216  */
3217 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3218                                 bool ieee, bool ismag, float_status *s)
3219 {
3220     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3221         if (ieee) {
3222             /* Takes two floating-point values `a' and `b', one of
3223              * which is a NaN, and returns the appropriate NaN
3224              * result. If either `a' or `b' is a signaling NaN,
3225              * the invalid exception is raised.
3226              */
3227             if (is_snan(a.cls) || is_snan(b.cls)) {
3228                 return *parts_pick_nan(&a, &b, s);
3229             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3230                 return b;
3231             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3232                 return a;
3233             }
3234         }
3235         return *parts_pick_nan(&a, &b, s);
3236     } else {
3237         int a_exp, b_exp;
3238 
3239         switch (a.cls) {
3240         case float_class_normal:
3241             a_exp = a.exp;
3242             break;
3243         case float_class_inf:
3244             a_exp = INT_MAX;
3245             break;
3246         case float_class_zero:
3247             a_exp = INT_MIN;
3248             break;
3249         default:
3250             g_assert_not_reached();
3251             break;
3252         }
3253         switch (b.cls) {
3254         case float_class_normal:
3255             b_exp = b.exp;
3256             break;
3257         case float_class_inf:
3258             b_exp = INT_MAX;
3259             break;
3260         case float_class_zero:
3261             b_exp = INT_MIN;
3262             break;
3263         default:
3264             g_assert_not_reached();
3265             break;
3266         }
3267 
3268         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3269             bool a_less = a_exp < b_exp;
3270             if (a_exp == b_exp) {
3271                 a_less = a.frac < b.frac;
3272             }
3273             return a_less ^ ismin ? b : a;
3274         }
3275 
3276         if (a.sign == b.sign) {
3277             bool a_less = a_exp < b_exp;
3278             if (a_exp == b_exp) {
3279                 a_less = a.frac < b.frac;
3280             }
3281             return a.sign ^ a_less ^ ismin ? b : a;
3282         } else {
3283             return a.sign ^ ismin ? b : a;
3284         }
3285     }
3286 }
3287 
3288 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3289 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3290                                      float_status *s)                   \
3291 {                                                                       \
3292     FloatParts64 pa, pb, pr;                                            \
3293     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3294     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3295     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3296     return float ## sz ## _round_pack_canonical(&pr, s);                \
3297 }
3298 
3299 MINMAX(16, min, true, false, false)
3300 MINMAX(16, minnum, true, true, false)
3301 MINMAX(16, minnummag, true, true, true)
3302 MINMAX(16, max, false, false, false)
3303 MINMAX(16, maxnum, false, true, false)
3304 MINMAX(16, maxnummag, false, true, true)
3305 
3306 MINMAX(32, min, true, false, false)
3307 MINMAX(32, minnum, true, true, false)
3308 MINMAX(32, minnummag, true, true, true)
3309 MINMAX(32, max, false, false, false)
3310 MINMAX(32, maxnum, false, true, false)
3311 MINMAX(32, maxnummag, false, true, true)
3312 
3313 MINMAX(64, min, true, false, false)
3314 MINMAX(64, minnum, true, true, false)
3315 MINMAX(64, minnummag, true, true, true)
3316 MINMAX(64, max, false, false, false)
3317 MINMAX(64, maxnum, false, true, false)
3318 MINMAX(64, maxnummag, false, true, true)
3319 
3320 #undef MINMAX
3321 
3322 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3323 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3324 {                                                                       \
3325     FloatParts64 pa, pb, pr;                                            \
3326     bfloat16_unpack_canonical(&pa, a, s);                               \
3327     bfloat16_unpack_canonical(&pb, b, s);                               \
3328     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3329     return bfloat16_round_pack_canonical(&pr, s);                       \
3330 }
3331 
3332 BF16_MINMAX(min, true, false, false)
3333 BF16_MINMAX(minnum, true, true, false)
3334 BF16_MINMAX(minnummag, true, true, true)
3335 BF16_MINMAX(max, false, false, false)
3336 BF16_MINMAX(maxnum, false, true, false)
3337 BF16_MINMAX(maxnummag, false, true, true)
3338 
3339 #undef BF16_MINMAX
3340 
3341 /* Floating point compare */
3342 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3343                                     float_status *s)
3344 {
3345     if (is_nan(a.cls) || is_nan(b.cls)) {
3346         if (!is_quiet ||
3347             a.cls == float_class_snan ||
3348             b.cls == float_class_snan) {
3349             float_raise(float_flag_invalid, s);
3350         }
3351         return float_relation_unordered;
3352     }
3353 
3354     if (a.cls == float_class_zero) {
3355         if (b.cls == float_class_zero) {
3356             return float_relation_equal;
3357         }
3358         return b.sign ? float_relation_greater : float_relation_less;
3359     } else if (b.cls == float_class_zero) {
3360         return a.sign ? float_relation_less : float_relation_greater;
3361     }
3362 
3363     /* The only really important thing about infinity is its sign. If
3364      * both are infinities the sign marks the smallest of the two.
3365      */
3366     if (a.cls == float_class_inf) {
3367         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3368             return float_relation_equal;
3369         }
3370         return a.sign ? float_relation_less : float_relation_greater;
3371     } else if (b.cls == float_class_inf) {
3372         return b.sign ? float_relation_greater : float_relation_less;
3373     }
3374 
3375     if (a.sign != b.sign) {
3376         return a.sign ? float_relation_less : float_relation_greater;
3377     }
3378 
3379     if (a.exp == b.exp) {
3380         if (a.frac == b.frac) {
3381             return float_relation_equal;
3382         }
3383         if (a.sign) {
3384             return a.frac > b.frac ?
3385                 float_relation_less : float_relation_greater;
3386         } else {
3387             return a.frac > b.frac ?
3388                 float_relation_greater : float_relation_less;
3389         }
3390     } else {
3391         if (a.sign) {
3392             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3393         } else {
3394             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3395         }
3396     }
3397 }
3398 
3399 #define COMPARE(name, attr, sz)                                         \
3400 static int attr                                                         \
3401 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3402 {                                                                       \
3403     FloatParts64 pa, pb;                                                \
3404     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3405     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3406     return compare_floats(pa, pb, is_quiet, s);                         \
3407 }
3408 
3409 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3410 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3411 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3412 
3413 #undef COMPARE
3414 
3415 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3416 {
3417     return soft_f16_compare(a, b, false, s);
3418 }
3419 
3420 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3421 {
3422     return soft_f16_compare(a, b, true, s);
3423 }
3424 
3425 static FloatRelation QEMU_FLATTEN
3426 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3427 {
3428     union_float32 ua, ub;
3429 
3430     ua.s = xa;
3431     ub.s = xb;
3432 
3433     if (QEMU_NO_HARDFLOAT) {
3434         goto soft;
3435     }
3436 
3437     float32_input_flush2(&ua.s, &ub.s, s);
3438     if (isgreaterequal(ua.h, ub.h)) {
3439         if (isgreater(ua.h, ub.h)) {
3440             return float_relation_greater;
3441         }
3442         return float_relation_equal;
3443     }
3444     if (likely(isless(ua.h, ub.h))) {
3445         return float_relation_less;
3446     }
3447     /* The only condition remaining is unordered.
3448      * Fall through to set flags.
3449      */
3450  soft:
3451     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3452 }
3453 
3454 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3455 {
3456     return f32_compare(a, b, false, s);
3457 }
3458 
3459 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3460 {
3461     return f32_compare(a, b, true, s);
3462 }
3463 
3464 static FloatRelation QEMU_FLATTEN
3465 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3466 {
3467     union_float64 ua, ub;
3468 
3469     ua.s = xa;
3470     ub.s = xb;
3471 
3472     if (QEMU_NO_HARDFLOAT) {
3473         goto soft;
3474     }
3475 
3476     float64_input_flush2(&ua.s, &ub.s, s);
3477     if (isgreaterequal(ua.h, ub.h)) {
3478         if (isgreater(ua.h, ub.h)) {
3479             return float_relation_greater;
3480         }
3481         return float_relation_equal;
3482     }
3483     if (likely(isless(ua.h, ub.h))) {
3484         return float_relation_less;
3485     }
3486     /* The only condition remaining is unordered.
3487      * Fall through to set flags.
3488      */
3489  soft:
3490     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3491 }
3492 
3493 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3494 {
3495     return f64_compare(a, b, false, s);
3496 }
3497 
3498 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3499 {
3500     return f64_compare(a, b, true, s);
3501 }
3502 
3503 static FloatRelation QEMU_FLATTEN
3504 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3505 {
3506     FloatParts64 pa, pb;
3507 
3508     bfloat16_unpack_canonical(&pa, a, s);
3509     bfloat16_unpack_canonical(&pb, b, s);
3510     return compare_floats(pa, pb, is_quiet, s);
3511 }
3512 
3513 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3514 {
3515     return soft_bf16_compare(a, b, false, s);
3516 }
3517 
3518 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3519 {
3520     return soft_bf16_compare(a, b, true, s);
3521 }
3522 
3523 /* Multiply A by 2 raised to the power N.  */
3524 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3525 {
3526     if (unlikely(is_nan(a.cls))) {
3527         parts_return_nan(&a, s);
3528     }
3529     if (a.cls == float_class_normal) {
3530         /* The largest float type (even though not supported by FloatParts64)
3531          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3532          * still allows rounding to infinity, without allowing overflow
3533          * within the int32_t that backs FloatParts64.exp.
3534          */
3535         n = MIN(MAX(n, -0x10000), 0x10000);
3536         a.exp += n;
3537     }
3538     return a;
3539 }
3540 
3541 float16 float16_scalbn(float16 a, int n, float_status *status)
3542 {
3543     FloatParts64 pa, pr;
3544 
3545     float16_unpack_canonical(&pa, a, status);
3546     pr = scalbn_decomposed(pa, n, status);
3547     return float16_round_pack_canonical(&pr, status);
3548 }
3549 
3550 float32 float32_scalbn(float32 a, int n, float_status *status)
3551 {
3552     FloatParts64 pa, pr;
3553 
3554     float32_unpack_canonical(&pa, a, status);
3555     pr = scalbn_decomposed(pa, n, status);
3556     return float32_round_pack_canonical(&pr, status);
3557 }
3558 
3559 float64 float64_scalbn(float64 a, int n, float_status *status)
3560 {
3561     FloatParts64 pa, pr;
3562 
3563     float64_unpack_canonical(&pa, a, status);
3564     pr = scalbn_decomposed(pa, n, status);
3565     return float64_round_pack_canonical(&pr, status);
3566 }
3567 
3568 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3569 {
3570     FloatParts64 pa, pr;
3571 
3572     bfloat16_unpack_canonical(&pa, a, status);
3573     pr = scalbn_decomposed(pa, n, status);
3574     return bfloat16_round_pack_canonical(&pr, status);
3575 }
3576 
3577 /*
3578  * Square Root
3579  *
3580  * The old softfloat code did an approximation step before zeroing in
3581  * on the final result. However for simpleness we just compute the
3582  * square root by iterating down from the implicit bit to enough extra
3583  * bits to ensure we get a correctly rounded result.
3584  *
3585  * This does mean however the calculation is slower than before,
3586  * especially for 64 bit floats.
3587  */
3588 
3589 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3590 {
3591     uint64_t a_frac, r_frac, s_frac;
3592     int bit, last_bit;
3593 
3594     if (is_nan(a.cls)) {
3595         parts_return_nan(&a, s);
3596         return a;
3597     }
3598     if (a.cls == float_class_zero) {
3599         return a;  /* sqrt(+-0) = +-0 */
3600     }
3601     if (a.sign) {
3602         float_raise(float_flag_invalid, s);
3603         parts_default_nan(&a, s);
3604         return a;
3605     }
3606     if (a.cls == float_class_inf) {
3607         return a;  /* sqrt(+inf) = +inf */
3608     }
3609 
3610     assert(a.cls == float_class_normal);
3611 
3612     /* We need two overflow bits at the top. Adding room for that is a
3613      * right shift. If the exponent is odd, we can discard the low bit
3614      * by multiplying the fraction by 2; that's a left shift. Combine
3615      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3616      */
3617     a_frac = a.frac >> (2 - (a.exp & 1));
3618     a.exp >>= 1;
3619 
3620     /* Bit-by-bit computation of sqrt.  */
3621     r_frac = 0;
3622     s_frac = 0;
3623 
3624     /* Iterate from implicit bit down to the 3 extra bits to compute a
3625      * properly rounded result. Remember we've inserted two more bits
3626      * at the top, so these positions are two less.
3627      */
3628     bit = DECOMPOSED_BINARY_POINT - 2;
3629     last_bit = MAX(p->frac_shift - 4, 0);
3630     do {
3631         uint64_t q = 1ULL << bit;
3632         uint64_t t_frac = s_frac + q;
3633         if (t_frac <= a_frac) {
3634             s_frac = t_frac + q;
3635             a_frac -= t_frac;
3636             r_frac += q;
3637         }
3638         a_frac <<= 1;
3639     } while (--bit >= last_bit);
3640 
3641     /* Undo the right shift done above. If there is any remaining
3642      * fraction, the result is inexact. Set the sticky bit.
3643      */
3644     a.frac = (r_frac << 2) + (a_frac != 0);
3645 
3646     return a;
3647 }
3648 
3649 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3650 {
3651     FloatParts64 pa, pr;
3652 
3653     float16_unpack_canonical(&pa, a, status);
3654     pr = sqrt_float(pa, status, &float16_params);
3655     return float16_round_pack_canonical(&pr, status);
3656 }
3657 
3658 static float32 QEMU_SOFTFLOAT_ATTR
3659 soft_f32_sqrt(float32 a, float_status *status)
3660 {
3661     FloatParts64 pa, pr;
3662 
3663     float32_unpack_canonical(&pa, a, status);
3664     pr = sqrt_float(pa, status, &float32_params);
3665     return float32_round_pack_canonical(&pr, status);
3666 }
3667 
3668 static float64 QEMU_SOFTFLOAT_ATTR
3669 soft_f64_sqrt(float64 a, float_status *status)
3670 {
3671     FloatParts64 pa, pr;
3672 
3673     float64_unpack_canonical(&pa, a, status);
3674     pr = sqrt_float(pa, status, &float64_params);
3675     return float64_round_pack_canonical(&pr, status);
3676 }
3677 
3678 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3679 {
3680     union_float32 ua, ur;
3681 
3682     ua.s = xa;
3683     if (unlikely(!can_use_fpu(s))) {
3684         goto soft;
3685     }
3686 
3687     float32_input_flush1(&ua.s, s);
3688     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3689         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3690                        fpclassify(ua.h) == FP_ZERO) ||
3691                      signbit(ua.h))) {
3692             goto soft;
3693         }
3694     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3695                         float32_is_neg(ua.s))) {
3696         goto soft;
3697     }
3698     ur.h = sqrtf(ua.h);
3699     return ur.s;
3700 
3701  soft:
3702     return soft_f32_sqrt(ua.s, s);
3703 }
3704 
3705 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3706 {
3707     union_float64 ua, ur;
3708 
3709     ua.s = xa;
3710     if (unlikely(!can_use_fpu(s))) {
3711         goto soft;
3712     }
3713 
3714     float64_input_flush1(&ua.s, s);
3715     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3716         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3717                        fpclassify(ua.h) == FP_ZERO) ||
3718                      signbit(ua.h))) {
3719             goto soft;
3720         }
3721     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3722                         float64_is_neg(ua.s))) {
3723         goto soft;
3724     }
3725     ur.h = sqrt(ua.h);
3726     return ur.s;
3727 
3728  soft:
3729     return soft_f64_sqrt(ua.s, s);
3730 }
3731 
3732 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3733 {
3734     FloatParts64 pa, pr;
3735 
3736     bfloat16_unpack_canonical(&pa, a, status);
3737     pr = sqrt_float(pa, status, &bfloat16_params);
3738     return bfloat16_round_pack_canonical(&pr, status);
3739 }
3740 
3741 /*----------------------------------------------------------------------------
3742 | The pattern for a default generated NaN.
3743 *----------------------------------------------------------------------------*/
3744 
3745 float16 float16_default_nan(float_status *status)
3746 {
3747     FloatParts64 p;
3748 
3749     parts_default_nan(&p, status);
3750     p.frac >>= float16_params.frac_shift;
3751     return float16_pack_raw(&p);
3752 }
3753 
3754 float32 float32_default_nan(float_status *status)
3755 {
3756     FloatParts64 p;
3757 
3758     parts_default_nan(&p, status);
3759     p.frac >>= float32_params.frac_shift;
3760     return float32_pack_raw(&p);
3761 }
3762 
3763 float64 float64_default_nan(float_status *status)
3764 {
3765     FloatParts64 p;
3766 
3767     parts_default_nan(&p, status);
3768     p.frac >>= float64_params.frac_shift;
3769     return float64_pack_raw(&p);
3770 }
3771 
3772 float128 float128_default_nan(float_status *status)
3773 {
3774     FloatParts128 p;
3775 
3776     parts_default_nan(&p, status);
3777     frac_shr(&p, float128_params.frac_shift);
3778     return float128_pack_raw(&p);
3779 }
3780 
3781 bfloat16 bfloat16_default_nan(float_status *status)
3782 {
3783     FloatParts64 p;
3784 
3785     parts_default_nan(&p, status);
3786     p.frac >>= bfloat16_params.frac_shift;
3787     return bfloat16_pack_raw(&p);
3788 }
3789 
3790 /*----------------------------------------------------------------------------
3791 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3792 *----------------------------------------------------------------------------*/
3793 
3794 float16 float16_silence_nan(float16 a, float_status *status)
3795 {
3796     FloatParts64 p;
3797 
3798     float16_unpack_raw(&p, a);
3799     p.frac <<= float16_params.frac_shift;
3800     parts_silence_nan(&p, status);
3801     p.frac >>= float16_params.frac_shift;
3802     return float16_pack_raw(&p);
3803 }
3804 
3805 float32 float32_silence_nan(float32 a, float_status *status)
3806 {
3807     FloatParts64 p;
3808 
3809     float32_unpack_raw(&p, a);
3810     p.frac <<= float32_params.frac_shift;
3811     parts_silence_nan(&p, status);
3812     p.frac >>= float32_params.frac_shift;
3813     return float32_pack_raw(&p);
3814 }
3815 
3816 float64 float64_silence_nan(float64 a, float_status *status)
3817 {
3818     FloatParts64 p;
3819 
3820     float64_unpack_raw(&p, a);
3821     p.frac <<= float64_params.frac_shift;
3822     parts_silence_nan(&p, status);
3823     p.frac >>= float64_params.frac_shift;
3824     return float64_pack_raw(&p);
3825 }
3826 
3827 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3828 {
3829     FloatParts64 p;
3830 
3831     bfloat16_unpack_raw(&p, a);
3832     p.frac <<= bfloat16_params.frac_shift;
3833     parts_silence_nan(&p, status);
3834     p.frac >>= bfloat16_params.frac_shift;
3835     return bfloat16_pack_raw(&p);
3836 }
3837 
3838 float128 float128_silence_nan(float128 a, float_status *status)
3839 {
3840     FloatParts128 p;
3841 
3842     float128_unpack_raw(&p, a);
3843     frac_shl(&p, float128_params.frac_shift);
3844     parts_silence_nan(&p, status);
3845     frac_shr(&p, float128_params.frac_shift);
3846     return float128_pack_raw(&p);
3847 }
3848 
3849 /*----------------------------------------------------------------------------
3850 | If `a' is denormal and we are in flush-to-zero mode then set the
3851 | input-denormal exception and return zero. Otherwise just return the value.
3852 *----------------------------------------------------------------------------*/
3853 
3854 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3855 {
3856     if (p.exp == 0 && p.frac != 0) {
3857         float_raise(float_flag_input_denormal, status);
3858         return true;
3859     }
3860 
3861     return false;
3862 }
3863 
3864 float16 float16_squash_input_denormal(float16 a, float_status *status)
3865 {
3866     if (status->flush_inputs_to_zero) {
3867         FloatParts64 p;
3868 
3869         float16_unpack_raw(&p, a);
3870         if (parts_squash_denormal(p, status)) {
3871             return float16_set_sign(float16_zero, p.sign);
3872         }
3873     }
3874     return a;
3875 }
3876 
3877 float32 float32_squash_input_denormal(float32 a, float_status *status)
3878 {
3879     if (status->flush_inputs_to_zero) {
3880         FloatParts64 p;
3881 
3882         float32_unpack_raw(&p, a);
3883         if (parts_squash_denormal(p, status)) {
3884             return float32_set_sign(float32_zero, p.sign);
3885         }
3886     }
3887     return a;
3888 }
3889 
3890 float64 float64_squash_input_denormal(float64 a, float_status *status)
3891 {
3892     if (status->flush_inputs_to_zero) {
3893         FloatParts64 p;
3894 
3895         float64_unpack_raw(&p, a);
3896         if (parts_squash_denormal(p, status)) {
3897             return float64_set_sign(float64_zero, p.sign);
3898         }
3899     }
3900     return a;
3901 }
3902 
3903 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3904 {
3905     if (status->flush_inputs_to_zero) {
3906         FloatParts64 p;
3907 
3908         bfloat16_unpack_raw(&p, a);
3909         if (parts_squash_denormal(p, status)) {
3910             return bfloat16_set_sign(bfloat16_zero, p.sign);
3911         }
3912     }
3913     return a;
3914 }
3915 
3916 /*----------------------------------------------------------------------------
3917 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3918 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3919 | input.  If `zSign' is 1, the input is negated before being converted to an
3920 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3921 | is simply rounded to an integer, with the inexact exception raised if the
3922 | input cannot be represented exactly as an integer.  However, if the fixed-
3923 | point input is too large, the invalid exception is raised and the largest
3924 | positive or negative integer is returned.
3925 *----------------------------------------------------------------------------*/
3926 
3927 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3928                                  float_status *status)
3929 {
3930     int8_t roundingMode;
3931     bool roundNearestEven;
3932     int8_t roundIncrement, roundBits;
3933     int32_t z;
3934 
3935     roundingMode = status->float_rounding_mode;
3936     roundNearestEven = ( roundingMode == float_round_nearest_even );
3937     switch (roundingMode) {
3938     case float_round_nearest_even:
3939     case float_round_ties_away:
3940         roundIncrement = 0x40;
3941         break;
3942     case float_round_to_zero:
3943         roundIncrement = 0;
3944         break;
3945     case float_round_up:
3946         roundIncrement = zSign ? 0 : 0x7f;
3947         break;
3948     case float_round_down:
3949         roundIncrement = zSign ? 0x7f : 0;
3950         break;
3951     case float_round_to_odd:
3952         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3953         break;
3954     default:
3955         abort();
3956     }
3957     roundBits = absZ & 0x7F;
3958     absZ = ( absZ + roundIncrement )>>7;
3959     if (!(roundBits ^ 0x40) && roundNearestEven) {
3960         absZ &= ~1;
3961     }
3962     z = absZ;
3963     if ( zSign ) z = - z;
3964     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3965         float_raise(float_flag_invalid, status);
3966         return zSign ? INT32_MIN : INT32_MAX;
3967     }
3968     if (roundBits) {
3969         float_raise(float_flag_inexact, status);
3970     }
3971     return z;
3972 
3973 }
3974 
3975 /*----------------------------------------------------------------------------
3976 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3977 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3978 | and returns the properly rounded 64-bit integer corresponding to the input.
3979 | If `zSign' is 1, the input is negated before being converted to an integer.
3980 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3981 | the inexact exception raised if the input cannot be represented exactly as
3982 | an integer.  However, if the fixed-point input is too large, the invalid
3983 | exception is raised and the largest positive or negative integer is
3984 | returned.
3985 *----------------------------------------------------------------------------*/
3986 
3987 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3988                                float_status *status)
3989 {
3990     int8_t roundingMode;
3991     bool roundNearestEven, increment;
3992     int64_t z;
3993 
3994     roundingMode = status->float_rounding_mode;
3995     roundNearestEven = ( roundingMode == float_round_nearest_even );
3996     switch (roundingMode) {
3997     case float_round_nearest_even:
3998     case float_round_ties_away:
3999         increment = ((int64_t) absZ1 < 0);
4000         break;
4001     case float_round_to_zero:
4002         increment = 0;
4003         break;
4004     case float_round_up:
4005         increment = !zSign && absZ1;
4006         break;
4007     case float_round_down:
4008         increment = zSign && absZ1;
4009         break;
4010     case float_round_to_odd:
4011         increment = !(absZ0 & 1) && absZ1;
4012         break;
4013     default:
4014         abort();
4015     }
4016     if ( increment ) {
4017         ++absZ0;
4018         if ( absZ0 == 0 ) goto overflow;
4019         if (!(absZ1 << 1) && roundNearestEven) {
4020             absZ0 &= ~1;
4021         }
4022     }
4023     z = absZ0;
4024     if ( zSign ) z = - z;
4025     if ( z && ( ( z < 0 ) ^ zSign ) ) {
4026  overflow:
4027         float_raise(float_flag_invalid, status);
4028         return zSign ? INT64_MIN : INT64_MAX;
4029     }
4030     if (absZ1) {
4031         float_raise(float_flag_inexact, status);
4032     }
4033     return z;
4034 
4035 }
4036 
4037 /*----------------------------------------------------------------------------
4038 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4039 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4040 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4041 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
4042 | with the inexact exception raised if the input cannot be represented exactly
4043 | as an integer.  However, if the fixed-point input is too large, the invalid
4044 | exception is raised and the largest unsigned integer is returned.
4045 *----------------------------------------------------------------------------*/
4046 
4047 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4048                                 uint64_t absZ1, float_status *status)
4049 {
4050     int8_t roundingMode;
4051     bool roundNearestEven, increment;
4052 
4053     roundingMode = status->float_rounding_mode;
4054     roundNearestEven = (roundingMode == float_round_nearest_even);
4055     switch (roundingMode) {
4056     case float_round_nearest_even:
4057     case float_round_ties_away:
4058         increment = ((int64_t)absZ1 < 0);
4059         break;
4060     case float_round_to_zero:
4061         increment = 0;
4062         break;
4063     case float_round_up:
4064         increment = !zSign && absZ1;
4065         break;
4066     case float_round_down:
4067         increment = zSign && absZ1;
4068         break;
4069     case float_round_to_odd:
4070         increment = !(absZ0 & 1) && absZ1;
4071         break;
4072     default:
4073         abort();
4074     }
4075     if (increment) {
4076         ++absZ0;
4077         if (absZ0 == 0) {
4078             float_raise(float_flag_invalid, status);
4079             return UINT64_MAX;
4080         }
4081         if (!(absZ1 << 1) && roundNearestEven) {
4082             absZ0 &= ~1;
4083         }
4084     }
4085 
4086     if (zSign && absZ0) {
4087         float_raise(float_flag_invalid, status);
4088         return 0;
4089     }
4090 
4091     if (absZ1) {
4092         float_raise(float_flag_inexact, status);
4093     }
4094     return absZ0;
4095 }
4096 
4097 /*----------------------------------------------------------------------------
4098 | Normalizes the subnormal single-precision floating-point value represented
4099 | by the denormalized significand `aSig'.  The normalized exponent and
4100 | significand are stored at the locations pointed to by `zExpPtr' and
4101 | `zSigPtr', respectively.
4102 *----------------------------------------------------------------------------*/
4103 
4104 static void
4105  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4106 {
4107     int8_t shiftCount;
4108 
4109     shiftCount = clz32(aSig) - 8;
4110     *zSigPtr = aSig<<shiftCount;
4111     *zExpPtr = 1 - shiftCount;
4112 
4113 }
4114 
4115 /*----------------------------------------------------------------------------
4116 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4117 | and significand `zSig', and returns the proper single-precision floating-
4118 | point value corresponding to the abstract input.  Ordinarily, the abstract
4119 | value is simply rounded and packed into the single-precision format, with
4120 | the inexact exception raised if the abstract input cannot be represented
4121 | exactly.  However, if the abstract value is too large, the overflow and
4122 | inexact exceptions are raised and an infinity or maximal finite value is
4123 | returned.  If the abstract value is too small, the input value is rounded to
4124 | a subnormal number, and the underflow and inexact exceptions are raised if
4125 | the abstract input cannot be represented exactly as a subnormal single-
4126 | precision floating-point number.
4127 |     The input significand `zSig' has its binary point between bits 30
4128 | and 29, which is 7 bits to the left of the usual location.  This shifted
4129 | significand must be normalized or smaller.  If `zSig' is not normalized,
4130 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4131 | and it must not require rounding.  In the usual case that `zSig' is
4132 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4133 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4134 | Binary Floating-Point Arithmetic.
4135 *----------------------------------------------------------------------------*/
4136 
4137 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4138                                    float_status *status)
4139 {
4140     int8_t roundingMode;
4141     bool roundNearestEven;
4142     int8_t roundIncrement, roundBits;
4143     bool isTiny;
4144 
4145     roundingMode = status->float_rounding_mode;
4146     roundNearestEven = ( roundingMode == float_round_nearest_even );
4147     switch (roundingMode) {
4148     case float_round_nearest_even:
4149     case float_round_ties_away:
4150         roundIncrement = 0x40;
4151         break;
4152     case float_round_to_zero:
4153         roundIncrement = 0;
4154         break;
4155     case float_round_up:
4156         roundIncrement = zSign ? 0 : 0x7f;
4157         break;
4158     case float_round_down:
4159         roundIncrement = zSign ? 0x7f : 0;
4160         break;
4161     case float_round_to_odd:
4162         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4163         break;
4164     default:
4165         abort();
4166         break;
4167     }
4168     roundBits = zSig & 0x7F;
4169     if ( 0xFD <= (uint16_t) zExp ) {
4170         if (    ( 0xFD < zExp )
4171              || (    ( zExp == 0xFD )
4172                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4173            ) {
4174             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4175                                    roundIncrement != 0;
4176             float_raise(float_flag_overflow | float_flag_inexact, status);
4177             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4178         }
4179         if ( zExp < 0 ) {
4180             if (status->flush_to_zero) {
4181                 float_raise(float_flag_output_denormal, status);
4182                 return packFloat32(zSign, 0, 0);
4183             }
4184             isTiny = status->tininess_before_rounding
4185                   || (zExp < -1)
4186                   || (zSig + roundIncrement < 0x80000000);
4187             shift32RightJamming( zSig, - zExp, &zSig );
4188             zExp = 0;
4189             roundBits = zSig & 0x7F;
4190             if (isTiny && roundBits) {
4191                 float_raise(float_flag_underflow, status);
4192             }
4193             if (roundingMode == float_round_to_odd) {
4194                 /*
4195                  * For round-to-odd case, the roundIncrement depends on
4196                  * zSig which just changed.
4197                  */
4198                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4199             }
4200         }
4201     }
4202     if (roundBits) {
4203         float_raise(float_flag_inexact, status);
4204     }
4205     zSig = ( zSig + roundIncrement )>>7;
4206     if (!(roundBits ^ 0x40) && roundNearestEven) {
4207         zSig &= ~1;
4208     }
4209     if ( zSig == 0 ) zExp = 0;
4210     return packFloat32( zSign, zExp, zSig );
4211 
4212 }
4213 
4214 /*----------------------------------------------------------------------------
4215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4216 | and significand `zSig', and returns the proper single-precision floating-
4217 | point value corresponding to the abstract input.  This routine is just like
4218 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4219 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4220 | floating-point exponent.
4221 *----------------------------------------------------------------------------*/
4222 
4223 static float32
4224  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4225                               float_status *status)
4226 {
4227     int8_t shiftCount;
4228 
4229     shiftCount = clz32(zSig) - 1;
4230     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4231                                status);
4232 
4233 }
4234 
4235 /*----------------------------------------------------------------------------
4236 | Normalizes the subnormal double-precision floating-point value represented
4237 | by the denormalized significand `aSig'.  The normalized exponent and
4238 | significand are stored at the locations pointed to by `zExpPtr' and
4239 | `zSigPtr', respectively.
4240 *----------------------------------------------------------------------------*/
4241 
4242 static void
4243  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4244 {
4245     int8_t shiftCount;
4246 
4247     shiftCount = clz64(aSig) - 11;
4248     *zSigPtr = aSig<<shiftCount;
4249     *zExpPtr = 1 - shiftCount;
4250 
4251 }
4252 
4253 /*----------------------------------------------------------------------------
4254 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4255 | double-precision floating-point value, returning the result.  After being
4256 | shifted into the proper positions, the three fields are simply added
4257 | together to form the result.  This means that any integer portion of `zSig'
4258 | will be added into the exponent.  Since a properly normalized significand
4259 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4260 | than the desired result exponent whenever `zSig' is a complete, normalized
4261 | significand.
4262 *----------------------------------------------------------------------------*/
4263 
4264 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4265 {
4266 
4267     return make_float64(
4268         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4269 
4270 }
4271 
4272 /*----------------------------------------------------------------------------
4273 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4274 | and significand `zSig', and returns the proper double-precision floating-
4275 | point value corresponding to the abstract input.  Ordinarily, the abstract
4276 | value is simply rounded and packed into the double-precision format, with
4277 | the inexact exception raised if the abstract input cannot be represented
4278 | exactly.  However, if the abstract value is too large, the overflow and
4279 | inexact exceptions are raised and an infinity or maximal finite value is
4280 | returned.  If the abstract value is too small, the input value is rounded to
4281 | a subnormal number, and the underflow and inexact exceptions are raised if
4282 | the abstract input cannot be represented exactly as a subnormal double-
4283 | precision floating-point number.
4284 |     The input significand `zSig' has its binary point between bits 62
4285 | and 61, which is 10 bits to the left of the usual location.  This shifted
4286 | significand must be normalized or smaller.  If `zSig' is not normalized,
4287 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4288 | and it must not require rounding.  In the usual case that `zSig' is
4289 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4290 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4291 | Binary Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4293 
4294 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4295                                    float_status *status)
4296 {
4297     int8_t roundingMode;
4298     bool roundNearestEven;
4299     int roundIncrement, roundBits;
4300     bool isTiny;
4301 
4302     roundingMode = status->float_rounding_mode;
4303     roundNearestEven = ( roundingMode == float_round_nearest_even );
4304     switch (roundingMode) {
4305     case float_round_nearest_even:
4306     case float_round_ties_away:
4307         roundIncrement = 0x200;
4308         break;
4309     case float_round_to_zero:
4310         roundIncrement = 0;
4311         break;
4312     case float_round_up:
4313         roundIncrement = zSign ? 0 : 0x3ff;
4314         break;
4315     case float_round_down:
4316         roundIncrement = zSign ? 0x3ff : 0;
4317         break;
4318     case float_round_to_odd:
4319         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4320         break;
4321     default:
4322         abort();
4323     }
4324     roundBits = zSig & 0x3FF;
4325     if ( 0x7FD <= (uint16_t) zExp ) {
4326         if (    ( 0x7FD < zExp )
4327              || (    ( zExp == 0x7FD )
4328                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4329            ) {
4330             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4331                                    roundIncrement != 0;
4332             float_raise(float_flag_overflow | float_flag_inexact, status);
4333             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4334         }
4335         if ( zExp < 0 ) {
4336             if (status->flush_to_zero) {
4337                 float_raise(float_flag_output_denormal, status);
4338                 return packFloat64(zSign, 0, 0);
4339             }
4340             isTiny = status->tininess_before_rounding
4341                   || (zExp < -1)
4342                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4343             shift64RightJamming( zSig, - zExp, &zSig );
4344             zExp = 0;
4345             roundBits = zSig & 0x3FF;
4346             if (isTiny && roundBits) {
4347                 float_raise(float_flag_underflow, status);
4348             }
4349             if (roundingMode == float_round_to_odd) {
4350                 /*
4351                  * For round-to-odd case, the roundIncrement depends on
4352                  * zSig which just changed.
4353                  */
4354                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4355             }
4356         }
4357     }
4358     if (roundBits) {
4359         float_raise(float_flag_inexact, status);
4360     }
4361     zSig = ( zSig + roundIncrement )>>10;
4362     if (!(roundBits ^ 0x200) && roundNearestEven) {
4363         zSig &= ~1;
4364     }
4365     if ( zSig == 0 ) zExp = 0;
4366     return packFloat64( zSign, zExp, zSig );
4367 
4368 }
4369 
4370 /*----------------------------------------------------------------------------
4371 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4372 | and significand `zSig', and returns the proper double-precision floating-
4373 | point value corresponding to the abstract input.  This routine is just like
4374 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4375 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4376 | floating-point exponent.
4377 *----------------------------------------------------------------------------*/
4378 
4379 static float64
4380  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4381                               float_status *status)
4382 {
4383     int8_t shiftCount;
4384 
4385     shiftCount = clz64(zSig) - 1;
4386     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4387                                status);
4388 
4389 }
4390 
4391 /*----------------------------------------------------------------------------
4392 | Normalizes the subnormal extended double-precision floating-point value
4393 | represented by the denormalized significand `aSig'.  The normalized exponent
4394 | and significand are stored at the locations pointed to by `zExpPtr' and
4395 | `zSigPtr', respectively.
4396 *----------------------------------------------------------------------------*/
4397 
4398 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4399                                 uint64_t *zSigPtr)
4400 {
4401     int8_t shiftCount;
4402 
4403     shiftCount = clz64(aSig);
4404     *zSigPtr = aSig<<shiftCount;
4405     *zExpPtr = 1 - shiftCount;
4406 }
4407 
4408 /*----------------------------------------------------------------------------
4409 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4410 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4411 | and returns the proper extended double-precision floating-point value
4412 | corresponding to the abstract input.  Ordinarily, the abstract value is
4413 | rounded and packed into the extended double-precision format, with the
4414 | inexact exception raised if the abstract input cannot be represented
4415 | exactly.  However, if the abstract value is too large, the overflow and
4416 | inexact exceptions are raised and an infinity or maximal finite value is
4417 | returned.  If the abstract value is too small, the input value is rounded to
4418 | a subnormal number, and the underflow and inexact exceptions are raised if
4419 | the abstract input cannot be represented exactly as a subnormal extended
4420 | double-precision floating-point number.
4421 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4422 | number of bits as single or double precision, respectively.  Otherwise, the
4423 | result is rounded to the full precision of the extended double-precision
4424 | format.
4425 |     The input significand must be normalized or smaller.  If the input
4426 | significand is not normalized, `zExp' must be 0; in that case, the result
4427 | returned is a subnormal number, and it must not require rounding.  The
4428 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4429 | Floating-Point Arithmetic.
4430 *----------------------------------------------------------------------------*/
4431 
4432 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4433                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4434                               float_status *status)
4435 {
4436     int8_t roundingMode;
4437     bool roundNearestEven, increment, isTiny;
4438     int64_t roundIncrement, roundMask, roundBits;
4439 
4440     roundingMode = status->float_rounding_mode;
4441     roundNearestEven = ( roundingMode == float_round_nearest_even );
4442     if ( roundingPrecision == 80 ) goto precision80;
4443     if ( roundingPrecision == 64 ) {
4444         roundIncrement = UINT64_C(0x0000000000000400);
4445         roundMask = UINT64_C(0x00000000000007FF);
4446     }
4447     else if ( roundingPrecision == 32 ) {
4448         roundIncrement = UINT64_C(0x0000008000000000);
4449         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4450     }
4451     else {
4452         goto precision80;
4453     }
4454     zSig0 |= ( zSig1 != 0 );
4455     switch (roundingMode) {
4456     case float_round_nearest_even:
4457     case float_round_ties_away:
4458         break;
4459     case float_round_to_zero:
4460         roundIncrement = 0;
4461         break;
4462     case float_round_up:
4463         roundIncrement = zSign ? 0 : roundMask;
4464         break;
4465     case float_round_down:
4466         roundIncrement = zSign ? roundMask : 0;
4467         break;
4468     default:
4469         abort();
4470     }
4471     roundBits = zSig0 & roundMask;
4472     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4473         if (    ( 0x7FFE < zExp )
4474              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4475            ) {
4476             goto overflow;
4477         }
4478         if ( zExp <= 0 ) {
4479             if (status->flush_to_zero) {
4480                 float_raise(float_flag_output_denormal, status);
4481                 return packFloatx80(zSign, 0, 0);
4482             }
4483             isTiny = status->tininess_before_rounding
4484                   || (zExp < 0 )
4485                   || (zSig0 <= zSig0 + roundIncrement);
4486             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4487             zExp = 0;
4488             roundBits = zSig0 & roundMask;
4489             if (isTiny && roundBits) {
4490                 float_raise(float_flag_underflow, status);
4491             }
4492             if (roundBits) {
4493                 float_raise(float_flag_inexact, status);
4494             }
4495             zSig0 += roundIncrement;
4496             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4497             roundIncrement = roundMask + 1;
4498             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4499                 roundMask |= roundIncrement;
4500             }
4501             zSig0 &= ~ roundMask;
4502             return packFloatx80( zSign, zExp, zSig0 );
4503         }
4504     }
4505     if (roundBits) {
4506         float_raise(float_flag_inexact, status);
4507     }
4508     zSig0 += roundIncrement;
4509     if ( zSig0 < roundIncrement ) {
4510         ++zExp;
4511         zSig0 = UINT64_C(0x8000000000000000);
4512     }
4513     roundIncrement = roundMask + 1;
4514     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4515         roundMask |= roundIncrement;
4516     }
4517     zSig0 &= ~ roundMask;
4518     if ( zSig0 == 0 ) zExp = 0;
4519     return packFloatx80( zSign, zExp, zSig0 );
4520  precision80:
4521     switch (roundingMode) {
4522     case float_round_nearest_even:
4523     case float_round_ties_away:
4524         increment = ((int64_t)zSig1 < 0);
4525         break;
4526     case float_round_to_zero:
4527         increment = 0;
4528         break;
4529     case float_round_up:
4530         increment = !zSign && zSig1;
4531         break;
4532     case float_round_down:
4533         increment = zSign && zSig1;
4534         break;
4535     default:
4536         abort();
4537     }
4538     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4539         if (    ( 0x7FFE < zExp )
4540              || (    ( zExp == 0x7FFE )
4541                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4542                   && increment
4543                 )
4544            ) {
4545             roundMask = 0;
4546  overflow:
4547             float_raise(float_flag_overflow | float_flag_inexact, status);
4548             if (    ( roundingMode == float_round_to_zero )
4549                  || ( zSign && ( roundingMode == float_round_up ) )
4550                  || ( ! zSign && ( roundingMode == float_round_down ) )
4551                ) {
4552                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4553             }
4554             return packFloatx80(zSign,
4555                                 floatx80_infinity_high,
4556                                 floatx80_infinity_low);
4557         }
4558         if ( zExp <= 0 ) {
4559             isTiny = status->tininess_before_rounding
4560                   || (zExp < 0)
4561                   || !increment
4562                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4563             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4564             zExp = 0;
4565             if (isTiny && zSig1) {
4566                 float_raise(float_flag_underflow, status);
4567             }
4568             if (zSig1) {
4569                 float_raise(float_flag_inexact, status);
4570             }
4571             switch (roundingMode) {
4572             case float_round_nearest_even:
4573             case float_round_ties_away:
4574                 increment = ((int64_t)zSig1 < 0);
4575                 break;
4576             case float_round_to_zero:
4577                 increment = 0;
4578                 break;
4579             case float_round_up:
4580                 increment = !zSign && zSig1;
4581                 break;
4582             case float_round_down:
4583                 increment = zSign && zSig1;
4584                 break;
4585             default:
4586                 abort();
4587             }
4588             if ( increment ) {
4589                 ++zSig0;
4590                 if (!(zSig1 << 1) && roundNearestEven) {
4591                     zSig0 &= ~1;
4592                 }
4593                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4594             }
4595             return packFloatx80( zSign, zExp, zSig0 );
4596         }
4597     }
4598     if (zSig1) {
4599         float_raise(float_flag_inexact, status);
4600     }
4601     if ( increment ) {
4602         ++zSig0;
4603         if ( zSig0 == 0 ) {
4604             ++zExp;
4605             zSig0 = UINT64_C(0x8000000000000000);
4606         }
4607         else {
4608             if (!(zSig1 << 1) && roundNearestEven) {
4609                 zSig0 &= ~1;
4610             }
4611         }
4612     }
4613     else {
4614         if ( zSig0 == 0 ) zExp = 0;
4615     }
4616     return packFloatx80( zSign, zExp, zSig0 );
4617 
4618 }
4619 
4620 /*----------------------------------------------------------------------------
4621 | Takes an abstract floating-point value having sign `zSign', exponent
4622 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4623 | and returns the proper extended double-precision floating-point value
4624 | corresponding to the abstract input.  This routine is just like
4625 | `roundAndPackFloatx80' except that the input significand does not have to be
4626 | normalized.
4627 *----------------------------------------------------------------------------*/
4628 
4629 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4630                                        bool zSign, int32_t zExp,
4631                                        uint64_t zSig0, uint64_t zSig1,
4632                                        float_status *status)
4633 {
4634     int8_t shiftCount;
4635 
4636     if ( zSig0 == 0 ) {
4637         zSig0 = zSig1;
4638         zSig1 = 0;
4639         zExp -= 64;
4640     }
4641     shiftCount = clz64(zSig0);
4642     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4643     zExp -= shiftCount;
4644     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4645                                 zSig0, zSig1, status);
4646 
4647 }
4648 
4649 /*----------------------------------------------------------------------------
4650 | Returns the least-significant 64 fraction bits of the quadruple-precision
4651 | floating-point value `a'.
4652 *----------------------------------------------------------------------------*/
4653 
4654 static inline uint64_t extractFloat128Frac1( float128 a )
4655 {
4656 
4657     return a.low;
4658 
4659 }
4660 
4661 /*----------------------------------------------------------------------------
4662 | Returns the most-significant 48 fraction bits of the quadruple-precision
4663 | floating-point value `a'.
4664 *----------------------------------------------------------------------------*/
4665 
4666 static inline uint64_t extractFloat128Frac0( float128 a )
4667 {
4668 
4669     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4670 
4671 }
4672 
4673 /*----------------------------------------------------------------------------
4674 | Returns the exponent bits of the quadruple-precision floating-point value
4675 | `a'.
4676 *----------------------------------------------------------------------------*/
4677 
4678 static inline int32_t extractFloat128Exp( float128 a )
4679 {
4680 
4681     return ( a.high>>48 ) & 0x7FFF;
4682 
4683 }
4684 
4685 /*----------------------------------------------------------------------------
4686 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4687 *----------------------------------------------------------------------------*/
4688 
4689 static inline bool extractFloat128Sign(float128 a)
4690 {
4691     return a.high >> 63;
4692 }
4693 
4694 /*----------------------------------------------------------------------------
4695 | Normalizes the subnormal quadruple-precision floating-point value
4696 | represented by the denormalized significand formed by the concatenation of
4697 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4698 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4699 | significand are stored at the location pointed to by `zSig0Ptr', and the
4700 | least significant 64 bits of the normalized significand are stored at the
4701 | location pointed to by `zSig1Ptr'.
4702 *----------------------------------------------------------------------------*/
4703 
4704 static void
4705  normalizeFloat128Subnormal(
4706      uint64_t aSig0,
4707      uint64_t aSig1,
4708      int32_t *zExpPtr,
4709      uint64_t *zSig0Ptr,
4710      uint64_t *zSig1Ptr
4711  )
4712 {
4713     int8_t shiftCount;
4714 
4715     if ( aSig0 == 0 ) {
4716         shiftCount = clz64(aSig1) - 15;
4717         if ( shiftCount < 0 ) {
4718             *zSig0Ptr = aSig1>>( - shiftCount );
4719             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4720         }
4721         else {
4722             *zSig0Ptr = aSig1<<shiftCount;
4723             *zSig1Ptr = 0;
4724         }
4725         *zExpPtr = - shiftCount - 63;
4726     }
4727     else {
4728         shiftCount = clz64(aSig0) - 15;
4729         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4730         *zExpPtr = 1 - shiftCount;
4731     }
4732 
4733 }
4734 
4735 /*----------------------------------------------------------------------------
4736 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4737 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4738 | floating-point value, returning the result.  After being shifted into the
4739 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4740 | added together to form the most significant 32 bits of the result.  This
4741 | means that any integer portion of `zSig0' will be added into the exponent.
4742 | Since a properly normalized significand will have an integer portion equal
4743 | to 1, the `zExp' input should be 1 less than the desired result exponent
4744 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4745 | significand.
4746 *----------------------------------------------------------------------------*/
4747 
4748 static inline float128
4749 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4750 {
4751     float128 z;
4752 
4753     z.low = zSig1;
4754     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4755     return z;
4756 }
4757 
4758 /*----------------------------------------------------------------------------
4759 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4760 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4761 | and `zSig2', and returns the proper quadruple-precision floating-point value
4762 | corresponding to the abstract input.  Ordinarily, the abstract value is
4763 | simply rounded and packed into the quadruple-precision format, with the
4764 | inexact exception raised if the abstract input cannot be represented
4765 | exactly.  However, if the abstract value is too large, the overflow and
4766 | inexact exceptions are raised and an infinity or maximal finite value is
4767 | returned.  If the abstract value is too small, the input value is rounded to
4768 | a subnormal number, and the underflow and inexact exceptions are raised if
4769 | the abstract input cannot be represented exactly as a subnormal quadruple-
4770 | precision floating-point number.
4771 |     The input significand must be normalized or smaller.  If the input
4772 | significand is not normalized, `zExp' must be 0; in that case, the result
4773 | returned is a subnormal number, and it must not require rounding.  In the
4774 | usual case that the input significand is normalized, `zExp' must be 1 less
4775 | than the ``true'' floating-point exponent.  The handling of underflow and
4776 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4777 *----------------------------------------------------------------------------*/
4778 
4779 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4780                                      uint64_t zSig0, uint64_t zSig1,
4781                                      uint64_t zSig2, float_status *status)
4782 {
4783     int8_t roundingMode;
4784     bool roundNearestEven, increment, isTiny;
4785 
4786     roundingMode = status->float_rounding_mode;
4787     roundNearestEven = ( roundingMode == float_round_nearest_even );
4788     switch (roundingMode) {
4789     case float_round_nearest_even:
4790     case float_round_ties_away:
4791         increment = ((int64_t)zSig2 < 0);
4792         break;
4793     case float_round_to_zero:
4794         increment = 0;
4795         break;
4796     case float_round_up:
4797         increment = !zSign && zSig2;
4798         break;
4799     case float_round_down:
4800         increment = zSign && zSig2;
4801         break;
4802     case float_round_to_odd:
4803         increment = !(zSig1 & 0x1) && zSig2;
4804         break;
4805     default:
4806         abort();
4807     }
4808     if ( 0x7FFD <= (uint32_t) zExp ) {
4809         if (    ( 0x7FFD < zExp )
4810              || (    ( zExp == 0x7FFD )
4811                   && eq128(
4812                          UINT64_C(0x0001FFFFFFFFFFFF),
4813                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4814                          zSig0,
4815                          zSig1
4816                      )
4817                   && increment
4818                 )
4819            ) {
4820             float_raise(float_flag_overflow | float_flag_inexact, status);
4821             if (    ( roundingMode == float_round_to_zero )
4822                  || ( zSign && ( roundingMode == float_round_up ) )
4823                  || ( ! zSign && ( roundingMode == float_round_down ) )
4824                  || (roundingMode == float_round_to_odd)
4825                ) {
4826                 return
4827                     packFloat128(
4828                         zSign,
4829                         0x7FFE,
4830                         UINT64_C(0x0000FFFFFFFFFFFF),
4831                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4832                     );
4833             }
4834             return packFloat128( zSign, 0x7FFF, 0, 0 );
4835         }
4836         if ( zExp < 0 ) {
4837             if (status->flush_to_zero) {
4838                 float_raise(float_flag_output_denormal, status);
4839                 return packFloat128(zSign, 0, 0, 0);
4840             }
4841             isTiny = status->tininess_before_rounding
4842                   || (zExp < -1)
4843                   || !increment
4844                   || lt128(zSig0, zSig1,
4845                            UINT64_C(0x0001FFFFFFFFFFFF),
4846                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4847             shift128ExtraRightJamming(
4848                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4849             zExp = 0;
4850             if (isTiny && zSig2) {
4851                 float_raise(float_flag_underflow, status);
4852             }
4853             switch (roundingMode) {
4854             case float_round_nearest_even:
4855             case float_round_ties_away:
4856                 increment = ((int64_t)zSig2 < 0);
4857                 break;
4858             case float_round_to_zero:
4859                 increment = 0;
4860                 break;
4861             case float_round_up:
4862                 increment = !zSign && zSig2;
4863                 break;
4864             case float_round_down:
4865                 increment = zSign && zSig2;
4866                 break;
4867             case float_round_to_odd:
4868                 increment = !(zSig1 & 0x1) && zSig2;
4869                 break;
4870             default:
4871                 abort();
4872             }
4873         }
4874     }
4875     if (zSig2) {
4876         float_raise(float_flag_inexact, status);
4877     }
4878     if ( increment ) {
4879         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4880         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4881             zSig1 &= ~1;
4882         }
4883     }
4884     else {
4885         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4886     }
4887     return packFloat128( zSign, zExp, zSig0, zSig1 );
4888 
4889 }
4890 
4891 /*----------------------------------------------------------------------------
4892 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4893 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4894 | returns the proper quadruple-precision floating-point value corresponding
4895 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4896 | except that the input significand has fewer bits and does not have to be
4897 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4898 | point exponent.
4899 *----------------------------------------------------------------------------*/
4900 
4901 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4902                                               uint64_t zSig0, uint64_t zSig1,
4903                                               float_status *status)
4904 {
4905     int8_t shiftCount;
4906     uint64_t zSig2;
4907 
4908     if ( zSig0 == 0 ) {
4909         zSig0 = zSig1;
4910         zSig1 = 0;
4911         zExp -= 64;
4912     }
4913     shiftCount = clz64(zSig0) - 15;
4914     if ( 0 <= shiftCount ) {
4915         zSig2 = 0;
4916         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4917     }
4918     else {
4919         shift128ExtraRightJamming(
4920             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4921     }
4922     zExp -= shiftCount;
4923     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4924 
4925 }
4926 
4927 
4928 /*----------------------------------------------------------------------------
4929 | Returns the result of converting the 32-bit two's complement integer `a'
4930 | to the extended double-precision floating-point format.  The conversion
4931 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4932 | Arithmetic.
4933 *----------------------------------------------------------------------------*/
4934 
4935 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4936 {
4937     bool zSign;
4938     uint32_t absA;
4939     int8_t shiftCount;
4940     uint64_t zSig;
4941 
4942     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4943     zSign = ( a < 0 );
4944     absA = zSign ? - a : a;
4945     shiftCount = clz32(absA) + 32;
4946     zSig = absA;
4947     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4948 
4949 }
4950 
4951 /*----------------------------------------------------------------------------
4952 | Returns the result of converting the 32-bit two's complement integer `a' to
4953 | the quadruple-precision floating-point format.  The conversion is performed
4954 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4955 *----------------------------------------------------------------------------*/
4956 
4957 float128 int32_to_float128(int32_t a, float_status *status)
4958 {
4959     bool zSign;
4960     uint32_t absA;
4961     int8_t shiftCount;
4962     uint64_t zSig0;
4963 
4964     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4965     zSign = ( a < 0 );
4966     absA = zSign ? - a : a;
4967     shiftCount = clz32(absA) + 17;
4968     zSig0 = absA;
4969     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4970 
4971 }
4972 
4973 /*----------------------------------------------------------------------------
4974 | Returns the result of converting the 64-bit two's complement integer `a'
4975 | to the extended double-precision floating-point format.  The conversion
4976 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4977 | Arithmetic.
4978 *----------------------------------------------------------------------------*/
4979 
4980 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4981 {
4982     bool zSign;
4983     uint64_t absA;
4984     int8_t shiftCount;
4985 
4986     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4987     zSign = ( a < 0 );
4988     absA = zSign ? - a : a;
4989     shiftCount = clz64(absA);
4990     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4991 
4992 }
4993 
4994 /*----------------------------------------------------------------------------
4995 | Returns the result of converting the 64-bit two's complement integer `a' to
4996 | the quadruple-precision floating-point format.  The conversion is performed
4997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4998 *----------------------------------------------------------------------------*/
4999 
5000 float128 int64_to_float128(int64_t a, float_status *status)
5001 {
5002     bool zSign;
5003     uint64_t absA;
5004     int8_t shiftCount;
5005     int32_t zExp;
5006     uint64_t zSig0, zSig1;
5007 
5008     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5009     zSign = ( a < 0 );
5010     absA = zSign ? - a : a;
5011     shiftCount = clz64(absA) + 49;
5012     zExp = 0x406E - shiftCount;
5013     if ( 64 <= shiftCount ) {
5014         zSig1 = 0;
5015         zSig0 = absA;
5016         shiftCount -= 64;
5017     }
5018     else {
5019         zSig1 = absA;
5020         zSig0 = 0;
5021     }
5022     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5023     return packFloat128( zSign, zExp, zSig0, zSig1 );
5024 
5025 }
5026 
5027 /*----------------------------------------------------------------------------
5028 | Returns the result of converting the 64-bit unsigned integer `a'
5029 | to the quadruple-precision floating-point format.  The conversion is performed
5030 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5031 *----------------------------------------------------------------------------*/
5032 
5033 float128 uint64_to_float128(uint64_t a, float_status *status)
5034 {
5035     if (a == 0) {
5036         return float128_zero;
5037     }
5038     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5039 }
5040 
5041 /*----------------------------------------------------------------------------
5042 | Returns the result of converting the single-precision floating-point value
5043 | `a' to the extended double-precision floating-point format.  The conversion
5044 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5045 | Arithmetic.
5046 *----------------------------------------------------------------------------*/
5047 
5048 floatx80 float32_to_floatx80(float32 a, float_status *status)
5049 {
5050     bool aSign;
5051     int aExp;
5052     uint32_t aSig;
5053 
5054     a = float32_squash_input_denormal(a, status);
5055     aSig = extractFloat32Frac( a );
5056     aExp = extractFloat32Exp( a );
5057     aSign = extractFloat32Sign( a );
5058     if ( aExp == 0xFF ) {
5059         if (aSig) {
5060             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5061                                                status);
5062             return floatx80_silence_nan(res, status);
5063         }
5064         return packFloatx80(aSign,
5065                             floatx80_infinity_high,
5066                             floatx80_infinity_low);
5067     }
5068     if ( aExp == 0 ) {
5069         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5070         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5071     }
5072     aSig |= 0x00800000;
5073     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5074 
5075 }
5076 
5077 /*----------------------------------------------------------------------------
5078 | Returns the result of converting the single-precision floating-point value
5079 | `a' to the double-precision floating-point format.  The conversion is
5080 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5081 | Arithmetic.
5082 *----------------------------------------------------------------------------*/
5083 
5084 float128 float32_to_float128(float32 a, float_status *status)
5085 {
5086     bool aSign;
5087     int aExp;
5088     uint32_t aSig;
5089 
5090     a = float32_squash_input_denormal(a, status);
5091     aSig = extractFloat32Frac( a );
5092     aExp = extractFloat32Exp( a );
5093     aSign = extractFloat32Sign( a );
5094     if ( aExp == 0xFF ) {
5095         if (aSig) {
5096             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5097         }
5098         return packFloat128( aSign, 0x7FFF, 0, 0 );
5099     }
5100     if ( aExp == 0 ) {
5101         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5102         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5103         --aExp;
5104     }
5105     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5106 
5107 }
5108 
5109 /*----------------------------------------------------------------------------
5110 | Returns the remainder of the single-precision floating-point value `a'
5111 | with respect to the corresponding value `b'.  The operation is performed
5112 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5113 *----------------------------------------------------------------------------*/
5114 
5115 float32 float32_rem(float32 a, float32 b, float_status *status)
5116 {
5117     bool aSign, zSign;
5118     int aExp, bExp, expDiff;
5119     uint32_t aSig, bSig;
5120     uint32_t q;
5121     uint64_t aSig64, bSig64, q64;
5122     uint32_t alternateASig;
5123     int32_t sigMean;
5124     a = float32_squash_input_denormal(a, status);
5125     b = float32_squash_input_denormal(b, status);
5126 
5127     aSig = extractFloat32Frac( a );
5128     aExp = extractFloat32Exp( a );
5129     aSign = extractFloat32Sign( a );
5130     bSig = extractFloat32Frac( b );
5131     bExp = extractFloat32Exp( b );
5132     if ( aExp == 0xFF ) {
5133         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5134             return propagateFloat32NaN(a, b, status);
5135         }
5136         float_raise(float_flag_invalid, status);
5137         return float32_default_nan(status);
5138     }
5139     if ( bExp == 0xFF ) {
5140         if (bSig) {
5141             return propagateFloat32NaN(a, b, status);
5142         }
5143         return a;
5144     }
5145     if ( bExp == 0 ) {
5146         if ( bSig == 0 ) {
5147             float_raise(float_flag_invalid, status);
5148             return float32_default_nan(status);
5149         }
5150         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5151     }
5152     if ( aExp == 0 ) {
5153         if ( aSig == 0 ) return a;
5154         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5155     }
5156     expDiff = aExp - bExp;
5157     aSig |= 0x00800000;
5158     bSig |= 0x00800000;
5159     if ( expDiff < 32 ) {
5160         aSig <<= 8;
5161         bSig <<= 8;
5162         if ( expDiff < 0 ) {
5163             if ( expDiff < -1 ) return a;
5164             aSig >>= 1;
5165         }
5166         q = ( bSig <= aSig );
5167         if ( q ) aSig -= bSig;
5168         if ( 0 < expDiff ) {
5169             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5170             q >>= 32 - expDiff;
5171             bSig >>= 2;
5172             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5173         }
5174         else {
5175             aSig >>= 2;
5176             bSig >>= 2;
5177         }
5178     }
5179     else {
5180         if ( bSig <= aSig ) aSig -= bSig;
5181         aSig64 = ( (uint64_t) aSig )<<40;
5182         bSig64 = ( (uint64_t) bSig )<<40;
5183         expDiff -= 64;
5184         while ( 0 < expDiff ) {
5185             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5186             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5187             aSig64 = - ( ( bSig * q64 )<<38 );
5188             expDiff -= 62;
5189         }
5190         expDiff += 64;
5191         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5192         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5193         q = q64>>( 64 - expDiff );
5194         bSig <<= 6;
5195         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5196     }
5197     do {
5198         alternateASig = aSig;
5199         ++q;
5200         aSig -= bSig;
5201     } while ( 0 <= (int32_t) aSig );
5202     sigMean = aSig + alternateASig;
5203     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5204         aSig = alternateASig;
5205     }
5206     zSign = ( (int32_t) aSig < 0 );
5207     if ( zSign ) aSig = - aSig;
5208     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5209 }
5210 
5211 
5212 
5213 /*----------------------------------------------------------------------------
5214 | Returns the binary exponential of the single-precision floating-point value
5215 | `a'. The operation is performed according to the IEC/IEEE Standard for
5216 | Binary Floating-Point Arithmetic.
5217 |
5218 | Uses the following identities:
5219 |
5220 | 1. -------------------------------------------------------------------------
5221 |      x    x*ln(2)
5222 |     2  = e
5223 |
5224 | 2. -------------------------------------------------------------------------
5225 |                      2     3     4     5           n
5226 |      x        x     x     x     x     x           x
5227 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5228 |               1!    2!    3!    4!    5!          n!
5229 *----------------------------------------------------------------------------*/
5230 
5231 static const float64 float32_exp2_coefficients[15] =
5232 {
5233     const_float64( 0x3ff0000000000000ll ), /*  1 */
5234     const_float64( 0x3fe0000000000000ll ), /*  2 */
5235     const_float64( 0x3fc5555555555555ll ), /*  3 */
5236     const_float64( 0x3fa5555555555555ll ), /*  4 */
5237     const_float64( 0x3f81111111111111ll ), /*  5 */
5238     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5239     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5240     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5241     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5242     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5243     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5244     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5245     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5246     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5247     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5248 };
5249 
5250 float32 float32_exp2(float32 a, float_status *status)
5251 {
5252     bool aSign;
5253     int aExp;
5254     uint32_t aSig;
5255     float64 r, x, xn;
5256     int i;
5257     a = float32_squash_input_denormal(a, status);
5258 
5259     aSig = extractFloat32Frac( a );
5260     aExp = extractFloat32Exp( a );
5261     aSign = extractFloat32Sign( a );
5262 
5263     if ( aExp == 0xFF) {
5264         if (aSig) {
5265             return propagateFloat32NaN(a, float32_zero, status);
5266         }
5267         return (aSign) ? float32_zero : a;
5268     }
5269     if (aExp == 0) {
5270         if (aSig == 0) return float32_one;
5271     }
5272 
5273     float_raise(float_flag_inexact, status);
5274 
5275     /* ******************************* */
5276     /* using float64 for approximation */
5277     /* ******************************* */
5278     x = float32_to_float64(a, status);
5279     x = float64_mul(x, float64_ln2, status);
5280 
5281     xn = x;
5282     r = float64_one;
5283     for (i = 0 ; i < 15 ; i++) {
5284         float64 f;
5285 
5286         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5287         r = float64_add(r, f, status);
5288 
5289         xn = float64_mul(xn, x, status);
5290     }
5291 
5292     return float64_to_float32(r, status);
5293 }
5294 
5295 /*----------------------------------------------------------------------------
5296 | Returns the binary log of the single-precision floating-point value `a'.
5297 | The operation is performed according to the IEC/IEEE Standard for Binary
5298 | Floating-Point Arithmetic.
5299 *----------------------------------------------------------------------------*/
5300 float32 float32_log2(float32 a, float_status *status)
5301 {
5302     bool aSign, zSign;
5303     int aExp;
5304     uint32_t aSig, zSig, i;
5305 
5306     a = float32_squash_input_denormal(a, status);
5307     aSig = extractFloat32Frac( a );
5308     aExp = extractFloat32Exp( a );
5309     aSign = extractFloat32Sign( a );
5310 
5311     if ( aExp == 0 ) {
5312         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5313         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5314     }
5315     if ( aSign ) {
5316         float_raise(float_flag_invalid, status);
5317         return float32_default_nan(status);
5318     }
5319     if ( aExp == 0xFF ) {
5320         if (aSig) {
5321             return propagateFloat32NaN(a, float32_zero, status);
5322         }
5323         return a;
5324     }
5325 
5326     aExp -= 0x7F;
5327     aSig |= 0x00800000;
5328     zSign = aExp < 0;
5329     zSig = aExp << 23;
5330 
5331     for (i = 1 << 22; i > 0; i >>= 1) {
5332         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5333         if ( aSig & 0x01000000 ) {
5334             aSig >>= 1;
5335             zSig |= i;
5336         }
5337     }
5338 
5339     if ( zSign )
5340         zSig = -zSig;
5341 
5342     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5343 }
5344 
5345 /*----------------------------------------------------------------------------
5346 | Returns the result of converting the double-precision floating-point value
5347 | `a' to the extended double-precision floating-point format.  The conversion
5348 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5349 | Arithmetic.
5350 *----------------------------------------------------------------------------*/
5351 
5352 floatx80 float64_to_floatx80(float64 a, float_status *status)
5353 {
5354     bool aSign;
5355     int aExp;
5356     uint64_t aSig;
5357 
5358     a = float64_squash_input_denormal(a, status);
5359     aSig = extractFloat64Frac( a );
5360     aExp = extractFloat64Exp( a );
5361     aSign = extractFloat64Sign( a );
5362     if ( aExp == 0x7FF ) {
5363         if (aSig) {
5364             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5365                                                status);
5366             return floatx80_silence_nan(res, status);
5367         }
5368         return packFloatx80(aSign,
5369                             floatx80_infinity_high,
5370                             floatx80_infinity_low);
5371     }
5372     if ( aExp == 0 ) {
5373         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5374         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5375     }
5376     return
5377         packFloatx80(
5378             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5379 
5380 }
5381 
5382 /*----------------------------------------------------------------------------
5383 | Returns the result of converting the double-precision floating-point value
5384 | `a' to the quadruple-precision floating-point format.  The conversion is
5385 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5386 | Arithmetic.
5387 *----------------------------------------------------------------------------*/
5388 
5389 float128 float64_to_float128(float64 a, float_status *status)
5390 {
5391     bool aSign;
5392     int aExp;
5393     uint64_t aSig, zSig0, zSig1;
5394 
5395     a = float64_squash_input_denormal(a, status);
5396     aSig = extractFloat64Frac( a );
5397     aExp = extractFloat64Exp( a );
5398     aSign = extractFloat64Sign( a );
5399     if ( aExp == 0x7FF ) {
5400         if (aSig) {
5401             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5402         }
5403         return packFloat128( aSign, 0x7FFF, 0, 0 );
5404     }
5405     if ( aExp == 0 ) {
5406         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5407         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5408         --aExp;
5409     }
5410     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5411     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5412 
5413 }
5414 
5415 
5416 /*----------------------------------------------------------------------------
5417 | Returns the remainder of the double-precision floating-point value `a'
5418 | with respect to the corresponding value `b'.  The operation is performed
5419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5420 *----------------------------------------------------------------------------*/
5421 
5422 float64 float64_rem(float64 a, float64 b, float_status *status)
5423 {
5424     bool aSign, zSign;
5425     int aExp, bExp, expDiff;
5426     uint64_t aSig, bSig;
5427     uint64_t q, alternateASig;
5428     int64_t sigMean;
5429 
5430     a = float64_squash_input_denormal(a, status);
5431     b = float64_squash_input_denormal(b, status);
5432     aSig = extractFloat64Frac( a );
5433     aExp = extractFloat64Exp( a );
5434     aSign = extractFloat64Sign( a );
5435     bSig = extractFloat64Frac( b );
5436     bExp = extractFloat64Exp( b );
5437     if ( aExp == 0x7FF ) {
5438         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5439             return propagateFloat64NaN(a, b, status);
5440         }
5441         float_raise(float_flag_invalid, status);
5442         return float64_default_nan(status);
5443     }
5444     if ( bExp == 0x7FF ) {
5445         if (bSig) {
5446             return propagateFloat64NaN(a, b, status);
5447         }
5448         return a;
5449     }
5450     if ( bExp == 0 ) {
5451         if ( bSig == 0 ) {
5452             float_raise(float_flag_invalid, status);
5453             return float64_default_nan(status);
5454         }
5455         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5456     }
5457     if ( aExp == 0 ) {
5458         if ( aSig == 0 ) return a;
5459         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5460     }
5461     expDiff = aExp - bExp;
5462     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5463     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5464     if ( expDiff < 0 ) {
5465         if ( expDiff < -1 ) return a;
5466         aSig >>= 1;
5467     }
5468     q = ( bSig <= aSig );
5469     if ( q ) aSig -= bSig;
5470     expDiff -= 64;
5471     while ( 0 < expDiff ) {
5472         q = estimateDiv128To64( aSig, 0, bSig );
5473         q = ( 2 < q ) ? q - 2 : 0;
5474         aSig = - ( ( bSig>>2 ) * q );
5475         expDiff -= 62;
5476     }
5477     expDiff += 64;
5478     if ( 0 < expDiff ) {
5479         q = estimateDiv128To64( aSig, 0, bSig );
5480         q = ( 2 < q ) ? q - 2 : 0;
5481         q >>= 64 - expDiff;
5482         bSig >>= 2;
5483         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5484     }
5485     else {
5486         aSig >>= 2;
5487         bSig >>= 2;
5488     }
5489     do {
5490         alternateASig = aSig;
5491         ++q;
5492         aSig -= bSig;
5493     } while ( 0 <= (int64_t) aSig );
5494     sigMean = aSig + alternateASig;
5495     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5496         aSig = alternateASig;
5497     }
5498     zSign = ( (int64_t) aSig < 0 );
5499     if ( zSign ) aSig = - aSig;
5500     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5501 
5502 }
5503 
5504 /*----------------------------------------------------------------------------
5505 | Returns the binary log of the double-precision floating-point value `a'.
5506 | The operation is performed according to the IEC/IEEE Standard for Binary
5507 | Floating-Point Arithmetic.
5508 *----------------------------------------------------------------------------*/
5509 float64 float64_log2(float64 a, float_status *status)
5510 {
5511     bool aSign, zSign;
5512     int aExp;
5513     uint64_t aSig, aSig0, aSig1, zSig, i;
5514     a = float64_squash_input_denormal(a, status);
5515 
5516     aSig = extractFloat64Frac( a );
5517     aExp = extractFloat64Exp( a );
5518     aSign = extractFloat64Sign( a );
5519 
5520     if ( aExp == 0 ) {
5521         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5522         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5523     }
5524     if ( aSign ) {
5525         float_raise(float_flag_invalid, status);
5526         return float64_default_nan(status);
5527     }
5528     if ( aExp == 0x7FF ) {
5529         if (aSig) {
5530             return propagateFloat64NaN(a, float64_zero, status);
5531         }
5532         return a;
5533     }
5534 
5535     aExp -= 0x3FF;
5536     aSig |= UINT64_C(0x0010000000000000);
5537     zSign = aExp < 0;
5538     zSig = (uint64_t)aExp << 52;
5539     for (i = 1LL << 51; i > 0; i >>= 1) {
5540         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5541         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5542         if ( aSig & UINT64_C(0x0020000000000000) ) {
5543             aSig >>= 1;
5544             zSig |= i;
5545         }
5546     }
5547 
5548     if ( zSign )
5549         zSig = -zSig;
5550     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5551 }
5552 
5553 /*----------------------------------------------------------------------------
5554 | Returns the result of converting the extended double-precision floating-
5555 | point value `a' to the 32-bit two's complement integer format.  The
5556 | conversion is performed according to the IEC/IEEE Standard for Binary
5557 | Floating-Point Arithmetic---which means in particular that the conversion
5558 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5559 | largest positive integer is returned.  Otherwise, if the conversion
5560 | overflows, the largest integer with the same sign as `a' is returned.
5561 *----------------------------------------------------------------------------*/
5562 
5563 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5564 {
5565     bool aSign;
5566     int32_t aExp, shiftCount;
5567     uint64_t aSig;
5568 
5569     if (floatx80_invalid_encoding(a)) {
5570         float_raise(float_flag_invalid, status);
5571         return 1 << 31;
5572     }
5573     aSig = extractFloatx80Frac( a );
5574     aExp = extractFloatx80Exp( a );
5575     aSign = extractFloatx80Sign( a );
5576     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5577     shiftCount = 0x4037 - aExp;
5578     if ( shiftCount <= 0 ) shiftCount = 1;
5579     shift64RightJamming( aSig, shiftCount, &aSig );
5580     return roundAndPackInt32(aSign, aSig, status);
5581 
5582 }
5583 
5584 /*----------------------------------------------------------------------------
5585 | Returns the result of converting the extended double-precision floating-
5586 | point value `a' to the 32-bit two's complement integer format.  The
5587 | conversion is performed according to the IEC/IEEE Standard for Binary
5588 | Floating-Point Arithmetic, except that the conversion is always rounded
5589 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5590 | Otherwise, if the conversion overflows, the largest integer with the same
5591 | sign as `a' is returned.
5592 *----------------------------------------------------------------------------*/
5593 
5594 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5595 {
5596     bool aSign;
5597     int32_t aExp, shiftCount;
5598     uint64_t aSig, savedASig;
5599     int32_t z;
5600 
5601     if (floatx80_invalid_encoding(a)) {
5602         float_raise(float_flag_invalid, status);
5603         return 1 << 31;
5604     }
5605     aSig = extractFloatx80Frac( a );
5606     aExp = extractFloatx80Exp( a );
5607     aSign = extractFloatx80Sign( a );
5608     if ( 0x401E < aExp ) {
5609         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5610         goto invalid;
5611     }
5612     else if ( aExp < 0x3FFF ) {
5613         if (aExp || aSig) {
5614             float_raise(float_flag_inexact, status);
5615         }
5616         return 0;
5617     }
5618     shiftCount = 0x403E - aExp;
5619     savedASig = aSig;
5620     aSig >>= shiftCount;
5621     z = aSig;
5622     if ( aSign ) z = - z;
5623     if ( ( z < 0 ) ^ aSign ) {
5624  invalid:
5625         float_raise(float_flag_invalid, status);
5626         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5627     }
5628     if ( ( aSig<<shiftCount ) != savedASig ) {
5629         float_raise(float_flag_inexact, status);
5630     }
5631     return z;
5632 
5633 }
5634 
5635 /*----------------------------------------------------------------------------
5636 | Returns the result of converting the extended double-precision floating-
5637 | point value `a' to the 64-bit two's complement integer format.  The
5638 | conversion is performed according to the IEC/IEEE Standard for Binary
5639 | Floating-Point Arithmetic---which means in particular that the conversion
5640 | is rounded according to the current rounding mode.  If `a' is a NaN,
5641 | the largest positive integer is returned.  Otherwise, if the conversion
5642 | overflows, the largest integer with the same sign as `a' is returned.
5643 *----------------------------------------------------------------------------*/
5644 
5645 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5646 {
5647     bool aSign;
5648     int32_t aExp, shiftCount;
5649     uint64_t aSig, aSigExtra;
5650 
5651     if (floatx80_invalid_encoding(a)) {
5652         float_raise(float_flag_invalid, status);
5653         return 1ULL << 63;
5654     }
5655     aSig = extractFloatx80Frac( a );
5656     aExp = extractFloatx80Exp( a );
5657     aSign = extractFloatx80Sign( a );
5658     shiftCount = 0x403E - aExp;
5659     if ( shiftCount <= 0 ) {
5660         if ( shiftCount ) {
5661             float_raise(float_flag_invalid, status);
5662             if (!aSign || floatx80_is_any_nan(a)) {
5663                 return INT64_MAX;
5664             }
5665             return INT64_MIN;
5666         }
5667         aSigExtra = 0;
5668     }
5669     else {
5670         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5671     }
5672     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5673 
5674 }
5675 
5676 /*----------------------------------------------------------------------------
5677 | Returns the result of converting the extended double-precision floating-
5678 | point value `a' to the 64-bit two's complement integer format.  The
5679 | conversion is performed according to the IEC/IEEE Standard for Binary
5680 | Floating-Point Arithmetic, except that the conversion is always rounded
5681 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5682 | Otherwise, if the conversion overflows, the largest integer with the same
5683 | sign as `a' is returned.
5684 *----------------------------------------------------------------------------*/
5685 
5686 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5687 {
5688     bool aSign;
5689     int32_t aExp, shiftCount;
5690     uint64_t aSig;
5691     int64_t z;
5692 
5693     if (floatx80_invalid_encoding(a)) {
5694         float_raise(float_flag_invalid, status);
5695         return 1ULL << 63;
5696     }
5697     aSig = extractFloatx80Frac( a );
5698     aExp = extractFloatx80Exp( a );
5699     aSign = extractFloatx80Sign( a );
5700     shiftCount = aExp - 0x403E;
5701     if ( 0 <= shiftCount ) {
5702         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5703         if ( ( a.high != 0xC03E ) || aSig ) {
5704             float_raise(float_flag_invalid, status);
5705             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5706                 return INT64_MAX;
5707             }
5708         }
5709         return INT64_MIN;
5710     }
5711     else if ( aExp < 0x3FFF ) {
5712         if (aExp | aSig) {
5713             float_raise(float_flag_inexact, status);
5714         }
5715         return 0;
5716     }
5717     z = aSig>>( - shiftCount );
5718     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5719         float_raise(float_flag_inexact, status);
5720     }
5721     if ( aSign ) z = - z;
5722     return z;
5723 
5724 }
5725 
5726 /*----------------------------------------------------------------------------
5727 | Returns the result of converting the extended double-precision floating-
5728 | point value `a' to the single-precision floating-point format.  The
5729 | conversion is performed according to the IEC/IEEE Standard for Binary
5730 | Floating-Point Arithmetic.
5731 *----------------------------------------------------------------------------*/
5732 
5733 float32 floatx80_to_float32(floatx80 a, float_status *status)
5734 {
5735     bool aSign;
5736     int32_t aExp;
5737     uint64_t aSig;
5738 
5739     if (floatx80_invalid_encoding(a)) {
5740         float_raise(float_flag_invalid, status);
5741         return float32_default_nan(status);
5742     }
5743     aSig = extractFloatx80Frac( a );
5744     aExp = extractFloatx80Exp( a );
5745     aSign = extractFloatx80Sign( a );
5746     if ( aExp == 0x7FFF ) {
5747         if ( (uint64_t) ( aSig<<1 ) ) {
5748             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5749                                              status);
5750             return float32_silence_nan(res, status);
5751         }
5752         return packFloat32( aSign, 0xFF, 0 );
5753     }
5754     shift64RightJamming( aSig, 33, &aSig );
5755     if ( aExp || aSig ) aExp -= 0x3F81;
5756     return roundAndPackFloat32(aSign, aExp, aSig, status);
5757 
5758 }
5759 
5760 /*----------------------------------------------------------------------------
5761 | Returns the result of converting the extended double-precision floating-
5762 | point value `a' to the double-precision floating-point format.  The
5763 | conversion is performed according to the IEC/IEEE Standard for Binary
5764 | Floating-Point Arithmetic.
5765 *----------------------------------------------------------------------------*/
5766 
5767 float64 floatx80_to_float64(floatx80 a, float_status *status)
5768 {
5769     bool aSign;
5770     int32_t aExp;
5771     uint64_t aSig, zSig;
5772 
5773     if (floatx80_invalid_encoding(a)) {
5774         float_raise(float_flag_invalid, status);
5775         return float64_default_nan(status);
5776     }
5777     aSig = extractFloatx80Frac( a );
5778     aExp = extractFloatx80Exp( a );
5779     aSign = extractFloatx80Sign( a );
5780     if ( aExp == 0x7FFF ) {
5781         if ( (uint64_t) ( aSig<<1 ) ) {
5782             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5783                                              status);
5784             return float64_silence_nan(res, status);
5785         }
5786         return packFloat64( aSign, 0x7FF, 0 );
5787     }
5788     shift64RightJamming( aSig, 1, &zSig );
5789     if ( aExp || aSig ) aExp -= 0x3C01;
5790     return roundAndPackFloat64(aSign, aExp, zSig, status);
5791 
5792 }
5793 
5794 /*----------------------------------------------------------------------------
5795 | Returns the result of converting the extended double-precision floating-
5796 | point value `a' to the quadruple-precision floating-point format.  The
5797 | conversion is performed according to the IEC/IEEE Standard for Binary
5798 | Floating-Point Arithmetic.
5799 *----------------------------------------------------------------------------*/
5800 
5801 float128 floatx80_to_float128(floatx80 a, float_status *status)
5802 {
5803     bool aSign;
5804     int aExp;
5805     uint64_t aSig, zSig0, zSig1;
5806 
5807     if (floatx80_invalid_encoding(a)) {
5808         float_raise(float_flag_invalid, status);
5809         return float128_default_nan(status);
5810     }
5811     aSig = extractFloatx80Frac( a );
5812     aExp = extractFloatx80Exp( a );
5813     aSign = extractFloatx80Sign( a );
5814     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5815         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5816                                            status);
5817         return float128_silence_nan(res, status);
5818     }
5819     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5820     return packFloat128( aSign, aExp, zSig0, zSig1 );
5821 
5822 }
5823 
5824 /*----------------------------------------------------------------------------
5825 | Rounds the extended double-precision floating-point value `a'
5826 | to the precision provided by floatx80_rounding_precision and returns the
5827 | result as an extended double-precision floating-point value.
5828 | The operation is performed according to the IEC/IEEE Standard for Binary
5829 | Floating-Point Arithmetic.
5830 *----------------------------------------------------------------------------*/
5831 
5832 floatx80 floatx80_round(floatx80 a, float_status *status)
5833 {
5834     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5835                                 extractFloatx80Sign(a),
5836                                 extractFloatx80Exp(a),
5837                                 extractFloatx80Frac(a), 0, status);
5838 }
5839 
5840 /*----------------------------------------------------------------------------
5841 | Rounds the extended double-precision floating-point value `a' to an integer,
5842 | and returns the result as an extended quadruple-precision floating-point
5843 | value.  The operation is performed according to the IEC/IEEE Standard for
5844 | Binary Floating-Point Arithmetic.
5845 *----------------------------------------------------------------------------*/
5846 
5847 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5848 {
5849     bool aSign;
5850     int32_t aExp;
5851     uint64_t lastBitMask, roundBitsMask;
5852     floatx80 z;
5853 
5854     if (floatx80_invalid_encoding(a)) {
5855         float_raise(float_flag_invalid, status);
5856         return floatx80_default_nan(status);
5857     }
5858     aExp = extractFloatx80Exp( a );
5859     if ( 0x403E <= aExp ) {
5860         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5861             return propagateFloatx80NaN(a, a, status);
5862         }
5863         return a;
5864     }
5865     if ( aExp < 0x3FFF ) {
5866         if (    ( aExp == 0 )
5867              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5868             return a;
5869         }
5870         float_raise(float_flag_inexact, status);
5871         aSign = extractFloatx80Sign( a );
5872         switch (status->float_rounding_mode) {
5873          case float_round_nearest_even:
5874             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5875                ) {
5876                 return
5877                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5878             }
5879             break;
5880         case float_round_ties_away:
5881             if (aExp == 0x3FFE) {
5882                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5883             }
5884             break;
5885          case float_round_down:
5886             return
5887                   aSign ?
5888                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5889                 : packFloatx80( 0, 0, 0 );
5890          case float_round_up:
5891             return
5892                   aSign ? packFloatx80( 1, 0, 0 )
5893                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5894 
5895         case float_round_to_zero:
5896             break;
5897         default:
5898             g_assert_not_reached();
5899         }
5900         return packFloatx80( aSign, 0, 0 );
5901     }
5902     lastBitMask = 1;
5903     lastBitMask <<= 0x403E - aExp;
5904     roundBitsMask = lastBitMask - 1;
5905     z = a;
5906     switch (status->float_rounding_mode) {
5907     case float_round_nearest_even:
5908         z.low += lastBitMask>>1;
5909         if ((z.low & roundBitsMask) == 0) {
5910             z.low &= ~lastBitMask;
5911         }
5912         break;
5913     case float_round_ties_away:
5914         z.low += lastBitMask >> 1;
5915         break;
5916     case float_round_to_zero:
5917         break;
5918     case float_round_up:
5919         if (!extractFloatx80Sign(z)) {
5920             z.low += roundBitsMask;
5921         }
5922         break;
5923     case float_round_down:
5924         if (extractFloatx80Sign(z)) {
5925             z.low += roundBitsMask;
5926         }
5927         break;
5928     default:
5929         abort();
5930     }
5931     z.low &= ~ roundBitsMask;
5932     if ( z.low == 0 ) {
5933         ++z.high;
5934         z.low = UINT64_C(0x8000000000000000);
5935     }
5936     if (z.low != a.low) {
5937         float_raise(float_flag_inexact, status);
5938     }
5939     return z;
5940 
5941 }
5942 
5943 /*----------------------------------------------------------------------------
5944 | Returns the result of adding the absolute values of the extended double-
5945 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5946 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5947 | The addition is performed according to the IEC/IEEE Standard for Binary
5948 | Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5950 
5951 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5952                                 float_status *status)
5953 {
5954     int32_t aExp, bExp, zExp;
5955     uint64_t aSig, bSig, zSig0, zSig1;
5956     int32_t expDiff;
5957 
5958     aSig = extractFloatx80Frac( a );
5959     aExp = extractFloatx80Exp( a );
5960     bSig = extractFloatx80Frac( b );
5961     bExp = extractFloatx80Exp( b );
5962     expDiff = aExp - bExp;
5963     if ( 0 < expDiff ) {
5964         if ( aExp == 0x7FFF ) {
5965             if ((uint64_t)(aSig << 1)) {
5966                 return propagateFloatx80NaN(a, b, status);
5967             }
5968             return a;
5969         }
5970         if ( bExp == 0 ) --expDiff;
5971         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5972         zExp = aExp;
5973     }
5974     else if ( expDiff < 0 ) {
5975         if ( bExp == 0x7FFF ) {
5976             if ((uint64_t)(bSig << 1)) {
5977                 return propagateFloatx80NaN(a, b, status);
5978             }
5979             return packFloatx80(zSign,
5980                                 floatx80_infinity_high,
5981                                 floatx80_infinity_low);
5982         }
5983         if ( aExp == 0 ) ++expDiff;
5984         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5985         zExp = bExp;
5986     }
5987     else {
5988         if ( aExp == 0x7FFF ) {
5989             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5990                 return propagateFloatx80NaN(a, b, status);
5991             }
5992             return a;
5993         }
5994         zSig1 = 0;
5995         zSig0 = aSig + bSig;
5996         if ( aExp == 0 ) {
5997             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5998                 /* At least one of the values is a pseudo-denormal,
5999                  * and there is a carry out of the result.  */
6000                 zExp = 1;
6001                 goto shiftRight1;
6002             }
6003             if (zSig0 == 0) {
6004                 return packFloatx80(zSign, 0, 0);
6005             }
6006             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
6007             goto roundAndPack;
6008         }
6009         zExp = aExp;
6010         goto shiftRight1;
6011     }
6012     zSig0 = aSig + bSig;
6013     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6014  shiftRight1:
6015     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6016     zSig0 |= UINT64_C(0x8000000000000000);
6017     ++zExp;
6018  roundAndPack:
6019     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6020                                 zSign, zExp, zSig0, zSig1, status);
6021 }
6022 
6023 /*----------------------------------------------------------------------------
6024 | Returns the result of subtracting the absolute values of the extended
6025 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
6026 | difference is negated before being returned.  `zSign' is ignored if the
6027 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6028 | Standard for Binary Floating-Point Arithmetic.
6029 *----------------------------------------------------------------------------*/
6030 
6031 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6032                                 float_status *status)
6033 {
6034     int32_t aExp, bExp, zExp;
6035     uint64_t aSig, bSig, zSig0, zSig1;
6036     int32_t expDiff;
6037 
6038     aSig = extractFloatx80Frac( a );
6039     aExp = extractFloatx80Exp( a );
6040     bSig = extractFloatx80Frac( b );
6041     bExp = extractFloatx80Exp( b );
6042     expDiff = aExp - bExp;
6043     if ( 0 < expDiff ) goto aExpBigger;
6044     if ( expDiff < 0 ) goto bExpBigger;
6045     if ( aExp == 0x7FFF ) {
6046         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6047             return propagateFloatx80NaN(a, b, status);
6048         }
6049         float_raise(float_flag_invalid, status);
6050         return floatx80_default_nan(status);
6051     }
6052     if ( aExp == 0 ) {
6053         aExp = 1;
6054         bExp = 1;
6055     }
6056     zSig1 = 0;
6057     if ( bSig < aSig ) goto aBigger;
6058     if ( aSig < bSig ) goto bBigger;
6059     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6060  bExpBigger:
6061     if ( bExp == 0x7FFF ) {
6062         if ((uint64_t)(bSig << 1)) {
6063             return propagateFloatx80NaN(a, b, status);
6064         }
6065         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6066                             floatx80_infinity_low);
6067     }
6068     if ( aExp == 0 ) ++expDiff;
6069     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6070  bBigger:
6071     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6072     zExp = bExp;
6073     zSign ^= 1;
6074     goto normalizeRoundAndPack;
6075  aExpBigger:
6076     if ( aExp == 0x7FFF ) {
6077         if ((uint64_t)(aSig << 1)) {
6078             return propagateFloatx80NaN(a, b, status);
6079         }
6080         return a;
6081     }
6082     if ( bExp == 0 ) --expDiff;
6083     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6084  aBigger:
6085     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6086     zExp = aExp;
6087  normalizeRoundAndPack:
6088     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6089                                          zSign, zExp, zSig0, zSig1, status);
6090 }
6091 
6092 /*----------------------------------------------------------------------------
6093 | Returns the result of adding the extended double-precision floating-point
6094 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6095 | Standard for Binary Floating-Point Arithmetic.
6096 *----------------------------------------------------------------------------*/
6097 
6098 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6099 {
6100     bool aSign, bSign;
6101 
6102     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6103         float_raise(float_flag_invalid, status);
6104         return floatx80_default_nan(status);
6105     }
6106     aSign = extractFloatx80Sign( a );
6107     bSign = extractFloatx80Sign( b );
6108     if ( aSign == bSign ) {
6109         return addFloatx80Sigs(a, b, aSign, status);
6110     }
6111     else {
6112         return subFloatx80Sigs(a, b, aSign, status);
6113     }
6114 
6115 }
6116 
6117 /*----------------------------------------------------------------------------
6118 | Returns the result of subtracting the extended double-precision floating-
6119 | point values `a' and `b'.  The operation is performed according to the
6120 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6121 *----------------------------------------------------------------------------*/
6122 
6123 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6124 {
6125     bool aSign, bSign;
6126 
6127     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6128         float_raise(float_flag_invalid, status);
6129         return floatx80_default_nan(status);
6130     }
6131     aSign = extractFloatx80Sign( a );
6132     bSign = extractFloatx80Sign( b );
6133     if ( aSign == bSign ) {
6134         return subFloatx80Sigs(a, b, aSign, status);
6135     }
6136     else {
6137         return addFloatx80Sigs(a, b, aSign, status);
6138     }
6139 
6140 }
6141 
6142 /*----------------------------------------------------------------------------
6143 | Returns the result of multiplying the extended double-precision floating-
6144 | point values `a' and `b'.  The operation is performed according to the
6145 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6146 *----------------------------------------------------------------------------*/
6147 
6148 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6149 {
6150     bool aSign, bSign, zSign;
6151     int32_t aExp, bExp, zExp;
6152     uint64_t aSig, bSig, zSig0, zSig1;
6153 
6154     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6155         float_raise(float_flag_invalid, status);
6156         return floatx80_default_nan(status);
6157     }
6158     aSig = extractFloatx80Frac( a );
6159     aExp = extractFloatx80Exp( a );
6160     aSign = extractFloatx80Sign( a );
6161     bSig = extractFloatx80Frac( b );
6162     bExp = extractFloatx80Exp( b );
6163     bSign = extractFloatx80Sign( b );
6164     zSign = aSign ^ bSign;
6165     if ( aExp == 0x7FFF ) {
6166         if (    (uint64_t) ( aSig<<1 )
6167              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6168             return propagateFloatx80NaN(a, b, status);
6169         }
6170         if ( ( bExp | bSig ) == 0 ) goto invalid;
6171         return packFloatx80(zSign, floatx80_infinity_high,
6172                                    floatx80_infinity_low);
6173     }
6174     if ( bExp == 0x7FFF ) {
6175         if ((uint64_t)(bSig << 1)) {
6176             return propagateFloatx80NaN(a, b, status);
6177         }
6178         if ( ( aExp | aSig ) == 0 ) {
6179  invalid:
6180             float_raise(float_flag_invalid, status);
6181             return floatx80_default_nan(status);
6182         }
6183         return packFloatx80(zSign, floatx80_infinity_high,
6184                                    floatx80_infinity_low);
6185     }
6186     if ( aExp == 0 ) {
6187         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6188         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6189     }
6190     if ( bExp == 0 ) {
6191         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6192         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6193     }
6194     zExp = aExp + bExp - 0x3FFE;
6195     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6196     if ( 0 < (int64_t) zSig0 ) {
6197         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6198         --zExp;
6199     }
6200     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6201                                 zSign, zExp, zSig0, zSig1, status);
6202 }
6203 
6204 /*----------------------------------------------------------------------------
6205 | Returns the result of dividing the extended double-precision floating-point
6206 | value `a' by the corresponding value `b'.  The operation is performed
6207 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6208 *----------------------------------------------------------------------------*/
6209 
6210 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6211 {
6212     bool aSign, bSign, zSign;
6213     int32_t aExp, bExp, zExp;
6214     uint64_t aSig, bSig, zSig0, zSig1;
6215     uint64_t rem0, rem1, rem2, term0, term1, term2;
6216 
6217     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6218         float_raise(float_flag_invalid, status);
6219         return floatx80_default_nan(status);
6220     }
6221     aSig = extractFloatx80Frac( a );
6222     aExp = extractFloatx80Exp( a );
6223     aSign = extractFloatx80Sign( a );
6224     bSig = extractFloatx80Frac( b );
6225     bExp = extractFloatx80Exp( b );
6226     bSign = extractFloatx80Sign( b );
6227     zSign = aSign ^ bSign;
6228     if ( aExp == 0x7FFF ) {
6229         if ((uint64_t)(aSig << 1)) {
6230             return propagateFloatx80NaN(a, b, status);
6231         }
6232         if ( bExp == 0x7FFF ) {
6233             if ((uint64_t)(bSig << 1)) {
6234                 return propagateFloatx80NaN(a, b, status);
6235             }
6236             goto invalid;
6237         }
6238         return packFloatx80(zSign, floatx80_infinity_high,
6239                                    floatx80_infinity_low);
6240     }
6241     if ( bExp == 0x7FFF ) {
6242         if ((uint64_t)(bSig << 1)) {
6243             return propagateFloatx80NaN(a, b, status);
6244         }
6245         return packFloatx80( zSign, 0, 0 );
6246     }
6247     if ( bExp == 0 ) {
6248         if ( bSig == 0 ) {
6249             if ( ( aExp | aSig ) == 0 ) {
6250  invalid:
6251                 float_raise(float_flag_invalid, status);
6252                 return floatx80_default_nan(status);
6253             }
6254             float_raise(float_flag_divbyzero, status);
6255             return packFloatx80(zSign, floatx80_infinity_high,
6256                                        floatx80_infinity_low);
6257         }
6258         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6259     }
6260     if ( aExp == 0 ) {
6261         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6262         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6263     }
6264     zExp = aExp - bExp + 0x3FFE;
6265     rem1 = 0;
6266     if ( bSig <= aSig ) {
6267         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6268         ++zExp;
6269     }
6270     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6271     mul64To128( bSig, zSig0, &term0, &term1 );
6272     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6273     while ( (int64_t) rem0 < 0 ) {
6274         --zSig0;
6275         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6276     }
6277     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6278     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6279         mul64To128( bSig, zSig1, &term1, &term2 );
6280         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6281         while ( (int64_t) rem1 < 0 ) {
6282             --zSig1;
6283             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6284         }
6285         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6286     }
6287     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6288                                 zSign, zExp, zSig0, zSig1, status);
6289 }
6290 
6291 /*----------------------------------------------------------------------------
6292 | Returns the remainder of the extended double-precision floating-point value
6293 | `a' with respect to the corresponding value `b'.  The operation is performed
6294 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6295 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6296 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6297 | the absolute value of the integer quotient.
6298 *----------------------------------------------------------------------------*/
6299 
6300 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6301                          float_status *status)
6302 {
6303     bool aSign, zSign;
6304     int32_t aExp, bExp, expDiff, aExpOrig;
6305     uint64_t aSig0, aSig1, bSig;
6306     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6307 
6308     *quotient = 0;
6309     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6310         float_raise(float_flag_invalid, status);
6311         return floatx80_default_nan(status);
6312     }
6313     aSig0 = extractFloatx80Frac( a );
6314     aExpOrig = aExp = extractFloatx80Exp( a );
6315     aSign = extractFloatx80Sign( a );
6316     bSig = extractFloatx80Frac( b );
6317     bExp = extractFloatx80Exp( b );
6318     if ( aExp == 0x7FFF ) {
6319         if (    (uint64_t) ( aSig0<<1 )
6320              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6321             return propagateFloatx80NaN(a, b, status);
6322         }
6323         goto invalid;
6324     }
6325     if ( bExp == 0x7FFF ) {
6326         if ((uint64_t)(bSig << 1)) {
6327             return propagateFloatx80NaN(a, b, status);
6328         }
6329         if (aExp == 0 && aSig0 >> 63) {
6330             /*
6331              * Pseudo-denormal argument must be returned in normalized
6332              * form.
6333              */
6334             return packFloatx80(aSign, 1, aSig0);
6335         }
6336         return a;
6337     }
6338     if ( bExp == 0 ) {
6339         if ( bSig == 0 ) {
6340  invalid:
6341             float_raise(float_flag_invalid, status);
6342             return floatx80_default_nan(status);
6343         }
6344         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6345     }
6346     if ( aExp == 0 ) {
6347         if ( aSig0 == 0 ) return a;
6348         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6349     }
6350     zSign = aSign;
6351     expDiff = aExp - bExp;
6352     aSig1 = 0;
6353     if ( expDiff < 0 ) {
6354         if ( mod || expDiff < -1 ) {
6355             if (aExp == 1 && aExpOrig == 0) {
6356                 /*
6357                  * Pseudo-denormal argument must be returned in
6358                  * normalized form.
6359                  */
6360                 return packFloatx80(aSign, aExp, aSig0);
6361             }
6362             return a;
6363         }
6364         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6365         expDiff = 0;
6366     }
6367     *quotient = q = ( bSig <= aSig0 );
6368     if ( q ) aSig0 -= bSig;
6369     expDiff -= 64;
6370     while ( 0 < expDiff ) {
6371         q = estimateDiv128To64( aSig0, aSig1, bSig );
6372         q = ( 2 < q ) ? q - 2 : 0;
6373         mul64To128( bSig, q, &term0, &term1 );
6374         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6375         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6376         expDiff -= 62;
6377         *quotient <<= 62;
6378         *quotient += q;
6379     }
6380     expDiff += 64;
6381     if ( 0 < expDiff ) {
6382         q = estimateDiv128To64( aSig0, aSig1, bSig );
6383         q = ( 2 < q ) ? q - 2 : 0;
6384         q >>= 64 - expDiff;
6385         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6386         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6387         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6388         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6389             ++q;
6390             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6391         }
6392         if (expDiff < 64) {
6393             *quotient <<= expDiff;
6394         } else {
6395             *quotient = 0;
6396         }
6397         *quotient += q;
6398     }
6399     else {
6400         term1 = 0;
6401         term0 = bSig;
6402     }
6403     if (!mod) {
6404         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6405         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6406                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6407                         && ( q & 1 ) )
6408             ) {
6409             aSig0 = alternateASig0;
6410             aSig1 = alternateASig1;
6411             zSign = ! zSign;
6412             ++*quotient;
6413         }
6414     }
6415     return
6416         normalizeRoundAndPackFloatx80(
6417             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6418 
6419 }
6420 
6421 /*----------------------------------------------------------------------------
6422 | Returns the remainder of the extended double-precision floating-point value
6423 | `a' with respect to the corresponding value `b'.  The operation is performed
6424 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6425 *----------------------------------------------------------------------------*/
6426 
6427 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6428 {
6429     uint64_t quotient;
6430     return floatx80_modrem(a, b, false, &quotient, status);
6431 }
6432 
6433 /*----------------------------------------------------------------------------
6434 | Returns the remainder of the extended double-precision floating-point value
6435 | `a' with respect to the corresponding value `b', with the quotient truncated
6436 | toward zero.
6437 *----------------------------------------------------------------------------*/
6438 
6439 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6440 {
6441     uint64_t quotient;
6442     return floatx80_modrem(a, b, true, &quotient, status);
6443 }
6444 
6445 /*----------------------------------------------------------------------------
6446 | Returns the square root of the extended double-precision floating-point
6447 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6448 | for Binary Floating-Point Arithmetic.
6449 *----------------------------------------------------------------------------*/
6450 
6451 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6452 {
6453     bool aSign;
6454     int32_t aExp, zExp;
6455     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6456     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6457 
6458     if (floatx80_invalid_encoding(a)) {
6459         float_raise(float_flag_invalid, status);
6460         return floatx80_default_nan(status);
6461     }
6462     aSig0 = extractFloatx80Frac( a );
6463     aExp = extractFloatx80Exp( a );
6464     aSign = extractFloatx80Sign( a );
6465     if ( aExp == 0x7FFF ) {
6466         if ((uint64_t)(aSig0 << 1)) {
6467             return propagateFloatx80NaN(a, a, status);
6468         }
6469         if ( ! aSign ) return a;
6470         goto invalid;
6471     }
6472     if ( aSign ) {
6473         if ( ( aExp | aSig0 ) == 0 ) return a;
6474  invalid:
6475         float_raise(float_flag_invalid, status);
6476         return floatx80_default_nan(status);
6477     }
6478     if ( aExp == 0 ) {
6479         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6480         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6481     }
6482     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6483     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6484     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6485     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6486     doubleZSig0 = zSig0<<1;
6487     mul64To128( zSig0, zSig0, &term0, &term1 );
6488     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6489     while ( (int64_t) rem0 < 0 ) {
6490         --zSig0;
6491         doubleZSig0 -= 2;
6492         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6493     }
6494     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6495     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6496         if ( zSig1 == 0 ) zSig1 = 1;
6497         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6498         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6499         mul64To128( zSig1, zSig1, &term2, &term3 );
6500         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6501         while ( (int64_t) rem1 < 0 ) {
6502             --zSig1;
6503             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6504             term3 |= 1;
6505             term2 |= doubleZSig0;
6506             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6507         }
6508         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6509     }
6510     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6511     zSig0 |= doubleZSig0;
6512     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6513                                 0, zExp, zSig0, zSig1, status);
6514 }
6515 
6516 /*----------------------------------------------------------------------------
6517 | Returns the result of converting the quadruple-precision floating-point
6518 | value `a' to the 32-bit two's complement integer format.  The conversion
6519 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6520 | Arithmetic---which means in particular that the conversion is rounded
6521 | according to the current rounding mode.  If `a' is a NaN, the largest
6522 | positive integer is returned.  Otherwise, if the conversion overflows, the
6523 | largest integer with the same sign as `a' is returned.
6524 *----------------------------------------------------------------------------*/
6525 
6526 int32_t float128_to_int32(float128 a, float_status *status)
6527 {
6528     bool aSign;
6529     int32_t aExp, shiftCount;
6530     uint64_t aSig0, aSig1;
6531 
6532     aSig1 = extractFloat128Frac1( a );
6533     aSig0 = extractFloat128Frac0( a );
6534     aExp = extractFloat128Exp( a );
6535     aSign = extractFloat128Sign( a );
6536     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6537     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6538     aSig0 |= ( aSig1 != 0 );
6539     shiftCount = 0x4028 - aExp;
6540     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6541     return roundAndPackInt32(aSign, aSig0, status);
6542 
6543 }
6544 
6545 /*----------------------------------------------------------------------------
6546 | Returns the result of converting the quadruple-precision floating-point
6547 | value `a' to the 32-bit two's complement integer format.  The conversion
6548 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6549 | Arithmetic, except that the conversion is always rounded toward zero.  If
6550 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6551 | conversion overflows, the largest integer with the same sign as `a' is
6552 | returned.
6553 *----------------------------------------------------------------------------*/
6554 
6555 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6556 {
6557     bool aSign;
6558     int32_t aExp, shiftCount;
6559     uint64_t aSig0, aSig1, savedASig;
6560     int32_t z;
6561 
6562     aSig1 = extractFloat128Frac1( a );
6563     aSig0 = extractFloat128Frac0( a );
6564     aExp = extractFloat128Exp( a );
6565     aSign = extractFloat128Sign( a );
6566     aSig0 |= ( aSig1 != 0 );
6567     if ( 0x401E < aExp ) {
6568         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6569         goto invalid;
6570     }
6571     else if ( aExp < 0x3FFF ) {
6572         if (aExp || aSig0) {
6573             float_raise(float_flag_inexact, status);
6574         }
6575         return 0;
6576     }
6577     aSig0 |= UINT64_C(0x0001000000000000);
6578     shiftCount = 0x402F - aExp;
6579     savedASig = aSig0;
6580     aSig0 >>= shiftCount;
6581     z = aSig0;
6582     if ( aSign ) z = - z;
6583     if ( ( z < 0 ) ^ aSign ) {
6584  invalid:
6585         float_raise(float_flag_invalid, status);
6586         return aSign ? INT32_MIN : INT32_MAX;
6587     }
6588     if ( ( aSig0<<shiftCount ) != savedASig ) {
6589         float_raise(float_flag_inexact, status);
6590     }
6591     return z;
6592 
6593 }
6594 
6595 /*----------------------------------------------------------------------------
6596 | Returns the result of converting the quadruple-precision floating-point
6597 | value `a' to the 64-bit two's complement integer format.  The conversion
6598 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6599 | Arithmetic---which means in particular that the conversion is rounded
6600 | according to the current rounding mode.  If `a' is a NaN, the largest
6601 | positive integer is returned.  Otherwise, if the conversion overflows, the
6602 | largest integer with the same sign as `a' is returned.
6603 *----------------------------------------------------------------------------*/
6604 
6605 int64_t float128_to_int64(float128 a, float_status *status)
6606 {
6607     bool aSign;
6608     int32_t aExp, shiftCount;
6609     uint64_t aSig0, aSig1;
6610 
6611     aSig1 = extractFloat128Frac1( a );
6612     aSig0 = extractFloat128Frac0( a );
6613     aExp = extractFloat128Exp( a );
6614     aSign = extractFloat128Sign( a );
6615     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6616     shiftCount = 0x402F - aExp;
6617     if ( shiftCount <= 0 ) {
6618         if ( 0x403E < aExp ) {
6619             float_raise(float_flag_invalid, status);
6620             if (    ! aSign
6621                  || (    ( aExp == 0x7FFF )
6622                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6623                     )
6624                ) {
6625                 return INT64_MAX;
6626             }
6627             return INT64_MIN;
6628         }
6629         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6630     }
6631     else {
6632         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6633     }
6634     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6635 
6636 }
6637 
6638 /*----------------------------------------------------------------------------
6639 | Returns the result of converting the quadruple-precision floating-point
6640 | value `a' to the 64-bit two's complement integer format.  The conversion
6641 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6642 | Arithmetic, except that the conversion is always rounded toward zero.
6643 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6644 | the conversion overflows, the largest integer with the same sign as `a' is
6645 | returned.
6646 *----------------------------------------------------------------------------*/
6647 
6648 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6649 {
6650     bool aSign;
6651     int32_t aExp, shiftCount;
6652     uint64_t aSig0, aSig1;
6653     int64_t z;
6654 
6655     aSig1 = extractFloat128Frac1( a );
6656     aSig0 = extractFloat128Frac0( a );
6657     aExp = extractFloat128Exp( a );
6658     aSign = extractFloat128Sign( a );
6659     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6660     shiftCount = aExp - 0x402F;
6661     if ( 0 < shiftCount ) {
6662         if ( 0x403E <= aExp ) {
6663             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6664             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6665                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6666                 if (aSig1) {
6667                     float_raise(float_flag_inexact, status);
6668                 }
6669             }
6670             else {
6671                 float_raise(float_flag_invalid, status);
6672                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6673                     return INT64_MAX;
6674                 }
6675             }
6676             return INT64_MIN;
6677         }
6678         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6679         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6680             float_raise(float_flag_inexact, status);
6681         }
6682     }
6683     else {
6684         if ( aExp < 0x3FFF ) {
6685             if ( aExp | aSig0 | aSig1 ) {
6686                 float_raise(float_flag_inexact, status);
6687             }
6688             return 0;
6689         }
6690         z = aSig0>>( - shiftCount );
6691         if (    aSig1
6692              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6693             float_raise(float_flag_inexact, status);
6694         }
6695     }
6696     if ( aSign ) z = - z;
6697     return z;
6698 
6699 }
6700 
6701 /*----------------------------------------------------------------------------
6702 | Returns the result of converting the quadruple-precision floating-point value
6703 | `a' to the 64-bit unsigned integer format.  The conversion is
6704 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6705 | Arithmetic---which means in particular that the conversion is rounded
6706 | according to the current rounding mode.  If `a' is a NaN, the largest
6707 | positive integer is returned.  If the conversion overflows, the
6708 | largest unsigned integer is returned.  If 'a' is negative, the value is
6709 | rounded and zero is returned; negative values that do not round to zero
6710 | will raise the inexact exception.
6711 *----------------------------------------------------------------------------*/
6712 
6713 uint64_t float128_to_uint64(float128 a, float_status *status)
6714 {
6715     bool aSign;
6716     int aExp;
6717     int shiftCount;
6718     uint64_t aSig0, aSig1;
6719 
6720     aSig0 = extractFloat128Frac0(a);
6721     aSig1 = extractFloat128Frac1(a);
6722     aExp = extractFloat128Exp(a);
6723     aSign = extractFloat128Sign(a);
6724     if (aSign && (aExp > 0x3FFE)) {
6725         float_raise(float_flag_invalid, status);
6726         if (float128_is_any_nan(a)) {
6727             return UINT64_MAX;
6728         } else {
6729             return 0;
6730         }
6731     }
6732     if (aExp) {
6733         aSig0 |= UINT64_C(0x0001000000000000);
6734     }
6735     shiftCount = 0x402F - aExp;
6736     if (shiftCount <= 0) {
6737         if (0x403E < aExp) {
6738             float_raise(float_flag_invalid, status);
6739             return UINT64_MAX;
6740         }
6741         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6742     } else {
6743         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6744     }
6745     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6746 }
6747 
6748 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6749 {
6750     uint64_t v;
6751     signed char current_rounding_mode = status->float_rounding_mode;
6752 
6753     set_float_rounding_mode(float_round_to_zero, status);
6754     v = float128_to_uint64(a, status);
6755     set_float_rounding_mode(current_rounding_mode, status);
6756 
6757     return v;
6758 }
6759 
6760 /*----------------------------------------------------------------------------
6761 | Returns the result of converting the quadruple-precision floating-point
6762 | value `a' to the 32-bit unsigned integer format.  The conversion
6763 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6764 | Arithmetic except that the conversion is always rounded toward zero.
6765 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6766 | if the conversion overflows, the largest unsigned integer is returned.
6767 | If 'a' is negative, the value is rounded and zero is returned; negative
6768 | values that do not round to zero will raise the inexact exception.
6769 *----------------------------------------------------------------------------*/
6770 
6771 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6772 {
6773     uint64_t v;
6774     uint32_t res;
6775     int old_exc_flags = get_float_exception_flags(status);
6776 
6777     v = float128_to_uint64_round_to_zero(a, status);
6778     if (v > 0xffffffff) {
6779         res = 0xffffffff;
6780     } else {
6781         return v;
6782     }
6783     set_float_exception_flags(old_exc_flags, status);
6784     float_raise(float_flag_invalid, status);
6785     return res;
6786 }
6787 
6788 /*----------------------------------------------------------------------------
6789 | Returns the result of converting the quadruple-precision floating-point value
6790 | `a' to the 32-bit unsigned integer format.  The conversion is
6791 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6792 | Arithmetic---which means in particular that the conversion is rounded
6793 | according to the current rounding mode.  If `a' is a NaN, the largest
6794 | positive integer is returned.  If the conversion overflows, the
6795 | largest unsigned integer is returned.  If 'a' is negative, the value is
6796 | rounded and zero is returned; negative values that do not round to zero
6797 | will raise the inexact exception.
6798 *----------------------------------------------------------------------------*/
6799 
6800 uint32_t float128_to_uint32(float128 a, float_status *status)
6801 {
6802     uint64_t v;
6803     uint32_t res;
6804     int old_exc_flags = get_float_exception_flags(status);
6805 
6806     v = float128_to_uint64(a, status);
6807     if (v > 0xffffffff) {
6808         res = 0xffffffff;
6809     } else {
6810         return v;
6811     }
6812     set_float_exception_flags(old_exc_flags, status);
6813     float_raise(float_flag_invalid, status);
6814     return res;
6815 }
6816 
6817 /*----------------------------------------------------------------------------
6818 | Returns the result of converting the quadruple-precision floating-point
6819 | value `a' to the single-precision floating-point format.  The conversion
6820 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6821 | Arithmetic.
6822 *----------------------------------------------------------------------------*/
6823 
6824 float32 float128_to_float32(float128 a, float_status *status)
6825 {
6826     bool aSign;
6827     int32_t aExp;
6828     uint64_t aSig0, aSig1;
6829     uint32_t zSig;
6830 
6831     aSig1 = extractFloat128Frac1( a );
6832     aSig0 = extractFloat128Frac0( a );
6833     aExp = extractFloat128Exp( a );
6834     aSign = extractFloat128Sign( a );
6835     if ( aExp == 0x7FFF ) {
6836         if ( aSig0 | aSig1 ) {
6837             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6838         }
6839         return packFloat32( aSign, 0xFF, 0 );
6840     }
6841     aSig0 |= ( aSig1 != 0 );
6842     shift64RightJamming( aSig0, 18, &aSig0 );
6843     zSig = aSig0;
6844     if ( aExp || zSig ) {
6845         zSig |= 0x40000000;
6846         aExp -= 0x3F81;
6847     }
6848     return roundAndPackFloat32(aSign, aExp, zSig, status);
6849 
6850 }
6851 
6852 /*----------------------------------------------------------------------------
6853 | Returns the result of converting the quadruple-precision floating-point
6854 | value `a' to the double-precision floating-point format.  The conversion
6855 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6856 | Arithmetic.
6857 *----------------------------------------------------------------------------*/
6858 
6859 float64 float128_to_float64(float128 a, float_status *status)
6860 {
6861     bool aSign;
6862     int32_t aExp;
6863     uint64_t aSig0, aSig1;
6864 
6865     aSig1 = extractFloat128Frac1( a );
6866     aSig0 = extractFloat128Frac0( a );
6867     aExp = extractFloat128Exp( a );
6868     aSign = extractFloat128Sign( a );
6869     if ( aExp == 0x7FFF ) {
6870         if ( aSig0 | aSig1 ) {
6871             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6872         }
6873         return packFloat64( aSign, 0x7FF, 0 );
6874     }
6875     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6876     aSig0 |= ( aSig1 != 0 );
6877     if ( aExp || aSig0 ) {
6878         aSig0 |= UINT64_C(0x4000000000000000);
6879         aExp -= 0x3C01;
6880     }
6881     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6882 
6883 }
6884 
6885 /*----------------------------------------------------------------------------
6886 | Returns the result of converting the quadruple-precision floating-point
6887 | value `a' to the extended double-precision floating-point format.  The
6888 | conversion is performed according to the IEC/IEEE Standard for Binary
6889 | Floating-Point Arithmetic.
6890 *----------------------------------------------------------------------------*/
6891 
6892 floatx80 float128_to_floatx80(float128 a, float_status *status)
6893 {
6894     bool aSign;
6895     int32_t aExp;
6896     uint64_t aSig0, aSig1;
6897 
6898     aSig1 = extractFloat128Frac1( a );
6899     aSig0 = extractFloat128Frac0( a );
6900     aExp = extractFloat128Exp( a );
6901     aSign = extractFloat128Sign( a );
6902     if ( aExp == 0x7FFF ) {
6903         if ( aSig0 | aSig1 ) {
6904             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6905                                                status);
6906             return floatx80_silence_nan(res, status);
6907         }
6908         return packFloatx80(aSign, floatx80_infinity_high,
6909                                    floatx80_infinity_low);
6910     }
6911     if ( aExp == 0 ) {
6912         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6913         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6914     }
6915     else {
6916         aSig0 |= UINT64_C(0x0001000000000000);
6917     }
6918     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6919     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6920 
6921 }
6922 
6923 /*----------------------------------------------------------------------------
6924 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6925 | returns the result as a quadruple-precision floating-point value.  The
6926 | operation is performed according to the IEC/IEEE Standard for Binary
6927 | Floating-Point Arithmetic.
6928 *----------------------------------------------------------------------------*/
6929 
6930 float128 float128_round_to_int(float128 a, float_status *status)
6931 {
6932     bool aSign;
6933     int32_t aExp;
6934     uint64_t lastBitMask, roundBitsMask;
6935     float128 z;
6936 
6937     aExp = extractFloat128Exp( a );
6938     if ( 0x402F <= aExp ) {
6939         if ( 0x406F <= aExp ) {
6940             if (    ( aExp == 0x7FFF )
6941                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6942                ) {
6943                 return propagateFloat128NaN(a, a, status);
6944             }
6945             return a;
6946         }
6947         lastBitMask = 1;
6948         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6949         roundBitsMask = lastBitMask - 1;
6950         z = a;
6951         switch (status->float_rounding_mode) {
6952         case float_round_nearest_even:
6953             if ( lastBitMask ) {
6954                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6955                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6956             }
6957             else {
6958                 if ( (int64_t) z.low < 0 ) {
6959                     ++z.high;
6960                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6961                 }
6962             }
6963             break;
6964         case float_round_ties_away:
6965             if (lastBitMask) {
6966                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6967             } else {
6968                 if ((int64_t) z.low < 0) {
6969                     ++z.high;
6970                 }
6971             }
6972             break;
6973         case float_round_to_zero:
6974             break;
6975         case float_round_up:
6976             if (!extractFloat128Sign(z)) {
6977                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6978             }
6979             break;
6980         case float_round_down:
6981             if (extractFloat128Sign(z)) {
6982                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6983             }
6984             break;
6985         case float_round_to_odd:
6986             /*
6987              * Note that if lastBitMask == 0, the last bit is the lsb
6988              * of high, and roundBitsMask == -1.
6989              */
6990             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6991                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6992             }
6993             break;
6994         default:
6995             abort();
6996         }
6997         z.low &= ~ roundBitsMask;
6998     }
6999     else {
7000         if ( aExp < 0x3FFF ) {
7001             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7002             float_raise(float_flag_inexact, status);
7003             aSign = extractFloat128Sign( a );
7004             switch (status->float_rounding_mode) {
7005             case float_round_nearest_even:
7006                 if (    ( aExp == 0x3FFE )
7007                      && (   extractFloat128Frac0( a )
7008                           | extractFloat128Frac1( a ) )
7009                    ) {
7010                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7011                 }
7012                 break;
7013             case float_round_ties_away:
7014                 if (aExp == 0x3FFE) {
7015                     return packFloat128(aSign, 0x3FFF, 0, 0);
7016                 }
7017                 break;
7018             case float_round_down:
7019                 return
7020                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7021                     : packFloat128( 0, 0, 0, 0 );
7022             case float_round_up:
7023                 return
7024                       aSign ? packFloat128( 1, 0, 0, 0 )
7025                     : packFloat128( 0, 0x3FFF, 0, 0 );
7026 
7027             case float_round_to_odd:
7028                 return packFloat128(aSign, 0x3FFF, 0, 0);
7029 
7030             case float_round_to_zero:
7031                 break;
7032             }
7033             return packFloat128( aSign, 0, 0, 0 );
7034         }
7035         lastBitMask = 1;
7036         lastBitMask <<= 0x402F - aExp;
7037         roundBitsMask = lastBitMask - 1;
7038         z.low = 0;
7039         z.high = a.high;
7040         switch (status->float_rounding_mode) {
7041         case float_round_nearest_even:
7042             z.high += lastBitMask>>1;
7043             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7044                 z.high &= ~ lastBitMask;
7045             }
7046             break;
7047         case float_round_ties_away:
7048             z.high += lastBitMask>>1;
7049             break;
7050         case float_round_to_zero:
7051             break;
7052         case float_round_up:
7053             if (!extractFloat128Sign(z)) {
7054                 z.high |= ( a.low != 0 );
7055                 z.high += roundBitsMask;
7056             }
7057             break;
7058         case float_round_down:
7059             if (extractFloat128Sign(z)) {
7060                 z.high |= (a.low != 0);
7061                 z.high += roundBitsMask;
7062             }
7063             break;
7064         case float_round_to_odd:
7065             if ((z.high & lastBitMask) == 0) {
7066                 z.high |= (a.low != 0);
7067                 z.high += roundBitsMask;
7068             }
7069             break;
7070         default:
7071             abort();
7072         }
7073         z.high &= ~ roundBitsMask;
7074     }
7075     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7076         float_raise(float_flag_inexact, status);
7077     }
7078     return z;
7079 
7080 }
7081 
7082 /*----------------------------------------------------------------------------
7083 | Returns the result of dividing the quadruple-precision floating-point value
7084 | `a' by the corresponding value `b'.  The operation is performed according to
7085 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7086 *----------------------------------------------------------------------------*/
7087 
7088 float128 float128_div(float128 a, float128 b, float_status *status)
7089 {
7090     bool aSign, bSign, zSign;
7091     int32_t aExp, bExp, zExp;
7092     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7093     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7094 
7095     aSig1 = extractFloat128Frac1( a );
7096     aSig0 = extractFloat128Frac0( a );
7097     aExp = extractFloat128Exp( a );
7098     aSign = extractFloat128Sign( a );
7099     bSig1 = extractFloat128Frac1( b );
7100     bSig0 = extractFloat128Frac0( b );
7101     bExp = extractFloat128Exp( b );
7102     bSign = extractFloat128Sign( b );
7103     zSign = aSign ^ bSign;
7104     if ( aExp == 0x7FFF ) {
7105         if (aSig0 | aSig1) {
7106             return propagateFloat128NaN(a, b, status);
7107         }
7108         if ( bExp == 0x7FFF ) {
7109             if (bSig0 | bSig1) {
7110                 return propagateFloat128NaN(a, b, status);
7111             }
7112             goto invalid;
7113         }
7114         return packFloat128( zSign, 0x7FFF, 0, 0 );
7115     }
7116     if ( bExp == 0x7FFF ) {
7117         if (bSig0 | bSig1) {
7118             return propagateFloat128NaN(a, b, status);
7119         }
7120         return packFloat128( zSign, 0, 0, 0 );
7121     }
7122     if ( bExp == 0 ) {
7123         if ( ( bSig0 | bSig1 ) == 0 ) {
7124             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7125  invalid:
7126                 float_raise(float_flag_invalid, status);
7127                 return float128_default_nan(status);
7128             }
7129             float_raise(float_flag_divbyzero, status);
7130             return packFloat128( zSign, 0x7FFF, 0, 0 );
7131         }
7132         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7133     }
7134     if ( aExp == 0 ) {
7135         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7136         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7137     }
7138     zExp = aExp - bExp + 0x3FFD;
7139     shortShift128Left(
7140         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7141     shortShift128Left(
7142         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7143     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7144         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7145         ++zExp;
7146     }
7147     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7148     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7149     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7150     while ( (int64_t) rem0 < 0 ) {
7151         --zSig0;
7152         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7153     }
7154     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7155     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7156         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7157         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7158         while ( (int64_t) rem1 < 0 ) {
7159             --zSig1;
7160             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7161         }
7162         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7163     }
7164     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7165     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7166 
7167 }
7168 
7169 /*----------------------------------------------------------------------------
7170 | Returns the remainder of the quadruple-precision floating-point value `a'
7171 | with respect to the corresponding value `b'.  The operation is performed
7172 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7173 *----------------------------------------------------------------------------*/
7174 
7175 float128 float128_rem(float128 a, float128 b, float_status *status)
7176 {
7177     bool aSign, zSign;
7178     int32_t aExp, bExp, expDiff;
7179     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7180     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7181     int64_t sigMean0;
7182 
7183     aSig1 = extractFloat128Frac1( a );
7184     aSig0 = extractFloat128Frac0( a );
7185     aExp = extractFloat128Exp( a );
7186     aSign = extractFloat128Sign( a );
7187     bSig1 = extractFloat128Frac1( b );
7188     bSig0 = extractFloat128Frac0( b );
7189     bExp = extractFloat128Exp( b );
7190     if ( aExp == 0x7FFF ) {
7191         if (    ( aSig0 | aSig1 )
7192              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7193             return propagateFloat128NaN(a, b, status);
7194         }
7195         goto invalid;
7196     }
7197     if ( bExp == 0x7FFF ) {
7198         if (bSig0 | bSig1) {
7199             return propagateFloat128NaN(a, b, status);
7200         }
7201         return a;
7202     }
7203     if ( bExp == 0 ) {
7204         if ( ( bSig0 | bSig1 ) == 0 ) {
7205  invalid:
7206             float_raise(float_flag_invalid, status);
7207             return float128_default_nan(status);
7208         }
7209         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7210     }
7211     if ( aExp == 0 ) {
7212         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7213         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7214     }
7215     expDiff = aExp - bExp;
7216     if ( expDiff < -1 ) return a;
7217     shortShift128Left(
7218         aSig0 | UINT64_C(0x0001000000000000),
7219         aSig1,
7220         15 - ( expDiff < 0 ),
7221         &aSig0,
7222         &aSig1
7223     );
7224     shortShift128Left(
7225         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7226     q = le128( bSig0, bSig1, aSig0, aSig1 );
7227     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7228     expDiff -= 64;
7229     while ( 0 < expDiff ) {
7230         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7231         q = ( 4 < q ) ? q - 4 : 0;
7232         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7233         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7234         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7235         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7236         expDiff -= 61;
7237     }
7238     if ( -64 < expDiff ) {
7239         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7240         q = ( 4 < q ) ? q - 4 : 0;
7241         q >>= - expDiff;
7242         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7243         expDiff += 52;
7244         if ( expDiff < 0 ) {
7245             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7246         }
7247         else {
7248             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7249         }
7250         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7251         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7252     }
7253     else {
7254         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7255         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7256     }
7257     do {
7258         alternateASig0 = aSig0;
7259         alternateASig1 = aSig1;
7260         ++q;
7261         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7262     } while ( 0 <= (int64_t) aSig0 );
7263     add128(
7264         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7265     if (    ( sigMean0 < 0 )
7266          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7267         aSig0 = alternateASig0;
7268         aSig1 = alternateASig1;
7269     }
7270     zSign = ( (int64_t) aSig0 < 0 );
7271     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7272     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7273                                          status);
7274 }
7275 
7276 /*----------------------------------------------------------------------------
7277 | Returns the square root of the quadruple-precision floating-point value `a'.
7278 | The operation is performed according to the IEC/IEEE Standard for Binary
7279 | Floating-Point Arithmetic.
7280 *----------------------------------------------------------------------------*/
7281 
7282 float128 float128_sqrt(float128 a, float_status *status)
7283 {
7284     bool aSign;
7285     int32_t aExp, zExp;
7286     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7287     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7288 
7289     aSig1 = extractFloat128Frac1( a );
7290     aSig0 = extractFloat128Frac0( a );
7291     aExp = extractFloat128Exp( a );
7292     aSign = extractFloat128Sign( a );
7293     if ( aExp == 0x7FFF ) {
7294         if (aSig0 | aSig1) {
7295             return propagateFloat128NaN(a, a, status);
7296         }
7297         if ( ! aSign ) return a;
7298         goto invalid;
7299     }
7300     if ( aSign ) {
7301         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7302  invalid:
7303         float_raise(float_flag_invalid, status);
7304         return float128_default_nan(status);
7305     }
7306     if ( aExp == 0 ) {
7307         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7308         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7309     }
7310     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7311     aSig0 |= UINT64_C(0x0001000000000000);
7312     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7313     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7314     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7315     doubleZSig0 = zSig0<<1;
7316     mul64To128( zSig0, zSig0, &term0, &term1 );
7317     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7318     while ( (int64_t) rem0 < 0 ) {
7319         --zSig0;
7320         doubleZSig0 -= 2;
7321         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7322     }
7323     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7324     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7325         if ( zSig1 == 0 ) zSig1 = 1;
7326         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7327         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7328         mul64To128( zSig1, zSig1, &term2, &term3 );
7329         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7330         while ( (int64_t) rem1 < 0 ) {
7331             --zSig1;
7332             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7333             term3 |= 1;
7334             term2 |= doubleZSig0;
7335             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7336         }
7337         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7338     }
7339     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7340     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7341 
7342 }
7343 
7344 static inline FloatRelation
7345 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7346                           float_status *status)
7347 {
7348     bool aSign, bSign;
7349 
7350     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7351         float_raise(float_flag_invalid, status);
7352         return float_relation_unordered;
7353     }
7354     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7355           ( extractFloatx80Frac( a )<<1 ) ) ||
7356         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7357           ( extractFloatx80Frac( b )<<1 ) )) {
7358         if (!is_quiet ||
7359             floatx80_is_signaling_nan(a, status) ||
7360             floatx80_is_signaling_nan(b, status)) {
7361             float_raise(float_flag_invalid, status);
7362         }
7363         return float_relation_unordered;
7364     }
7365     aSign = extractFloatx80Sign( a );
7366     bSign = extractFloatx80Sign( b );
7367     if ( aSign != bSign ) {
7368 
7369         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7370              ( ( a.low | b.low ) == 0 ) ) {
7371             /* zero case */
7372             return float_relation_equal;
7373         } else {
7374             return 1 - (2 * aSign);
7375         }
7376     } else {
7377         /* Normalize pseudo-denormals before comparison.  */
7378         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7379             ++a.high;
7380         }
7381         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7382             ++b.high;
7383         }
7384         if (a.low == b.low && a.high == b.high) {
7385             return float_relation_equal;
7386         } else {
7387             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7388         }
7389     }
7390 }
7391 
7392 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7393 {
7394     return floatx80_compare_internal(a, b, 0, status);
7395 }
7396 
7397 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7398                                      float_status *status)
7399 {
7400     return floatx80_compare_internal(a, b, 1, status);
7401 }
7402 
7403 static inline FloatRelation
7404 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7405                           float_status *status)
7406 {
7407     bool aSign, bSign;
7408 
7409     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7410           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7411         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7412           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7413         if (!is_quiet ||
7414             float128_is_signaling_nan(a, status) ||
7415             float128_is_signaling_nan(b, status)) {
7416             float_raise(float_flag_invalid, status);
7417         }
7418         return float_relation_unordered;
7419     }
7420     aSign = extractFloat128Sign( a );
7421     bSign = extractFloat128Sign( b );
7422     if ( aSign != bSign ) {
7423         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7424             /* zero case */
7425             return float_relation_equal;
7426         } else {
7427             return 1 - (2 * aSign);
7428         }
7429     } else {
7430         if (a.low == b.low && a.high == b.high) {
7431             return float_relation_equal;
7432         } else {
7433             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7434         }
7435     }
7436 }
7437 
7438 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7439 {
7440     return float128_compare_internal(a, b, 0, status);
7441 }
7442 
7443 FloatRelation float128_compare_quiet(float128 a, float128 b,
7444                                      float_status *status)
7445 {
7446     return float128_compare_internal(a, b, 1, status);
7447 }
7448 
7449 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7450 {
7451     bool aSign;
7452     int32_t aExp;
7453     uint64_t aSig;
7454 
7455     if (floatx80_invalid_encoding(a)) {
7456         float_raise(float_flag_invalid, status);
7457         return floatx80_default_nan(status);
7458     }
7459     aSig = extractFloatx80Frac( a );
7460     aExp = extractFloatx80Exp( a );
7461     aSign = extractFloatx80Sign( a );
7462 
7463     if ( aExp == 0x7FFF ) {
7464         if ( aSig<<1 ) {
7465             return propagateFloatx80NaN(a, a, status);
7466         }
7467         return a;
7468     }
7469 
7470     if (aExp == 0) {
7471         if (aSig == 0) {
7472             return a;
7473         }
7474         aExp++;
7475     }
7476 
7477     if (n > 0x10000) {
7478         n = 0x10000;
7479     } else if (n < -0x10000) {
7480         n = -0x10000;
7481     }
7482 
7483     aExp += n;
7484     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7485                                          aSign, aExp, aSig, 0, status);
7486 }
7487 
7488 float128 float128_scalbn(float128 a, int n, float_status *status)
7489 {
7490     bool aSign;
7491     int32_t aExp;
7492     uint64_t aSig0, aSig1;
7493 
7494     aSig1 = extractFloat128Frac1( a );
7495     aSig0 = extractFloat128Frac0( a );
7496     aExp = extractFloat128Exp( a );
7497     aSign = extractFloat128Sign( a );
7498     if ( aExp == 0x7FFF ) {
7499         if ( aSig0 | aSig1 ) {
7500             return propagateFloat128NaN(a, a, status);
7501         }
7502         return a;
7503     }
7504     if (aExp != 0) {
7505         aSig0 |= UINT64_C(0x0001000000000000);
7506     } else if (aSig0 == 0 && aSig1 == 0) {
7507         return a;
7508     } else {
7509         aExp++;
7510     }
7511 
7512     if (n > 0x10000) {
7513         n = 0x10000;
7514     } else if (n < -0x10000) {
7515         n = -0x10000;
7516     }
7517 
7518     aExp += n - 1;
7519     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7520                                          , status);
7521 
7522 }
7523 
7524 static void __attribute__((constructor)) softfloat_init(void)
7525 {
7526     union_float64 ua, ub, uc, ur;
7527 
7528     if (QEMU_NO_HARDFLOAT) {
7529         return;
7530     }
7531     /*
7532      * Test that the host's FMA is not obviously broken. For example,
7533      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7534      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7535      */
7536     ua.s = 0x0020000000000001ULL;
7537     ub.s = 0x3ca0000000000000ULL;
7538     uc.s = 0x0020000000000000ULL;
7539     ur.h = fma(ua.h, ub.h, uc.h);
7540     if (ur.s != 0x0020000000000001ULL) {
7541         force_soft_fma = true;
7542     }
7543 }
7544