xref: /openbmc/qemu/fpu/softfloat.c (revision 22c355f4)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             float_raise(float_flag_input_denormal, s);                  \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345              f32_check_fn pre, f32_check_fn post)
346 {
347     union_float32 ua, ub, ur;
348 
349     ua.s = xa;
350     ub.s = xb;
351 
352     if (unlikely(!can_use_fpu(s))) {
353         goto soft;
354     }
355 
356     float32_input_flush2(&ua.s, &ub.s, s);
357     if (unlikely(!pre(ua, ub))) {
358         goto soft;
359     }
360 
361     ur.h = hard(ua.h, ub.h);
362     if (unlikely(f32_is_inf(ur))) {
363         float_raise(float_flag_overflow, s);
364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365         goto soft;
366     }
367     return ur.s;
368 
369  soft:
370     return soft(ua.s, ub.s, s);
371 }
372 
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376              f64_check_fn pre, f64_check_fn post)
377 {
378     union_float64 ua, ub, ur;
379 
380     ua.s = xa;
381     ub.s = xb;
382 
383     if (unlikely(!can_use_fpu(s))) {
384         goto soft;
385     }
386 
387     float64_input_flush2(&ua.s, &ub.s, s);
388     if (unlikely(!pre(ua, ub))) {
389         goto soft;
390     }
391 
392     ur.h = hard(ua.h, ub.h);
393     if (unlikely(f64_is_inf(ur))) {
394         float_raise(float_flag_overflow, s);
395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396         goto soft;
397     }
398     return ur.s;
399 
400  soft:
401     return soft(ua.s, ub.s, s);
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410     return float32_val(a) & 0x007FFFFF;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 static inline int extractFloat32Exp(float32 a)
418 {
419     return (float32_val(a) >> 23) & 0xFF;
420 }
421 
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline bool extractFloat32Sign(float32 a)
427 {
428     return float32_val(a) >> 31;
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434 
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439 
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443 
444 static inline int extractFloat64Exp(float64 a)
445 {
446     return (float64_val(a) >> 52) & 0x7FF;
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452 
453 static inline bool extractFloat64Sign(float64 a)
454 {
455     return float64_val(a) >> 63;
456 }
457 
458 /*
459  * Classify a floating point number. Everything above float_class_qnan
460  * is a NaN so cls >= float_class_qnan is any NaN.
461  */
462 
463 typedef enum __attribute__ ((__packed__)) {
464     float_class_unclassified,
465     float_class_zero,
466     float_class_normal,
467     float_class_inf,
468     float_class_qnan,  /* all NaNs from here */
469     float_class_snan,
470 } FloatClass;
471 
472 #define float_cmask(bit)  (1u << (bit))
473 
474 enum {
475     float_cmask_zero    = float_cmask(float_class_zero),
476     float_cmask_normal  = float_cmask(float_class_normal),
477     float_cmask_inf     = float_cmask(float_class_inf),
478     float_cmask_qnan    = float_cmask(float_class_qnan),
479     float_cmask_snan    = float_cmask(float_class_snan),
480 
481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
483 };
484 
485 
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489     return unlikely(c >= float_class_qnan);
490 }
491 
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494     return c == float_class_snan;
495 }
496 
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499     return c == float_class_qnan;
500 }
501 
502 /*
503  * Structure holding all of the decomposed parts of a float.
504  * The exponent is unbiased and the fraction is normalized.
505  *
506  * The fraction words are stored in big-endian word ordering,
507  * so that truncation from a larger format to a smaller format
508  * can be done simply by ignoring subsequent elements.
509  */
510 
511 typedef struct {
512     FloatClass cls;
513     bool sign;
514     int32_t exp;
515     union {
516         /* Routines that know the structure may reference the singular name. */
517         uint64_t frac;
518         /*
519          * Routines expanded with multiple structures reference "hi" and "lo"
520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
521          * both the same word and aliased here.
522          */
523         uint64_t frac_hi;
524         uint64_t frac_lo;
525     };
526 } FloatParts64;
527 
528 typedef struct {
529     FloatClass cls;
530     bool sign;
531     int32_t exp;
532     uint64_t frac_hi;
533     uint64_t frac_lo;
534 } FloatParts128;
535 
536 /* These apply to the most significant word of each FloatPartsN. */
537 #define DECOMPOSED_BINARY_POINT    63
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 
540 /* Structure holding all of the relevant parameters for a format.
541  *   exp_size: the size of the exponent field
542  *   exp_bias: the offset applied to the exponent field
543  *   exp_max: the maximum normalised exponent
544  *   frac_size: the size of the fraction field
545  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
546  * The following are computed based the size of fraction
547  *   frac_lsb: least significant bit of fraction
548  *   frac_lsbm1: the bit below the least significant bit (for rounding)
549  *   round_mask/roundeven_mask: masks used for rounding
550  * The following optional modifiers are available:
551  *   arm_althp: handle ARM Alternative Half Precision
552  */
553 typedef struct {
554     int exp_size;
555     int exp_bias;
556     int exp_max;
557     int frac_size;
558     int frac_shift;
559     uint64_t frac_lsb;
560     uint64_t frac_lsbm1;
561     uint64_t round_mask;
562     uint64_t roundeven_mask;
563     bool arm_althp;
564 } FloatFmt;
565 
566 /* Expand fields based on the size of exponent and fraction */
567 #define FLOAT_PARAMS(E, F)                                           \
568     .exp_size       = E,                                             \
569     .exp_bias       = ((1 << E) - 1) >> 1,                           \
570     .exp_max        = (1 << E) - 1,                                  \
571     .frac_size      = F,                                             \
572     .frac_shift     = (-F - 1) & 63,                                 \
573     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
574     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
575     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
576     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
577 
578 static const FloatFmt float16_params = {
579     FLOAT_PARAMS(5, 10)
580 };
581 
582 static const FloatFmt float16_params_ahp = {
583     FLOAT_PARAMS(5, 10),
584     .arm_althp = true
585 };
586 
587 static const FloatFmt bfloat16_params = {
588     FLOAT_PARAMS(8, 7)
589 };
590 
591 static const FloatFmt float32_params = {
592     FLOAT_PARAMS(8, 23)
593 };
594 
595 static const FloatFmt float64_params = {
596     FLOAT_PARAMS(11, 52)
597 };
598 
599 static const FloatFmt float128_params = {
600     FLOAT_PARAMS(15, 112)
601 };
602 
603 /* Unpack a float to parts, but do not canonicalize.  */
604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
605 {
606     const int f_size = fmt->frac_size;
607     const int e_size = fmt->exp_size;
608 
609     *r = (FloatParts64) {
610         .cls = float_class_unclassified,
611         .sign = extract64(raw, f_size + e_size, 1),
612         .exp = extract64(raw, f_size, e_size),
613         .frac = extract64(raw, 0, f_size)
614     };
615 }
616 
617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
618 {
619     unpack_raw64(p, &float16_params, f);
620 }
621 
622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
623 {
624     unpack_raw64(p, &bfloat16_params, f);
625 }
626 
627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
628 {
629     unpack_raw64(p, &float32_params, f);
630 }
631 
632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
633 {
634     unpack_raw64(p, &float64_params, f);
635 }
636 
637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
638 {
639     const int f_size = float128_params.frac_size - 64;
640     const int e_size = float128_params.exp_size;
641 
642     *p = (FloatParts128) {
643         .cls = float_class_unclassified,
644         .sign = extract64(f.high, f_size + e_size, 1),
645         .exp = extract64(f.high, f_size, e_size),
646         .frac_hi = extract64(f.high, 0, f_size),
647         .frac_lo = f.low,
648     };
649 }
650 
651 /* Pack a float from parts, but do not canonicalize.  */
652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
653 {
654     const int f_size = fmt->frac_size;
655     const int e_size = fmt->exp_size;
656     uint64_t ret;
657 
658     ret = (uint64_t)p->sign << (f_size + e_size);
659     ret = deposit64(ret, f_size, e_size, p->exp);
660     ret = deposit64(ret, 0, f_size, p->frac);
661     return ret;
662 }
663 
664 static inline float16 float16_pack_raw(const FloatParts64 *p)
665 {
666     return make_float16(pack_raw64(p, &float16_params));
667 }
668 
669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
670 {
671     return pack_raw64(p, &bfloat16_params);
672 }
673 
674 static inline float32 float32_pack_raw(const FloatParts64 *p)
675 {
676     return make_float32(pack_raw64(p, &float32_params));
677 }
678 
679 static inline float64 float64_pack_raw(const FloatParts64 *p)
680 {
681     return make_float64(pack_raw64(p, &float64_params));
682 }
683 
684 static float128 float128_pack_raw(const FloatParts128 *p)
685 {
686     const int f_size = float128_params.frac_size - 64;
687     const int e_size = float128_params.exp_size;
688     uint64_t hi;
689 
690     hi = (uint64_t)p->sign << (f_size + e_size);
691     hi = deposit64(hi, f_size, e_size, p->exp);
692     hi = deposit64(hi, 0, f_size, p->frac_hi);
693     return make_float128(hi, p->frac_lo);
694 }
695 
696 /*----------------------------------------------------------------------------
697 | Functions and definitions to determine:  (1) whether tininess for underflow
698 | is detected before or after rounding by default, (2) what (if anything)
699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
701 | are propagated from function inputs to output.  These details are target-
702 | specific.
703 *----------------------------------------------------------------------------*/
704 #include "softfloat-specialize.c.inc"
705 
706 #define PARTS_GENERIC_64_128(NAME, P) \
707     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
708 
709 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
710 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
711 
712 static void parts64_return_nan(FloatParts64 *a, float_status *s);
713 static void parts128_return_nan(FloatParts128 *a, float_status *s);
714 
715 #define parts_return_nan(P, S)     PARTS_GENERIC_64_128(return_nan, P)(P, S)
716 
717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
718                                       float_status *s);
719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
720                                         float_status *s);
721 
722 #define parts_pick_nan(A, B, S)    PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
723 
724 /*
725  * Helper functions for softfloat-parts.c.inc, per-size operations.
726  */
727 
728 #define FRAC_GENERIC_64_128(NAME, P) \
729     QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
730 
731 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
732 {
733     return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
734 }
735 
736 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
737 {
738     uint64_t ta = a->frac_hi, tb = b->frac_hi;
739     if (ta == tb) {
740         ta = a->frac_lo, tb = b->frac_lo;
741         if (ta == tb) {
742             return 0;
743         }
744     }
745     return ta < tb ? -1 : 1;
746 }
747 
748 #define frac_cmp(A, B)  FRAC_GENERIC_64_128(cmp, A)(A, B)
749 
750 static void frac128_shl(FloatParts128 *a, int c)
751 {
752     shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
753 }
754 
755 #define frac_shl(A, C)             frac128_shl(A, C)
756 
757 static void frac128_shr(FloatParts128 *a, int c)
758 {
759     shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
760 }
761 
762 #define frac_shr(A, C)             frac128_shr(A, C)
763 
764 /* Canonicalize EXP and FRAC, setting CLS.  */
765 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
766                                   float_status *status)
767 {
768     if (part.exp == parm->exp_max && !parm->arm_althp) {
769         if (part.frac == 0) {
770             part.cls = float_class_inf;
771         } else {
772             part.frac <<= parm->frac_shift;
773             part.cls = (parts_is_snan_frac(part.frac, status)
774                         ? float_class_snan : float_class_qnan);
775         }
776     } else if (part.exp == 0) {
777         if (likely(part.frac == 0)) {
778             part.cls = float_class_zero;
779         } else if (status->flush_inputs_to_zero) {
780             float_raise(float_flag_input_denormal, status);
781             part.cls = float_class_zero;
782             part.frac = 0;
783         } else {
784             int shift = clz64(part.frac);
785             part.cls = float_class_normal;
786             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
787             part.frac <<= shift;
788         }
789     } else {
790         part.cls = float_class_normal;
791         part.exp -= parm->exp_bias;
792         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
793     }
794     return part;
795 }
796 
797 /* Round and uncanonicalize a floating-point number by parts. There
798  * are FRAC_SHIFT bits that may require rounding at the bottom of the
799  * fraction; these bits will be removed. The exponent will be biased
800  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
801  */
802 
803 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
804                                   const FloatFmt *parm)
805 {
806     const uint64_t frac_lsb = parm->frac_lsb;
807     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
808     const uint64_t round_mask = parm->round_mask;
809     const uint64_t roundeven_mask = parm->roundeven_mask;
810     const int exp_max = parm->exp_max;
811     const int frac_shift = parm->frac_shift;
812     uint64_t frac, inc;
813     int exp, flags = 0;
814     bool overflow_norm;
815 
816     frac = p.frac;
817     exp = p.exp;
818 
819     switch (p.cls) {
820     case float_class_normal:
821         switch (s->float_rounding_mode) {
822         case float_round_nearest_even:
823             overflow_norm = false;
824             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
825             break;
826         case float_round_ties_away:
827             overflow_norm = false;
828             inc = frac_lsbm1;
829             break;
830         case float_round_to_zero:
831             overflow_norm = true;
832             inc = 0;
833             break;
834         case float_round_up:
835             inc = p.sign ? 0 : round_mask;
836             overflow_norm = p.sign;
837             break;
838         case float_round_down:
839             inc = p.sign ? round_mask : 0;
840             overflow_norm = !p.sign;
841             break;
842         case float_round_to_odd:
843             overflow_norm = true;
844             inc = frac & frac_lsb ? 0 : round_mask;
845             break;
846         default:
847             g_assert_not_reached();
848         }
849 
850         exp += parm->exp_bias;
851         if (likely(exp > 0)) {
852             if (frac & round_mask) {
853                 flags |= float_flag_inexact;
854                 if (uadd64_overflow(frac, inc, &frac)) {
855                     frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
856                     exp++;
857                 }
858             }
859             frac >>= frac_shift;
860 
861             if (parm->arm_althp) {
862                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
863                 if (unlikely(exp > exp_max)) {
864                     /* Overflow.  Return the maximum normal.  */
865                     flags = float_flag_invalid;
866                     exp = exp_max;
867                     frac = -1;
868                 }
869             } else if (unlikely(exp >= exp_max)) {
870                 flags |= float_flag_overflow | float_flag_inexact;
871                 if (overflow_norm) {
872                     exp = exp_max - 1;
873                     frac = -1;
874                 } else {
875                     p.cls = float_class_inf;
876                     goto do_inf;
877                 }
878             }
879         } else if (s->flush_to_zero) {
880             flags |= float_flag_output_denormal;
881             p.cls = float_class_zero;
882             goto do_zero;
883         } else {
884             bool is_tiny = s->tininess_before_rounding || (exp < 0);
885 
886             if (!is_tiny) {
887                 uint64_t discard;
888                 is_tiny = !uadd64_overflow(frac, inc, &discard);
889             }
890 
891             shift64RightJamming(frac, 1 - exp, &frac);
892             if (frac & round_mask) {
893                 /* Need to recompute round-to-even.  */
894                 switch (s->float_rounding_mode) {
895                 case float_round_nearest_even:
896                     inc = ((frac & roundeven_mask) != frac_lsbm1
897                            ? frac_lsbm1 : 0);
898                     break;
899                 case float_round_to_odd:
900                     inc = frac & frac_lsb ? 0 : round_mask;
901                     break;
902                 default:
903                     break;
904                 }
905                 flags |= float_flag_inexact;
906                 frac += inc;
907             }
908 
909             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
910             frac >>= frac_shift;
911 
912             if (is_tiny && (flags & float_flag_inexact)) {
913                 flags |= float_flag_underflow;
914             }
915             if (exp == 0 && frac == 0) {
916                 p.cls = float_class_zero;
917             }
918         }
919         break;
920 
921     case float_class_zero:
922     do_zero:
923         exp = 0;
924         frac = 0;
925         break;
926 
927     case float_class_inf:
928     do_inf:
929         assert(!parm->arm_althp);
930         exp = exp_max;
931         frac = 0;
932         break;
933 
934     case float_class_qnan:
935     case float_class_snan:
936         assert(!parm->arm_althp);
937         exp = exp_max;
938         frac >>= parm->frac_shift;
939         break;
940 
941     default:
942         g_assert_not_reached();
943     }
944 
945     float_raise(flags, s);
946     p.exp = exp;
947     p.frac = frac;
948     return p;
949 }
950 
951 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c,
952                                   bool inf_zero, float_status *s)
953 {
954     int which;
955 
956     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
957         float_raise(float_flag_invalid, s);
958     }
959 
960     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
961 
962     if (s->default_nan_mode) {
963         /* Note that this check is after pickNaNMulAdd so that function
964          * has an opportunity to set the Invalid flag.
965          */
966         which = 3;
967     }
968 
969     switch (which) {
970     case 0:
971         break;
972     case 1:
973         a = b;
974         break;
975     case 2:
976         a = c;
977         break;
978     case 3:
979         parts_default_nan(&a, s);
980         break;
981     default:
982         g_assert_not_reached();
983     }
984 
985     if (is_snan(a.cls)) {
986         parts_silence_nan(&a, s);
987     }
988     return a;
989 }
990 
991 #define partsN(NAME)   parts64_##NAME
992 #define FloatPartsN    FloatParts64
993 
994 #include "softfloat-parts.c.inc"
995 
996 #undef  partsN
997 #undef  FloatPartsN
998 #define partsN(NAME)   parts128_##NAME
999 #define FloatPartsN    FloatParts128
1000 
1001 #include "softfloat-parts.c.inc"
1002 
1003 #undef  partsN
1004 #undef  FloatPartsN
1005 
1006 /*
1007  * Pack/unpack routines with a specific FloatFmt.
1008  */
1009 
1010 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1011                                       float_status *s, const FloatFmt *params)
1012 {
1013     float16_unpack_raw(p, f);
1014     *p = sf_canonicalize(*p, params, s);
1015 }
1016 
1017 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1018                                      float_status *s)
1019 {
1020     float16a_unpack_canonical(p, f, s, &float16_params);
1021 }
1022 
1023 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1024                                       float_status *s)
1025 {
1026     bfloat16_unpack_raw(p, f);
1027     *p = sf_canonicalize(*p, &bfloat16_params, s);
1028 }
1029 
1030 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1031                                              float_status *s,
1032                                              const FloatFmt *params)
1033 {
1034     *p = round_canonical(*p, s, params);
1035     return float16_pack_raw(p);
1036 }
1037 
1038 static float16 float16_round_pack_canonical(FloatParts64 *p,
1039                                             float_status *s)
1040 {
1041     return float16a_round_pack_canonical(p, s, &float16_params);
1042 }
1043 
1044 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1045                                               float_status *s)
1046 {
1047     *p = round_canonical(*p, s, &bfloat16_params);
1048     return bfloat16_pack_raw(p);
1049 }
1050 
1051 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1052                                      float_status *s)
1053 {
1054     float32_unpack_raw(p, f);
1055     *p = sf_canonicalize(*p, &float32_params, s);
1056 }
1057 
1058 static float32 float32_round_pack_canonical(FloatParts64 *p,
1059                                             float_status *s)
1060 {
1061     *p = round_canonical(*p, s, &float32_params);
1062     return float32_pack_raw(p);
1063 }
1064 
1065 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1066                                      float_status *s)
1067 {
1068     float64_unpack_raw(p, f);
1069     *p = sf_canonicalize(*p, &float64_params, s);
1070 }
1071 
1072 static float64 float64_round_pack_canonical(FloatParts64 *p,
1073                                             float_status *s)
1074 {
1075     *p = round_canonical(*p, s, &float64_params);
1076     return float64_pack_raw(p);
1077 }
1078 
1079 /*
1080  * Returns the result of adding or subtracting the values of the
1081  * floating-point values `a' and `b'. The operation is performed
1082  * according to the IEC/IEEE Standard for Binary Floating-Point
1083  * Arithmetic.
1084  */
1085 
1086 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
1087                                 float_status *s)
1088 {
1089     bool a_sign = a.sign;
1090     bool b_sign = b.sign ^ subtract;
1091 
1092     if (a_sign != b_sign) {
1093         /* Subtraction */
1094 
1095         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1096             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
1097                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1098                 a.frac = a.frac - b.frac;
1099             } else {
1100                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1101                 a.frac = b.frac - a.frac;
1102                 a.exp = b.exp;
1103                 a_sign ^= 1;
1104             }
1105 
1106             if (a.frac == 0) {
1107                 a.cls = float_class_zero;
1108                 a.sign = s->float_rounding_mode == float_round_down;
1109             } else {
1110                 int shift = clz64(a.frac);
1111                 a.frac = a.frac << shift;
1112                 a.exp = a.exp - shift;
1113                 a.sign = a_sign;
1114             }
1115             return a;
1116         }
1117         if (is_nan(a.cls) || is_nan(b.cls)) {
1118             return *parts_pick_nan(&a, &b, s);
1119         }
1120         if (a.cls == float_class_inf) {
1121             if (b.cls == float_class_inf) {
1122                 float_raise(float_flag_invalid, s);
1123                 parts_default_nan(&a, s);
1124             }
1125             return a;
1126         }
1127         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1128             a.sign = s->float_rounding_mode == float_round_down;
1129             return a;
1130         }
1131         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1132             b.sign = a_sign ^ 1;
1133             return b;
1134         }
1135         if (b.cls == float_class_zero) {
1136             return a;
1137         }
1138     } else {
1139         /* Addition */
1140         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1141             if (a.exp > b.exp) {
1142                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1143             } else if (a.exp < b.exp) {
1144                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1145                 a.exp = b.exp;
1146             }
1147 
1148             if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1149                 shift64RightJamming(a.frac, 1, &a.frac);
1150                 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1151                 a.exp += 1;
1152             }
1153             return a;
1154         }
1155         if (is_nan(a.cls) || is_nan(b.cls)) {
1156             return *parts_pick_nan(&a, &b, s);
1157         }
1158         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1159             return a;
1160         }
1161         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1162             b.sign = b_sign;
1163             return b;
1164         }
1165     }
1166     g_assert_not_reached();
1167 }
1168 
1169 /*
1170  * Returns the result of adding or subtracting the floating-point
1171  * values `a' and `b'. The operation is performed according to the
1172  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1173  */
1174 
1175 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1176 {
1177     FloatParts64 pa, pb, pr;
1178 
1179     float16_unpack_canonical(&pa, a, status);
1180     float16_unpack_canonical(&pb, b, status);
1181     pr = addsub_floats(pa, pb, false, status);
1182 
1183     return float16_round_pack_canonical(&pr, status);
1184 }
1185 
1186 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1187 {
1188     FloatParts64 pa, pb, pr;
1189 
1190     float16_unpack_canonical(&pa, a, status);
1191     float16_unpack_canonical(&pb, b, status);
1192     pr = addsub_floats(pa, pb, true, status);
1193 
1194     return float16_round_pack_canonical(&pr, status);
1195 }
1196 
1197 static float32 QEMU_SOFTFLOAT_ATTR
1198 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1199 {
1200     FloatParts64 pa, pb, pr;
1201 
1202     float32_unpack_canonical(&pa, a, status);
1203     float32_unpack_canonical(&pb, b, status);
1204     pr = addsub_floats(pa, pb, subtract, status);
1205 
1206     return float32_round_pack_canonical(&pr, status);
1207 }
1208 
1209 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1210 {
1211     return soft_f32_addsub(a, b, false, status);
1212 }
1213 
1214 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1215 {
1216     return soft_f32_addsub(a, b, true, status);
1217 }
1218 
1219 static float64 QEMU_SOFTFLOAT_ATTR
1220 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1221 {
1222     FloatParts64 pa, pb, pr;
1223 
1224     float64_unpack_canonical(&pa, a, status);
1225     float64_unpack_canonical(&pb, b, status);
1226     pr = addsub_floats(pa, pb, subtract, status);
1227 
1228     return float64_round_pack_canonical(&pr, status);
1229 }
1230 
1231 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1232 {
1233     return soft_f64_addsub(a, b, false, status);
1234 }
1235 
1236 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1237 {
1238     return soft_f64_addsub(a, b, true, status);
1239 }
1240 
1241 static float hard_f32_add(float a, float b)
1242 {
1243     return a + b;
1244 }
1245 
1246 static float hard_f32_sub(float a, float b)
1247 {
1248     return a - b;
1249 }
1250 
1251 static double hard_f64_add(double a, double b)
1252 {
1253     return a + b;
1254 }
1255 
1256 static double hard_f64_sub(double a, double b)
1257 {
1258     return a - b;
1259 }
1260 
1261 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1262 {
1263     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1264         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1265     }
1266     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1267 }
1268 
1269 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1270 {
1271     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1272         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1273     } else {
1274         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1275     }
1276 }
1277 
1278 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1279                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1280 {
1281     return float32_gen2(a, b, s, hard, soft,
1282                         f32_is_zon2, f32_addsubmul_post);
1283 }
1284 
1285 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1286                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1287 {
1288     return float64_gen2(a, b, s, hard, soft,
1289                         f64_is_zon2, f64_addsubmul_post);
1290 }
1291 
1292 float32 QEMU_FLATTEN
1293 float32_add(float32 a, float32 b, float_status *s)
1294 {
1295     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1296 }
1297 
1298 float32 QEMU_FLATTEN
1299 float32_sub(float32 a, float32 b, float_status *s)
1300 {
1301     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1302 }
1303 
1304 float64 QEMU_FLATTEN
1305 float64_add(float64 a, float64 b, float_status *s)
1306 {
1307     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1308 }
1309 
1310 float64 QEMU_FLATTEN
1311 float64_sub(float64 a, float64 b, float_status *s)
1312 {
1313     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1314 }
1315 
1316 /*
1317  * Returns the result of adding or subtracting the bfloat16
1318  * values `a' and `b'.
1319  */
1320 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1321 {
1322     FloatParts64 pa, pb, pr;
1323 
1324     bfloat16_unpack_canonical(&pa, a, status);
1325     bfloat16_unpack_canonical(&pb, b, status);
1326     pr = addsub_floats(pa, pb, false, status);
1327 
1328     return bfloat16_round_pack_canonical(&pr, status);
1329 }
1330 
1331 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1332 {
1333     FloatParts64 pa, pb, pr;
1334 
1335     bfloat16_unpack_canonical(&pa, a, status);
1336     bfloat16_unpack_canonical(&pb, b, status);
1337     pr = addsub_floats(pa, pb, true, status);
1338 
1339     return bfloat16_round_pack_canonical(&pr, status);
1340 }
1341 
1342 /*
1343  * Returns the result of multiplying the floating-point values `a' and
1344  * `b'. The operation is performed according to the IEC/IEEE Standard
1345  * for Binary Floating-Point Arithmetic.
1346  */
1347 
1348 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1349 {
1350     bool sign = a.sign ^ b.sign;
1351 
1352     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1353         uint64_t hi, lo;
1354         int exp = a.exp + b.exp;
1355 
1356         mul64To128(a.frac, b.frac, &hi, &lo);
1357         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1358             exp += 1;
1359         } else {
1360             hi <<= 1;
1361         }
1362         hi |= (lo != 0);
1363 
1364         /* Re-use a */
1365         a.exp = exp;
1366         a.sign = sign;
1367         a.frac = hi;
1368         return a;
1369     }
1370     /* handle all the NaN cases */
1371     if (is_nan(a.cls) || is_nan(b.cls)) {
1372         return *parts_pick_nan(&a, &b, s);
1373     }
1374     /* Inf * Zero == NaN */
1375     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1376         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1377         float_raise(float_flag_invalid, s);
1378         parts_default_nan(&a, s);
1379         return a;
1380     }
1381     /* Multiply by 0 or Inf */
1382     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1383         a.sign = sign;
1384         return a;
1385     }
1386     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1387         b.sign = sign;
1388         return b;
1389     }
1390     g_assert_not_reached();
1391 }
1392 
1393 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1394 {
1395     FloatParts64 pa, pb, pr;
1396 
1397     float16_unpack_canonical(&pa, a, status);
1398     float16_unpack_canonical(&pb, b, status);
1399     pr = mul_floats(pa, pb, status);
1400 
1401     return float16_round_pack_canonical(&pr, status);
1402 }
1403 
1404 static float32 QEMU_SOFTFLOAT_ATTR
1405 soft_f32_mul(float32 a, float32 b, float_status *status)
1406 {
1407     FloatParts64 pa, pb, pr;
1408 
1409     float32_unpack_canonical(&pa, a, status);
1410     float32_unpack_canonical(&pb, b, status);
1411     pr = mul_floats(pa, pb, status);
1412 
1413     return float32_round_pack_canonical(&pr, status);
1414 }
1415 
1416 static float64 QEMU_SOFTFLOAT_ATTR
1417 soft_f64_mul(float64 a, float64 b, float_status *status)
1418 {
1419     FloatParts64 pa, pb, pr;
1420 
1421     float64_unpack_canonical(&pa, a, status);
1422     float64_unpack_canonical(&pb, b, status);
1423     pr = mul_floats(pa, pb, status);
1424 
1425     return float64_round_pack_canonical(&pr, status);
1426 }
1427 
1428 static float hard_f32_mul(float a, float b)
1429 {
1430     return a * b;
1431 }
1432 
1433 static double hard_f64_mul(double a, double b)
1434 {
1435     return a * b;
1436 }
1437 
1438 float32 QEMU_FLATTEN
1439 float32_mul(float32 a, float32 b, float_status *s)
1440 {
1441     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1442                         f32_is_zon2, f32_addsubmul_post);
1443 }
1444 
1445 float64 QEMU_FLATTEN
1446 float64_mul(float64 a, float64 b, float_status *s)
1447 {
1448     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1449                         f64_is_zon2, f64_addsubmul_post);
1450 }
1451 
1452 /*
1453  * Returns the result of multiplying the bfloat16
1454  * values `a' and `b'.
1455  */
1456 
1457 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1458 {
1459     FloatParts64 pa, pb, pr;
1460 
1461     bfloat16_unpack_canonical(&pa, a, status);
1462     bfloat16_unpack_canonical(&pb, b, status);
1463     pr = mul_floats(pa, pb, status);
1464 
1465     return bfloat16_round_pack_canonical(&pr, status);
1466 }
1467 
1468 /*
1469  * Returns the result of multiplying the floating-point values `a' and
1470  * `b' then adding 'c', with no intermediate rounding step after the
1471  * multiplication. The operation is performed according to the
1472  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1473  * The flags argument allows the caller to select negation of the
1474  * addend, the intermediate product, or the final result. (The
1475  * difference between this and having the caller do a separate
1476  * negation is that negating externally will flip the sign bit on
1477  * NaNs.)
1478  */
1479 
1480 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1481                                 int flags, float_status *s)
1482 {
1483     bool inf_zero, p_sign;
1484     bool sign_flip = flags & float_muladd_negate_result;
1485     FloatClass p_class;
1486     uint64_t hi, lo;
1487     int p_exp;
1488     int ab_mask, abc_mask;
1489 
1490     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1491     abc_mask = float_cmask(c.cls) | ab_mask;
1492     inf_zero = ab_mask == float_cmask_infzero;
1493 
1494     /* It is implementation-defined whether the cases of (0,inf,qnan)
1495      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1496      * they return if they do), so we have to hand this information
1497      * off to the target-specific pick-a-NaN routine.
1498      */
1499     if (unlikely(abc_mask & float_cmask_anynan)) {
1500         return pick_nan_muladd(a, b, c, inf_zero, s);
1501     }
1502 
1503     if (inf_zero) {
1504         float_raise(float_flag_invalid, s);
1505         parts_default_nan(&a, s);
1506         return a;
1507     }
1508 
1509     if (flags & float_muladd_negate_c) {
1510         c.sign ^= 1;
1511     }
1512 
1513     p_sign = a.sign ^ b.sign;
1514 
1515     if (flags & float_muladd_negate_product) {
1516         p_sign ^= 1;
1517     }
1518 
1519     if (ab_mask & float_cmask_inf) {
1520         p_class = float_class_inf;
1521     } else if (ab_mask & float_cmask_zero) {
1522         p_class = float_class_zero;
1523     } else {
1524         p_class = float_class_normal;
1525     }
1526 
1527     if (c.cls == float_class_inf) {
1528         if (p_class == float_class_inf && p_sign != c.sign) {
1529             float_raise(float_flag_invalid, s);
1530             parts_default_nan(&c, s);
1531         } else {
1532             c.sign ^= sign_flip;
1533         }
1534         return c;
1535     }
1536 
1537     if (p_class == float_class_inf) {
1538         a.cls = float_class_inf;
1539         a.sign = p_sign ^ sign_flip;
1540         return a;
1541     }
1542 
1543     if (p_class == float_class_zero) {
1544         if (c.cls == float_class_zero) {
1545             if (p_sign != c.sign) {
1546                 p_sign = s->float_rounding_mode == float_round_down;
1547             }
1548             c.sign = p_sign;
1549         } else if (flags & float_muladd_halve_result) {
1550             c.exp -= 1;
1551         }
1552         c.sign ^= sign_flip;
1553         return c;
1554     }
1555 
1556     /* a & b should be normals now... */
1557     assert(a.cls == float_class_normal &&
1558            b.cls == float_class_normal);
1559 
1560     p_exp = a.exp + b.exp;
1561 
1562     mul64To128(a.frac, b.frac, &hi, &lo);
1563 
1564     /* Renormalize to the msb. */
1565     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1566         p_exp += 1;
1567     } else {
1568         shortShift128Left(hi, lo, 1, &hi, &lo);
1569     }
1570 
1571     /* + add/sub */
1572     if (c.cls != float_class_zero) {
1573         int exp_diff = p_exp - c.exp;
1574         if (p_sign == c.sign) {
1575             /* Addition */
1576             if (exp_diff <= 0) {
1577                 shift64RightJamming(hi, -exp_diff, &hi);
1578                 p_exp = c.exp;
1579                 if (uadd64_overflow(hi, c.frac, &hi)) {
1580                     shift64RightJamming(hi, 1, &hi);
1581                     hi |= DECOMPOSED_IMPLICIT_BIT;
1582                     p_exp += 1;
1583                 }
1584             } else {
1585                 uint64_t c_hi, c_lo, over;
1586                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1587                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1588                 if (over) {
1589                     shift64RightJamming(hi, 1, &hi);
1590                     hi |= DECOMPOSED_IMPLICIT_BIT;
1591                     p_exp += 1;
1592                 }
1593             }
1594         } else {
1595             /* Subtraction */
1596             uint64_t c_hi = c.frac, c_lo = 0;
1597 
1598             if (exp_diff <= 0) {
1599                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1600                 if (exp_diff == 0
1601                     &&
1602                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1603                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1604                 } else {
1605                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1606                     p_sign ^= 1;
1607                     p_exp = c.exp;
1608                 }
1609             } else {
1610                 shift128RightJamming(c_hi, c_lo,
1611                                      exp_diff,
1612                                      &c_hi, &c_lo);
1613                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1614             }
1615 
1616             if (hi == 0 && lo == 0) {
1617                 a.cls = float_class_zero;
1618                 a.sign = s->float_rounding_mode == float_round_down;
1619                 a.sign ^= sign_flip;
1620                 return a;
1621             } else {
1622                 int shift;
1623                 if (hi != 0) {
1624                     shift = clz64(hi);
1625                 } else {
1626                     shift = clz64(lo) + 64;
1627                 }
1628                 /* Normalizing to a binary point of 124 is the
1629                    correct adjust for the exponent.  However since we're
1630                    shifting, we might as well put the binary point back
1631                    at 63 where we really want it.  Therefore shift as
1632                    if we're leaving 1 bit at the top of the word, but
1633                    adjust the exponent as if we're leaving 3 bits.  */
1634                 shift128Left(hi, lo, shift, &hi, &lo);
1635                 p_exp -= shift;
1636             }
1637         }
1638     }
1639     hi |= (lo != 0);
1640 
1641     if (flags & float_muladd_halve_result) {
1642         p_exp -= 1;
1643     }
1644 
1645     /* finally prepare our result */
1646     a.cls = float_class_normal;
1647     a.sign = p_sign ^ sign_flip;
1648     a.exp = p_exp;
1649     a.frac = hi;
1650 
1651     return a;
1652 }
1653 
1654 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1655                                                 int flags, float_status *status)
1656 {
1657     FloatParts64 pa, pb, pc, pr;
1658 
1659     float16_unpack_canonical(&pa, a, status);
1660     float16_unpack_canonical(&pb, b, status);
1661     float16_unpack_canonical(&pc, c, status);
1662     pr = muladd_floats(pa, pb, pc, flags, status);
1663 
1664     return float16_round_pack_canonical(&pr, status);
1665 }
1666 
1667 static float32 QEMU_SOFTFLOAT_ATTR
1668 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1669                 float_status *status)
1670 {
1671     FloatParts64 pa, pb, pc, pr;
1672 
1673     float32_unpack_canonical(&pa, a, status);
1674     float32_unpack_canonical(&pb, b, status);
1675     float32_unpack_canonical(&pc, c, status);
1676     pr = muladd_floats(pa, pb, pc, flags, status);
1677 
1678     return float32_round_pack_canonical(&pr, status);
1679 }
1680 
1681 static float64 QEMU_SOFTFLOAT_ATTR
1682 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1683                 float_status *status)
1684 {
1685     FloatParts64 pa, pb, pc, pr;
1686 
1687     float64_unpack_canonical(&pa, a, status);
1688     float64_unpack_canonical(&pb, b, status);
1689     float64_unpack_canonical(&pc, c, status);
1690     pr = muladd_floats(pa, pb, pc, flags, status);
1691 
1692     return float64_round_pack_canonical(&pr, status);
1693 }
1694 
1695 static bool force_soft_fma;
1696 
1697 float32 QEMU_FLATTEN
1698 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1699 {
1700     union_float32 ua, ub, uc, ur;
1701 
1702     ua.s = xa;
1703     ub.s = xb;
1704     uc.s = xc;
1705 
1706     if (unlikely(!can_use_fpu(s))) {
1707         goto soft;
1708     }
1709     if (unlikely(flags & float_muladd_halve_result)) {
1710         goto soft;
1711     }
1712 
1713     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1714     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1715         goto soft;
1716     }
1717 
1718     if (unlikely(force_soft_fma)) {
1719         goto soft;
1720     }
1721 
1722     /*
1723      * When (a || b) == 0, there's no need to check for under/over flow,
1724      * since we know the addend is (normal || 0) and the product is 0.
1725      */
1726     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1727         union_float32 up;
1728         bool prod_sign;
1729 
1730         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1731         prod_sign ^= !!(flags & float_muladd_negate_product);
1732         up.s = float32_set_sign(float32_zero, prod_sign);
1733 
1734         if (flags & float_muladd_negate_c) {
1735             uc.h = -uc.h;
1736         }
1737         ur.h = up.h + uc.h;
1738     } else {
1739         union_float32 ua_orig = ua;
1740         union_float32 uc_orig = uc;
1741 
1742         if (flags & float_muladd_negate_product) {
1743             ua.h = -ua.h;
1744         }
1745         if (flags & float_muladd_negate_c) {
1746             uc.h = -uc.h;
1747         }
1748 
1749         ur.h = fmaf(ua.h, ub.h, uc.h);
1750 
1751         if (unlikely(f32_is_inf(ur))) {
1752             float_raise(float_flag_overflow, s);
1753         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1754             ua = ua_orig;
1755             uc = uc_orig;
1756             goto soft;
1757         }
1758     }
1759     if (flags & float_muladd_negate_result) {
1760         return float32_chs(ur.s);
1761     }
1762     return ur.s;
1763 
1764  soft:
1765     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1766 }
1767 
1768 float64 QEMU_FLATTEN
1769 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1770 {
1771     union_float64 ua, ub, uc, ur;
1772 
1773     ua.s = xa;
1774     ub.s = xb;
1775     uc.s = xc;
1776 
1777     if (unlikely(!can_use_fpu(s))) {
1778         goto soft;
1779     }
1780     if (unlikely(flags & float_muladd_halve_result)) {
1781         goto soft;
1782     }
1783 
1784     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1785     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1786         goto soft;
1787     }
1788 
1789     if (unlikely(force_soft_fma)) {
1790         goto soft;
1791     }
1792 
1793     /*
1794      * When (a || b) == 0, there's no need to check for under/over flow,
1795      * since we know the addend is (normal || 0) and the product is 0.
1796      */
1797     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1798         union_float64 up;
1799         bool prod_sign;
1800 
1801         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1802         prod_sign ^= !!(flags & float_muladd_negate_product);
1803         up.s = float64_set_sign(float64_zero, prod_sign);
1804 
1805         if (flags & float_muladd_negate_c) {
1806             uc.h = -uc.h;
1807         }
1808         ur.h = up.h + uc.h;
1809     } else {
1810         union_float64 ua_orig = ua;
1811         union_float64 uc_orig = uc;
1812 
1813         if (flags & float_muladd_negate_product) {
1814             ua.h = -ua.h;
1815         }
1816         if (flags & float_muladd_negate_c) {
1817             uc.h = -uc.h;
1818         }
1819 
1820         ur.h = fma(ua.h, ub.h, uc.h);
1821 
1822         if (unlikely(f64_is_inf(ur))) {
1823             float_raise(float_flag_overflow, s);
1824         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1825             ua = ua_orig;
1826             uc = uc_orig;
1827             goto soft;
1828         }
1829     }
1830     if (flags & float_muladd_negate_result) {
1831         return float64_chs(ur.s);
1832     }
1833     return ur.s;
1834 
1835  soft:
1836     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1837 }
1838 
1839 /*
1840  * Returns the result of multiplying the bfloat16 values `a'
1841  * and `b' then adding 'c', with no intermediate rounding step after the
1842  * multiplication.
1843  */
1844 
1845 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1846                                       int flags, float_status *status)
1847 {
1848     FloatParts64 pa, pb, pc, pr;
1849 
1850     bfloat16_unpack_canonical(&pa, a, status);
1851     bfloat16_unpack_canonical(&pb, b, status);
1852     bfloat16_unpack_canonical(&pc, c, status);
1853     pr = muladd_floats(pa, pb, pc, flags, status);
1854 
1855     return bfloat16_round_pack_canonical(&pr, status);
1856 }
1857 
1858 /*
1859  * Returns the result of dividing the floating-point value `a' by the
1860  * corresponding value `b'. The operation is performed according to
1861  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1862  */
1863 
1864 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1865 {
1866     bool sign = a.sign ^ b.sign;
1867 
1868     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1869         uint64_t n0, n1, q, r;
1870         int exp = a.exp - b.exp;
1871 
1872         /*
1873          * We want a 2*N / N-bit division to produce exactly an N-bit
1874          * result, so that we do not lose any precision and so that we
1875          * do not have to renormalize afterward.  If A.frac < B.frac,
1876          * then division would produce an (N-1)-bit result; shift A left
1877          * by one to produce the an N-bit result, and decrement the
1878          * exponent to match.
1879          *
1880          * The udiv_qrnnd algorithm that we're using requires normalization,
1881          * i.e. the msb of the denominator must be set, which is already true.
1882          */
1883         if (a.frac < b.frac) {
1884             exp -= 1;
1885             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1886         } else {
1887             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1888         }
1889         q = udiv_qrnnd(&r, n1, n0, b.frac);
1890 
1891         /* Set lsb if there is a remainder, to set inexact. */
1892         a.frac = q | (r != 0);
1893         a.sign = sign;
1894         a.exp = exp;
1895         return a;
1896     }
1897     /* handle all the NaN cases */
1898     if (is_nan(a.cls) || is_nan(b.cls)) {
1899         return *parts_pick_nan(&a, &b, s);
1900     }
1901     /* 0/0 or Inf/Inf */
1902     if (a.cls == b.cls
1903         &&
1904         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1905         float_raise(float_flag_invalid, s);
1906         parts_default_nan(&a, s);
1907         return a;
1908     }
1909     /* Inf / x or 0 / x */
1910     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1911         a.sign = sign;
1912         return a;
1913     }
1914     /* Div 0 => Inf */
1915     if (b.cls == float_class_zero) {
1916         float_raise(float_flag_divbyzero, s);
1917         a.cls = float_class_inf;
1918         a.sign = sign;
1919         return a;
1920     }
1921     /* Div by Inf */
1922     if (b.cls == float_class_inf) {
1923         a.cls = float_class_zero;
1924         a.sign = sign;
1925         return a;
1926     }
1927     g_assert_not_reached();
1928 }
1929 
1930 float16 float16_div(float16 a, float16 b, float_status *status)
1931 {
1932     FloatParts64 pa, pb, pr;
1933 
1934     float16_unpack_canonical(&pa, a, status);
1935     float16_unpack_canonical(&pb, b, status);
1936     pr = div_floats(pa, pb, status);
1937 
1938     return float16_round_pack_canonical(&pr, status);
1939 }
1940 
1941 static float32 QEMU_SOFTFLOAT_ATTR
1942 soft_f32_div(float32 a, float32 b, float_status *status)
1943 {
1944     FloatParts64 pa, pb, pr;
1945 
1946     float32_unpack_canonical(&pa, a, status);
1947     float32_unpack_canonical(&pb, b, status);
1948     pr = div_floats(pa, pb, status);
1949 
1950     return float32_round_pack_canonical(&pr, status);
1951 }
1952 
1953 static float64 QEMU_SOFTFLOAT_ATTR
1954 soft_f64_div(float64 a, float64 b, float_status *status)
1955 {
1956     FloatParts64 pa, pb, pr;
1957 
1958     float64_unpack_canonical(&pa, a, status);
1959     float64_unpack_canonical(&pb, b, status);
1960     pr = div_floats(pa, pb, status);
1961 
1962     return float64_round_pack_canonical(&pr, status);
1963 }
1964 
1965 static float hard_f32_div(float a, float b)
1966 {
1967     return a / b;
1968 }
1969 
1970 static double hard_f64_div(double a, double b)
1971 {
1972     return a / b;
1973 }
1974 
1975 static bool f32_div_pre(union_float32 a, union_float32 b)
1976 {
1977     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1978         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1979                fpclassify(b.h) == FP_NORMAL;
1980     }
1981     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1982 }
1983 
1984 static bool f64_div_pre(union_float64 a, union_float64 b)
1985 {
1986     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1987         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1988                fpclassify(b.h) == FP_NORMAL;
1989     }
1990     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1991 }
1992 
1993 static bool f32_div_post(union_float32 a, union_float32 b)
1994 {
1995     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1996         return fpclassify(a.h) != FP_ZERO;
1997     }
1998     return !float32_is_zero(a.s);
1999 }
2000 
2001 static bool f64_div_post(union_float64 a, union_float64 b)
2002 {
2003     if (QEMU_HARDFLOAT_2F64_USE_FP) {
2004         return fpclassify(a.h) != FP_ZERO;
2005     }
2006     return !float64_is_zero(a.s);
2007 }
2008 
2009 float32 QEMU_FLATTEN
2010 float32_div(float32 a, float32 b, float_status *s)
2011 {
2012     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
2013                         f32_div_pre, f32_div_post);
2014 }
2015 
2016 float64 QEMU_FLATTEN
2017 float64_div(float64 a, float64 b, float_status *s)
2018 {
2019     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
2020                         f64_div_pre, f64_div_post);
2021 }
2022 
2023 /*
2024  * Returns the result of dividing the bfloat16
2025  * value `a' by the corresponding value `b'.
2026  */
2027 
2028 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
2029 {
2030     FloatParts64 pa, pb, pr;
2031 
2032     bfloat16_unpack_canonical(&pa, a, status);
2033     bfloat16_unpack_canonical(&pb, b, status);
2034     pr = div_floats(pa, pb, status);
2035 
2036     return bfloat16_round_pack_canonical(&pr, status);
2037 }
2038 
2039 /*
2040  * Float to Float conversions
2041  *
2042  * Returns the result of converting one float format to another. The
2043  * conversion is performed according to the IEC/IEEE Standard for
2044  * Binary Floating-Point Arithmetic.
2045  *
2046  * The float_to_float helper only needs to take care of raising
2047  * invalid exceptions and handling the conversion on NaNs.
2048  */
2049 
2050 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
2051                                  float_status *s)
2052 {
2053     if (dstf->arm_althp) {
2054         switch (a.cls) {
2055         case float_class_qnan:
2056         case float_class_snan:
2057             /* There is no NaN in the destination format.  Raise Invalid
2058              * and return a zero with the sign of the input NaN.
2059              */
2060             float_raise(float_flag_invalid, s);
2061             a.cls = float_class_zero;
2062             a.frac = 0;
2063             a.exp = 0;
2064             break;
2065 
2066         case float_class_inf:
2067             /* There is no Inf in the destination format.  Raise Invalid
2068              * and return the maximum normal with the correct sign.
2069              */
2070             float_raise(float_flag_invalid, s);
2071             a.cls = float_class_normal;
2072             a.exp = dstf->exp_max;
2073             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
2074             break;
2075 
2076         default:
2077             break;
2078         }
2079     } else if (is_nan(a.cls)) {
2080         parts_return_nan(&a, s);
2081     }
2082     return a;
2083 }
2084 
2085 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
2086 {
2087     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2088     FloatParts64 pa, pr;
2089 
2090     float16a_unpack_canonical(&pa, a, s, fmt16);
2091     pr = float_to_float(pa, &float32_params, s);
2092     return float32_round_pack_canonical(&pr, s);
2093 }
2094 
2095 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2096 {
2097     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2098     FloatParts64 pa, pr;
2099 
2100     float16a_unpack_canonical(&pa, a, s, fmt16);
2101     pr = float_to_float(pa, &float64_params, s);
2102     return float64_round_pack_canonical(&pr, s);
2103 }
2104 
2105 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2106 {
2107     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2108     FloatParts64 pa, pr;
2109 
2110     float32_unpack_canonical(&pa, a, s);
2111     pr = float_to_float(pa, fmt16, s);
2112     return float16a_round_pack_canonical(&pr, s, fmt16);
2113 }
2114 
2115 static float64 QEMU_SOFTFLOAT_ATTR
2116 soft_float32_to_float64(float32 a, float_status *s)
2117 {
2118     FloatParts64 pa, pr;
2119 
2120     float32_unpack_canonical(&pa, a, s);
2121     pr = float_to_float(pa, &float64_params, s);
2122     return float64_round_pack_canonical(&pr, s);
2123 }
2124 
2125 float64 float32_to_float64(float32 a, float_status *s)
2126 {
2127     if (likely(float32_is_normal(a))) {
2128         /* Widening conversion can never produce inexact results.  */
2129         union_float32 uf;
2130         union_float64 ud;
2131         uf.s = a;
2132         ud.h = uf.h;
2133         return ud.s;
2134     } else if (float32_is_zero(a)) {
2135         return float64_set_sign(float64_zero, float32_is_neg(a));
2136     } else {
2137         return soft_float32_to_float64(a, s);
2138     }
2139 }
2140 
2141 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2142 {
2143     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2144     FloatParts64 pa, pr;
2145 
2146     float64_unpack_canonical(&pa, a, s);
2147     pr = float_to_float(pa, fmt16, s);
2148     return float16a_round_pack_canonical(&pr, s, fmt16);
2149 }
2150 
2151 float32 float64_to_float32(float64 a, float_status *s)
2152 {
2153     FloatParts64 pa, pr;
2154 
2155     float64_unpack_canonical(&pa, a, s);
2156     pr = float_to_float(pa, &float32_params, s);
2157     return float32_round_pack_canonical(&pr, s);
2158 }
2159 
2160 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2161 {
2162     FloatParts64 pa, pr;
2163 
2164     bfloat16_unpack_canonical(&pa, a, s);
2165     pr = float_to_float(pa, &float32_params, s);
2166     return float32_round_pack_canonical(&pr, s);
2167 }
2168 
2169 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2170 {
2171     FloatParts64 pa, pr;
2172 
2173     bfloat16_unpack_canonical(&pa, a, s);
2174     pr = float_to_float(pa, &float64_params, s);
2175     return float64_round_pack_canonical(&pr, s);
2176 }
2177 
2178 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2179 {
2180     FloatParts64 pa, pr;
2181 
2182     float32_unpack_canonical(&pa, a, s);
2183     pr = float_to_float(pa, &bfloat16_params, s);
2184     return bfloat16_round_pack_canonical(&pr, s);
2185 }
2186 
2187 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2188 {
2189     FloatParts64 pa, pr;
2190 
2191     float64_unpack_canonical(&pa, a, s);
2192     pr = float_to_float(pa, &bfloat16_params, s);
2193     return bfloat16_round_pack_canonical(&pr, s);
2194 }
2195 
2196 /*
2197  * Rounds the floating-point value `a' to an integer, and returns the
2198  * result as a floating-point value. The operation is performed
2199  * according to the IEC/IEEE Standard for Binary Floating-Point
2200  * Arithmetic.
2201  */
2202 
2203 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2204                                int scale, float_status *s)
2205 {
2206     switch (a.cls) {
2207     case float_class_qnan:
2208     case float_class_snan:
2209         parts_return_nan(&a, s);
2210         break;
2211 
2212     case float_class_zero:
2213     case float_class_inf:
2214         /* already "integral" */
2215         break;
2216 
2217     case float_class_normal:
2218         scale = MIN(MAX(scale, -0x10000), 0x10000);
2219         a.exp += scale;
2220 
2221         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2222             /* already integral */
2223             break;
2224         }
2225         if (a.exp < 0) {
2226             bool one;
2227             /* all fractional */
2228             float_raise(float_flag_inexact, s);
2229             switch (rmode) {
2230             case float_round_nearest_even:
2231                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2232                 break;
2233             case float_round_ties_away:
2234                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2235                 break;
2236             case float_round_to_zero:
2237                 one = false;
2238                 break;
2239             case float_round_up:
2240                 one = !a.sign;
2241                 break;
2242             case float_round_down:
2243                 one = a.sign;
2244                 break;
2245             case float_round_to_odd:
2246                 one = true;
2247                 break;
2248             default:
2249                 g_assert_not_reached();
2250             }
2251 
2252             if (one) {
2253                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2254                 a.exp = 0;
2255             } else {
2256                 a.cls = float_class_zero;
2257             }
2258         } else {
2259             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2260             uint64_t frac_lsbm1 = frac_lsb >> 1;
2261             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2262             uint64_t rnd_mask = rnd_even_mask >> 1;
2263             uint64_t inc;
2264 
2265             switch (rmode) {
2266             case float_round_nearest_even:
2267                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2268                 break;
2269             case float_round_ties_away:
2270                 inc = frac_lsbm1;
2271                 break;
2272             case float_round_to_zero:
2273                 inc = 0;
2274                 break;
2275             case float_round_up:
2276                 inc = a.sign ? 0 : rnd_mask;
2277                 break;
2278             case float_round_down:
2279                 inc = a.sign ? rnd_mask : 0;
2280                 break;
2281             case float_round_to_odd:
2282                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2283                 break;
2284             default:
2285                 g_assert_not_reached();
2286             }
2287 
2288             if (a.frac & rnd_mask) {
2289                 float_raise(float_flag_inexact, s);
2290                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2291                     a.frac >>= 1;
2292                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2293                     a.exp++;
2294                 }
2295                 a.frac &= ~rnd_mask;
2296             }
2297         }
2298         break;
2299     default:
2300         g_assert_not_reached();
2301     }
2302     return a;
2303 }
2304 
2305 float16 float16_round_to_int(float16 a, float_status *s)
2306 {
2307     FloatParts64 pa, pr;
2308 
2309     float16_unpack_canonical(&pa, a, s);
2310     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2311     return float16_round_pack_canonical(&pr, s);
2312 }
2313 
2314 float32 float32_round_to_int(float32 a, float_status *s)
2315 {
2316     FloatParts64 pa, pr;
2317 
2318     float32_unpack_canonical(&pa, a, s);
2319     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2320     return float32_round_pack_canonical(&pr, s);
2321 }
2322 
2323 float64 float64_round_to_int(float64 a, float_status *s)
2324 {
2325     FloatParts64 pa, pr;
2326 
2327     float64_unpack_canonical(&pa, a, s);
2328     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2329     return float64_round_pack_canonical(&pr, s);
2330 }
2331 
2332 /*
2333  * Rounds the bfloat16 value `a' to an integer, and returns the
2334  * result as a bfloat16 value.
2335  */
2336 
2337 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2338 {
2339     FloatParts64 pa, pr;
2340 
2341     bfloat16_unpack_canonical(&pa, a, s);
2342     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2343     return bfloat16_round_pack_canonical(&pr, s);
2344 }
2345 
2346 /*
2347  * Returns the result of converting the floating-point value `a' to
2348  * the two's complement integer format. The conversion is performed
2349  * according to the IEC/IEEE Standard for Binary Floating-Point
2350  * Arithmetic---which means in particular that the conversion is
2351  * rounded according to the current rounding mode. If `a' is a NaN,
2352  * the largest positive integer is returned. Otherwise, if the
2353  * conversion overflows, the largest integer with the same sign as `a'
2354  * is returned.
2355 */
2356 
2357 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2358                                      int scale, int64_t min, int64_t max,
2359                                      float_status *s)
2360 {
2361     uint64_t r;
2362     int orig_flags = get_float_exception_flags(s);
2363     FloatParts64 p = round_to_int(in, rmode, scale, s);
2364 
2365     switch (p.cls) {
2366     case float_class_snan:
2367     case float_class_qnan:
2368         s->float_exception_flags = orig_flags | float_flag_invalid;
2369         return max;
2370     case float_class_inf:
2371         s->float_exception_flags = orig_flags | float_flag_invalid;
2372         return p.sign ? min : max;
2373     case float_class_zero:
2374         return 0;
2375     case float_class_normal:
2376         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2377             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2378         } else {
2379             r = UINT64_MAX;
2380         }
2381         if (p.sign) {
2382             if (r <= -(uint64_t) min) {
2383                 return -r;
2384             } else {
2385                 s->float_exception_flags = orig_flags | float_flag_invalid;
2386                 return min;
2387             }
2388         } else {
2389             if (r <= max) {
2390                 return r;
2391             } else {
2392                 s->float_exception_flags = orig_flags | float_flag_invalid;
2393                 return max;
2394             }
2395         }
2396     default:
2397         g_assert_not_reached();
2398     }
2399 }
2400 
2401 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2402                               float_status *s)
2403 {
2404     FloatParts64 p;
2405 
2406     float16_unpack_canonical(&p, a, s);
2407     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2408 }
2409 
2410 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2411                                 float_status *s)
2412 {
2413     FloatParts64 p;
2414 
2415     float16_unpack_canonical(&p, a, s);
2416     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2417 }
2418 
2419 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2420                                 float_status *s)
2421 {
2422     FloatParts64 p;
2423 
2424     float16_unpack_canonical(&p, a, s);
2425     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2426 }
2427 
2428 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2429                                 float_status *s)
2430 {
2431     FloatParts64 p;
2432 
2433     float16_unpack_canonical(&p, a, s);
2434     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2435 }
2436 
2437 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2438                                 float_status *s)
2439 {
2440     FloatParts64 p;
2441 
2442     float32_unpack_canonical(&p, a, s);
2443     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2444 }
2445 
2446 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2447                                 float_status *s)
2448 {
2449     FloatParts64 p;
2450 
2451     float32_unpack_canonical(&p, a, s);
2452     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2453 }
2454 
2455 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2456                                 float_status *s)
2457 {
2458     FloatParts64 p;
2459 
2460     float32_unpack_canonical(&p, a, s);
2461     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2462 }
2463 
2464 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2465                                 float_status *s)
2466 {
2467     FloatParts64 p;
2468 
2469     float64_unpack_canonical(&p, a, s);
2470     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2471 }
2472 
2473 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2474                                 float_status *s)
2475 {
2476     FloatParts64 p;
2477 
2478     float64_unpack_canonical(&p, a, s);
2479     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2480 }
2481 
2482 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2483                                 float_status *s)
2484 {
2485     FloatParts64 p;
2486 
2487     float64_unpack_canonical(&p, a, s);
2488     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2489 }
2490 
2491 int8_t float16_to_int8(float16 a, float_status *s)
2492 {
2493     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2494 }
2495 
2496 int16_t float16_to_int16(float16 a, float_status *s)
2497 {
2498     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2499 }
2500 
2501 int32_t float16_to_int32(float16 a, float_status *s)
2502 {
2503     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2504 }
2505 
2506 int64_t float16_to_int64(float16 a, float_status *s)
2507 {
2508     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2509 }
2510 
2511 int16_t float32_to_int16(float32 a, float_status *s)
2512 {
2513     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2514 }
2515 
2516 int32_t float32_to_int32(float32 a, float_status *s)
2517 {
2518     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2519 }
2520 
2521 int64_t float32_to_int64(float32 a, float_status *s)
2522 {
2523     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2524 }
2525 
2526 int16_t float64_to_int16(float64 a, float_status *s)
2527 {
2528     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2529 }
2530 
2531 int32_t float64_to_int32(float64 a, float_status *s)
2532 {
2533     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2534 }
2535 
2536 int64_t float64_to_int64(float64 a, float_status *s)
2537 {
2538     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2539 }
2540 
2541 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2542 {
2543     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2544 }
2545 
2546 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2547 {
2548     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2549 }
2550 
2551 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2552 {
2553     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2554 }
2555 
2556 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2557 {
2558     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2559 }
2560 
2561 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2562 {
2563     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2564 }
2565 
2566 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2567 {
2568     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2569 }
2570 
2571 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2572 {
2573     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2574 }
2575 
2576 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2577 {
2578     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2579 }
2580 
2581 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2582 {
2583     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2584 }
2585 
2586 /*
2587  * Returns the result of converting the floating-point value `a' to
2588  * the two's complement integer format.
2589  */
2590 
2591 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2592                                  float_status *s)
2593 {
2594     FloatParts64 p;
2595 
2596     bfloat16_unpack_canonical(&p, a, s);
2597     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2598 }
2599 
2600 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2601                                  float_status *s)
2602 {
2603     FloatParts64 p;
2604 
2605     bfloat16_unpack_canonical(&p, a, s);
2606     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2607 }
2608 
2609 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2610                                  float_status *s)
2611 {
2612     FloatParts64 p;
2613 
2614     bfloat16_unpack_canonical(&p, a, s);
2615     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2616 }
2617 
2618 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2619 {
2620     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2621 }
2622 
2623 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2624 {
2625     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2626 }
2627 
2628 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2629 {
2630     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2631 }
2632 
2633 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2634 {
2635     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2636 }
2637 
2638 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2639 {
2640     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2641 }
2642 
2643 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2644 {
2645     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2646 }
2647 
2648 /*
2649  *  Returns the result of converting the floating-point value `a' to
2650  *  the unsigned integer format. The conversion is performed according
2651  *  to the IEC/IEEE Standard for Binary Floating-Point
2652  *  Arithmetic---which means in particular that the conversion is
2653  *  rounded according to the current rounding mode. If `a' is a NaN,
2654  *  the largest unsigned integer is returned. Otherwise, if the
2655  *  conversion overflows, the largest unsigned integer is returned. If
2656  *  the 'a' is negative, the result is rounded and zero is returned;
2657  *  values that do not round to zero will raise the inexact exception
2658  *  flag.
2659  */
2660 
2661 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2662                                        int scale, uint64_t max,
2663                                        float_status *s)
2664 {
2665     int orig_flags = get_float_exception_flags(s);
2666     FloatParts64 p = round_to_int(in, rmode, scale, s);
2667     uint64_t r;
2668 
2669     switch (p.cls) {
2670     case float_class_snan:
2671     case float_class_qnan:
2672         s->float_exception_flags = orig_flags | float_flag_invalid;
2673         return max;
2674     case float_class_inf:
2675         s->float_exception_flags = orig_flags | float_flag_invalid;
2676         return p.sign ? 0 : max;
2677     case float_class_zero:
2678         return 0;
2679     case float_class_normal:
2680         if (p.sign) {
2681             s->float_exception_flags = orig_flags | float_flag_invalid;
2682             return 0;
2683         }
2684 
2685         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2686             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2687         } else {
2688             s->float_exception_flags = orig_flags | float_flag_invalid;
2689             return max;
2690         }
2691 
2692         /* For uint64 this will never trip, but if p.exp is too large
2693          * to shift a decomposed fraction we shall have exited via the
2694          * 3rd leg above.
2695          */
2696         if (r > max) {
2697             s->float_exception_flags = orig_flags | float_flag_invalid;
2698             return max;
2699         }
2700         return r;
2701     default:
2702         g_assert_not_reached();
2703     }
2704 }
2705 
2706 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2707                                 float_status *s)
2708 {
2709     FloatParts64 p;
2710 
2711     float16_unpack_canonical(&p, a, s);
2712     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2713 }
2714 
2715 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2716                                   float_status *s)
2717 {
2718     FloatParts64 p;
2719 
2720     float16_unpack_canonical(&p, a, s);
2721     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2722 }
2723 
2724 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2725                                   float_status *s)
2726 {
2727     FloatParts64 p;
2728 
2729     float16_unpack_canonical(&p, a, s);
2730     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2731 }
2732 
2733 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2734                                   float_status *s)
2735 {
2736     FloatParts64 p;
2737 
2738     float16_unpack_canonical(&p, a, s);
2739     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2740 }
2741 
2742 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2743                                   float_status *s)
2744 {
2745     FloatParts64 p;
2746 
2747     float32_unpack_canonical(&p, a, s);
2748     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2749 }
2750 
2751 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2752                                   float_status *s)
2753 {
2754     FloatParts64 p;
2755 
2756     float32_unpack_canonical(&p, a, s);
2757     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2758 }
2759 
2760 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2761                                   float_status *s)
2762 {
2763     FloatParts64 p;
2764 
2765     float32_unpack_canonical(&p, a, s);
2766     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2767 }
2768 
2769 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2770                                   float_status *s)
2771 {
2772     FloatParts64 p;
2773 
2774     float64_unpack_canonical(&p, a, s);
2775     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2776 }
2777 
2778 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2779                                   float_status *s)
2780 {
2781     FloatParts64 p;
2782 
2783     float64_unpack_canonical(&p, a, s);
2784     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2785 }
2786 
2787 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2788                                   float_status *s)
2789 {
2790     FloatParts64 p;
2791 
2792     float64_unpack_canonical(&p, a, s);
2793     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2794 }
2795 
2796 uint8_t float16_to_uint8(float16 a, float_status *s)
2797 {
2798     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2799 }
2800 
2801 uint16_t float16_to_uint16(float16 a, float_status *s)
2802 {
2803     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2804 }
2805 
2806 uint32_t float16_to_uint32(float16 a, float_status *s)
2807 {
2808     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2809 }
2810 
2811 uint64_t float16_to_uint64(float16 a, float_status *s)
2812 {
2813     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2814 }
2815 
2816 uint16_t float32_to_uint16(float32 a, float_status *s)
2817 {
2818     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2819 }
2820 
2821 uint32_t float32_to_uint32(float32 a, float_status *s)
2822 {
2823     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2824 }
2825 
2826 uint64_t float32_to_uint64(float32 a, float_status *s)
2827 {
2828     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2829 }
2830 
2831 uint16_t float64_to_uint16(float64 a, float_status *s)
2832 {
2833     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2834 }
2835 
2836 uint32_t float64_to_uint32(float64 a, float_status *s)
2837 {
2838     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2839 }
2840 
2841 uint64_t float64_to_uint64(float64 a, float_status *s)
2842 {
2843     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2844 }
2845 
2846 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2847 {
2848     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2849 }
2850 
2851 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2852 {
2853     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2854 }
2855 
2856 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2857 {
2858     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2859 }
2860 
2861 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2862 {
2863     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2864 }
2865 
2866 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2867 {
2868     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2869 }
2870 
2871 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2872 {
2873     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2874 }
2875 
2876 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2877 {
2878     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2879 }
2880 
2881 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2882 {
2883     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2884 }
2885 
2886 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2887 {
2888     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2889 }
2890 
2891 /*
2892  *  Returns the result of converting the bfloat16 value `a' to
2893  *  the unsigned integer format.
2894  */
2895 
2896 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2897                                    int scale, float_status *s)
2898 {
2899     FloatParts64 p;
2900 
2901     bfloat16_unpack_canonical(&p, a, s);
2902     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2903 }
2904 
2905 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2906                                    int scale, float_status *s)
2907 {
2908     FloatParts64 p;
2909 
2910     bfloat16_unpack_canonical(&p, a, s);
2911     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2912 }
2913 
2914 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2915                                    int scale, float_status *s)
2916 {
2917     FloatParts64 p;
2918 
2919     bfloat16_unpack_canonical(&p, a, s);
2920     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2921 }
2922 
2923 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2924 {
2925     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2926 }
2927 
2928 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2929 {
2930     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2931 }
2932 
2933 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2934 {
2935     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2936 }
2937 
2938 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2939 {
2940     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2941 }
2942 
2943 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2944 {
2945     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2946 }
2947 
2948 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2949 {
2950     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2951 }
2952 
2953 /*
2954  * Integer to float conversions
2955  *
2956  * Returns the result of converting the two's complement integer `a'
2957  * to the floating-point format. The conversion is performed according
2958  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2959  */
2960 
2961 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2962 {
2963     FloatParts64 r = { .sign = false };
2964 
2965     if (a == 0) {
2966         r.cls = float_class_zero;
2967     } else {
2968         uint64_t f = a;
2969         int shift;
2970 
2971         r.cls = float_class_normal;
2972         if (a < 0) {
2973             f = -f;
2974             r.sign = true;
2975         }
2976         shift = clz64(f);
2977         scale = MIN(MAX(scale, -0x10000), 0x10000);
2978 
2979         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2980         r.frac = f << shift;
2981     }
2982 
2983     return r;
2984 }
2985 
2986 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2987 {
2988     FloatParts64 pa = int_to_float(a, scale, status);
2989     return float16_round_pack_canonical(&pa, status);
2990 }
2991 
2992 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2993 {
2994     return int64_to_float16_scalbn(a, scale, status);
2995 }
2996 
2997 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2998 {
2999     return int64_to_float16_scalbn(a, scale, status);
3000 }
3001 
3002 float16 int64_to_float16(int64_t a, float_status *status)
3003 {
3004     return int64_to_float16_scalbn(a, 0, status);
3005 }
3006 
3007 float16 int32_to_float16(int32_t a, float_status *status)
3008 {
3009     return int64_to_float16_scalbn(a, 0, status);
3010 }
3011 
3012 float16 int16_to_float16(int16_t a, float_status *status)
3013 {
3014     return int64_to_float16_scalbn(a, 0, status);
3015 }
3016 
3017 float16 int8_to_float16(int8_t a, float_status *status)
3018 {
3019     return int64_to_float16_scalbn(a, 0, status);
3020 }
3021 
3022 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
3023 {
3024     FloatParts64 pa = int_to_float(a, scale, status);
3025     return float32_round_pack_canonical(&pa, status);
3026 }
3027 
3028 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
3029 {
3030     return int64_to_float32_scalbn(a, scale, status);
3031 }
3032 
3033 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
3034 {
3035     return int64_to_float32_scalbn(a, scale, status);
3036 }
3037 
3038 float32 int64_to_float32(int64_t a, float_status *status)
3039 {
3040     return int64_to_float32_scalbn(a, 0, status);
3041 }
3042 
3043 float32 int32_to_float32(int32_t a, float_status *status)
3044 {
3045     return int64_to_float32_scalbn(a, 0, status);
3046 }
3047 
3048 float32 int16_to_float32(int16_t a, float_status *status)
3049 {
3050     return int64_to_float32_scalbn(a, 0, status);
3051 }
3052 
3053 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
3054 {
3055     FloatParts64 pa = int_to_float(a, scale, status);
3056     return float64_round_pack_canonical(&pa, status);
3057 }
3058 
3059 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
3060 {
3061     return int64_to_float64_scalbn(a, scale, status);
3062 }
3063 
3064 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
3065 {
3066     return int64_to_float64_scalbn(a, scale, status);
3067 }
3068 
3069 float64 int64_to_float64(int64_t a, float_status *status)
3070 {
3071     return int64_to_float64_scalbn(a, 0, status);
3072 }
3073 
3074 float64 int32_to_float64(int32_t a, float_status *status)
3075 {
3076     return int64_to_float64_scalbn(a, 0, status);
3077 }
3078 
3079 float64 int16_to_float64(int16_t a, float_status *status)
3080 {
3081     return int64_to_float64_scalbn(a, 0, status);
3082 }
3083 
3084 /*
3085  * Returns the result of converting the two's complement integer `a'
3086  * to the bfloat16 format.
3087  */
3088 
3089 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3090 {
3091     FloatParts64 pa = int_to_float(a, scale, status);
3092     return bfloat16_round_pack_canonical(&pa, status);
3093 }
3094 
3095 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3096 {
3097     return int64_to_bfloat16_scalbn(a, scale, status);
3098 }
3099 
3100 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3101 {
3102     return int64_to_bfloat16_scalbn(a, scale, status);
3103 }
3104 
3105 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3106 {
3107     return int64_to_bfloat16_scalbn(a, 0, status);
3108 }
3109 
3110 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3111 {
3112     return int64_to_bfloat16_scalbn(a, 0, status);
3113 }
3114 
3115 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3116 {
3117     return int64_to_bfloat16_scalbn(a, 0, status);
3118 }
3119 
3120 /*
3121  * Unsigned Integer to float conversions
3122  *
3123  * Returns the result of converting the unsigned integer `a' to the
3124  * floating-point format. The conversion is performed according to the
3125  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3126  */
3127 
3128 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3129 {
3130     FloatParts64 r = { .sign = false };
3131     int shift;
3132 
3133     if (a == 0) {
3134         r.cls = float_class_zero;
3135     } else {
3136         scale = MIN(MAX(scale, -0x10000), 0x10000);
3137         shift = clz64(a);
3138         r.cls = float_class_normal;
3139         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3140         r.frac = a << shift;
3141     }
3142 
3143     return r;
3144 }
3145 
3146 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3147 {
3148     FloatParts64 pa = uint_to_float(a, scale, status);
3149     return float16_round_pack_canonical(&pa, status);
3150 }
3151 
3152 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3153 {
3154     return uint64_to_float16_scalbn(a, scale, status);
3155 }
3156 
3157 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3158 {
3159     return uint64_to_float16_scalbn(a, scale, status);
3160 }
3161 
3162 float16 uint64_to_float16(uint64_t a, float_status *status)
3163 {
3164     return uint64_to_float16_scalbn(a, 0, status);
3165 }
3166 
3167 float16 uint32_to_float16(uint32_t a, float_status *status)
3168 {
3169     return uint64_to_float16_scalbn(a, 0, status);
3170 }
3171 
3172 float16 uint16_to_float16(uint16_t a, float_status *status)
3173 {
3174     return uint64_to_float16_scalbn(a, 0, status);
3175 }
3176 
3177 float16 uint8_to_float16(uint8_t a, float_status *status)
3178 {
3179     return uint64_to_float16_scalbn(a, 0, status);
3180 }
3181 
3182 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3183 {
3184     FloatParts64 pa = uint_to_float(a, scale, status);
3185     return float32_round_pack_canonical(&pa, status);
3186 }
3187 
3188 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3189 {
3190     return uint64_to_float32_scalbn(a, scale, status);
3191 }
3192 
3193 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3194 {
3195     return uint64_to_float32_scalbn(a, scale, status);
3196 }
3197 
3198 float32 uint64_to_float32(uint64_t a, float_status *status)
3199 {
3200     return uint64_to_float32_scalbn(a, 0, status);
3201 }
3202 
3203 float32 uint32_to_float32(uint32_t a, float_status *status)
3204 {
3205     return uint64_to_float32_scalbn(a, 0, status);
3206 }
3207 
3208 float32 uint16_to_float32(uint16_t a, float_status *status)
3209 {
3210     return uint64_to_float32_scalbn(a, 0, status);
3211 }
3212 
3213 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3214 {
3215     FloatParts64 pa = uint_to_float(a, scale, status);
3216     return float64_round_pack_canonical(&pa, status);
3217 }
3218 
3219 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3220 {
3221     return uint64_to_float64_scalbn(a, scale, status);
3222 }
3223 
3224 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3225 {
3226     return uint64_to_float64_scalbn(a, scale, status);
3227 }
3228 
3229 float64 uint64_to_float64(uint64_t a, float_status *status)
3230 {
3231     return uint64_to_float64_scalbn(a, 0, status);
3232 }
3233 
3234 float64 uint32_to_float64(uint32_t a, float_status *status)
3235 {
3236     return uint64_to_float64_scalbn(a, 0, status);
3237 }
3238 
3239 float64 uint16_to_float64(uint16_t a, float_status *status)
3240 {
3241     return uint64_to_float64_scalbn(a, 0, status);
3242 }
3243 
3244 /*
3245  * Returns the result of converting the unsigned integer `a' to the
3246  * bfloat16 format.
3247  */
3248 
3249 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3250 {
3251     FloatParts64 pa = uint_to_float(a, scale, status);
3252     return bfloat16_round_pack_canonical(&pa, status);
3253 }
3254 
3255 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3256 {
3257     return uint64_to_bfloat16_scalbn(a, scale, status);
3258 }
3259 
3260 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3261 {
3262     return uint64_to_bfloat16_scalbn(a, scale, status);
3263 }
3264 
3265 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3266 {
3267     return uint64_to_bfloat16_scalbn(a, 0, status);
3268 }
3269 
3270 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3271 {
3272     return uint64_to_bfloat16_scalbn(a, 0, status);
3273 }
3274 
3275 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3276 {
3277     return uint64_to_bfloat16_scalbn(a, 0, status);
3278 }
3279 
3280 /* Float Min/Max */
3281 /* min() and max() functions. These can't be implemented as
3282  * 'compare and pick one input' because that would mishandle
3283  * NaNs and +0 vs -0.
3284  *
3285  * minnum() and maxnum() functions. These are similar to the min()
3286  * and max() functions but if one of the arguments is a QNaN and
3287  * the other is numerical then the numerical argument is returned.
3288  * SNaNs will get quietened before being returned.
3289  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3290  * and maxNum() operations. min() and max() are the typical min/max
3291  * semantics provided by many CPUs which predate that specification.
3292  *
3293  * minnummag() and maxnummag() functions correspond to minNumMag()
3294  * and minNumMag() from the IEEE-754 2008.
3295  */
3296 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3297                                 bool ieee, bool ismag, float_status *s)
3298 {
3299     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3300         if (ieee) {
3301             /* Takes two floating-point values `a' and `b', one of
3302              * which is a NaN, and returns the appropriate NaN
3303              * result. If either `a' or `b' is a signaling NaN,
3304              * the invalid exception is raised.
3305              */
3306             if (is_snan(a.cls) || is_snan(b.cls)) {
3307                 return *parts_pick_nan(&a, &b, s);
3308             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3309                 return b;
3310             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3311                 return a;
3312             }
3313         }
3314         return *parts_pick_nan(&a, &b, s);
3315     } else {
3316         int a_exp, b_exp;
3317 
3318         switch (a.cls) {
3319         case float_class_normal:
3320             a_exp = a.exp;
3321             break;
3322         case float_class_inf:
3323             a_exp = INT_MAX;
3324             break;
3325         case float_class_zero:
3326             a_exp = INT_MIN;
3327             break;
3328         default:
3329             g_assert_not_reached();
3330             break;
3331         }
3332         switch (b.cls) {
3333         case float_class_normal:
3334             b_exp = b.exp;
3335             break;
3336         case float_class_inf:
3337             b_exp = INT_MAX;
3338             break;
3339         case float_class_zero:
3340             b_exp = INT_MIN;
3341             break;
3342         default:
3343             g_assert_not_reached();
3344             break;
3345         }
3346 
3347         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3348             bool a_less = a_exp < b_exp;
3349             if (a_exp == b_exp) {
3350                 a_less = a.frac < b.frac;
3351             }
3352             return a_less ^ ismin ? b : a;
3353         }
3354 
3355         if (a.sign == b.sign) {
3356             bool a_less = a_exp < b_exp;
3357             if (a_exp == b_exp) {
3358                 a_less = a.frac < b.frac;
3359             }
3360             return a.sign ^ a_less ^ ismin ? b : a;
3361         } else {
3362             return a.sign ^ ismin ? b : a;
3363         }
3364     }
3365 }
3366 
3367 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3368 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3369                                      float_status *s)                   \
3370 {                                                                       \
3371     FloatParts64 pa, pb, pr;                                            \
3372     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3373     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3374     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3375     return float ## sz ## _round_pack_canonical(&pr, s);                \
3376 }
3377 
3378 MINMAX(16, min, true, false, false)
3379 MINMAX(16, minnum, true, true, false)
3380 MINMAX(16, minnummag, true, true, true)
3381 MINMAX(16, max, false, false, false)
3382 MINMAX(16, maxnum, false, true, false)
3383 MINMAX(16, maxnummag, false, true, true)
3384 
3385 MINMAX(32, min, true, false, false)
3386 MINMAX(32, minnum, true, true, false)
3387 MINMAX(32, minnummag, true, true, true)
3388 MINMAX(32, max, false, false, false)
3389 MINMAX(32, maxnum, false, true, false)
3390 MINMAX(32, maxnummag, false, true, true)
3391 
3392 MINMAX(64, min, true, false, false)
3393 MINMAX(64, minnum, true, true, false)
3394 MINMAX(64, minnummag, true, true, true)
3395 MINMAX(64, max, false, false, false)
3396 MINMAX(64, maxnum, false, true, false)
3397 MINMAX(64, maxnummag, false, true, true)
3398 
3399 #undef MINMAX
3400 
3401 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3402 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3403 {                                                                       \
3404     FloatParts64 pa, pb, pr;                                            \
3405     bfloat16_unpack_canonical(&pa, a, s);                               \
3406     bfloat16_unpack_canonical(&pb, b, s);                               \
3407     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3408     return bfloat16_round_pack_canonical(&pr, s);                       \
3409 }
3410 
3411 BF16_MINMAX(min, true, false, false)
3412 BF16_MINMAX(minnum, true, true, false)
3413 BF16_MINMAX(minnummag, true, true, true)
3414 BF16_MINMAX(max, false, false, false)
3415 BF16_MINMAX(maxnum, false, true, false)
3416 BF16_MINMAX(maxnummag, false, true, true)
3417 
3418 #undef BF16_MINMAX
3419 
3420 /* Floating point compare */
3421 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3422                                     float_status *s)
3423 {
3424     if (is_nan(a.cls) || is_nan(b.cls)) {
3425         if (!is_quiet ||
3426             a.cls == float_class_snan ||
3427             b.cls == float_class_snan) {
3428             float_raise(float_flag_invalid, s);
3429         }
3430         return float_relation_unordered;
3431     }
3432 
3433     if (a.cls == float_class_zero) {
3434         if (b.cls == float_class_zero) {
3435             return float_relation_equal;
3436         }
3437         return b.sign ? float_relation_greater : float_relation_less;
3438     } else if (b.cls == float_class_zero) {
3439         return a.sign ? float_relation_less : float_relation_greater;
3440     }
3441 
3442     /* The only really important thing about infinity is its sign. If
3443      * both are infinities the sign marks the smallest of the two.
3444      */
3445     if (a.cls == float_class_inf) {
3446         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3447             return float_relation_equal;
3448         }
3449         return a.sign ? float_relation_less : float_relation_greater;
3450     } else if (b.cls == float_class_inf) {
3451         return b.sign ? float_relation_greater : float_relation_less;
3452     }
3453 
3454     if (a.sign != b.sign) {
3455         return a.sign ? float_relation_less : float_relation_greater;
3456     }
3457 
3458     if (a.exp == b.exp) {
3459         if (a.frac == b.frac) {
3460             return float_relation_equal;
3461         }
3462         if (a.sign) {
3463             return a.frac > b.frac ?
3464                 float_relation_less : float_relation_greater;
3465         } else {
3466             return a.frac > b.frac ?
3467                 float_relation_greater : float_relation_less;
3468         }
3469     } else {
3470         if (a.sign) {
3471             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3472         } else {
3473             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3474         }
3475     }
3476 }
3477 
3478 #define COMPARE(name, attr, sz)                                         \
3479 static int attr                                                         \
3480 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3481 {                                                                       \
3482     FloatParts64 pa, pb;                                                \
3483     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3484     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3485     return compare_floats(pa, pb, is_quiet, s);                         \
3486 }
3487 
3488 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3489 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3490 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3491 
3492 #undef COMPARE
3493 
3494 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3495 {
3496     return soft_f16_compare(a, b, false, s);
3497 }
3498 
3499 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3500 {
3501     return soft_f16_compare(a, b, true, s);
3502 }
3503 
3504 static FloatRelation QEMU_FLATTEN
3505 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3506 {
3507     union_float32 ua, ub;
3508 
3509     ua.s = xa;
3510     ub.s = xb;
3511 
3512     if (QEMU_NO_HARDFLOAT) {
3513         goto soft;
3514     }
3515 
3516     float32_input_flush2(&ua.s, &ub.s, s);
3517     if (isgreaterequal(ua.h, ub.h)) {
3518         if (isgreater(ua.h, ub.h)) {
3519             return float_relation_greater;
3520         }
3521         return float_relation_equal;
3522     }
3523     if (likely(isless(ua.h, ub.h))) {
3524         return float_relation_less;
3525     }
3526     /* The only condition remaining is unordered.
3527      * Fall through to set flags.
3528      */
3529  soft:
3530     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3531 }
3532 
3533 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3534 {
3535     return f32_compare(a, b, false, s);
3536 }
3537 
3538 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3539 {
3540     return f32_compare(a, b, true, s);
3541 }
3542 
3543 static FloatRelation QEMU_FLATTEN
3544 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3545 {
3546     union_float64 ua, ub;
3547 
3548     ua.s = xa;
3549     ub.s = xb;
3550 
3551     if (QEMU_NO_HARDFLOAT) {
3552         goto soft;
3553     }
3554 
3555     float64_input_flush2(&ua.s, &ub.s, s);
3556     if (isgreaterequal(ua.h, ub.h)) {
3557         if (isgreater(ua.h, ub.h)) {
3558             return float_relation_greater;
3559         }
3560         return float_relation_equal;
3561     }
3562     if (likely(isless(ua.h, ub.h))) {
3563         return float_relation_less;
3564     }
3565     /* The only condition remaining is unordered.
3566      * Fall through to set flags.
3567      */
3568  soft:
3569     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3570 }
3571 
3572 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3573 {
3574     return f64_compare(a, b, false, s);
3575 }
3576 
3577 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3578 {
3579     return f64_compare(a, b, true, s);
3580 }
3581 
3582 static FloatRelation QEMU_FLATTEN
3583 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3584 {
3585     FloatParts64 pa, pb;
3586 
3587     bfloat16_unpack_canonical(&pa, a, s);
3588     bfloat16_unpack_canonical(&pb, b, s);
3589     return compare_floats(pa, pb, is_quiet, s);
3590 }
3591 
3592 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3593 {
3594     return soft_bf16_compare(a, b, false, s);
3595 }
3596 
3597 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3598 {
3599     return soft_bf16_compare(a, b, true, s);
3600 }
3601 
3602 /* Multiply A by 2 raised to the power N.  */
3603 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3604 {
3605     if (unlikely(is_nan(a.cls))) {
3606         parts_return_nan(&a, s);
3607     }
3608     if (a.cls == float_class_normal) {
3609         /* The largest float type (even though not supported by FloatParts64)
3610          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3611          * still allows rounding to infinity, without allowing overflow
3612          * within the int32_t that backs FloatParts64.exp.
3613          */
3614         n = MIN(MAX(n, -0x10000), 0x10000);
3615         a.exp += n;
3616     }
3617     return a;
3618 }
3619 
3620 float16 float16_scalbn(float16 a, int n, float_status *status)
3621 {
3622     FloatParts64 pa, pr;
3623 
3624     float16_unpack_canonical(&pa, a, status);
3625     pr = scalbn_decomposed(pa, n, status);
3626     return float16_round_pack_canonical(&pr, status);
3627 }
3628 
3629 float32 float32_scalbn(float32 a, int n, float_status *status)
3630 {
3631     FloatParts64 pa, pr;
3632 
3633     float32_unpack_canonical(&pa, a, status);
3634     pr = scalbn_decomposed(pa, n, status);
3635     return float32_round_pack_canonical(&pr, status);
3636 }
3637 
3638 float64 float64_scalbn(float64 a, int n, float_status *status)
3639 {
3640     FloatParts64 pa, pr;
3641 
3642     float64_unpack_canonical(&pa, a, status);
3643     pr = scalbn_decomposed(pa, n, status);
3644     return float64_round_pack_canonical(&pr, status);
3645 }
3646 
3647 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3648 {
3649     FloatParts64 pa, pr;
3650 
3651     bfloat16_unpack_canonical(&pa, a, status);
3652     pr = scalbn_decomposed(pa, n, status);
3653     return bfloat16_round_pack_canonical(&pr, status);
3654 }
3655 
3656 /*
3657  * Square Root
3658  *
3659  * The old softfloat code did an approximation step before zeroing in
3660  * on the final result. However for simpleness we just compute the
3661  * square root by iterating down from the implicit bit to enough extra
3662  * bits to ensure we get a correctly rounded result.
3663  *
3664  * This does mean however the calculation is slower than before,
3665  * especially for 64 bit floats.
3666  */
3667 
3668 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3669 {
3670     uint64_t a_frac, r_frac, s_frac;
3671     int bit, last_bit;
3672 
3673     if (is_nan(a.cls)) {
3674         parts_return_nan(&a, s);
3675         return a;
3676     }
3677     if (a.cls == float_class_zero) {
3678         return a;  /* sqrt(+-0) = +-0 */
3679     }
3680     if (a.sign) {
3681         float_raise(float_flag_invalid, s);
3682         parts_default_nan(&a, s);
3683         return a;
3684     }
3685     if (a.cls == float_class_inf) {
3686         return a;  /* sqrt(+inf) = +inf */
3687     }
3688 
3689     assert(a.cls == float_class_normal);
3690 
3691     /* We need two overflow bits at the top. Adding room for that is a
3692      * right shift. If the exponent is odd, we can discard the low bit
3693      * by multiplying the fraction by 2; that's a left shift. Combine
3694      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3695      */
3696     a_frac = a.frac >> (2 - (a.exp & 1));
3697     a.exp >>= 1;
3698 
3699     /* Bit-by-bit computation of sqrt.  */
3700     r_frac = 0;
3701     s_frac = 0;
3702 
3703     /* Iterate from implicit bit down to the 3 extra bits to compute a
3704      * properly rounded result. Remember we've inserted two more bits
3705      * at the top, so these positions are two less.
3706      */
3707     bit = DECOMPOSED_BINARY_POINT - 2;
3708     last_bit = MAX(p->frac_shift - 4, 0);
3709     do {
3710         uint64_t q = 1ULL << bit;
3711         uint64_t t_frac = s_frac + q;
3712         if (t_frac <= a_frac) {
3713             s_frac = t_frac + q;
3714             a_frac -= t_frac;
3715             r_frac += q;
3716         }
3717         a_frac <<= 1;
3718     } while (--bit >= last_bit);
3719 
3720     /* Undo the right shift done above. If there is any remaining
3721      * fraction, the result is inexact. Set the sticky bit.
3722      */
3723     a.frac = (r_frac << 2) + (a_frac != 0);
3724 
3725     return a;
3726 }
3727 
3728 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3729 {
3730     FloatParts64 pa, pr;
3731 
3732     float16_unpack_canonical(&pa, a, status);
3733     pr = sqrt_float(pa, status, &float16_params);
3734     return float16_round_pack_canonical(&pr, status);
3735 }
3736 
3737 static float32 QEMU_SOFTFLOAT_ATTR
3738 soft_f32_sqrt(float32 a, float_status *status)
3739 {
3740     FloatParts64 pa, pr;
3741 
3742     float32_unpack_canonical(&pa, a, status);
3743     pr = sqrt_float(pa, status, &float32_params);
3744     return float32_round_pack_canonical(&pr, status);
3745 }
3746 
3747 static float64 QEMU_SOFTFLOAT_ATTR
3748 soft_f64_sqrt(float64 a, float_status *status)
3749 {
3750     FloatParts64 pa, pr;
3751 
3752     float64_unpack_canonical(&pa, a, status);
3753     pr = sqrt_float(pa, status, &float64_params);
3754     return float64_round_pack_canonical(&pr, status);
3755 }
3756 
3757 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3758 {
3759     union_float32 ua, ur;
3760 
3761     ua.s = xa;
3762     if (unlikely(!can_use_fpu(s))) {
3763         goto soft;
3764     }
3765 
3766     float32_input_flush1(&ua.s, s);
3767     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3768         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3769                        fpclassify(ua.h) == FP_ZERO) ||
3770                      signbit(ua.h))) {
3771             goto soft;
3772         }
3773     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3774                         float32_is_neg(ua.s))) {
3775         goto soft;
3776     }
3777     ur.h = sqrtf(ua.h);
3778     return ur.s;
3779 
3780  soft:
3781     return soft_f32_sqrt(ua.s, s);
3782 }
3783 
3784 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3785 {
3786     union_float64 ua, ur;
3787 
3788     ua.s = xa;
3789     if (unlikely(!can_use_fpu(s))) {
3790         goto soft;
3791     }
3792 
3793     float64_input_flush1(&ua.s, s);
3794     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3795         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3796                        fpclassify(ua.h) == FP_ZERO) ||
3797                      signbit(ua.h))) {
3798             goto soft;
3799         }
3800     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3801                         float64_is_neg(ua.s))) {
3802         goto soft;
3803     }
3804     ur.h = sqrt(ua.h);
3805     return ur.s;
3806 
3807  soft:
3808     return soft_f64_sqrt(ua.s, s);
3809 }
3810 
3811 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3812 {
3813     FloatParts64 pa, pr;
3814 
3815     bfloat16_unpack_canonical(&pa, a, status);
3816     pr = sqrt_float(pa, status, &bfloat16_params);
3817     return bfloat16_round_pack_canonical(&pr, status);
3818 }
3819 
3820 /*----------------------------------------------------------------------------
3821 | The pattern for a default generated NaN.
3822 *----------------------------------------------------------------------------*/
3823 
3824 float16 float16_default_nan(float_status *status)
3825 {
3826     FloatParts64 p;
3827 
3828     parts_default_nan(&p, status);
3829     p.frac >>= float16_params.frac_shift;
3830     return float16_pack_raw(&p);
3831 }
3832 
3833 float32 float32_default_nan(float_status *status)
3834 {
3835     FloatParts64 p;
3836 
3837     parts_default_nan(&p, status);
3838     p.frac >>= float32_params.frac_shift;
3839     return float32_pack_raw(&p);
3840 }
3841 
3842 float64 float64_default_nan(float_status *status)
3843 {
3844     FloatParts64 p;
3845 
3846     parts_default_nan(&p, status);
3847     p.frac >>= float64_params.frac_shift;
3848     return float64_pack_raw(&p);
3849 }
3850 
3851 float128 float128_default_nan(float_status *status)
3852 {
3853     FloatParts128 p;
3854 
3855     parts_default_nan(&p, status);
3856     frac_shr(&p, float128_params.frac_shift);
3857     return float128_pack_raw(&p);
3858 }
3859 
3860 bfloat16 bfloat16_default_nan(float_status *status)
3861 {
3862     FloatParts64 p;
3863 
3864     parts_default_nan(&p, status);
3865     p.frac >>= bfloat16_params.frac_shift;
3866     return bfloat16_pack_raw(&p);
3867 }
3868 
3869 /*----------------------------------------------------------------------------
3870 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3871 *----------------------------------------------------------------------------*/
3872 
3873 float16 float16_silence_nan(float16 a, float_status *status)
3874 {
3875     FloatParts64 p;
3876 
3877     float16_unpack_raw(&p, a);
3878     p.frac <<= float16_params.frac_shift;
3879     parts_silence_nan(&p, status);
3880     p.frac >>= float16_params.frac_shift;
3881     return float16_pack_raw(&p);
3882 }
3883 
3884 float32 float32_silence_nan(float32 a, float_status *status)
3885 {
3886     FloatParts64 p;
3887 
3888     float32_unpack_raw(&p, a);
3889     p.frac <<= float32_params.frac_shift;
3890     parts_silence_nan(&p, status);
3891     p.frac >>= float32_params.frac_shift;
3892     return float32_pack_raw(&p);
3893 }
3894 
3895 float64 float64_silence_nan(float64 a, float_status *status)
3896 {
3897     FloatParts64 p;
3898 
3899     float64_unpack_raw(&p, a);
3900     p.frac <<= float64_params.frac_shift;
3901     parts_silence_nan(&p, status);
3902     p.frac >>= float64_params.frac_shift;
3903     return float64_pack_raw(&p);
3904 }
3905 
3906 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3907 {
3908     FloatParts64 p;
3909 
3910     bfloat16_unpack_raw(&p, a);
3911     p.frac <<= bfloat16_params.frac_shift;
3912     parts_silence_nan(&p, status);
3913     p.frac >>= bfloat16_params.frac_shift;
3914     return bfloat16_pack_raw(&p);
3915 }
3916 
3917 float128 float128_silence_nan(float128 a, float_status *status)
3918 {
3919     FloatParts128 p;
3920 
3921     float128_unpack_raw(&p, a);
3922     frac_shl(&p, float128_params.frac_shift);
3923     parts_silence_nan(&p, status);
3924     frac_shr(&p, float128_params.frac_shift);
3925     return float128_pack_raw(&p);
3926 }
3927 
3928 /*----------------------------------------------------------------------------
3929 | If `a' is denormal and we are in flush-to-zero mode then set the
3930 | input-denormal exception and return zero. Otherwise just return the value.
3931 *----------------------------------------------------------------------------*/
3932 
3933 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3934 {
3935     if (p.exp == 0 && p.frac != 0) {
3936         float_raise(float_flag_input_denormal, status);
3937         return true;
3938     }
3939 
3940     return false;
3941 }
3942 
3943 float16 float16_squash_input_denormal(float16 a, float_status *status)
3944 {
3945     if (status->flush_inputs_to_zero) {
3946         FloatParts64 p;
3947 
3948         float16_unpack_raw(&p, a);
3949         if (parts_squash_denormal(p, status)) {
3950             return float16_set_sign(float16_zero, p.sign);
3951         }
3952     }
3953     return a;
3954 }
3955 
3956 float32 float32_squash_input_denormal(float32 a, float_status *status)
3957 {
3958     if (status->flush_inputs_to_zero) {
3959         FloatParts64 p;
3960 
3961         float32_unpack_raw(&p, a);
3962         if (parts_squash_denormal(p, status)) {
3963             return float32_set_sign(float32_zero, p.sign);
3964         }
3965     }
3966     return a;
3967 }
3968 
3969 float64 float64_squash_input_denormal(float64 a, float_status *status)
3970 {
3971     if (status->flush_inputs_to_zero) {
3972         FloatParts64 p;
3973 
3974         float64_unpack_raw(&p, a);
3975         if (parts_squash_denormal(p, status)) {
3976             return float64_set_sign(float64_zero, p.sign);
3977         }
3978     }
3979     return a;
3980 }
3981 
3982 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3983 {
3984     if (status->flush_inputs_to_zero) {
3985         FloatParts64 p;
3986 
3987         bfloat16_unpack_raw(&p, a);
3988         if (parts_squash_denormal(p, status)) {
3989             return bfloat16_set_sign(bfloat16_zero, p.sign);
3990         }
3991     }
3992     return a;
3993 }
3994 
3995 /*----------------------------------------------------------------------------
3996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3997 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3998 | input.  If `zSign' is 1, the input is negated before being converted to an
3999 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
4000 | is simply rounded to an integer, with the inexact exception raised if the
4001 | input cannot be represented exactly as an integer.  However, if the fixed-
4002 | point input is too large, the invalid exception is raised and the largest
4003 | positive or negative integer is returned.
4004 *----------------------------------------------------------------------------*/
4005 
4006 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
4007                                  float_status *status)
4008 {
4009     int8_t roundingMode;
4010     bool roundNearestEven;
4011     int8_t roundIncrement, roundBits;
4012     int32_t z;
4013 
4014     roundingMode = status->float_rounding_mode;
4015     roundNearestEven = ( roundingMode == float_round_nearest_even );
4016     switch (roundingMode) {
4017     case float_round_nearest_even:
4018     case float_round_ties_away:
4019         roundIncrement = 0x40;
4020         break;
4021     case float_round_to_zero:
4022         roundIncrement = 0;
4023         break;
4024     case float_round_up:
4025         roundIncrement = zSign ? 0 : 0x7f;
4026         break;
4027     case float_round_down:
4028         roundIncrement = zSign ? 0x7f : 0;
4029         break;
4030     case float_round_to_odd:
4031         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
4032         break;
4033     default:
4034         abort();
4035     }
4036     roundBits = absZ & 0x7F;
4037     absZ = ( absZ + roundIncrement )>>7;
4038     if (!(roundBits ^ 0x40) && roundNearestEven) {
4039         absZ &= ~1;
4040     }
4041     z = absZ;
4042     if ( zSign ) z = - z;
4043     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
4044         float_raise(float_flag_invalid, status);
4045         return zSign ? INT32_MIN : INT32_MAX;
4046     }
4047     if (roundBits) {
4048         float_raise(float_flag_inexact, status);
4049     }
4050     return z;
4051 
4052 }
4053 
4054 /*----------------------------------------------------------------------------
4055 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4056 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4057 | and returns the properly rounded 64-bit integer corresponding to the input.
4058 | If `zSign' is 1, the input is negated before being converted to an integer.
4059 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4060 | the inexact exception raised if the input cannot be represented exactly as
4061 | an integer.  However, if the fixed-point input is too large, the invalid
4062 | exception is raised and the largest positive or negative integer is
4063 | returned.
4064 *----------------------------------------------------------------------------*/
4065 
4066 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
4067                                float_status *status)
4068 {
4069     int8_t roundingMode;
4070     bool roundNearestEven, increment;
4071     int64_t z;
4072 
4073     roundingMode = status->float_rounding_mode;
4074     roundNearestEven = ( roundingMode == float_round_nearest_even );
4075     switch (roundingMode) {
4076     case float_round_nearest_even:
4077     case float_round_ties_away:
4078         increment = ((int64_t) absZ1 < 0);
4079         break;
4080     case float_round_to_zero:
4081         increment = 0;
4082         break;
4083     case float_round_up:
4084         increment = !zSign && absZ1;
4085         break;
4086     case float_round_down:
4087         increment = zSign && absZ1;
4088         break;
4089     case float_round_to_odd:
4090         increment = !(absZ0 & 1) && absZ1;
4091         break;
4092     default:
4093         abort();
4094     }
4095     if ( increment ) {
4096         ++absZ0;
4097         if ( absZ0 == 0 ) goto overflow;
4098         if (!(absZ1 << 1) && roundNearestEven) {
4099             absZ0 &= ~1;
4100         }
4101     }
4102     z = absZ0;
4103     if ( zSign ) z = - z;
4104     if ( z && ( ( z < 0 ) ^ zSign ) ) {
4105  overflow:
4106         float_raise(float_flag_invalid, status);
4107         return zSign ? INT64_MIN : INT64_MAX;
4108     }
4109     if (absZ1) {
4110         float_raise(float_flag_inexact, status);
4111     }
4112     return z;
4113 
4114 }
4115 
4116 /*----------------------------------------------------------------------------
4117 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4118 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4119 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4120 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
4121 | with the inexact exception raised if the input cannot be represented exactly
4122 | as an integer.  However, if the fixed-point input is too large, the invalid
4123 | exception is raised and the largest unsigned integer is returned.
4124 *----------------------------------------------------------------------------*/
4125 
4126 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4127                                 uint64_t absZ1, float_status *status)
4128 {
4129     int8_t roundingMode;
4130     bool roundNearestEven, increment;
4131 
4132     roundingMode = status->float_rounding_mode;
4133     roundNearestEven = (roundingMode == float_round_nearest_even);
4134     switch (roundingMode) {
4135     case float_round_nearest_even:
4136     case float_round_ties_away:
4137         increment = ((int64_t)absZ1 < 0);
4138         break;
4139     case float_round_to_zero:
4140         increment = 0;
4141         break;
4142     case float_round_up:
4143         increment = !zSign && absZ1;
4144         break;
4145     case float_round_down:
4146         increment = zSign && absZ1;
4147         break;
4148     case float_round_to_odd:
4149         increment = !(absZ0 & 1) && absZ1;
4150         break;
4151     default:
4152         abort();
4153     }
4154     if (increment) {
4155         ++absZ0;
4156         if (absZ0 == 0) {
4157             float_raise(float_flag_invalid, status);
4158             return UINT64_MAX;
4159         }
4160         if (!(absZ1 << 1) && roundNearestEven) {
4161             absZ0 &= ~1;
4162         }
4163     }
4164 
4165     if (zSign && absZ0) {
4166         float_raise(float_flag_invalid, status);
4167         return 0;
4168     }
4169 
4170     if (absZ1) {
4171         float_raise(float_flag_inexact, status);
4172     }
4173     return absZ0;
4174 }
4175 
4176 /*----------------------------------------------------------------------------
4177 | Normalizes the subnormal single-precision floating-point value represented
4178 | by the denormalized significand `aSig'.  The normalized exponent and
4179 | significand are stored at the locations pointed to by `zExpPtr' and
4180 | `zSigPtr', respectively.
4181 *----------------------------------------------------------------------------*/
4182 
4183 static void
4184  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4185 {
4186     int8_t shiftCount;
4187 
4188     shiftCount = clz32(aSig) - 8;
4189     *zSigPtr = aSig<<shiftCount;
4190     *zExpPtr = 1 - shiftCount;
4191 
4192 }
4193 
4194 /*----------------------------------------------------------------------------
4195 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4196 | and significand `zSig', and returns the proper single-precision floating-
4197 | point value corresponding to the abstract input.  Ordinarily, the abstract
4198 | value is simply rounded and packed into the single-precision format, with
4199 | the inexact exception raised if the abstract input cannot be represented
4200 | exactly.  However, if the abstract value is too large, the overflow and
4201 | inexact exceptions are raised and an infinity or maximal finite value is
4202 | returned.  If the abstract value is too small, the input value is rounded to
4203 | a subnormal number, and the underflow and inexact exceptions are raised if
4204 | the abstract input cannot be represented exactly as a subnormal single-
4205 | precision floating-point number.
4206 |     The input significand `zSig' has its binary point between bits 30
4207 | and 29, which is 7 bits to the left of the usual location.  This shifted
4208 | significand must be normalized or smaller.  If `zSig' is not normalized,
4209 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4210 | and it must not require rounding.  In the usual case that `zSig' is
4211 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4212 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4213 | Binary Floating-Point Arithmetic.
4214 *----------------------------------------------------------------------------*/
4215 
4216 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4217                                    float_status *status)
4218 {
4219     int8_t roundingMode;
4220     bool roundNearestEven;
4221     int8_t roundIncrement, roundBits;
4222     bool isTiny;
4223 
4224     roundingMode = status->float_rounding_mode;
4225     roundNearestEven = ( roundingMode == float_round_nearest_even );
4226     switch (roundingMode) {
4227     case float_round_nearest_even:
4228     case float_round_ties_away:
4229         roundIncrement = 0x40;
4230         break;
4231     case float_round_to_zero:
4232         roundIncrement = 0;
4233         break;
4234     case float_round_up:
4235         roundIncrement = zSign ? 0 : 0x7f;
4236         break;
4237     case float_round_down:
4238         roundIncrement = zSign ? 0x7f : 0;
4239         break;
4240     case float_round_to_odd:
4241         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4242         break;
4243     default:
4244         abort();
4245         break;
4246     }
4247     roundBits = zSig & 0x7F;
4248     if ( 0xFD <= (uint16_t) zExp ) {
4249         if (    ( 0xFD < zExp )
4250              || (    ( zExp == 0xFD )
4251                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4252            ) {
4253             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4254                                    roundIncrement != 0;
4255             float_raise(float_flag_overflow | float_flag_inexact, status);
4256             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4257         }
4258         if ( zExp < 0 ) {
4259             if (status->flush_to_zero) {
4260                 float_raise(float_flag_output_denormal, status);
4261                 return packFloat32(zSign, 0, 0);
4262             }
4263             isTiny = status->tininess_before_rounding
4264                   || (zExp < -1)
4265                   || (zSig + roundIncrement < 0x80000000);
4266             shift32RightJamming( zSig, - zExp, &zSig );
4267             zExp = 0;
4268             roundBits = zSig & 0x7F;
4269             if (isTiny && roundBits) {
4270                 float_raise(float_flag_underflow, status);
4271             }
4272             if (roundingMode == float_round_to_odd) {
4273                 /*
4274                  * For round-to-odd case, the roundIncrement depends on
4275                  * zSig which just changed.
4276                  */
4277                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4278             }
4279         }
4280     }
4281     if (roundBits) {
4282         float_raise(float_flag_inexact, status);
4283     }
4284     zSig = ( zSig + roundIncrement )>>7;
4285     if (!(roundBits ^ 0x40) && roundNearestEven) {
4286         zSig &= ~1;
4287     }
4288     if ( zSig == 0 ) zExp = 0;
4289     return packFloat32( zSign, zExp, zSig );
4290 
4291 }
4292 
4293 /*----------------------------------------------------------------------------
4294 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4295 | and significand `zSig', and returns the proper single-precision floating-
4296 | point value corresponding to the abstract input.  This routine is just like
4297 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4298 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4299 | floating-point exponent.
4300 *----------------------------------------------------------------------------*/
4301 
4302 static float32
4303  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4304                               float_status *status)
4305 {
4306     int8_t shiftCount;
4307 
4308     shiftCount = clz32(zSig) - 1;
4309     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4310                                status);
4311 
4312 }
4313 
4314 /*----------------------------------------------------------------------------
4315 | Normalizes the subnormal double-precision floating-point value represented
4316 | by the denormalized significand `aSig'.  The normalized exponent and
4317 | significand are stored at the locations pointed to by `zExpPtr' and
4318 | `zSigPtr', respectively.
4319 *----------------------------------------------------------------------------*/
4320 
4321 static void
4322  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4323 {
4324     int8_t shiftCount;
4325 
4326     shiftCount = clz64(aSig) - 11;
4327     *zSigPtr = aSig<<shiftCount;
4328     *zExpPtr = 1 - shiftCount;
4329 
4330 }
4331 
4332 /*----------------------------------------------------------------------------
4333 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4334 | double-precision floating-point value, returning the result.  After being
4335 | shifted into the proper positions, the three fields are simply added
4336 | together to form the result.  This means that any integer portion of `zSig'
4337 | will be added into the exponent.  Since a properly normalized significand
4338 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4339 | than the desired result exponent whenever `zSig' is a complete, normalized
4340 | significand.
4341 *----------------------------------------------------------------------------*/
4342 
4343 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4344 {
4345 
4346     return make_float64(
4347         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4348 
4349 }
4350 
4351 /*----------------------------------------------------------------------------
4352 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4353 | and significand `zSig', and returns the proper double-precision floating-
4354 | point value corresponding to the abstract input.  Ordinarily, the abstract
4355 | value is simply rounded and packed into the double-precision format, with
4356 | the inexact exception raised if the abstract input cannot be represented
4357 | exactly.  However, if the abstract value is too large, the overflow and
4358 | inexact exceptions are raised and an infinity or maximal finite value is
4359 | returned.  If the abstract value is too small, the input value is rounded to
4360 | a subnormal number, and the underflow and inexact exceptions are raised if
4361 | the abstract input cannot be represented exactly as a subnormal double-
4362 | precision floating-point number.
4363 |     The input significand `zSig' has its binary point between bits 62
4364 | and 61, which is 10 bits to the left of the usual location.  This shifted
4365 | significand must be normalized or smaller.  If `zSig' is not normalized,
4366 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4367 | and it must not require rounding.  In the usual case that `zSig' is
4368 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4369 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4370 | Binary Floating-Point Arithmetic.
4371 *----------------------------------------------------------------------------*/
4372 
4373 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4374                                    float_status *status)
4375 {
4376     int8_t roundingMode;
4377     bool roundNearestEven;
4378     int roundIncrement, roundBits;
4379     bool isTiny;
4380 
4381     roundingMode = status->float_rounding_mode;
4382     roundNearestEven = ( roundingMode == float_round_nearest_even );
4383     switch (roundingMode) {
4384     case float_round_nearest_even:
4385     case float_round_ties_away:
4386         roundIncrement = 0x200;
4387         break;
4388     case float_round_to_zero:
4389         roundIncrement = 0;
4390         break;
4391     case float_round_up:
4392         roundIncrement = zSign ? 0 : 0x3ff;
4393         break;
4394     case float_round_down:
4395         roundIncrement = zSign ? 0x3ff : 0;
4396         break;
4397     case float_round_to_odd:
4398         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4399         break;
4400     default:
4401         abort();
4402     }
4403     roundBits = zSig & 0x3FF;
4404     if ( 0x7FD <= (uint16_t) zExp ) {
4405         if (    ( 0x7FD < zExp )
4406              || (    ( zExp == 0x7FD )
4407                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4408            ) {
4409             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4410                                    roundIncrement != 0;
4411             float_raise(float_flag_overflow | float_flag_inexact, status);
4412             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4413         }
4414         if ( zExp < 0 ) {
4415             if (status->flush_to_zero) {
4416                 float_raise(float_flag_output_denormal, status);
4417                 return packFloat64(zSign, 0, 0);
4418             }
4419             isTiny = status->tininess_before_rounding
4420                   || (zExp < -1)
4421                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4422             shift64RightJamming( zSig, - zExp, &zSig );
4423             zExp = 0;
4424             roundBits = zSig & 0x3FF;
4425             if (isTiny && roundBits) {
4426                 float_raise(float_flag_underflow, status);
4427             }
4428             if (roundingMode == float_round_to_odd) {
4429                 /*
4430                  * For round-to-odd case, the roundIncrement depends on
4431                  * zSig which just changed.
4432                  */
4433                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4434             }
4435         }
4436     }
4437     if (roundBits) {
4438         float_raise(float_flag_inexact, status);
4439     }
4440     zSig = ( zSig + roundIncrement )>>10;
4441     if (!(roundBits ^ 0x200) && roundNearestEven) {
4442         zSig &= ~1;
4443     }
4444     if ( zSig == 0 ) zExp = 0;
4445     return packFloat64( zSign, zExp, zSig );
4446 
4447 }
4448 
4449 /*----------------------------------------------------------------------------
4450 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4451 | and significand `zSig', and returns the proper double-precision floating-
4452 | point value corresponding to the abstract input.  This routine is just like
4453 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4454 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4455 | floating-point exponent.
4456 *----------------------------------------------------------------------------*/
4457 
4458 static float64
4459  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4460                               float_status *status)
4461 {
4462     int8_t shiftCount;
4463 
4464     shiftCount = clz64(zSig) - 1;
4465     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4466                                status);
4467 
4468 }
4469 
4470 /*----------------------------------------------------------------------------
4471 | Normalizes the subnormal extended double-precision floating-point value
4472 | represented by the denormalized significand `aSig'.  The normalized exponent
4473 | and significand are stored at the locations pointed to by `zExpPtr' and
4474 | `zSigPtr', respectively.
4475 *----------------------------------------------------------------------------*/
4476 
4477 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4478                                 uint64_t *zSigPtr)
4479 {
4480     int8_t shiftCount;
4481 
4482     shiftCount = clz64(aSig);
4483     *zSigPtr = aSig<<shiftCount;
4484     *zExpPtr = 1 - shiftCount;
4485 }
4486 
4487 /*----------------------------------------------------------------------------
4488 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4489 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4490 | and returns the proper extended double-precision floating-point value
4491 | corresponding to the abstract input.  Ordinarily, the abstract value is
4492 | rounded and packed into the extended double-precision format, with the
4493 | inexact exception raised if the abstract input cannot be represented
4494 | exactly.  However, if the abstract value is too large, the overflow and
4495 | inexact exceptions are raised and an infinity or maximal finite value is
4496 | returned.  If the abstract value is too small, the input value is rounded to
4497 | a subnormal number, and the underflow and inexact exceptions are raised if
4498 | the abstract input cannot be represented exactly as a subnormal extended
4499 | double-precision floating-point number.
4500 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4501 | number of bits as single or double precision, respectively.  Otherwise, the
4502 | result is rounded to the full precision of the extended double-precision
4503 | format.
4504 |     The input significand must be normalized or smaller.  If the input
4505 | significand is not normalized, `zExp' must be 0; in that case, the result
4506 | returned is a subnormal number, and it must not require rounding.  The
4507 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4508 | Floating-Point Arithmetic.
4509 *----------------------------------------------------------------------------*/
4510 
4511 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4512                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4513                               float_status *status)
4514 {
4515     int8_t roundingMode;
4516     bool roundNearestEven, increment, isTiny;
4517     int64_t roundIncrement, roundMask, roundBits;
4518 
4519     roundingMode = status->float_rounding_mode;
4520     roundNearestEven = ( roundingMode == float_round_nearest_even );
4521     if ( roundingPrecision == 80 ) goto precision80;
4522     if ( roundingPrecision == 64 ) {
4523         roundIncrement = UINT64_C(0x0000000000000400);
4524         roundMask = UINT64_C(0x00000000000007FF);
4525     }
4526     else if ( roundingPrecision == 32 ) {
4527         roundIncrement = UINT64_C(0x0000008000000000);
4528         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4529     }
4530     else {
4531         goto precision80;
4532     }
4533     zSig0 |= ( zSig1 != 0 );
4534     switch (roundingMode) {
4535     case float_round_nearest_even:
4536     case float_round_ties_away:
4537         break;
4538     case float_round_to_zero:
4539         roundIncrement = 0;
4540         break;
4541     case float_round_up:
4542         roundIncrement = zSign ? 0 : roundMask;
4543         break;
4544     case float_round_down:
4545         roundIncrement = zSign ? roundMask : 0;
4546         break;
4547     default:
4548         abort();
4549     }
4550     roundBits = zSig0 & roundMask;
4551     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4552         if (    ( 0x7FFE < zExp )
4553              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4554            ) {
4555             goto overflow;
4556         }
4557         if ( zExp <= 0 ) {
4558             if (status->flush_to_zero) {
4559                 float_raise(float_flag_output_denormal, status);
4560                 return packFloatx80(zSign, 0, 0);
4561             }
4562             isTiny = status->tininess_before_rounding
4563                   || (zExp < 0 )
4564                   || (zSig0 <= zSig0 + roundIncrement);
4565             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4566             zExp = 0;
4567             roundBits = zSig0 & roundMask;
4568             if (isTiny && roundBits) {
4569                 float_raise(float_flag_underflow, status);
4570             }
4571             if (roundBits) {
4572                 float_raise(float_flag_inexact, status);
4573             }
4574             zSig0 += roundIncrement;
4575             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4576             roundIncrement = roundMask + 1;
4577             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4578                 roundMask |= roundIncrement;
4579             }
4580             zSig0 &= ~ roundMask;
4581             return packFloatx80( zSign, zExp, zSig0 );
4582         }
4583     }
4584     if (roundBits) {
4585         float_raise(float_flag_inexact, status);
4586     }
4587     zSig0 += roundIncrement;
4588     if ( zSig0 < roundIncrement ) {
4589         ++zExp;
4590         zSig0 = UINT64_C(0x8000000000000000);
4591     }
4592     roundIncrement = roundMask + 1;
4593     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4594         roundMask |= roundIncrement;
4595     }
4596     zSig0 &= ~ roundMask;
4597     if ( zSig0 == 0 ) zExp = 0;
4598     return packFloatx80( zSign, zExp, zSig0 );
4599  precision80:
4600     switch (roundingMode) {
4601     case float_round_nearest_even:
4602     case float_round_ties_away:
4603         increment = ((int64_t)zSig1 < 0);
4604         break;
4605     case float_round_to_zero:
4606         increment = 0;
4607         break;
4608     case float_round_up:
4609         increment = !zSign && zSig1;
4610         break;
4611     case float_round_down:
4612         increment = zSign && zSig1;
4613         break;
4614     default:
4615         abort();
4616     }
4617     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4618         if (    ( 0x7FFE < zExp )
4619              || (    ( zExp == 0x7FFE )
4620                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4621                   && increment
4622                 )
4623            ) {
4624             roundMask = 0;
4625  overflow:
4626             float_raise(float_flag_overflow | float_flag_inexact, status);
4627             if (    ( roundingMode == float_round_to_zero )
4628                  || ( zSign && ( roundingMode == float_round_up ) )
4629                  || ( ! zSign && ( roundingMode == float_round_down ) )
4630                ) {
4631                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4632             }
4633             return packFloatx80(zSign,
4634                                 floatx80_infinity_high,
4635                                 floatx80_infinity_low);
4636         }
4637         if ( zExp <= 0 ) {
4638             isTiny = status->tininess_before_rounding
4639                   || (zExp < 0)
4640                   || !increment
4641                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4642             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4643             zExp = 0;
4644             if (isTiny && zSig1) {
4645                 float_raise(float_flag_underflow, status);
4646             }
4647             if (zSig1) {
4648                 float_raise(float_flag_inexact, status);
4649             }
4650             switch (roundingMode) {
4651             case float_round_nearest_even:
4652             case float_round_ties_away:
4653                 increment = ((int64_t)zSig1 < 0);
4654                 break;
4655             case float_round_to_zero:
4656                 increment = 0;
4657                 break;
4658             case float_round_up:
4659                 increment = !zSign && zSig1;
4660                 break;
4661             case float_round_down:
4662                 increment = zSign && zSig1;
4663                 break;
4664             default:
4665                 abort();
4666             }
4667             if ( increment ) {
4668                 ++zSig0;
4669                 if (!(zSig1 << 1) && roundNearestEven) {
4670                     zSig0 &= ~1;
4671                 }
4672                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4673             }
4674             return packFloatx80( zSign, zExp, zSig0 );
4675         }
4676     }
4677     if (zSig1) {
4678         float_raise(float_flag_inexact, status);
4679     }
4680     if ( increment ) {
4681         ++zSig0;
4682         if ( zSig0 == 0 ) {
4683             ++zExp;
4684             zSig0 = UINT64_C(0x8000000000000000);
4685         }
4686         else {
4687             if (!(zSig1 << 1) && roundNearestEven) {
4688                 zSig0 &= ~1;
4689             }
4690         }
4691     }
4692     else {
4693         if ( zSig0 == 0 ) zExp = 0;
4694     }
4695     return packFloatx80( zSign, zExp, zSig0 );
4696 
4697 }
4698 
4699 /*----------------------------------------------------------------------------
4700 | Takes an abstract floating-point value having sign `zSign', exponent
4701 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4702 | and returns the proper extended double-precision floating-point value
4703 | corresponding to the abstract input.  This routine is just like
4704 | `roundAndPackFloatx80' except that the input significand does not have to be
4705 | normalized.
4706 *----------------------------------------------------------------------------*/
4707 
4708 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4709                                        bool zSign, int32_t zExp,
4710                                        uint64_t zSig0, uint64_t zSig1,
4711                                        float_status *status)
4712 {
4713     int8_t shiftCount;
4714 
4715     if ( zSig0 == 0 ) {
4716         zSig0 = zSig1;
4717         zSig1 = 0;
4718         zExp -= 64;
4719     }
4720     shiftCount = clz64(zSig0);
4721     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4722     zExp -= shiftCount;
4723     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4724                                 zSig0, zSig1, status);
4725 
4726 }
4727 
4728 /*----------------------------------------------------------------------------
4729 | Returns the least-significant 64 fraction bits of the quadruple-precision
4730 | floating-point value `a'.
4731 *----------------------------------------------------------------------------*/
4732 
4733 static inline uint64_t extractFloat128Frac1( float128 a )
4734 {
4735 
4736     return a.low;
4737 
4738 }
4739 
4740 /*----------------------------------------------------------------------------
4741 | Returns the most-significant 48 fraction bits of the quadruple-precision
4742 | floating-point value `a'.
4743 *----------------------------------------------------------------------------*/
4744 
4745 static inline uint64_t extractFloat128Frac0( float128 a )
4746 {
4747 
4748     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4749 
4750 }
4751 
4752 /*----------------------------------------------------------------------------
4753 | Returns the exponent bits of the quadruple-precision floating-point value
4754 | `a'.
4755 *----------------------------------------------------------------------------*/
4756 
4757 static inline int32_t extractFloat128Exp( float128 a )
4758 {
4759 
4760     return ( a.high>>48 ) & 0x7FFF;
4761 
4762 }
4763 
4764 /*----------------------------------------------------------------------------
4765 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4766 *----------------------------------------------------------------------------*/
4767 
4768 static inline bool extractFloat128Sign(float128 a)
4769 {
4770     return a.high >> 63;
4771 }
4772 
4773 /*----------------------------------------------------------------------------
4774 | Normalizes the subnormal quadruple-precision floating-point value
4775 | represented by the denormalized significand formed by the concatenation of
4776 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4777 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4778 | significand are stored at the location pointed to by `zSig0Ptr', and the
4779 | least significant 64 bits of the normalized significand are stored at the
4780 | location pointed to by `zSig1Ptr'.
4781 *----------------------------------------------------------------------------*/
4782 
4783 static void
4784  normalizeFloat128Subnormal(
4785      uint64_t aSig0,
4786      uint64_t aSig1,
4787      int32_t *zExpPtr,
4788      uint64_t *zSig0Ptr,
4789      uint64_t *zSig1Ptr
4790  )
4791 {
4792     int8_t shiftCount;
4793 
4794     if ( aSig0 == 0 ) {
4795         shiftCount = clz64(aSig1) - 15;
4796         if ( shiftCount < 0 ) {
4797             *zSig0Ptr = aSig1>>( - shiftCount );
4798             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4799         }
4800         else {
4801             *zSig0Ptr = aSig1<<shiftCount;
4802             *zSig1Ptr = 0;
4803         }
4804         *zExpPtr = - shiftCount - 63;
4805     }
4806     else {
4807         shiftCount = clz64(aSig0) - 15;
4808         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4809         *zExpPtr = 1 - shiftCount;
4810     }
4811 
4812 }
4813 
4814 /*----------------------------------------------------------------------------
4815 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4816 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4817 | floating-point value, returning the result.  After being shifted into the
4818 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4819 | added together to form the most significant 32 bits of the result.  This
4820 | means that any integer portion of `zSig0' will be added into the exponent.
4821 | Since a properly normalized significand will have an integer portion equal
4822 | to 1, the `zExp' input should be 1 less than the desired result exponent
4823 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4824 | significand.
4825 *----------------------------------------------------------------------------*/
4826 
4827 static inline float128
4828 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4829 {
4830     float128 z;
4831 
4832     z.low = zSig1;
4833     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4834     return z;
4835 }
4836 
4837 /*----------------------------------------------------------------------------
4838 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4839 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4840 | and `zSig2', and returns the proper quadruple-precision floating-point value
4841 | corresponding to the abstract input.  Ordinarily, the abstract value is
4842 | simply rounded and packed into the quadruple-precision format, with the
4843 | inexact exception raised if the abstract input cannot be represented
4844 | exactly.  However, if the abstract value is too large, the overflow and
4845 | inexact exceptions are raised and an infinity or maximal finite value is
4846 | returned.  If the abstract value is too small, the input value is rounded to
4847 | a subnormal number, and the underflow and inexact exceptions are raised if
4848 | the abstract input cannot be represented exactly as a subnormal quadruple-
4849 | precision floating-point number.
4850 |     The input significand must be normalized or smaller.  If the input
4851 | significand is not normalized, `zExp' must be 0; in that case, the result
4852 | returned is a subnormal number, and it must not require rounding.  In the
4853 | usual case that the input significand is normalized, `zExp' must be 1 less
4854 | than the ``true'' floating-point exponent.  The handling of underflow and
4855 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4856 *----------------------------------------------------------------------------*/
4857 
4858 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4859                                      uint64_t zSig0, uint64_t zSig1,
4860                                      uint64_t zSig2, float_status *status)
4861 {
4862     int8_t roundingMode;
4863     bool roundNearestEven, increment, isTiny;
4864 
4865     roundingMode = status->float_rounding_mode;
4866     roundNearestEven = ( roundingMode == float_round_nearest_even );
4867     switch (roundingMode) {
4868     case float_round_nearest_even:
4869     case float_round_ties_away:
4870         increment = ((int64_t)zSig2 < 0);
4871         break;
4872     case float_round_to_zero:
4873         increment = 0;
4874         break;
4875     case float_round_up:
4876         increment = !zSign && zSig2;
4877         break;
4878     case float_round_down:
4879         increment = zSign && zSig2;
4880         break;
4881     case float_round_to_odd:
4882         increment = !(zSig1 & 0x1) && zSig2;
4883         break;
4884     default:
4885         abort();
4886     }
4887     if ( 0x7FFD <= (uint32_t) zExp ) {
4888         if (    ( 0x7FFD < zExp )
4889              || (    ( zExp == 0x7FFD )
4890                   && eq128(
4891                          UINT64_C(0x0001FFFFFFFFFFFF),
4892                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4893                          zSig0,
4894                          zSig1
4895                      )
4896                   && increment
4897                 )
4898            ) {
4899             float_raise(float_flag_overflow | float_flag_inexact, status);
4900             if (    ( roundingMode == float_round_to_zero )
4901                  || ( zSign && ( roundingMode == float_round_up ) )
4902                  || ( ! zSign && ( roundingMode == float_round_down ) )
4903                  || (roundingMode == float_round_to_odd)
4904                ) {
4905                 return
4906                     packFloat128(
4907                         zSign,
4908                         0x7FFE,
4909                         UINT64_C(0x0000FFFFFFFFFFFF),
4910                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4911                     );
4912             }
4913             return packFloat128( zSign, 0x7FFF, 0, 0 );
4914         }
4915         if ( zExp < 0 ) {
4916             if (status->flush_to_zero) {
4917                 float_raise(float_flag_output_denormal, status);
4918                 return packFloat128(zSign, 0, 0, 0);
4919             }
4920             isTiny = status->tininess_before_rounding
4921                   || (zExp < -1)
4922                   || !increment
4923                   || lt128(zSig0, zSig1,
4924                            UINT64_C(0x0001FFFFFFFFFFFF),
4925                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4926             shift128ExtraRightJamming(
4927                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4928             zExp = 0;
4929             if (isTiny && zSig2) {
4930                 float_raise(float_flag_underflow, status);
4931             }
4932             switch (roundingMode) {
4933             case float_round_nearest_even:
4934             case float_round_ties_away:
4935                 increment = ((int64_t)zSig2 < 0);
4936                 break;
4937             case float_round_to_zero:
4938                 increment = 0;
4939                 break;
4940             case float_round_up:
4941                 increment = !zSign && zSig2;
4942                 break;
4943             case float_round_down:
4944                 increment = zSign && zSig2;
4945                 break;
4946             case float_round_to_odd:
4947                 increment = !(zSig1 & 0x1) && zSig2;
4948                 break;
4949             default:
4950                 abort();
4951             }
4952         }
4953     }
4954     if (zSig2) {
4955         float_raise(float_flag_inexact, status);
4956     }
4957     if ( increment ) {
4958         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4959         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4960             zSig1 &= ~1;
4961         }
4962     }
4963     else {
4964         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4965     }
4966     return packFloat128( zSign, zExp, zSig0, zSig1 );
4967 
4968 }
4969 
4970 /*----------------------------------------------------------------------------
4971 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4972 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4973 | returns the proper quadruple-precision floating-point value corresponding
4974 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4975 | except that the input significand has fewer bits and does not have to be
4976 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4977 | point exponent.
4978 *----------------------------------------------------------------------------*/
4979 
4980 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4981                                               uint64_t zSig0, uint64_t zSig1,
4982                                               float_status *status)
4983 {
4984     int8_t shiftCount;
4985     uint64_t zSig2;
4986 
4987     if ( zSig0 == 0 ) {
4988         zSig0 = zSig1;
4989         zSig1 = 0;
4990         zExp -= 64;
4991     }
4992     shiftCount = clz64(zSig0) - 15;
4993     if ( 0 <= shiftCount ) {
4994         zSig2 = 0;
4995         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4996     }
4997     else {
4998         shift128ExtraRightJamming(
4999             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
5000     }
5001     zExp -= shiftCount;
5002     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
5003 
5004 }
5005 
5006 
5007 /*----------------------------------------------------------------------------
5008 | Returns the result of converting the 32-bit two's complement integer `a'
5009 | to the extended double-precision floating-point format.  The conversion
5010 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5011 | Arithmetic.
5012 *----------------------------------------------------------------------------*/
5013 
5014 floatx80 int32_to_floatx80(int32_t a, float_status *status)
5015 {
5016     bool zSign;
5017     uint32_t absA;
5018     int8_t shiftCount;
5019     uint64_t zSig;
5020 
5021     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5022     zSign = ( a < 0 );
5023     absA = zSign ? - a : a;
5024     shiftCount = clz32(absA) + 32;
5025     zSig = absA;
5026     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
5027 
5028 }
5029 
5030 /*----------------------------------------------------------------------------
5031 | Returns the result of converting the 32-bit two's complement integer `a' to
5032 | the quadruple-precision floating-point format.  The conversion is performed
5033 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5034 *----------------------------------------------------------------------------*/
5035 
5036 float128 int32_to_float128(int32_t a, float_status *status)
5037 {
5038     bool zSign;
5039     uint32_t absA;
5040     int8_t shiftCount;
5041     uint64_t zSig0;
5042 
5043     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5044     zSign = ( a < 0 );
5045     absA = zSign ? - a : a;
5046     shiftCount = clz32(absA) + 17;
5047     zSig0 = absA;
5048     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
5049 
5050 }
5051 
5052 /*----------------------------------------------------------------------------
5053 | Returns the result of converting the 64-bit two's complement integer `a'
5054 | to the extended double-precision floating-point format.  The conversion
5055 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5056 | Arithmetic.
5057 *----------------------------------------------------------------------------*/
5058 
5059 floatx80 int64_to_floatx80(int64_t a, float_status *status)
5060 {
5061     bool zSign;
5062     uint64_t absA;
5063     int8_t shiftCount;
5064 
5065     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5066     zSign = ( a < 0 );
5067     absA = zSign ? - a : a;
5068     shiftCount = clz64(absA);
5069     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
5070 
5071 }
5072 
5073 /*----------------------------------------------------------------------------
5074 | Returns the result of converting the 64-bit two's complement integer `a' to
5075 | the quadruple-precision floating-point format.  The conversion is performed
5076 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5077 *----------------------------------------------------------------------------*/
5078 
5079 float128 int64_to_float128(int64_t a, float_status *status)
5080 {
5081     bool zSign;
5082     uint64_t absA;
5083     int8_t shiftCount;
5084     int32_t zExp;
5085     uint64_t zSig0, zSig1;
5086 
5087     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5088     zSign = ( a < 0 );
5089     absA = zSign ? - a : a;
5090     shiftCount = clz64(absA) + 49;
5091     zExp = 0x406E - shiftCount;
5092     if ( 64 <= shiftCount ) {
5093         zSig1 = 0;
5094         zSig0 = absA;
5095         shiftCount -= 64;
5096     }
5097     else {
5098         zSig1 = absA;
5099         zSig0 = 0;
5100     }
5101     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5102     return packFloat128( zSign, zExp, zSig0, zSig1 );
5103 
5104 }
5105 
5106 /*----------------------------------------------------------------------------
5107 | Returns the result of converting the 64-bit unsigned integer `a'
5108 | to the quadruple-precision floating-point format.  The conversion is performed
5109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5110 *----------------------------------------------------------------------------*/
5111 
5112 float128 uint64_to_float128(uint64_t a, float_status *status)
5113 {
5114     if (a == 0) {
5115         return float128_zero;
5116     }
5117     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5118 }
5119 
5120 /*----------------------------------------------------------------------------
5121 | Returns the result of converting the single-precision floating-point value
5122 | `a' to the extended double-precision floating-point format.  The conversion
5123 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5124 | Arithmetic.
5125 *----------------------------------------------------------------------------*/
5126 
5127 floatx80 float32_to_floatx80(float32 a, float_status *status)
5128 {
5129     bool aSign;
5130     int aExp;
5131     uint32_t aSig;
5132 
5133     a = float32_squash_input_denormal(a, status);
5134     aSig = extractFloat32Frac( a );
5135     aExp = extractFloat32Exp( a );
5136     aSign = extractFloat32Sign( a );
5137     if ( aExp == 0xFF ) {
5138         if (aSig) {
5139             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5140                                                status);
5141             return floatx80_silence_nan(res, status);
5142         }
5143         return packFloatx80(aSign,
5144                             floatx80_infinity_high,
5145                             floatx80_infinity_low);
5146     }
5147     if ( aExp == 0 ) {
5148         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5149         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5150     }
5151     aSig |= 0x00800000;
5152     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5153 
5154 }
5155 
5156 /*----------------------------------------------------------------------------
5157 | Returns the result of converting the single-precision floating-point value
5158 | `a' to the double-precision floating-point format.  The conversion is
5159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5160 | Arithmetic.
5161 *----------------------------------------------------------------------------*/
5162 
5163 float128 float32_to_float128(float32 a, float_status *status)
5164 {
5165     bool aSign;
5166     int aExp;
5167     uint32_t aSig;
5168 
5169     a = float32_squash_input_denormal(a, status);
5170     aSig = extractFloat32Frac( a );
5171     aExp = extractFloat32Exp( a );
5172     aSign = extractFloat32Sign( a );
5173     if ( aExp == 0xFF ) {
5174         if (aSig) {
5175             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5176         }
5177         return packFloat128( aSign, 0x7FFF, 0, 0 );
5178     }
5179     if ( aExp == 0 ) {
5180         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5181         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5182         --aExp;
5183     }
5184     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5185 
5186 }
5187 
5188 /*----------------------------------------------------------------------------
5189 | Returns the remainder of the single-precision floating-point value `a'
5190 | with respect to the corresponding value `b'.  The operation is performed
5191 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5192 *----------------------------------------------------------------------------*/
5193 
5194 float32 float32_rem(float32 a, float32 b, float_status *status)
5195 {
5196     bool aSign, zSign;
5197     int aExp, bExp, expDiff;
5198     uint32_t aSig, bSig;
5199     uint32_t q;
5200     uint64_t aSig64, bSig64, q64;
5201     uint32_t alternateASig;
5202     int32_t sigMean;
5203     a = float32_squash_input_denormal(a, status);
5204     b = float32_squash_input_denormal(b, status);
5205 
5206     aSig = extractFloat32Frac( a );
5207     aExp = extractFloat32Exp( a );
5208     aSign = extractFloat32Sign( a );
5209     bSig = extractFloat32Frac( b );
5210     bExp = extractFloat32Exp( b );
5211     if ( aExp == 0xFF ) {
5212         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5213             return propagateFloat32NaN(a, b, status);
5214         }
5215         float_raise(float_flag_invalid, status);
5216         return float32_default_nan(status);
5217     }
5218     if ( bExp == 0xFF ) {
5219         if (bSig) {
5220             return propagateFloat32NaN(a, b, status);
5221         }
5222         return a;
5223     }
5224     if ( bExp == 0 ) {
5225         if ( bSig == 0 ) {
5226             float_raise(float_flag_invalid, status);
5227             return float32_default_nan(status);
5228         }
5229         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5230     }
5231     if ( aExp == 0 ) {
5232         if ( aSig == 0 ) return a;
5233         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5234     }
5235     expDiff = aExp - bExp;
5236     aSig |= 0x00800000;
5237     bSig |= 0x00800000;
5238     if ( expDiff < 32 ) {
5239         aSig <<= 8;
5240         bSig <<= 8;
5241         if ( expDiff < 0 ) {
5242             if ( expDiff < -1 ) return a;
5243             aSig >>= 1;
5244         }
5245         q = ( bSig <= aSig );
5246         if ( q ) aSig -= bSig;
5247         if ( 0 < expDiff ) {
5248             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5249             q >>= 32 - expDiff;
5250             bSig >>= 2;
5251             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5252         }
5253         else {
5254             aSig >>= 2;
5255             bSig >>= 2;
5256         }
5257     }
5258     else {
5259         if ( bSig <= aSig ) aSig -= bSig;
5260         aSig64 = ( (uint64_t) aSig )<<40;
5261         bSig64 = ( (uint64_t) bSig )<<40;
5262         expDiff -= 64;
5263         while ( 0 < expDiff ) {
5264             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5265             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5266             aSig64 = - ( ( bSig * q64 )<<38 );
5267             expDiff -= 62;
5268         }
5269         expDiff += 64;
5270         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5271         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5272         q = q64>>( 64 - expDiff );
5273         bSig <<= 6;
5274         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5275     }
5276     do {
5277         alternateASig = aSig;
5278         ++q;
5279         aSig -= bSig;
5280     } while ( 0 <= (int32_t) aSig );
5281     sigMean = aSig + alternateASig;
5282     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5283         aSig = alternateASig;
5284     }
5285     zSign = ( (int32_t) aSig < 0 );
5286     if ( zSign ) aSig = - aSig;
5287     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5288 }
5289 
5290 
5291 
5292 /*----------------------------------------------------------------------------
5293 | Returns the binary exponential of the single-precision floating-point value
5294 | `a'. The operation is performed according to the IEC/IEEE Standard for
5295 | Binary Floating-Point Arithmetic.
5296 |
5297 | Uses the following identities:
5298 |
5299 | 1. -------------------------------------------------------------------------
5300 |      x    x*ln(2)
5301 |     2  = e
5302 |
5303 | 2. -------------------------------------------------------------------------
5304 |                      2     3     4     5           n
5305 |      x        x     x     x     x     x           x
5306 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5307 |               1!    2!    3!    4!    5!          n!
5308 *----------------------------------------------------------------------------*/
5309 
5310 static const float64 float32_exp2_coefficients[15] =
5311 {
5312     const_float64( 0x3ff0000000000000ll ), /*  1 */
5313     const_float64( 0x3fe0000000000000ll ), /*  2 */
5314     const_float64( 0x3fc5555555555555ll ), /*  3 */
5315     const_float64( 0x3fa5555555555555ll ), /*  4 */
5316     const_float64( 0x3f81111111111111ll ), /*  5 */
5317     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5318     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5319     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5320     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5321     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5322     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5323     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5324     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5325     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5326     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5327 };
5328 
5329 float32 float32_exp2(float32 a, float_status *status)
5330 {
5331     bool aSign;
5332     int aExp;
5333     uint32_t aSig;
5334     float64 r, x, xn;
5335     int i;
5336     a = float32_squash_input_denormal(a, status);
5337 
5338     aSig = extractFloat32Frac( a );
5339     aExp = extractFloat32Exp( a );
5340     aSign = extractFloat32Sign( a );
5341 
5342     if ( aExp == 0xFF) {
5343         if (aSig) {
5344             return propagateFloat32NaN(a, float32_zero, status);
5345         }
5346         return (aSign) ? float32_zero : a;
5347     }
5348     if (aExp == 0) {
5349         if (aSig == 0) return float32_one;
5350     }
5351 
5352     float_raise(float_flag_inexact, status);
5353 
5354     /* ******************************* */
5355     /* using float64 for approximation */
5356     /* ******************************* */
5357     x = float32_to_float64(a, status);
5358     x = float64_mul(x, float64_ln2, status);
5359 
5360     xn = x;
5361     r = float64_one;
5362     for (i = 0 ; i < 15 ; i++) {
5363         float64 f;
5364 
5365         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5366         r = float64_add(r, f, status);
5367 
5368         xn = float64_mul(xn, x, status);
5369     }
5370 
5371     return float64_to_float32(r, status);
5372 }
5373 
5374 /*----------------------------------------------------------------------------
5375 | Returns the binary log of the single-precision floating-point value `a'.
5376 | The operation is performed according to the IEC/IEEE Standard for Binary
5377 | Floating-Point Arithmetic.
5378 *----------------------------------------------------------------------------*/
5379 float32 float32_log2(float32 a, float_status *status)
5380 {
5381     bool aSign, zSign;
5382     int aExp;
5383     uint32_t aSig, zSig, i;
5384 
5385     a = float32_squash_input_denormal(a, status);
5386     aSig = extractFloat32Frac( a );
5387     aExp = extractFloat32Exp( a );
5388     aSign = extractFloat32Sign( a );
5389 
5390     if ( aExp == 0 ) {
5391         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5392         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5393     }
5394     if ( aSign ) {
5395         float_raise(float_flag_invalid, status);
5396         return float32_default_nan(status);
5397     }
5398     if ( aExp == 0xFF ) {
5399         if (aSig) {
5400             return propagateFloat32NaN(a, float32_zero, status);
5401         }
5402         return a;
5403     }
5404 
5405     aExp -= 0x7F;
5406     aSig |= 0x00800000;
5407     zSign = aExp < 0;
5408     zSig = aExp << 23;
5409 
5410     for (i = 1 << 22; i > 0; i >>= 1) {
5411         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5412         if ( aSig & 0x01000000 ) {
5413             aSig >>= 1;
5414             zSig |= i;
5415         }
5416     }
5417 
5418     if ( zSign )
5419         zSig = -zSig;
5420 
5421     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5422 }
5423 
5424 /*----------------------------------------------------------------------------
5425 | Returns the result of converting the double-precision floating-point value
5426 | `a' to the extended double-precision floating-point format.  The conversion
5427 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5428 | Arithmetic.
5429 *----------------------------------------------------------------------------*/
5430 
5431 floatx80 float64_to_floatx80(float64 a, float_status *status)
5432 {
5433     bool aSign;
5434     int aExp;
5435     uint64_t aSig;
5436 
5437     a = float64_squash_input_denormal(a, status);
5438     aSig = extractFloat64Frac( a );
5439     aExp = extractFloat64Exp( a );
5440     aSign = extractFloat64Sign( a );
5441     if ( aExp == 0x7FF ) {
5442         if (aSig) {
5443             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5444                                                status);
5445             return floatx80_silence_nan(res, status);
5446         }
5447         return packFloatx80(aSign,
5448                             floatx80_infinity_high,
5449                             floatx80_infinity_low);
5450     }
5451     if ( aExp == 0 ) {
5452         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5453         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5454     }
5455     return
5456         packFloatx80(
5457             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5458 
5459 }
5460 
5461 /*----------------------------------------------------------------------------
5462 | Returns the result of converting the double-precision floating-point value
5463 | `a' to the quadruple-precision floating-point format.  The conversion is
5464 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5465 | Arithmetic.
5466 *----------------------------------------------------------------------------*/
5467 
5468 float128 float64_to_float128(float64 a, float_status *status)
5469 {
5470     bool aSign;
5471     int aExp;
5472     uint64_t aSig, zSig0, zSig1;
5473 
5474     a = float64_squash_input_denormal(a, status);
5475     aSig = extractFloat64Frac( a );
5476     aExp = extractFloat64Exp( a );
5477     aSign = extractFloat64Sign( a );
5478     if ( aExp == 0x7FF ) {
5479         if (aSig) {
5480             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5481         }
5482         return packFloat128( aSign, 0x7FFF, 0, 0 );
5483     }
5484     if ( aExp == 0 ) {
5485         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5486         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5487         --aExp;
5488     }
5489     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5490     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5491 
5492 }
5493 
5494 
5495 /*----------------------------------------------------------------------------
5496 | Returns the remainder of the double-precision floating-point value `a'
5497 | with respect to the corresponding value `b'.  The operation is performed
5498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5499 *----------------------------------------------------------------------------*/
5500 
5501 float64 float64_rem(float64 a, float64 b, float_status *status)
5502 {
5503     bool aSign, zSign;
5504     int aExp, bExp, expDiff;
5505     uint64_t aSig, bSig;
5506     uint64_t q, alternateASig;
5507     int64_t sigMean;
5508 
5509     a = float64_squash_input_denormal(a, status);
5510     b = float64_squash_input_denormal(b, status);
5511     aSig = extractFloat64Frac( a );
5512     aExp = extractFloat64Exp( a );
5513     aSign = extractFloat64Sign( a );
5514     bSig = extractFloat64Frac( b );
5515     bExp = extractFloat64Exp( b );
5516     if ( aExp == 0x7FF ) {
5517         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5518             return propagateFloat64NaN(a, b, status);
5519         }
5520         float_raise(float_flag_invalid, status);
5521         return float64_default_nan(status);
5522     }
5523     if ( bExp == 0x7FF ) {
5524         if (bSig) {
5525             return propagateFloat64NaN(a, b, status);
5526         }
5527         return a;
5528     }
5529     if ( bExp == 0 ) {
5530         if ( bSig == 0 ) {
5531             float_raise(float_flag_invalid, status);
5532             return float64_default_nan(status);
5533         }
5534         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5535     }
5536     if ( aExp == 0 ) {
5537         if ( aSig == 0 ) return a;
5538         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5539     }
5540     expDiff = aExp - bExp;
5541     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5542     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5543     if ( expDiff < 0 ) {
5544         if ( expDiff < -1 ) return a;
5545         aSig >>= 1;
5546     }
5547     q = ( bSig <= aSig );
5548     if ( q ) aSig -= bSig;
5549     expDiff -= 64;
5550     while ( 0 < expDiff ) {
5551         q = estimateDiv128To64( aSig, 0, bSig );
5552         q = ( 2 < q ) ? q - 2 : 0;
5553         aSig = - ( ( bSig>>2 ) * q );
5554         expDiff -= 62;
5555     }
5556     expDiff += 64;
5557     if ( 0 < expDiff ) {
5558         q = estimateDiv128To64( aSig, 0, bSig );
5559         q = ( 2 < q ) ? q - 2 : 0;
5560         q >>= 64 - expDiff;
5561         bSig >>= 2;
5562         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5563     }
5564     else {
5565         aSig >>= 2;
5566         bSig >>= 2;
5567     }
5568     do {
5569         alternateASig = aSig;
5570         ++q;
5571         aSig -= bSig;
5572     } while ( 0 <= (int64_t) aSig );
5573     sigMean = aSig + alternateASig;
5574     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5575         aSig = alternateASig;
5576     }
5577     zSign = ( (int64_t) aSig < 0 );
5578     if ( zSign ) aSig = - aSig;
5579     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5580 
5581 }
5582 
5583 /*----------------------------------------------------------------------------
5584 | Returns the binary log of the double-precision floating-point value `a'.
5585 | The operation is performed according to the IEC/IEEE Standard for Binary
5586 | Floating-Point Arithmetic.
5587 *----------------------------------------------------------------------------*/
5588 float64 float64_log2(float64 a, float_status *status)
5589 {
5590     bool aSign, zSign;
5591     int aExp;
5592     uint64_t aSig, aSig0, aSig1, zSig, i;
5593     a = float64_squash_input_denormal(a, status);
5594 
5595     aSig = extractFloat64Frac( a );
5596     aExp = extractFloat64Exp( a );
5597     aSign = extractFloat64Sign( a );
5598 
5599     if ( aExp == 0 ) {
5600         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5601         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5602     }
5603     if ( aSign ) {
5604         float_raise(float_flag_invalid, status);
5605         return float64_default_nan(status);
5606     }
5607     if ( aExp == 0x7FF ) {
5608         if (aSig) {
5609             return propagateFloat64NaN(a, float64_zero, status);
5610         }
5611         return a;
5612     }
5613 
5614     aExp -= 0x3FF;
5615     aSig |= UINT64_C(0x0010000000000000);
5616     zSign = aExp < 0;
5617     zSig = (uint64_t)aExp << 52;
5618     for (i = 1LL << 51; i > 0; i >>= 1) {
5619         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5620         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5621         if ( aSig & UINT64_C(0x0020000000000000) ) {
5622             aSig >>= 1;
5623             zSig |= i;
5624         }
5625     }
5626 
5627     if ( zSign )
5628         zSig = -zSig;
5629     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5630 }
5631 
5632 /*----------------------------------------------------------------------------
5633 | Returns the result of converting the extended double-precision floating-
5634 | point value `a' to the 32-bit two's complement integer format.  The
5635 | conversion is performed according to the IEC/IEEE Standard for Binary
5636 | Floating-Point Arithmetic---which means in particular that the conversion
5637 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5638 | largest positive integer is returned.  Otherwise, if the conversion
5639 | overflows, the largest integer with the same sign as `a' is returned.
5640 *----------------------------------------------------------------------------*/
5641 
5642 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5643 {
5644     bool aSign;
5645     int32_t aExp, shiftCount;
5646     uint64_t aSig;
5647 
5648     if (floatx80_invalid_encoding(a)) {
5649         float_raise(float_flag_invalid, status);
5650         return 1 << 31;
5651     }
5652     aSig = extractFloatx80Frac( a );
5653     aExp = extractFloatx80Exp( a );
5654     aSign = extractFloatx80Sign( a );
5655     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5656     shiftCount = 0x4037 - aExp;
5657     if ( shiftCount <= 0 ) shiftCount = 1;
5658     shift64RightJamming( aSig, shiftCount, &aSig );
5659     return roundAndPackInt32(aSign, aSig, status);
5660 
5661 }
5662 
5663 /*----------------------------------------------------------------------------
5664 | Returns the result of converting the extended double-precision floating-
5665 | point value `a' to the 32-bit two's complement integer format.  The
5666 | conversion is performed according to the IEC/IEEE Standard for Binary
5667 | Floating-Point Arithmetic, except that the conversion is always rounded
5668 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5669 | Otherwise, if the conversion overflows, the largest integer with the same
5670 | sign as `a' is returned.
5671 *----------------------------------------------------------------------------*/
5672 
5673 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5674 {
5675     bool aSign;
5676     int32_t aExp, shiftCount;
5677     uint64_t aSig, savedASig;
5678     int32_t z;
5679 
5680     if (floatx80_invalid_encoding(a)) {
5681         float_raise(float_flag_invalid, status);
5682         return 1 << 31;
5683     }
5684     aSig = extractFloatx80Frac( a );
5685     aExp = extractFloatx80Exp( a );
5686     aSign = extractFloatx80Sign( a );
5687     if ( 0x401E < aExp ) {
5688         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5689         goto invalid;
5690     }
5691     else if ( aExp < 0x3FFF ) {
5692         if (aExp || aSig) {
5693             float_raise(float_flag_inexact, status);
5694         }
5695         return 0;
5696     }
5697     shiftCount = 0x403E - aExp;
5698     savedASig = aSig;
5699     aSig >>= shiftCount;
5700     z = aSig;
5701     if ( aSign ) z = - z;
5702     if ( ( z < 0 ) ^ aSign ) {
5703  invalid:
5704         float_raise(float_flag_invalid, status);
5705         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5706     }
5707     if ( ( aSig<<shiftCount ) != savedASig ) {
5708         float_raise(float_flag_inexact, status);
5709     }
5710     return z;
5711 
5712 }
5713 
5714 /*----------------------------------------------------------------------------
5715 | Returns the result of converting the extended double-precision floating-
5716 | point value `a' to the 64-bit two's complement integer format.  The
5717 | conversion is performed according to the IEC/IEEE Standard for Binary
5718 | Floating-Point Arithmetic---which means in particular that the conversion
5719 | is rounded according to the current rounding mode.  If `a' is a NaN,
5720 | the largest positive integer is returned.  Otherwise, if the conversion
5721 | overflows, the largest integer with the same sign as `a' is returned.
5722 *----------------------------------------------------------------------------*/
5723 
5724 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5725 {
5726     bool aSign;
5727     int32_t aExp, shiftCount;
5728     uint64_t aSig, aSigExtra;
5729 
5730     if (floatx80_invalid_encoding(a)) {
5731         float_raise(float_flag_invalid, status);
5732         return 1ULL << 63;
5733     }
5734     aSig = extractFloatx80Frac( a );
5735     aExp = extractFloatx80Exp( a );
5736     aSign = extractFloatx80Sign( a );
5737     shiftCount = 0x403E - aExp;
5738     if ( shiftCount <= 0 ) {
5739         if ( shiftCount ) {
5740             float_raise(float_flag_invalid, status);
5741             if (!aSign || floatx80_is_any_nan(a)) {
5742                 return INT64_MAX;
5743             }
5744             return INT64_MIN;
5745         }
5746         aSigExtra = 0;
5747     }
5748     else {
5749         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5750     }
5751     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5752 
5753 }
5754 
5755 /*----------------------------------------------------------------------------
5756 | Returns the result of converting the extended double-precision floating-
5757 | point value `a' to the 64-bit two's complement integer format.  The
5758 | conversion is performed according to the IEC/IEEE Standard for Binary
5759 | Floating-Point Arithmetic, except that the conversion is always rounded
5760 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5761 | Otherwise, if the conversion overflows, the largest integer with the same
5762 | sign as `a' is returned.
5763 *----------------------------------------------------------------------------*/
5764 
5765 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5766 {
5767     bool aSign;
5768     int32_t aExp, shiftCount;
5769     uint64_t aSig;
5770     int64_t z;
5771 
5772     if (floatx80_invalid_encoding(a)) {
5773         float_raise(float_flag_invalid, status);
5774         return 1ULL << 63;
5775     }
5776     aSig = extractFloatx80Frac( a );
5777     aExp = extractFloatx80Exp( a );
5778     aSign = extractFloatx80Sign( a );
5779     shiftCount = aExp - 0x403E;
5780     if ( 0 <= shiftCount ) {
5781         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5782         if ( ( a.high != 0xC03E ) || aSig ) {
5783             float_raise(float_flag_invalid, status);
5784             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5785                 return INT64_MAX;
5786             }
5787         }
5788         return INT64_MIN;
5789     }
5790     else if ( aExp < 0x3FFF ) {
5791         if (aExp | aSig) {
5792             float_raise(float_flag_inexact, status);
5793         }
5794         return 0;
5795     }
5796     z = aSig>>( - shiftCount );
5797     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5798         float_raise(float_flag_inexact, status);
5799     }
5800     if ( aSign ) z = - z;
5801     return z;
5802 
5803 }
5804 
5805 /*----------------------------------------------------------------------------
5806 | Returns the result of converting the extended double-precision floating-
5807 | point value `a' to the single-precision floating-point format.  The
5808 | conversion is performed according to the IEC/IEEE Standard for Binary
5809 | Floating-Point Arithmetic.
5810 *----------------------------------------------------------------------------*/
5811 
5812 float32 floatx80_to_float32(floatx80 a, float_status *status)
5813 {
5814     bool aSign;
5815     int32_t aExp;
5816     uint64_t aSig;
5817 
5818     if (floatx80_invalid_encoding(a)) {
5819         float_raise(float_flag_invalid, status);
5820         return float32_default_nan(status);
5821     }
5822     aSig = extractFloatx80Frac( a );
5823     aExp = extractFloatx80Exp( a );
5824     aSign = extractFloatx80Sign( a );
5825     if ( aExp == 0x7FFF ) {
5826         if ( (uint64_t) ( aSig<<1 ) ) {
5827             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5828                                              status);
5829             return float32_silence_nan(res, status);
5830         }
5831         return packFloat32( aSign, 0xFF, 0 );
5832     }
5833     shift64RightJamming( aSig, 33, &aSig );
5834     if ( aExp || aSig ) aExp -= 0x3F81;
5835     return roundAndPackFloat32(aSign, aExp, aSig, status);
5836 
5837 }
5838 
5839 /*----------------------------------------------------------------------------
5840 | Returns the result of converting the extended double-precision floating-
5841 | point value `a' to the double-precision floating-point format.  The
5842 | conversion is performed according to the IEC/IEEE Standard for Binary
5843 | Floating-Point Arithmetic.
5844 *----------------------------------------------------------------------------*/
5845 
5846 float64 floatx80_to_float64(floatx80 a, float_status *status)
5847 {
5848     bool aSign;
5849     int32_t aExp;
5850     uint64_t aSig, zSig;
5851 
5852     if (floatx80_invalid_encoding(a)) {
5853         float_raise(float_flag_invalid, status);
5854         return float64_default_nan(status);
5855     }
5856     aSig = extractFloatx80Frac( a );
5857     aExp = extractFloatx80Exp( a );
5858     aSign = extractFloatx80Sign( a );
5859     if ( aExp == 0x7FFF ) {
5860         if ( (uint64_t) ( aSig<<1 ) ) {
5861             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5862                                              status);
5863             return float64_silence_nan(res, status);
5864         }
5865         return packFloat64( aSign, 0x7FF, 0 );
5866     }
5867     shift64RightJamming( aSig, 1, &zSig );
5868     if ( aExp || aSig ) aExp -= 0x3C01;
5869     return roundAndPackFloat64(aSign, aExp, zSig, status);
5870 
5871 }
5872 
5873 /*----------------------------------------------------------------------------
5874 | Returns the result of converting the extended double-precision floating-
5875 | point value `a' to the quadruple-precision floating-point format.  The
5876 | conversion is performed according to the IEC/IEEE Standard for Binary
5877 | Floating-Point Arithmetic.
5878 *----------------------------------------------------------------------------*/
5879 
5880 float128 floatx80_to_float128(floatx80 a, float_status *status)
5881 {
5882     bool aSign;
5883     int aExp;
5884     uint64_t aSig, zSig0, zSig1;
5885 
5886     if (floatx80_invalid_encoding(a)) {
5887         float_raise(float_flag_invalid, status);
5888         return float128_default_nan(status);
5889     }
5890     aSig = extractFloatx80Frac( a );
5891     aExp = extractFloatx80Exp( a );
5892     aSign = extractFloatx80Sign( a );
5893     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5894         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5895                                            status);
5896         return float128_silence_nan(res, status);
5897     }
5898     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5899     return packFloat128( aSign, aExp, zSig0, zSig1 );
5900 
5901 }
5902 
5903 /*----------------------------------------------------------------------------
5904 | Rounds the extended double-precision floating-point value `a'
5905 | to the precision provided by floatx80_rounding_precision and returns the
5906 | result as an extended double-precision floating-point value.
5907 | The operation is performed according to the IEC/IEEE Standard for Binary
5908 | Floating-Point Arithmetic.
5909 *----------------------------------------------------------------------------*/
5910 
5911 floatx80 floatx80_round(floatx80 a, float_status *status)
5912 {
5913     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5914                                 extractFloatx80Sign(a),
5915                                 extractFloatx80Exp(a),
5916                                 extractFloatx80Frac(a), 0, status);
5917 }
5918 
5919 /*----------------------------------------------------------------------------
5920 | Rounds the extended double-precision floating-point value `a' to an integer,
5921 | and returns the result as an extended quadruple-precision floating-point
5922 | value.  The operation is performed according to the IEC/IEEE Standard for
5923 | Binary Floating-Point Arithmetic.
5924 *----------------------------------------------------------------------------*/
5925 
5926 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5927 {
5928     bool aSign;
5929     int32_t aExp;
5930     uint64_t lastBitMask, roundBitsMask;
5931     floatx80 z;
5932 
5933     if (floatx80_invalid_encoding(a)) {
5934         float_raise(float_flag_invalid, status);
5935         return floatx80_default_nan(status);
5936     }
5937     aExp = extractFloatx80Exp( a );
5938     if ( 0x403E <= aExp ) {
5939         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5940             return propagateFloatx80NaN(a, a, status);
5941         }
5942         return a;
5943     }
5944     if ( aExp < 0x3FFF ) {
5945         if (    ( aExp == 0 )
5946              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5947             return a;
5948         }
5949         float_raise(float_flag_inexact, status);
5950         aSign = extractFloatx80Sign( a );
5951         switch (status->float_rounding_mode) {
5952          case float_round_nearest_even:
5953             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5954                ) {
5955                 return
5956                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5957             }
5958             break;
5959         case float_round_ties_away:
5960             if (aExp == 0x3FFE) {
5961                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5962             }
5963             break;
5964          case float_round_down:
5965             return
5966                   aSign ?
5967                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5968                 : packFloatx80( 0, 0, 0 );
5969          case float_round_up:
5970             return
5971                   aSign ? packFloatx80( 1, 0, 0 )
5972                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5973 
5974         case float_round_to_zero:
5975             break;
5976         default:
5977             g_assert_not_reached();
5978         }
5979         return packFloatx80( aSign, 0, 0 );
5980     }
5981     lastBitMask = 1;
5982     lastBitMask <<= 0x403E - aExp;
5983     roundBitsMask = lastBitMask - 1;
5984     z = a;
5985     switch (status->float_rounding_mode) {
5986     case float_round_nearest_even:
5987         z.low += lastBitMask>>1;
5988         if ((z.low & roundBitsMask) == 0) {
5989             z.low &= ~lastBitMask;
5990         }
5991         break;
5992     case float_round_ties_away:
5993         z.low += lastBitMask >> 1;
5994         break;
5995     case float_round_to_zero:
5996         break;
5997     case float_round_up:
5998         if (!extractFloatx80Sign(z)) {
5999             z.low += roundBitsMask;
6000         }
6001         break;
6002     case float_round_down:
6003         if (extractFloatx80Sign(z)) {
6004             z.low += roundBitsMask;
6005         }
6006         break;
6007     default:
6008         abort();
6009     }
6010     z.low &= ~ roundBitsMask;
6011     if ( z.low == 0 ) {
6012         ++z.high;
6013         z.low = UINT64_C(0x8000000000000000);
6014     }
6015     if (z.low != a.low) {
6016         float_raise(float_flag_inexact, status);
6017     }
6018     return z;
6019 
6020 }
6021 
6022 /*----------------------------------------------------------------------------
6023 | Returns the result of adding the absolute values of the extended double-
6024 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
6025 | negated before being returned.  `zSign' is ignored if the result is a NaN.
6026 | The addition is performed according to the IEC/IEEE Standard for Binary
6027 | Floating-Point Arithmetic.
6028 *----------------------------------------------------------------------------*/
6029 
6030 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6031                                 float_status *status)
6032 {
6033     int32_t aExp, bExp, zExp;
6034     uint64_t aSig, bSig, zSig0, zSig1;
6035     int32_t expDiff;
6036 
6037     aSig = extractFloatx80Frac( a );
6038     aExp = extractFloatx80Exp( a );
6039     bSig = extractFloatx80Frac( b );
6040     bExp = extractFloatx80Exp( b );
6041     expDiff = aExp - bExp;
6042     if ( 0 < expDiff ) {
6043         if ( aExp == 0x7FFF ) {
6044             if ((uint64_t)(aSig << 1)) {
6045                 return propagateFloatx80NaN(a, b, status);
6046             }
6047             return a;
6048         }
6049         if ( bExp == 0 ) --expDiff;
6050         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6051         zExp = aExp;
6052     }
6053     else if ( expDiff < 0 ) {
6054         if ( bExp == 0x7FFF ) {
6055             if ((uint64_t)(bSig << 1)) {
6056                 return propagateFloatx80NaN(a, b, status);
6057             }
6058             return packFloatx80(zSign,
6059                                 floatx80_infinity_high,
6060                                 floatx80_infinity_low);
6061         }
6062         if ( aExp == 0 ) ++expDiff;
6063         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6064         zExp = bExp;
6065     }
6066     else {
6067         if ( aExp == 0x7FFF ) {
6068             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6069                 return propagateFloatx80NaN(a, b, status);
6070             }
6071             return a;
6072         }
6073         zSig1 = 0;
6074         zSig0 = aSig + bSig;
6075         if ( aExp == 0 ) {
6076             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
6077                 /* At least one of the values is a pseudo-denormal,
6078                  * and there is a carry out of the result.  */
6079                 zExp = 1;
6080                 goto shiftRight1;
6081             }
6082             if (zSig0 == 0) {
6083                 return packFloatx80(zSign, 0, 0);
6084             }
6085             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
6086             goto roundAndPack;
6087         }
6088         zExp = aExp;
6089         goto shiftRight1;
6090     }
6091     zSig0 = aSig + bSig;
6092     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6093  shiftRight1:
6094     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6095     zSig0 |= UINT64_C(0x8000000000000000);
6096     ++zExp;
6097  roundAndPack:
6098     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6099                                 zSign, zExp, zSig0, zSig1, status);
6100 }
6101 
6102 /*----------------------------------------------------------------------------
6103 | Returns the result of subtracting the absolute values of the extended
6104 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
6105 | difference is negated before being returned.  `zSign' is ignored if the
6106 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6107 | Standard for Binary Floating-Point Arithmetic.
6108 *----------------------------------------------------------------------------*/
6109 
6110 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6111                                 float_status *status)
6112 {
6113     int32_t aExp, bExp, zExp;
6114     uint64_t aSig, bSig, zSig0, zSig1;
6115     int32_t expDiff;
6116 
6117     aSig = extractFloatx80Frac( a );
6118     aExp = extractFloatx80Exp( a );
6119     bSig = extractFloatx80Frac( b );
6120     bExp = extractFloatx80Exp( b );
6121     expDiff = aExp - bExp;
6122     if ( 0 < expDiff ) goto aExpBigger;
6123     if ( expDiff < 0 ) goto bExpBigger;
6124     if ( aExp == 0x7FFF ) {
6125         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6126             return propagateFloatx80NaN(a, b, status);
6127         }
6128         float_raise(float_flag_invalid, status);
6129         return floatx80_default_nan(status);
6130     }
6131     if ( aExp == 0 ) {
6132         aExp = 1;
6133         bExp = 1;
6134     }
6135     zSig1 = 0;
6136     if ( bSig < aSig ) goto aBigger;
6137     if ( aSig < bSig ) goto bBigger;
6138     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6139  bExpBigger:
6140     if ( bExp == 0x7FFF ) {
6141         if ((uint64_t)(bSig << 1)) {
6142             return propagateFloatx80NaN(a, b, status);
6143         }
6144         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6145                             floatx80_infinity_low);
6146     }
6147     if ( aExp == 0 ) ++expDiff;
6148     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6149  bBigger:
6150     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6151     zExp = bExp;
6152     zSign ^= 1;
6153     goto normalizeRoundAndPack;
6154  aExpBigger:
6155     if ( aExp == 0x7FFF ) {
6156         if ((uint64_t)(aSig << 1)) {
6157             return propagateFloatx80NaN(a, b, status);
6158         }
6159         return a;
6160     }
6161     if ( bExp == 0 ) --expDiff;
6162     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6163  aBigger:
6164     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6165     zExp = aExp;
6166  normalizeRoundAndPack:
6167     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6168                                          zSign, zExp, zSig0, zSig1, status);
6169 }
6170 
6171 /*----------------------------------------------------------------------------
6172 | Returns the result of adding the extended double-precision floating-point
6173 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6174 | Standard for Binary Floating-Point Arithmetic.
6175 *----------------------------------------------------------------------------*/
6176 
6177 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6178 {
6179     bool aSign, bSign;
6180 
6181     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6182         float_raise(float_flag_invalid, status);
6183         return floatx80_default_nan(status);
6184     }
6185     aSign = extractFloatx80Sign( a );
6186     bSign = extractFloatx80Sign( b );
6187     if ( aSign == bSign ) {
6188         return addFloatx80Sigs(a, b, aSign, status);
6189     }
6190     else {
6191         return subFloatx80Sigs(a, b, aSign, status);
6192     }
6193 
6194 }
6195 
6196 /*----------------------------------------------------------------------------
6197 | Returns the result of subtracting the extended double-precision floating-
6198 | point values `a' and `b'.  The operation is performed according to the
6199 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6200 *----------------------------------------------------------------------------*/
6201 
6202 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6203 {
6204     bool aSign, bSign;
6205 
6206     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6207         float_raise(float_flag_invalid, status);
6208         return floatx80_default_nan(status);
6209     }
6210     aSign = extractFloatx80Sign( a );
6211     bSign = extractFloatx80Sign( b );
6212     if ( aSign == bSign ) {
6213         return subFloatx80Sigs(a, b, aSign, status);
6214     }
6215     else {
6216         return addFloatx80Sigs(a, b, aSign, status);
6217     }
6218 
6219 }
6220 
6221 /*----------------------------------------------------------------------------
6222 | Returns the result of multiplying the extended double-precision floating-
6223 | point values `a' and `b'.  The operation is performed according to the
6224 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6225 *----------------------------------------------------------------------------*/
6226 
6227 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6228 {
6229     bool aSign, bSign, zSign;
6230     int32_t aExp, bExp, zExp;
6231     uint64_t aSig, bSig, zSig0, zSig1;
6232 
6233     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6234         float_raise(float_flag_invalid, status);
6235         return floatx80_default_nan(status);
6236     }
6237     aSig = extractFloatx80Frac( a );
6238     aExp = extractFloatx80Exp( a );
6239     aSign = extractFloatx80Sign( a );
6240     bSig = extractFloatx80Frac( b );
6241     bExp = extractFloatx80Exp( b );
6242     bSign = extractFloatx80Sign( b );
6243     zSign = aSign ^ bSign;
6244     if ( aExp == 0x7FFF ) {
6245         if (    (uint64_t) ( aSig<<1 )
6246              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6247             return propagateFloatx80NaN(a, b, status);
6248         }
6249         if ( ( bExp | bSig ) == 0 ) goto invalid;
6250         return packFloatx80(zSign, floatx80_infinity_high,
6251                                    floatx80_infinity_low);
6252     }
6253     if ( bExp == 0x7FFF ) {
6254         if ((uint64_t)(bSig << 1)) {
6255             return propagateFloatx80NaN(a, b, status);
6256         }
6257         if ( ( aExp | aSig ) == 0 ) {
6258  invalid:
6259             float_raise(float_flag_invalid, status);
6260             return floatx80_default_nan(status);
6261         }
6262         return packFloatx80(zSign, floatx80_infinity_high,
6263                                    floatx80_infinity_low);
6264     }
6265     if ( aExp == 0 ) {
6266         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6267         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6268     }
6269     if ( bExp == 0 ) {
6270         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6271         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6272     }
6273     zExp = aExp + bExp - 0x3FFE;
6274     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6275     if ( 0 < (int64_t) zSig0 ) {
6276         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6277         --zExp;
6278     }
6279     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6280                                 zSign, zExp, zSig0, zSig1, status);
6281 }
6282 
6283 /*----------------------------------------------------------------------------
6284 | Returns the result of dividing the extended double-precision floating-point
6285 | value `a' by the corresponding value `b'.  The operation is performed
6286 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6287 *----------------------------------------------------------------------------*/
6288 
6289 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6290 {
6291     bool aSign, bSign, zSign;
6292     int32_t aExp, bExp, zExp;
6293     uint64_t aSig, bSig, zSig0, zSig1;
6294     uint64_t rem0, rem1, rem2, term0, term1, term2;
6295 
6296     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6297         float_raise(float_flag_invalid, status);
6298         return floatx80_default_nan(status);
6299     }
6300     aSig = extractFloatx80Frac( a );
6301     aExp = extractFloatx80Exp( a );
6302     aSign = extractFloatx80Sign( a );
6303     bSig = extractFloatx80Frac( b );
6304     bExp = extractFloatx80Exp( b );
6305     bSign = extractFloatx80Sign( b );
6306     zSign = aSign ^ bSign;
6307     if ( aExp == 0x7FFF ) {
6308         if ((uint64_t)(aSig << 1)) {
6309             return propagateFloatx80NaN(a, b, status);
6310         }
6311         if ( bExp == 0x7FFF ) {
6312             if ((uint64_t)(bSig << 1)) {
6313                 return propagateFloatx80NaN(a, b, status);
6314             }
6315             goto invalid;
6316         }
6317         return packFloatx80(zSign, floatx80_infinity_high,
6318                                    floatx80_infinity_low);
6319     }
6320     if ( bExp == 0x7FFF ) {
6321         if ((uint64_t)(bSig << 1)) {
6322             return propagateFloatx80NaN(a, b, status);
6323         }
6324         return packFloatx80( zSign, 0, 0 );
6325     }
6326     if ( bExp == 0 ) {
6327         if ( bSig == 0 ) {
6328             if ( ( aExp | aSig ) == 0 ) {
6329  invalid:
6330                 float_raise(float_flag_invalid, status);
6331                 return floatx80_default_nan(status);
6332             }
6333             float_raise(float_flag_divbyzero, status);
6334             return packFloatx80(zSign, floatx80_infinity_high,
6335                                        floatx80_infinity_low);
6336         }
6337         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6338     }
6339     if ( aExp == 0 ) {
6340         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6341         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6342     }
6343     zExp = aExp - bExp + 0x3FFE;
6344     rem1 = 0;
6345     if ( bSig <= aSig ) {
6346         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6347         ++zExp;
6348     }
6349     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6350     mul64To128( bSig, zSig0, &term0, &term1 );
6351     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6352     while ( (int64_t) rem0 < 0 ) {
6353         --zSig0;
6354         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6355     }
6356     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6357     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6358         mul64To128( bSig, zSig1, &term1, &term2 );
6359         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6360         while ( (int64_t) rem1 < 0 ) {
6361             --zSig1;
6362             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6363         }
6364         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6365     }
6366     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6367                                 zSign, zExp, zSig0, zSig1, status);
6368 }
6369 
6370 /*----------------------------------------------------------------------------
6371 | Returns the remainder of the extended double-precision floating-point value
6372 | `a' with respect to the corresponding value `b'.  The operation is performed
6373 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6374 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6375 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6376 | the absolute value of the integer quotient.
6377 *----------------------------------------------------------------------------*/
6378 
6379 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6380                          float_status *status)
6381 {
6382     bool aSign, zSign;
6383     int32_t aExp, bExp, expDiff, aExpOrig;
6384     uint64_t aSig0, aSig1, bSig;
6385     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6386 
6387     *quotient = 0;
6388     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6389         float_raise(float_flag_invalid, status);
6390         return floatx80_default_nan(status);
6391     }
6392     aSig0 = extractFloatx80Frac( a );
6393     aExpOrig = aExp = extractFloatx80Exp( a );
6394     aSign = extractFloatx80Sign( a );
6395     bSig = extractFloatx80Frac( b );
6396     bExp = extractFloatx80Exp( b );
6397     if ( aExp == 0x7FFF ) {
6398         if (    (uint64_t) ( aSig0<<1 )
6399              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6400             return propagateFloatx80NaN(a, b, status);
6401         }
6402         goto invalid;
6403     }
6404     if ( bExp == 0x7FFF ) {
6405         if ((uint64_t)(bSig << 1)) {
6406             return propagateFloatx80NaN(a, b, status);
6407         }
6408         if (aExp == 0 && aSig0 >> 63) {
6409             /*
6410              * Pseudo-denormal argument must be returned in normalized
6411              * form.
6412              */
6413             return packFloatx80(aSign, 1, aSig0);
6414         }
6415         return a;
6416     }
6417     if ( bExp == 0 ) {
6418         if ( bSig == 0 ) {
6419  invalid:
6420             float_raise(float_flag_invalid, status);
6421             return floatx80_default_nan(status);
6422         }
6423         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6424     }
6425     if ( aExp == 0 ) {
6426         if ( aSig0 == 0 ) return a;
6427         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6428     }
6429     zSign = aSign;
6430     expDiff = aExp - bExp;
6431     aSig1 = 0;
6432     if ( expDiff < 0 ) {
6433         if ( mod || expDiff < -1 ) {
6434             if (aExp == 1 && aExpOrig == 0) {
6435                 /*
6436                  * Pseudo-denormal argument must be returned in
6437                  * normalized form.
6438                  */
6439                 return packFloatx80(aSign, aExp, aSig0);
6440             }
6441             return a;
6442         }
6443         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6444         expDiff = 0;
6445     }
6446     *quotient = q = ( bSig <= aSig0 );
6447     if ( q ) aSig0 -= bSig;
6448     expDiff -= 64;
6449     while ( 0 < expDiff ) {
6450         q = estimateDiv128To64( aSig0, aSig1, bSig );
6451         q = ( 2 < q ) ? q - 2 : 0;
6452         mul64To128( bSig, q, &term0, &term1 );
6453         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6454         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6455         expDiff -= 62;
6456         *quotient <<= 62;
6457         *quotient += q;
6458     }
6459     expDiff += 64;
6460     if ( 0 < expDiff ) {
6461         q = estimateDiv128To64( aSig0, aSig1, bSig );
6462         q = ( 2 < q ) ? q - 2 : 0;
6463         q >>= 64 - expDiff;
6464         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6465         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6466         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6467         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6468             ++q;
6469             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6470         }
6471         if (expDiff < 64) {
6472             *quotient <<= expDiff;
6473         } else {
6474             *quotient = 0;
6475         }
6476         *quotient += q;
6477     }
6478     else {
6479         term1 = 0;
6480         term0 = bSig;
6481     }
6482     if (!mod) {
6483         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6484         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6485                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6486                         && ( q & 1 ) )
6487             ) {
6488             aSig0 = alternateASig0;
6489             aSig1 = alternateASig1;
6490             zSign = ! zSign;
6491             ++*quotient;
6492         }
6493     }
6494     return
6495         normalizeRoundAndPackFloatx80(
6496             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6497 
6498 }
6499 
6500 /*----------------------------------------------------------------------------
6501 | Returns the remainder of the extended double-precision floating-point value
6502 | `a' with respect to the corresponding value `b'.  The operation is performed
6503 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6504 *----------------------------------------------------------------------------*/
6505 
6506 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6507 {
6508     uint64_t quotient;
6509     return floatx80_modrem(a, b, false, &quotient, status);
6510 }
6511 
6512 /*----------------------------------------------------------------------------
6513 | Returns the remainder of the extended double-precision floating-point value
6514 | `a' with respect to the corresponding value `b', with the quotient truncated
6515 | toward zero.
6516 *----------------------------------------------------------------------------*/
6517 
6518 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6519 {
6520     uint64_t quotient;
6521     return floatx80_modrem(a, b, true, &quotient, status);
6522 }
6523 
6524 /*----------------------------------------------------------------------------
6525 | Returns the square root of the extended double-precision floating-point
6526 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6527 | for Binary Floating-Point Arithmetic.
6528 *----------------------------------------------------------------------------*/
6529 
6530 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6531 {
6532     bool aSign;
6533     int32_t aExp, zExp;
6534     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6535     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6536 
6537     if (floatx80_invalid_encoding(a)) {
6538         float_raise(float_flag_invalid, status);
6539         return floatx80_default_nan(status);
6540     }
6541     aSig0 = extractFloatx80Frac( a );
6542     aExp = extractFloatx80Exp( a );
6543     aSign = extractFloatx80Sign( a );
6544     if ( aExp == 0x7FFF ) {
6545         if ((uint64_t)(aSig0 << 1)) {
6546             return propagateFloatx80NaN(a, a, status);
6547         }
6548         if ( ! aSign ) return a;
6549         goto invalid;
6550     }
6551     if ( aSign ) {
6552         if ( ( aExp | aSig0 ) == 0 ) return a;
6553  invalid:
6554         float_raise(float_flag_invalid, status);
6555         return floatx80_default_nan(status);
6556     }
6557     if ( aExp == 0 ) {
6558         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6559         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6560     }
6561     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6562     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6563     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6564     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6565     doubleZSig0 = zSig0<<1;
6566     mul64To128( zSig0, zSig0, &term0, &term1 );
6567     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6568     while ( (int64_t) rem0 < 0 ) {
6569         --zSig0;
6570         doubleZSig0 -= 2;
6571         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6572     }
6573     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6574     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6575         if ( zSig1 == 0 ) zSig1 = 1;
6576         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6577         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6578         mul64To128( zSig1, zSig1, &term2, &term3 );
6579         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6580         while ( (int64_t) rem1 < 0 ) {
6581             --zSig1;
6582             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6583             term3 |= 1;
6584             term2 |= doubleZSig0;
6585             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6586         }
6587         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6588     }
6589     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6590     zSig0 |= doubleZSig0;
6591     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6592                                 0, zExp, zSig0, zSig1, status);
6593 }
6594 
6595 /*----------------------------------------------------------------------------
6596 | Returns the result of converting the quadruple-precision floating-point
6597 | value `a' to the 32-bit two's complement integer format.  The conversion
6598 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6599 | Arithmetic---which means in particular that the conversion is rounded
6600 | according to the current rounding mode.  If `a' is a NaN, the largest
6601 | positive integer is returned.  Otherwise, if the conversion overflows, the
6602 | largest integer with the same sign as `a' is returned.
6603 *----------------------------------------------------------------------------*/
6604 
6605 int32_t float128_to_int32(float128 a, float_status *status)
6606 {
6607     bool aSign;
6608     int32_t aExp, shiftCount;
6609     uint64_t aSig0, aSig1;
6610 
6611     aSig1 = extractFloat128Frac1( a );
6612     aSig0 = extractFloat128Frac0( a );
6613     aExp = extractFloat128Exp( a );
6614     aSign = extractFloat128Sign( a );
6615     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6616     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6617     aSig0 |= ( aSig1 != 0 );
6618     shiftCount = 0x4028 - aExp;
6619     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6620     return roundAndPackInt32(aSign, aSig0, status);
6621 
6622 }
6623 
6624 /*----------------------------------------------------------------------------
6625 | Returns the result of converting the quadruple-precision floating-point
6626 | value `a' to the 32-bit two's complement integer format.  The conversion
6627 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6628 | Arithmetic, except that the conversion is always rounded toward zero.  If
6629 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6630 | conversion overflows, the largest integer with the same sign as `a' is
6631 | returned.
6632 *----------------------------------------------------------------------------*/
6633 
6634 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6635 {
6636     bool aSign;
6637     int32_t aExp, shiftCount;
6638     uint64_t aSig0, aSig1, savedASig;
6639     int32_t z;
6640 
6641     aSig1 = extractFloat128Frac1( a );
6642     aSig0 = extractFloat128Frac0( a );
6643     aExp = extractFloat128Exp( a );
6644     aSign = extractFloat128Sign( a );
6645     aSig0 |= ( aSig1 != 0 );
6646     if ( 0x401E < aExp ) {
6647         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6648         goto invalid;
6649     }
6650     else if ( aExp < 0x3FFF ) {
6651         if (aExp || aSig0) {
6652             float_raise(float_flag_inexact, status);
6653         }
6654         return 0;
6655     }
6656     aSig0 |= UINT64_C(0x0001000000000000);
6657     shiftCount = 0x402F - aExp;
6658     savedASig = aSig0;
6659     aSig0 >>= shiftCount;
6660     z = aSig0;
6661     if ( aSign ) z = - z;
6662     if ( ( z < 0 ) ^ aSign ) {
6663  invalid:
6664         float_raise(float_flag_invalid, status);
6665         return aSign ? INT32_MIN : INT32_MAX;
6666     }
6667     if ( ( aSig0<<shiftCount ) != savedASig ) {
6668         float_raise(float_flag_inexact, status);
6669     }
6670     return z;
6671 
6672 }
6673 
6674 /*----------------------------------------------------------------------------
6675 | Returns the result of converting the quadruple-precision floating-point
6676 | value `a' to the 64-bit two's complement integer format.  The conversion
6677 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6678 | Arithmetic---which means in particular that the conversion is rounded
6679 | according to the current rounding mode.  If `a' is a NaN, the largest
6680 | positive integer is returned.  Otherwise, if the conversion overflows, the
6681 | largest integer with the same sign as `a' is returned.
6682 *----------------------------------------------------------------------------*/
6683 
6684 int64_t float128_to_int64(float128 a, float_status *status)
6685 {
6686     bool aSign;
6687     int32_t aExp, shiftCount;
6688     uint64_t aSig0, aSig1;
6689 
6690     aSig1 = extractFloat128Frac1( a );
6691     aSig0 = extractFloat128Frac0( a );
6692     aExp = extractFloat128Exp( a );
6693     aSign = extractFloat128Sign( a );
6694     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6695     shiftCount = 0x402F - aExp;
6696     if ( shiftCount <= 0 ) {
6697         if ( 0x403E < aExp ) {
6698             float_raise(float_flag_invalid, status);
6699             if (    ! aSign
6700                  || (    ( aExp == 0x7FFF )
6701                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6702                     )
6703                ) {
6704                 return INT64_MAX;
6705             }
6706             return INT64_MIN;
6707         }
6708         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6709     }
6710     else {
6711         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6712     }
6713     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6714 
6715 }
6716 
6717 /*----------------------------------------------------------------------------
6718 | Returns the result of converting the quadruple-precision floating-point
6719 | value `a' to the 64-bit two's complement integer format.  The conversion
6720 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6721 | Arithmetic, except that the conversion is always rounded toward zero.
6722 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6723 | the conversion overflows, the largest integer with the same sign as `a' is
6724 | returned.
6725 *----------------------------------------------------------------------------*/
6726 
6727 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6728 {
6729     bool aSign;
6730     int32_t aExp, shiftCount;
6731     uint64_t aSig0, aSig1;
6732     int64_t z;
6733 
6734     aSig1 = extractFloat128Frac1( a );
6735     aSig0 = extractFloat128Frac0( a );
6736     aExp = extractFloat128Exp( a );
6737     aSign = extractFloat128Sign( a );
6738     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6739     shiftCount = aExp - 0x402F;
6740     if ( 0 < shiftCount ) {
6741         if ( 0x403E <= aExp ) {
6742             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6743             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6744                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6745                 if (aSig1) {
6746                     float_raise(float_flag_inexact, status);
6747                 }
6748             }
6749             else {
6750                 float_raise(float_flag_invalid, status);
6751                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6752                     return INT64_MAX;
6753                 }
6754             }
6755             return INT64_MIN;
6756         }
6757         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6758         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6759             float_raise(float_flag_inexact, status);
6760         }
6761     }
6762     else {
6763         if ( aExp < 0x3FFF ) {
6764             if ( aExp | aSig0 | aSig1 ) {
6765                 float_raise(float_flag_inexact, status);
6766             }
6767             return 0;
6768         }
6769         z = aSig0>>( - shiftCount );
6770         if (    aSig1
6771              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6772             float_raise(float_flag_inexact, status);
6773         }
6774     }
6775     if ( aSign ) z = - z;
6776     return z;
6777 
6778 }
6779 
6780 /*----------------------------------------------------------------------------
6781 | Returns the result of converting the quadruple-precision floating-point value
6782 | `a' to the 64-bit unsigned integer format.  The conversion is
6783 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6784 | Arithmetic---which means in particular that the conversion is rounded
6785 | according to the current rounding mode.  If `a' is a NaN, the largest
6786 | positive integer is returned.  If the conversion overflows, the
6787 | largest unsigned integer is returned.  If 'a' is negative, the value is
6788 | rounded and zero is returned; negative values that do not round to zero
6789 | will raise the inexact exception.
6790 *----------------------------------------------------------------------------*/
6791 
6792 uint64_t float128_to_uint64(float128 a, float_status *status)
6793 {
6794     bool aSign;
6795     int aExp;
6796     int shiftCount;
6797     uint64_t aSig0, aSig1;
6798 
6799     aSig0 = extractFloat128Frac0(a);
6800     aSig1 = extractFloat128Frac1(a);
6801     aExp = extractFloat128Exp(a);
6802     aSign = extractFloat128Sign(a);
6803     if (aSign && (aExp > 0x3FFE)) {
6804         float_raise(float_flag_invalid, status);
6805         if (float128_is_any_nan(a)) {
6806             return UINT64_MAX;
6807         } else {
6808             return 0;
6809         }
6810     }
6811     if (aExp) {
6812         aSig0 |= UINT64_C(0x0001000000000000);
6813     }
6814     shiftCount = 0x402F - aExp;
6815     if (shiftCount <= 0) {
6816         if (0x403E < aExp) {
6817             float_raise(float_flag_invalid, status);
6818             return UINT64_MAX;
6819         }
6820         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6821     } else {
6822         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6823     }
6824     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6825 }
6826 
6827 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6828 {
6829     uint64_t v;
6830     signed char current_rounding_mode = status->float_rounding_mode;
6831 
6832     set_float_rounding_mode(float_round_to_zero, status);
6833     v = float128_to_uint64(a, status);
6834     set_float_rounding_mode(current_rounding_mode, status);
6835 
6836     return v;
6837 }
6838 
6839 /*----------------------------------------------------------------------------
6840 | Returns the result of converting the quadruple-precision floating-point
6841 | value `a' to the 32-bit unsigned integer format.  The conversion
6842 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6843 | Arithmetic except that the conversion is always rounded toward zero.
6844 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6845 | if the conversion overflows, the largest unsigned integer is returned.
6846 | If 'a' is negative, the value is rounded and zero is returned; negative
6847 | values that do not round to zero will raise the inexact exception.
6848 *----------------------------------------------------------------------------*/
6849 
6850 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6851 {
6852     uint64_t v;
6853     uint32_t res;
6854     int old_exc_flags = get_float_exception_flags(status);
6855 
6856     v = float128_to_uint64_round_to_zero(a, status);
6857     if (v > 0xffffffff) {
6858         res = 0xffffffff;
6859     } else {
6860         return v;
6861     }
6862     set_float_exception_flags(old_exc_flags, status);
6863     float_raise(float_flag_invalid, status);
6864     return res;
6865 }
6866 
6867 /*----------------------------------------------------------------------------
6868 | Returns the result of converting the quadruple-precision floating-point value
6869 | `a' to the 32-bit unsigned integer format.  The conversion is
6870 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6871 | Arithmetic---which means in particular that the conversion is rounded
6872 | according to the current rounding mode.  If `a' is a NaN, the largest
6873 | positive integer is returned.  If the conversion overflows, the
6874 | largest unsigned integer is returned.  If 'a' is negative, the value is
6875 | rounded and zero is returned; negative values that do not round to zero
6876 | will raise the inexact exception.
6877 *----------------------------------------------------------------------------*/
6878 
6879 uint32_t float128_to_uint32(float128 a, float_status *status)
6880 {
6881     uint64_t v;
6882     uint32_t res;
6883     int old_exc_flags = get_float_exception_flags(status);
6884 
6885     v = float128_to_uint64(a, status);
6886     if (v > 0xffffffff) {
6887         res = 0xffffffff;
6888     } else {
6889         return v;
6890     }
6891     set_float_exception_flags(old_exc_flags, status);
6892     float_raise(float_flag_invalid, status);
6893     return res;
6894 }
6895 
6896 /*----------------------------------------------------------------------------
6897 | Returns the result of converting the quadruple-precision floating-point
6898 | value `a' to the single-precision floating-point format.  The conversion
6899 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6900 | Arithmetic.
6901 *----------------------------------------------------------------------------*/
6902 
6903 float32 float128_to_float32(float128 a, float_status *status)
6904 {
6905     bool aSign;
6906     int32_t aExp;
6907     uint64_t aSig0, aSig1;
6908     uint32_t zSig;
6909 
6910     aSig1 = extractFloat128Frac1( a );
6911     aSig0 = extractFloat128Frac0( a );
6912     aExp = extractFloat128Exp( a );
6913     aSign = extractFloat128Sign( a );
6914     if ( aExp == 0x7FFF ) {
6915         if ( aSig0 | aSig1 ) {
6916             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6917         }
6918         return packFloat32( aSign, 0xFF, 0 );
6919     }
6920     aSig0 |= ( aSig1 != 0 );
6921     shift64RightJamming( aSig0, 18, &aSig0 );
6922     zSig = aSig0;
6923     if ( aExp || zSig ) {
6924         zSig |= 0x40000000;
6925         aExp -= 0x3F81;
6926     }
6927     return roundAndPackFloat32(aSign, aExp, zSig, status);
6928 
6929 }
6930 
6931 /*----------------------------------------------------------------------------
6932 | Returns the result of converting the quadruple-precision floating-point
6933 | value `a' to the double-precision floating-point format.  The conversion
6934 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6935 | Arithmetic.
6936 *----------------------------------------------------------------------------*/
6937 
6938 float64 float128_to_float64(float128 a, float_status *status)
6939 {
6940     bool aSign;
6941     int32_t aExp;
6942     uint64_t aSig0, aSig1;
6943 
6944     aSig1 = extractFloat128Frac1( a );
6945     aSig0 = extractFloat128Frac0( a );
6946     aExp = extractFloat128Exp( a );
6947     aSign = extractFloat128Sign( a );
6948     if ( aExp == 0x7FFF ) {
6949         if ( aSig0 | aSig1 ) {
6950             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6951         }
6952         return packFloat64( aSign, 0x7FF, 0 );
6953     }
6954     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6955     aSig0 |= ( aSig1 != 0 );
6956     if ( aExp || aSig0 ) {
6957         aSig0 |= UINT64_C(0x4000000000000000);
6958         aExp -= 0x3C01;
6959     }
6960     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6961 
6962 }
6963 
6964 /*----------------------------------------------------------------------------
6965 | Returns the result of converting the quadruple-precision floating-point
6966 | value `a' to the extended double-precision floating-point format.  The
6967 | conversion is performed according to the IEC/IEEE Standard for Binary
6968 | Floating-Point Arithmetic.
6969 *----------------------------------------------------------------------------*/
6970 
6971 floatx80 float128_to_floatx80(float128 a, float_status *status)
6972 {
6973     bool aSign;
6974     int32_t aExp;
6975     uint64_t aSig0, aSig1;
6976 
6977     aSig1 = extractFloat128Frac1( a );
6978     aSig0 = extractFloat128Frac0( a );
6979     aExp = extractFloat128Exp( a );
6980     aSign = extractFloat128Sign( a );
6981     if ( aExp == 0x7FFF ) {
6982         if ( aSig0 | aSig1 ) {
6983             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6984                                                status);
6985             return floatx80_silence_nan(res, status);
6986         }
6987         return packFloatx80(aSign, floatx80_infinity_high,
6988                                    floatx80_infinity_low);
6989     }
6990     if ( aExp == 0 ) {
6991         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6992         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6993     }
6994     else {
6995         aSig0 |= UINT64_C(0x0001000000000000);
6996     }
6997     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6998     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6999 
7000 }
7001 
7002 /*----------------------------------------------------------------------------
7003 | Rounds the quadruple-precision floating-point value `a' to an integer, and
7004 | returns the result as a quadruple-precision floating-point value.  The
7005 | operation is performed according to the IEC/IEEE Standard for Binary
7006 | Floating-Point Arithmetic.
7007 *----------------------------------------------------------------------------*/
7008 
7009 float128 float128_round_to_int(float128 a, float_status *status)
7010 {
7011     bool aSign;
7012     int32_t aExp;
7013     uint64_t lastBitMask, roundBitsMask;
7014     float128 z;
7015 
7016     aExp = extractFloat128Exp( a );
7017     if ( 0x402F <= aExp ) {
7018         if ( 0x406F <= aExp ) {
7019             if (    ( aExp == 0x7FFF )
7020                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
7021                ) {
7022                 return propagateFloat128NaN(a, a, status);
7023             }
7024             return a;
7025         }
7026         lastBitMask = 1;
7027         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7028         roundBitsMask = lastBitMask - 1;
7029         z = a;
7030         switch (status->float_rounding_mode) {
7031         case float_round_nearest_even:
7032             if ( lastBitMask ) {
7033                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7034                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7035             }
7036             else {
7037                 if ( (int64_t) z.low < 0 ) {
7038                     ++z.high;
7039                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7040                 }
7041             }
7042             break;
7043         case float_round_ties_away:
7044             if (lastBitMask) {
7045                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7046             } else {
7047                 if ((int64_t) z.low < 0) {
7048                     ++z.high;
7049                 }
7050             }
7051             break;
7052         case float_round_to_zero:
7053             break;
7054         case float_round_up:
7055             if (!extractFloat128Sign(z)) {
7056                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7057             }
7058             break;
7059         case float_round_down:
7060             if (extractFloat128Sign(z)) {
7061                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7062             }
7063             break;
7064         case float_round_to_odd:
7065             /*
7066              * Note that if lastBitMask == 0, the last bit is the lsb
7067              * of high, and roundBitsMask == -1.
7068              */
7069             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7070                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7071             }
7072             break;
7073         default:
7074             abort();
7075         }
7076         z.low &= ~ roundBitsMask;
7077     }
7078     else {
7079         if ( aExp < 0x3FFF ) {
7080             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7081             float_raise(float_flag_inexact, status);
7082             aSign = extractFloat128Sign( a );
7083             switch (status->float_rounding_mode) {
7084             case float_round_nearest_even:
7085                 if (    ( aExp == 0x3FFE )
7086                      && (   extractFloat128Frac0( a )
7087                           | extractFloat128Frac1( a ) )
7088                    ) {
7089                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7090                 }
7091                 break;
7092             case float_round_ties_away:
7093                 if (aExp == 0x3FFE) {
7094                     return packFloat128(aSign, 0x3FFF, 0, 0);
7095                 }
7096                 break;
7097             case float_round_down:
7098                 return
7099                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7100                     : packFloat128( 0, 0, 0, 0 );
7101             case float_round_up:
7102                 return
7103                       aSign ? packFloat128( 1, 0, 0, 0 )
7104                     : packFloat128( 0, 0x3FFF, 0, 0 );
7105 
7106             case float_round_to_odd:
7107                 return packFloat128(aSign, 0x3FFF, 0, 0);
7108 
7109             case float_round_to_zero:
7110                 break;
7111             }
7112             return packFloat128( aSign, 0, 0, 0 );
7113         }
7114         lastBitMask = 1;
7115         lastBitMask <<= 0x402F - aExp;
7116         roundBitsMask = lastBitMask - 1;
7117         z.low = 0;
7118         z.high = a.high;
7119         switch (status->float_rounding_mode) {
7120         case float_round_nearest_even:
7121             z.high += lastBitMask>>1;
7122             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7123                 z.high &= ~ lastBitMask;
7124             }
7125             break;
7126         case float_round_ties_away:
7127             z.high += lastBitMask>>1;
7128             break;
7129         case float_round_to_zero:
7130             break;
7131         case float_round_up:
7132             if (!extractFloat128Sign(z)) {
7133                 z.high |= ( a.low != 0 );
7134                 z.high += roundBitsMask;
7135             }
7136             break;
7137         case float_round_down:
7138             if (extractFloat128Sign(z)) {
7139                 z.high |= (a.low != 0);
7140                 z.high += roundBitsMask;
7141             }
7142             break;
7143         case float_round_to_odd:
7144             if ((z.high & lastBitMask) == 0) {
7145                 z.high |= (a.low != 0);
7146                 z.high += roundBitsMask;
7147             }
7148             break;
7149         default:
7150             abort();
7151         }
7152         z.high &= ~ roundBitsMask;
7153     }
7154     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7155         float_raise(float_flag_inexact, status);
7156     }
7157     return z;
7158 
7159 }
7160 
7161 /*----------------------------------------------------------------------------
7162 | Returns the result of adding the absolute values of the quadruple-precision
7163 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7164 | before being returned.  `zSign' is ignored if the result is a NaN.
7165 | The addition is performed according to the IEC/IEEE Standard for Binary
7166 | Floating-Point Arithmetic.
7167 *----------------------------------------------------------------------------*/
7168 
7169 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
7170                                 float_status *status)
7171 {
7172     int32_t aExp, bExp, zExp;
7173     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7174     int32_t expDiff;
7175 
7176     aSig1 = extractFloat128Frac1( a );
7177     aSig0 = extractFloat128Frac0( a );
7178     aExp = extractFloat128Exp( a );
7179     bSig1 = extractFloat128Frac1( b );
7180     bSig0 = extractFloat128Frac0( b );
7181     bExp = extractFloat128Exp( b );
7182     expDiff = aExp - bExp;
7183     if ( 0 < expDiff ) {
7184         if ( aExp == 0x7FFF ) {
7185             if (aSig0 | aSig1) {
7186                 return propagateFloat128NaN(a, b, status);
7187             }
7188             return a;
7189         }
7190         if ( bExp == 0 ) {
7191             --expDiff;
7192         }
7193         else {
7194             bSig0 |= UINT64_C(0x0001000000000000);
7195         }
7196         shift128ExtraRightJamming(
7197             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7198         zExp = aExp;
7199     }
7200     else if ( expDiff < 0 ) {
7201         if ( bExp == 0x7FFF ) {
7202             if (bSig0 | bSig1) {
7203                 return propagateFloat128NaN(a, b, status);
7204             }
7205             return packFloat128( zSign, 0x7FFF, 0, 0 );
7206         }
7207         if ( aExp == 0 ) {
7208             ++expDiff;
7209         }
7210         else {
7211             aSig0 |= UINT64_C(0x0001000000000000);
7212         }
7213         shift128ExtraRightJamming(
7214             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7215         zExp = bExp;
7216     }
7217     else {
7218         if ( aExp == 0x7FFF ) {
7219             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7220                 return propagateFloat128NaN(a, b, status);
7221             }
7222             return a;
7223         }
7224         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7225         if ( aExp == 0 ) {
7226             if (status->flush_to_zero) {
7227                 if (zSig0 | zSig1) {
7228                     float_raise(float_flag_output_denormal, status);
7229                 }
7230                 return packFloat128(zSign, 0, 0, 0);
7231             }
7232             return packFloat128( zSign, 0, zSig0, zSig1 );
7233         }
7234         zSig2 = 0;
7235         zSig0 |= UINT64_C(0x0002000000000000);
7236         zExp = aExp;
7237         goto shiftRight1;
7238     }
7239     aSig0 |= UINT64_C(0x0001000000000000);
7240     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7241     --zExp;
7242     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7243     ++zExp;
7244  shiftRight1:
7245     shift128ExtraRightJamming(
7246         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7247  roundAndPack:
7248     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7249 
7250 }
7251 
7252 /*----------------------------------------------------------------------------
7253 | Returns the result of subtracting the absolute values of the quadruple-
7254 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7255 | difference is negated before being returned.  `zSign' is ignored if the
7256 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7257 | Standard for Binary Floating-Point Arithmetic.
7258 *----------------------------------------------------------------------------*/
7259 
7260 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7261                                 float_status *status)
7262 {
7263     int32_t aExp, bExp, zExp;
7264     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7265     int32_t expDiff;
7266 
7267     aSig1 = extractFloat128Frac1( a );
7268     aSig0 = extractFloat128Frac0( a );
7269     aExp = extractFloat128Exp( a );
7270     bSig1 = extractFloat128Frac1( b );
7271     bSig0 = extractFloat128Frac0( b );
7272     bExp = extractFloat128Exp( b );
7273     expDiff = aExp - bExp;
7274     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7275     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7276     if ( 0 < expDiff ) goto aExpBigger;
7277     if ( expDiff < 0 ) goto bExpBigger;
7278     if ( aExp == 0x7FFF ) {
7279         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7280             return propagateFloat128NaN(a, b, status);
7281         }
7282         float_raise(float_flag_invalid, status);
7283         return float128_default_nan(status);
7284     }
7285     if ( aExp == 0 ) {
7286         aExp = 1;
7287         bExp = 1;
7288     }
7289     if ( bSig0 < aSig0 ) goto aBigger;
7290     if ( aSig0 < bSig0 ) goto bBigger;
7291     if ( bSig1 < aSig1 ) goto aBigger;
7292     if ( aSig1 < bSig1 ) goto bBigger;
7293     return packFloat128(status->float_rounding_mode == float_round_down,
7294                         0, 0, 0);
7295  bExpBigger:
7296     if ( bExp == 0x7FFF ) {
7297         if (bSig0 | bSig1) {
7298             return propagateFloat128NaN(a, b, status);
7299         }
7300         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7301     }
7302     if ( aExp == 0 ) {
7303         ++expDiff;
7304     }
7305     else {
7306         aSig0 |= UINT64_C(0x4000000000000000);
7307     }
7308     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7309     bSig0 |= UINT64_C(0x4000000000000000);
7310  bBigger:
7311     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7312     zExp = bExp;
7313     zSign ^= 1;
7314     goto normalizeRoundAndPack;
7315  aExpBigger:
7316     if ( aExp == 0x7FFF ) {
7317         if (aSig0 | aSig1) {
7318             return propagateFloat128NaN(a, b, status);
7319         }
7320         return a;
7321     }
7322     if ( bExp == 0 ) {
7323         --expDiff;
7324     }
7325     else {
7326         bSig0 |= UINT64_C(0x4000000000000000);
7327     }
7328     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7329     aSig0 |= UINT64_C(0x4000000000000000);
7330  aBigger:
7331     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7332     zExp = aExp;
7333  normalizeRoundAndPack:
7334     --zExp;
7335     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7336                                          status);
7337 
7338 }
7339 
7340 /*----------------------------------------------------------------------------
7341 | Returns the result of adding the quadruple-precision floating-point values
7342 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7343 | for Binary Floating-Point Arithmetic.
7344 *----------------------------------------------------------------------------*/
7345 
7346 float128 float128_add(float128 a, float128 b, float_status *status)
7347 {
7348     bool aSign, bSign;
7349 
7350     aSign = extractFloat128Sign( a );
7351     bSign = extractFloat128Sign( b );
7352     if ( aSign == bSign ) {
7353         return addFloat128Sigs(a, b, aSign, status);
7354     }
7355     else {
7356         return subFloat128Sigs(a, b, aSign, status);
7357     }
7358 
7359 }
7360 
7361 /*----------------------------------------------------------------------------
7362 | Returns the result of subtracting the quadruple-precision floating-point
7363 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7364 | Standard for Binary Floating-Point Arithmetic.
7365 *----------------------------------------------------------------------------*/
7366 
7367 float128 float128_sub(float128 a, float128 b, float_status *status)
7368 {
7369     bool aSign, bSign;
7370 
7371     aSign = extractFloat128Sign( a );
7372     bSign = extractFloat128Sign( b );
7373     if ( aSign == bSign ) {
7374         return subFloat128Sigs(a, b, aSign, status);
7375     }
7376     else {
7377         return addFloat128Sigs(a, b, aSign, status);
7378     }
7379 
7380 }
7381 
7382 /*----------------------------------------------------------------------------
7383 | Returns the result of multiplying the quadruple-precision floating-point
7384 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7385 | Standard for Binary Floating-Point Arithmetic.
7386 *----------------------------------------------------------------------------*/
7387 
7388 float128 float128_mul(float128 a, float128 b, float_status *status)
7389 {
7390     bool aSign, bSign, zSign;
7391     int32_t aExp, bExp, zExp;
7392     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7393 
7394     aSig1 = extractFloat128Frac1( a );
7395     aSig0 = extractFloat128Frac0( a );
7396     aExp = extractFloat128Exp( a );
7397     aSign = extractFloat128Sign( a );
7398     bSig1 = extractFloat128Frac1( b );
7399     bSig0 = extractFloat128Frac0( b );
7400     bExp = extractFloat128Exp( b );
7401     bSign = extractFloat128Sign( b );
7402     zSign = aSign ^ bSign;
7403     if ( aExp == 0x7FFF ) {
7404         if (    ( aSig0 | aSig1 )
7405              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7406             return propagateFloat128NaN(a, b, status);
7407         }
7408         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7409         return packFloat128( zSign, 0x7FFF, 0, 0 );
7410     }
7411     if ( bExp == 0x7FFF ) {
7412         if (bSig0 | bSig1) {
7413             return propagateFloat128NaN(a, b, status);
7414         }
7415         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7416  invalid:
7417             float_raise(float_flag_invalid, status);
7418             return float128_default_nan(status);
7419         }
7420         return packFloat128( zSign, 0x7FFF, 0, 0 );
7421     }
7422     if ( aExp == 0 ) {
7423         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7424         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7425     }
7426     if ( bExp == 0 ) {
7427         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7428         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7429     }
7430     zExp = aExp + bExp - 0x4000;
7431     aSig0 |= UINT64_C(0x0001000000000000);
7432     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7433     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7434     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7435     zSig2 |= ( zSig3 != 0 );
7436     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7437         shift128ExtraRightJamming(
7438             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7439         ++zExp;
7440     }
7441     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7442 
7443 }
7444 
7445 /*----------------------------------------------------------------------------
7446 | Returns the result of dividing the quadruple-precision floating-point value
7447 | `a' by the corresponding value `b'.  The operation is performed according to
7448 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7449 *----------------------------------------------------------------------------*/
7450 
7451 float128 float128_div(float128 a, float128 b, float_status *status)
7452 {
7453     bool aSign, bSign, zSign;
7454     int32_t aExp, bExp, zExp;
7455     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7456     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7457 
7458     aSig1 = extractFloat128Frac1( a );
7459     aSig0 = extractFloat128Frac0( a );
7460     aExp = extractFloat128Exp( a );
7461     aSign = extractFloat128Sign( a );
7462     bSig1 = extractFloat128Frac1( b );
7463     bSig0 = extractFloat128Frac0( b );
7464     bExp = extractFloat128Exp( b );
7465     bSign = extractFloat128Sign( b );
7466     zSign = aSign ^ bSign;
7467     if ( aExp == 0x7FFF ) {
7468         if (aSig0 | aSig1) {
7469             return propagateFloat128NaN(a, b, status);
7470         }
7471         if ( bExp == 0x7FFF ) {
7472             if (bSig0 | bSig1) {
7473                 return propagateFloat128NaN(a, b, status);
7474             }
7475             goto invalid;
7476         }
7477         return packFloat128( zSign, 0x7FFF, 0, 0 );
7478     }
7479     if ( bExp == 0x7FFF ) {
7480         if (bSig0 | bSig1) {
7481             return propagateFloat128NaN(a, b, status);
7482         }
7483         return packFloat128( zSign, 0, 0, 0 );
7484     }
7485     if ( bExp == 0 ) {
7486         if ( ( bSig0 | bSig1 ) == 0 ) {
7487             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7488  invalid:
7489                 float_raise(float_flag_invalid, status);
7490                 return float128_default_nan(status);
7491             }
7492             float_raise(float_flag_divbyzero, status);
7493             return packFloat128( zSign, 0x7FFF, 0, 0 );
7494         }
7495         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7496     }
7497     if ( aExp == 0 ) {
7498         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7499         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7500     }
7501     zExp = aExp - bExp + 0x3FFD;
7502     shortShift128Left(
7503         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7504     shortShift128Left(
7505         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7506     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7507         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7508         ++zExp;
7509     }
7510     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7511     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7512     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7513     while ( (int64_t) rem0 < 0 ) {
7514         --zSig0;
7515         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7516     }
7517     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7518     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7519         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7520         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7521         while ( (int64_t) rem1 < 0 ) {
7522             --zSig1;
7523             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7524         }
7525         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7526     }
7527     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7528     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7529 
7530 }
7531 
7532 /*----------------------------------------------------------------------------
7533 | Returns the remainder of the quadruple-precision floating-point value `a'
7534 | with respect to the corresponding value `b'.  The operation is performed
7535 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7536 *----------------------------------------------------------------------------*/
7537 
7538 float128 float128_rem(float128 a, float128 b, float_status *status)
7539 {
7540     bool aSign, zSign;
7541     int32_t aExp, bExp, expDiff;
7542     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7543     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7544     int64_t sigMean0;
7545 
7546     aSig1 = extractFloat128Frac1( a );
7547     aSig0 = extractFloat128Frac0( a );
7548     aExp = extractFloat128Exp( a );
7549     aSign = extractFloat128Sign( a );
7550     bSig1 = extractFloat128Frac1( b );
7551     bSig0 = extractFloat128Frac0( b );
7552     bExp = extractFloat128Exp( b );
7553     if ( aExp == 0x7FFF ) {
7554         if (    ( aSig0 | aSig1 )
7555              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7556             return propagateFloat128NaN(a, b, status);
7557         }
7558         goto invalid;
7559     }
7560     if ( bExp == 0x7FFF ) {
7561         if (bSig0 | bSig1) {
7562             return propagateFloat128NaN(a, b, status);
7563         }
7564         return a;
7565     }
7566     if ( bExp == 0 ) {
7567         if ( ( bSig0 | bSig1 ) == 0 ) {
7568  invalid:
7569             float_raise(float_flag_invalid, status);
7570             return float128_default_nan(status);
7571         }
7572         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7573     }
7574     if ( aExp == 0 ) {
7575         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7576         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7577     }
7578     expDiff = aExp - bExp;
7579     if ( expDiff < -1 ) return a;
7580     shortShift128Left(
7581         aSig0 | UINT64_C(0x0001000000000000),
7582         aSig1,
7583         15 - ( expDiff < 0 ),
7584         &aSig0,
7585         &aSig1
7586     );
7587     shortShift128Left(
7588         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7589     q = le128( bSig0, bSig1, aSig0, aSig1 );
7590     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7591     expDiff -= 64;
7592     while ( 0 < expDiff ) {
7593         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7594         q = ( 4 < q ) ? q - 4 : 0;
7595         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7596         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7597         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7598         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7599         expDiff -= 61;
7600     }
7601     if ( -64 < expDiff ) {
7602         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7603         q = ( 4 < q ) ? q - 4 : 0;
7604         q >>= - expDiff;
7605         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7606         expDiff += 52;
7607         if ( expDiff < 0 ) {
7608             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7609         }
7610         else {
7611             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7612         }
7613         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7614         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7615     }
7616     else {
7617         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7618         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7619     }
7620     do {
7621         alternateASig0 = aSig0;
7622         alternateASig1 = aSig1;
7623         ++q;
7624         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7625     } while ( 0 <= (int64_t) aSig0 );
7626     add128(
7627         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7628     if (    ( sigMean0 < 0 )
7629          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7630         aSig0 = alternateASig0;
7631         aSig1 = alternateASig1;
7632     }
7633     zSign = ( (int64_t) aSig0 < 0 );
7634     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7635     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7636                                          status);
7637 }
7638 
7639 /*----------------------------------------------------------------------------
7640 | Returns the square root of the quadruple-precision floating-point value `a'.
7641 | The operation is performed according to the IEC/IEEE Standard for Binary
7642 | Floating-Point Arithmetic.
7643 *----------------------------------------------------------------------------*/
7644 
7645 float128 float128_sqrt(float128 a, float_status *status)
7646 {
7647     bool aSign;
7648     int32_t aExp, zExp;
7649     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7650     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7651 
7652     aSig1 = extractFloat128Frac1( a );
7653     aSig0 = extractFloat128Frac0( a );
7654     aExp = extractFloat128Exp( a );
7655     aSign = extractFloat128Sign( a );
7656     if ( aExp == 0x7FFF ) {
7657         if (aSig0 | aSig1) {
7658             return propagateFloat128NaN(a, a, status);
7659         }
7660         if ( ! aSign ) return a;
7661         goto invalid;
7662     }
7663     if ( aSign ) {
7664         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7665  invalid:
7666         float_raise(float_flag_invalid, status);
7667         return float128_default_nan(status);
7668     }
7669     if ( aExp == 0 ) {
7670         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7671         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7672     }
7673     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7674     aSig0 |= UINT64_C(0x0001000000000000);
7675     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7676     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7677     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7678     doubleZSig0 = zSig0<<1;
7679     mul64To128( zSig0, zSig0, &term0, &term1 );
7680     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7681     while ( (int64_t) rem0 < 0 ) {
7682         --zSig0;
7683         doubleZSig0 -= 2;
7684         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7685     }
7686     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7687     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7688         if ( zSig1 == 0 ) zSig1 = 1;
7689         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7690         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7691         mul64To128( zSig1, zSig1, &term2, &term3 );
7692         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7693         while ( (int64_t) rem1 < 0 ) {
7694             --zSig1;
7695             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7696             term3 |= 1;
7697             term2 |= doubleZSig0;
7698             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7699         }
7700         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7701     }
7702     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7703     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7704 
7705 }
7706 
7707 static inline FloatRelation
7708 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7709                           float_status *status)
7710 {
7711     bool aSign, bSign;
7712 
7713     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7714         float_raise(float_flag_invalid, status);
7715         return float_relation_unordered;
7716     }
7717     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7718           ( extractFloatx80Frac( a )<<1 ) ) ||
7719         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7720           ( extractFloatx80Frac( b )<<1 ) )) {
7721         if (!is_quiet ||
7722             floatx80_is_signaling_nan(a, status) ||
7723             floatx80_is_signaling_nan(b, status)) {
7724             float_raise(float_flag_invalid, status);
7725         }
7726         return float_relation_unordered;
7727     }
7728     aSign = extractFloatx80Sign( a );
7729     bSign = extractFloatx80Sign( b );
7730     if ( aSign != bSign ) {
7731 
7732         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7733              ( ( a.low | b.low ) == 0 ) ) {
7734             /* zero case */
7735             return float_relation_equal;
7736         } else {
7737             return 1 - (2 * aSign);
7738         }
7739     } else {
7740         /* Normalize pseudo-denormals before comparison.  */
7741         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7742             ++a.high;
7743         }
7744         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7745             ++b.high;
7746         }
7747         if (a.low == b.low && a.high == b.high) {
7748             return float_relation_equal;
7749         } else {
7750             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7751         }
7752     }
7753 }
7754 
7755 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7756 {
7757     return floatx80_compare_internal(a, b, 0, status);
7758 }
7759 
7760 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7761                                      float_status *status)
7762 {
7763     return floatx80_compare_internal(a, b, 1, status);
7764 }
7765 
7766 static inline FloatRelation
7767 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7768                           float_status *status)
7769 {
7770     bool aSign, bSign;
7771 
7772     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7773           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7774         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7775           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7776         if (!is_quiet ||
7777             float128_is_signaling_nan(a, status) ||
7778             float128_is_signaling_nan(b, status)) {
7779             float_raise(float_flag_invalid, status);
7780         }
7781         return float_relation_unordered;
7782     }
7783     aSign = extractFloat128Sign( a );
7784     bSign = extractFloat128Sign( b );
7785     if ( aSign != bSign ) {
7786         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7787             /* zero case */
7788             return float_relation_equal;
7789         } else {
7790             return 1 - (2 * aSign);
7791         }
7792     } else {
7793         if (a.low == b.low && a.high == b.high) {
7794             return float_relation_equal;
7795         } else {
7796             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7797         }
7798     }
7799 }
7800 
7801 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7802 {
7803     return float128_compare_internal(a, b, 0, status);
7804 }
7805 
7806 FloatRelation float128_compare_quiet(float128 a, float128 b,
7807                                      float_status *status)
7808 {
7809     return float128_compare_internal(a, b, 1, status);
7810 }
7811 
7812 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7813 {
7814     bool aSign;
7815     int32_t aExp;
7816     uint64_t aSig;
7817 
7818     if (floatx80_invalid_encoding(a)) {
7819         float_raise(float_flag_invalid, status);
7820         return floatx80_default_nan(status);
7821     }
7822     aSig = extractFloatx80Frac( a );
7823     aExp = extractFloatx80Exp( a );
7824     aSign = extractFloatx80Sign( a );
7825 
7826     if ( aExp == 0x7FFF ) {
7827         if ( aSig<<1 ) {
7828             return propagateFloatx80NaN(a, a, status);
7829         }
7830         return a;
7831     }
7832 
7833     if (aExp == 0) {
7834         if (aSig == 0) {
7835             return a;
7836         }
7837         aExp++;
7838     }
7839 
7840     if (n > 0x10000) {
7841         n = 0x10000;
7842     } else if (n < -0x10000) {
7843         n = -0x10000;
7844     }
7845 
7846     aExp += n;
7847     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7848                                          aSign, aExp, aSig, 0, status);
7849 }
7850 
7851 float128 float128_scalbn(float128 a, int n, float_status *status)
7852 {
7853     bool aSign;
7854     int32_t aExp;
7855     uint64_t aSig0, aSig1;
7856 
7857     aSig1 = extractFloat128Frac1( a );
7858     aSig0 = extractFloat128Frac0( a );
7859     aExp = extractFloat128Exp( a );
7860     aSign = extractFloat128Sign( a );
7861     if ( aExp == 0x7FFF ) {
7862         if ( aSig0 | aSig1 ) {
7863             return propagateFloat128NaN(a, a, status);
7864         }
7865         return a;
7866     }
7867     if (aExp != 0) {
7868         aSig0 |= UINT64_C(0x0001000000000000);
7869     } else if (aSig0 == 0 && aSig1 == 0) {
7870         return a;
7871     } else {
7872         aExp++;
7873     }
7874 
7875     if (n > 0x10000) {
7876         n = 0x10000;
7877     } else if (n < -0x10000) {
7878         n = -0x10000;
7879     }
7880 
7881     aExp += n - 1;
7882     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7883                                          , status);
7884 
7885 }
7886 
7887 static void __attribute__((constructor)) softfloat_init(void)
7888 {
7889     union_float64 ua, ub, uc, ur;
7890 
7891     if (QEMU_NO_HARDFLOAT) {
7892         return;
7893     }
7894     /*
7895      * Test that the host's FMA is not obviously broken. For example,
7896      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7897      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7898      */
7899     ua.s = 0x0020000000000001ULL;
7900     ub.s = 0x3ca0000000000000ULL;
7901     uc.s = 0x0020000000000000ULL;
7902     ur.h = fma(ua.h, ub.h, uc.h);
7903     if (ur.s != 0x0020000000000001ULL) {
7904         force_soft_fma = true;
7905     }
7906 }
7907