xref: /openbmc/qemu/fpu/softfloat.c (revision e9034ea8)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             float_raise(float_flag_input_denormal, s);                  \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345              f32_check_fn pre, f32_check_fn post)
346 {
347     union_float32 ua, ub, ur;
348 
349     ua.s = xa;
350     ub.s = xb;
351 
352     if (unlikely(!can_use_fpu(s))) {
353         goto soft;
354     }
355 
356     float32_input_flush2(&ua.s, &ub.s, s);
357     if (unlikely(!pre(ua, ub))) {
358         goto soft;
359     }
360 
361     ur.h = hard(ua.h, ub.h);
362     if (unlikely(f32_is_inf(ur))) {
363         float_raise(float_flag_overflow, s);
364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365         goto soft;
366     }
367     return ur.s;
368 
369  soft:
370     return soft(ua.s, ub.s, s);
371 }
372 
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376              f64_check_fn pre, f64_check_fn post)
377 {
378     union_float64 ua, ub, ur;
379 
380     ua.s = xa;
381     ub.s = xb;
382 
383     if (unlikely(!can_use_fpu(s))) {
384         goto soft;
385     }
386 
387     float64_input_flush2(&ua.s, &ub.s, s);
388     if (unlikely(!pre(ua, ub))) {
389         goto soft;
390     }
391 
392     ur.h = hard(ua.h, ub.h);
393     if (unlikely(f64_is_inf(ur))) {
394         float_raise(float_flag_overflow, s);
395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396         goto soft;
397     }
398     return ur.s;
399 
400  soft:
401     return soft(ua.s, ub.s, s);
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410     return float32_val(a) & 0x007FFFFF;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 static inline int extractFloat32Exp(float32 a)
418 {
419     return (float32_val(a) >> 23) & 0xFF;
420 }
421 
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline bool extractFloat32Sign(float32 a)
427 {
428     return float32_val(a) >> 31;
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434 
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439 
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443 
444 static inline int extractFloat64Exp(float64 a)
445 {
446     return (float64_val(a) >> 52) & 0x7FF;
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452 
453 static inline bool extractFloat64Sign(float64 a)
454 {
455     return float64_val(a) >> 63;
456 }
457 
458 /*
459  * Classify a floating point number. Everything above float_class_qnan
460  * is a NaN so cls >= float_class_qnan is any NaN.
461  */
462 
463 typedef enum __attribute__ ((__packed__)) {
464     float_class_unclassified,
465     float_class_zero,
466     float_class_normal,
467     float_class_inf,
468     float_class_qnan,  /* all NaNs from here */
469     float_class_snan,
470 } FloatClass;
471 
472 #define float_cmask(bit)  (1u << (bit))
473 
474 enum {
475     float_cmask_zero    = float_cmask(float_class_zero),
476     float_cmask_normal  = float_cmask(float_class_normal),
477     float_cmask_inf     = float_cmask(float_class_inf),
478     float_cmask_qnan    = float_cmask(float_class_qnan),
479     float_cmask_snan    = float_cmask(float_class_snan),
480 
481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
483 };
484 
485 
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489     return unlikely(c >= float_class_qnan);
490 }
491 
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494     return c == float_class_snan;
495 }
496 
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499     return c == float_class_qnan;
500 }
501 
502 /*
503  * Structure holding all of the decomposed parts of a float.
504  * The exponent is unbiased and the fraction is normalized.
505  *
506  * The fraction words are stored in big-endian word ordering,
507  * so that truncation from a larger format to a smaller format
508  * can be done simply by ignoring subsequent elements.
509  */
510 
511 typedef struct {
512     FloatClass cls;
513     bool sign;
514     int32_t exp;
515     union {
516         /* Routines that know the structure may reference the singular name. */
517         uint64_t frac;
518         /*
519          * Routines expanded with multiple structures reference "hi" and "lo"
520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
521          * both the same word and aliased here.
522          */
523         uint64_t frac_hi;
524         uint64_t frac_lo;
525     };
526 } FloatParts64;
527 
528 typedef struct {
529     FloatClass cls;
530     bool sign;
531     int32_t exp;
532     uint64_t frac_hi;
533     uint64_t frac_lo;
534 } FloatParts128;
535 
536 /* These apply to the most significant word of each FloatPartsN. */
537 #define DECOMPOSED_BINARY_POINT    63
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 
540 /* Structure holding all of the relevant parameters for a format.
541  *   exp_size: the size of the exponent field
542  *   exp_bias: the offset applied to the exponent field
543  *   exp_max: the maximum normalised exponent
544  *   frac_size: the size of the fraction field
545  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
546  * The following are computed based the size of fraction
547  *   frac_lsb: least significant bit of fraction
548  *   frac_lsbm1: the bit below the least significant bit (for rounding)
549  *   round_mask/roundeven_mask: masks used for rounding
550  * The following optional modifiers are available:
551  *   arm_althp: handle ARM Alternative Half Precision
552  */
553 typedef struct {
554     int exp_size;
555     int exp_bias;
556     int exp_max;
557     int frac_size;
558     int frac_shift;
559     uint64_t frac_lsb;
560     uint64_t frac_lsbm1;
561     uint64_t round_mask;
562     uint64_t roundeven_mask;
563     bool arm_althp;
564 } FloatFmt;
565 
566 /* Expand fields based on the size of exponent and fraction */
567 #define FLOAT_PARAMS(E, F)                                           \
568     .exp_size       = E,                                             \
569     .exp_bias       = ((1 << E) - 1) >> 1,                           \
570     .exp_max        = (1 << E) - 1,                                  \
571     .frac_size      = F,                                             \
572     .frac_shift     = (-F - 1) & 63,                                 \
573     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
574     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
575     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
576     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
577 
578 static const FloatFmt float16_params = {
579     FLOAT_PARAMS(5, 10)
580 };
581 
582 static const FloatFmt float16_params_ahp = {
583     FLOAT_PARAMS(5, 10),
584     .arm_althp = true
585 };
586 
587 static const FloatFmt bfloat16_params = {
588     FLOAT_PARAMS(8, 7)
589 };
590 
591 static const FloatFmt float32_params = {
592     FLOAT_PARAMS(8, 23)
593 };
594 
595 static const FloatFmt float64_params = {
596     FLOAT_PARAMS(11, 52)
597 };
598 
599 static const FloatFmt float128_params = {
600     FLOAT_PARAMS(15, 112)
601 };
602 
603 /* Unpack a float to parts, but do not canonicalize.  */
604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
605 {
606     const int f_size = fmt->frac_size;
607     const int e_size = fmt->exp_size;
608 
609     *r = (FloatParts64) {
610         .cls = float_class_unclassified,
611         .sign = extract64(raw, f_size + e_size, 1),
612         .exp = extract64(raw, f_size, e_size),
613         .frac = extract64(raw, 0, f_size)
614     };
615 }
616 
617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
618 {
619     unpack_raw64(p, &float16_params, f);
620 }
621 
622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
623 {
624     unpack_raw64(p, &bfloat16_params, f);
625 }
626 
627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
628 {
629     unpack_raw64(p, &float32_params, f);
630 }
631 
632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
633 {
634     unpack_raw64(p, &float64_params, f);
635 }
636 
637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
638 {
639     const int f_size = float128_params.frac_size - 64;
640     const int e_size = float128_params.exp_size;
641 
642     *p = (FloatParts128) {
643         .cls = float_class_unclassified,
644         .sign = extract64(f.high, f_size + e_size, 1),
645         .exp = extract64(f.high, f_size, e_size),
646         .frac_hi = extract64(f.high, 0, f_size),
647         .frac_lo = f.low,
648     };
649 }
650 
651 /* Pack a float from parts, but do not canonicalize.  */
652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
653 {
654     const int f_size = fmt->frac_size;
655     const int e_size = fmt->exp_size;
656     uint64_t ret;
657 
658     ret = (uint64_t)p->sign << (f_size + e_size);
659     ret = deposit64(ret, f_size, e_size, p->exp);
660     ret = deposit64(ret, 0, f_size, p->frac);
661     return ret;
662 }
663 
664 static inline float16 float16_pack_raw(const FloatParts64 *p)
665 {
666     return make_float16(pack_raw64(p, &float16_params));
667 }
668 
669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
670 {
671     return pack_raw64(p, &bfloat16_params);
672 }
673 
674 static inline float32 float32_pack_raw(const FloatParts64 *p)
675 {
676     return make_float32(pack_raw64(p, &float32_params));
677 }
678 
679 static inline float64 float64_pack_raw(const FloatParts64 *p)
680 {
681     return make_float64(pack_raw64(p, &float64_params));
682 }
683 
684 static float128 float128_pack_raw(const FloatParts128 *p)
685 {
686     const int f_size = float128_params.frac_size - 64;
687     const int e_size = float128_params.exp_size;
688     uint64_t hi;
689 
690     hi = (uint64_t)p->sign << (f_size + e_size);
691     hi = deposit64(hi, f_size, e_size, p->exp);
692     hi = deposit64(hi, 0, f_size, p->frac_hi);
693     return make_float128(hi, p->frac_lo);
694 }
695 
696 /*----------------------------------------------------------------------------
697 | Functions and definitions to determine:  (1) whether tininess for underflow
698 | is detected before or after rounding by default, (2) what (if anything)
699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
701 | are propagated from function inputs to output.  These details are target-
702 | specific.
703 *----------------------------------------------------------------------------*/
704 #include "softfloat-specialize.c.inc"
705 
706 #define PARTS_GENERIC_64_128(NAME, P) \
707     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
708 
709 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
710 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
711 
712 
713 /*
714  * Helper functions for softfloat-parts.c.inc, per-size operations.
715  */
716 
717 static void frac128_shl(FloatParts128 *a, int c)
718 {
719     shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
720 }
721 
722 #define frac_shl(A, C)             frac128_shl(A, C)
723 
724 static void frac128_shr(FloatParts128 *a, int c)
725 {
726     shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
727 }
728 
729 #define frac_shr(A, C)             frac128_shr(A, C)
730 
731 /* Canonicalize EXP and FRAC, setting CLS.  */
732 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
733                                   float_status *status)
734 {
735     if (part.exp == parm->exp_max && !parm->arm_althp) {
736         if (part.frac == 0) {
737             part.cls = float_class_inf;
738         } else {
739             part.frac <<= parm->frac_shift;
740             part.cls = (parts_is_snan_frac(part.frac, status)
741                         ? float_class_snan : float_class_qnan);
742         }
743     } else if (part.exp == 0) {
744         if (likely(part.frac == 0)) {
745             part.cls = float_class_zero;
746         } else if (status->flush_inputs_to_zero) {
747             float_raise(float_flag_input_denormal, status);
748             part.cls = float_class_zero;
749             part.frac = 0;
750         } else {
751             int shift = clz64(part.frac);
752             part.cls = float_class_normal;
753             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
754             part.frac <<= shift;
755         }
756     } else {
757         part.cls = float_class_normal;
758         part.exp -= parm->exp_bias;
759         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
760     }
761     return part;
762 }
763 
764 /* Round and uncanonicalize a floating-point number by parts. There
765  * are FRAC_SHIFT bits that may require rounding at the bottom of the
766  * fraction; these bits will be removed. The exponent will be biased
767  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
768  */
769 
770 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
771                                   const FloatFmt *parm)
772 {
773     const uint64_t frac_lsb = parm->frac_lsb;
774     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
775     const uint64_t round_mask = parm->round_mask;
776     const uint64_t roundeven_mask = parm->roundeven_mask;
777     const int exp_max = parm->exp_max;
778     const int frac_shift = parm->frac_shift;
779     uint64_t frac, inc;
780     int exp, flags = 0;
781     bool overflow_norm;
782 
783     frac = p.frac;
784     exp = p.exp;
785 
786     switch (p.cls) {
787     case float_class_normal:
788         switch (s->float_rounding_mode) {
789         case float_round_nearest_even:
790             overflow_norm = false;
791             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
792             break;
793         case float_round_ties_away:
794             overflow_norm = false;
795             inc = frac_lsbm1;
796             break;
797         case float_round_to_zero:
798             overflow_norm = true;
799             inc = 0;
800             break;
801         case float_round_up:
802             inc = p.sign ? 0 : round_mask;
803             overflow_norm = p.sign;
804             break;
805         case float_round_down:
806             inc = p.sign ? round_mask : 0;
807             overflow_norm = !p.sign;
808             break;
809         case float_round_to_odd:
810             overflow_norm = true;
811             inc = frac & frac_lsb ? 0 : round_mask;
812             break;
813         default:
814             g_assert_not_reached();
815         }
816 
817         exp += parm->exp_bias;
818         if (likely(exp > 0)) {
819             if (frac & round_mask) {
820                 flags |= float_flag_inexact;
821                 if (uadd64_overflow(frac, inc, &frac)) {
822                     frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
823                     exp++;
824                 }
825             }
826             frac >>= frac_shift;
827 
828             if (parm->arm_althp) {
829                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
830                 if (unlikely(exp > exp_max)) {
831                     /* Overflow.  Return the maximum normal.  */
832                     flags = float_flag_invalid;
833                     exp = exp_max;
834                     frac = -1;
835                 }
836             } else if (unlikely(exp >= exp_max)) {
837                 flags |= float_flag_overflow | float_flag_inexact;
838                 if (overflow_norm) {
839                     exp = exp_max - 1;
840                     frac = -1;
841                 } else {
842                     p.cls = float_class_inf;
843                     goto do_inf;
844                 }
845             }
846         } else if (s->flush_to_zero) {
847             flags |= float_flag_output_denormal;
848             p.cls = float_class_zero;
849             goto do_zero;
850         } else {
851             bool is_tiny = s->tininess_before_rounding || (exp < 0);
852 
853             if (!is_tiny) {
854                 uint64_t discard;
855                 is_tiny = !uadd64_overflow(frac, inc, &discard);
856             }
857 
858             shift64RightJamming(frac, 1 - exp, &frac);
859             if (frac & round_mask) {
860                 /* Need to recompute round-to-even.  */
861                 switch (s->float_rounding_mode) {
862                 case float_round_nearest_even:
863                     inc = ((frac & roundeven_mask) != frac_lsbm1
864                            ? frac_lsbm1 : 0);
865                     break;
866                 case float_round_to_odd:
867                     inc = frac & frac_lsb ? 0 : round_mask;
868                     break;
869                 default:
870                     break;
871                 }
872                 flags |= float_flag_inexact;
873                 frac += inc;
874             }
875 
876             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
877             frac >>= frac_shift;
878 
879             if (is_tiny && (flags & float_flag_inexact)) {
880                 flags |= float_flag_underflow;
881             }
882             if (exp == 0 && frac == 0) {
883                 p.cls = float_class_zero;
884             }
885         }
886         break;
887 
888     case float_class_zero:
889     do_zero:
890         exp = 0;
891         frac = 0;
892         break;
893 
894     case float_class_inf:
895     do_inf:
896         assert(!parm->arm_althp);
897         exp = exp_max;
898         frac = 0;
899         break;
900 
901     case float_class_qnan:
902     case float_class_snan:
903         assert(!parm->arm_althp);
904         exp = exp_max;
905         frac >>= parm->frac_shift;
906         break;
907 
908     default:
909         g_assert_not_reached();
910     }
911 
912     float_raise(flags, s);
913     p.exp = exp;
914     p.frac = frac;
915     return p;
916 }
917 
918 static FloatParts64 return_nan(FloatParts64 a, float_status *s)
919 {
920     g_assert(is_nan(a.cls));
921     if (is_snan(a.cls)) {
922         float_raise(float_flag_invalid, s);
923         if (!s->default_nan_mode) {
924             parts_silence_nan(&a, s);
925             return a;
926         }
927     } else if (!s->default_nan_mode) {
928         return a;
929     }
930     parts_default_nan(&a, s);
931     return a;
932 }
933 
934 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s)
935 {
936     if (is_snan(a.cls) || is_snan(b.cls)) {
937         float_raise(float_flag_invalid, s);
938     }
939 
940     if (s->default_nan_mode) {
941         parts_default_nan(&a, s);
942     } else {
943         if (pickNaN(a.cls, b.cls,
944                     a.frac > b.frac ||
945                     (a.frac == b.frac && a.sign < b.sign), s)) {
946             a = b;
947         }
948         if (is_snan(a.cls)) {
949             parts_silence_nan(&a, s);
950         }
951     }
952     return a;
953 }
954 
955 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c,
956                                   bool inf_zero, float_status *s)
957 {
958     int which;
959 
960     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
961         float_raise(float_flag_invalid, s);
962     }
963 
964     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
965 
966     if (s->default_nan_mode) {
967         /* Note that this check is after pickNaNMulAdd so that function
968          * has an opportunity to set the Invalid flag.
969          */
970         which = 3;
971     }
972 
973     switch (which) {
974     case 0:
975         break;
976     case 1:
977         a = b;
978         break;
979     case 2:
980         a = c;
981         break;
982     case 3:
983         parts_default_nan(&a, s);
984         break;
985     default:
986         g_assert_not_reached();
987     }
988 
989     if (is_snan(a.cls)) {
990         parts_silence_nan(&a, s);
991     }
992     return a;
993 }
994 
995 /*
996  * Pack/unpack routines with a specific FloatFmt.
997  */
998 
999 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1000                                       float_status *s, const FloatFmt *params)
1001 {
1002     float16_unpack_raw(p, f);
1003     *p = sf_canonicalize(*p, params, s);
1004 }
1005 
1006 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1007                                      float_status *s)
1008 {
1009     float16a_unpack_canonical(p, f, s, &float16_params);
1010 }
1011 
1012 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1013                                       float_status *s)
1014 {
1015     bfloat16_unpack_raw(p, f);
1016     *p = sf_canonicalize(*p, &bfloat16_params, s);
1017 }
1018 
1019 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1020                                              float_status *s,
1021                                              const FloatFmt *params)
1022 {
1023     *p = round_canonical(*p, s, params);
1024     return float16_pack_raw(p);
1025 }
1026 
1027 static float16 float16_round_pack_canonical(FloatParts64 *p,
1028                                             float_status *s)
1029 {
1030     return float16a_round_pack_canonical(p, s, &float16_params);
1031 }
1032 
1033 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1034                                               float_status *s)
1035 {
1036     *p = round_canonical(*p, s, &bfloat16_params);
1037     return bfloat16_pack_raw(p);
1038 }
1039 
1040 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1041                                      float_status *s)
1042 {
1043     float32_unpack_raw(p, f);
1044     *p = sf_canonicalize(*p, &float32_params, s);
1045 }
1046 
1047 static float32 float32_round_pack_canonical(FloatParts64 *p,
1048                                             float_status *s)
1049 {
1050     *p = round_canonical(*p, s, &float32_params);
1051     return float32_pack_raw(p);
1052 }
1053 
1054 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1055                                      float_status *s)
1056 {
1057     float64_unpack_raw(p, f);
1058     *p = sf_canonicalize(*p, &float64_params, s);
1059 }
1060 
1061 static float64 float64_round_pack_canonical(FloatParts64 *p,
1062                                             float_status *s)
1063 {
1064     *p = round_canonical(*p, s, &float64_params);
1065     return float64_pack_raw(p);
1066 }
1067 
1068 /*
1069  * Returns the result of adding or subtracting the values of the
1070  * floating-point values `a' and `b'. The operation is performed
1071  * according to the IEC/IEEE Standard for Binary Floating-Point
1072  * Arithmetic.
1073  */
1074 
1075 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
1076                                 float_status *s)
1077 {
1078     bool a_sign = a.sign;
1079     bool b_sign = b.sign ^ subtract;
1080 
1081     if (a_sign != b_sign) {
1082         /* Subtraction */
1083 
1084         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1085             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
1086                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1087                 a.frac = a.frac - b.frac;
1088             } else {
1089                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1090                 a.frac = b.frac - a.frac;
1091                 a.exp = b.exp;
1092                 a_sign ^= 1;
1093             }
1094 
1095             if (a.frac == 0) {
1096                 a.cls = float_class_zero;
1097                 a.sign = s->float_rounding_mode == float_round_down;
1098             } else {
1099                 int shift = clz64(a.frac);
1100                 a.frac = a.frac << shift;
1101                 a.exp = a.exp - shift;
1102                 a.sign = a_sign;
1103             }
1104             return a;
1105         }
1106         if (is_nan(a.cls) || is_nan(b.cls)) {
1107             return pick_nan(a, b, s);
1108         }
1109         if (a.cls == float_class_inf) {
1110             if (b.cls == float_class_inf) {
1111                 float_raise(float_flag_invalid, s);
1112                 parts_default_nan(&a, s);
1113             }
1114             return a;
1115         }
1116         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1117             a.sign = s->float_rounding_mode == float_round_down;
1118             return a;
1119         }
1120         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1121             b.sign = a_sign ^ 1;
1122             return b;
1123         }
1124         if (b.cls == float_class_zero) {
1125             return a;
1126         }
1127     } else {
1128         /* Addition */
1129         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1130             if (a.exp > b.exp) {
1131                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1132             } else if (a.exp < b.exp) {
1133                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1134                 a.exp = b.exp;
1135             }
1136 
1137             if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1138                 shift64RightJamming(a.frac, 1, &a.frac);
1139                 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1140                 a.exp += 1;
1141             }
1142             return a;
1143         }
1144         if (is_nan(a.cls) || is_nan(b.cls)) {
1145             return pick_nan(a, b, s);
1146         }
1147         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1148             return a;
1149         }
1150         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1151             b.sign = b_sign;
1152             return b;
1153         }
1154     }
1155     g_assert_not_reached();
1156 }
1157 
1158 /*
1159  * Returns the result of adding or subtracting the floating-point
1160  * values `a' and `b'. The operation is performed according to the
1161  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1162  */
1163 
1164 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1165 {
1166     FloatParts64 pa, pb, pr;
1167 
1168     float16_unpack_canonical(&pa, a, status);
1169     float16_unpack_canonical(&pb, b, status);
1170     pr = addsub_floats(pa, pb, false, status);
1171 
1172     return float16_round_pack_canonical(&pr, status);
1173 }
1174 
1175 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1176 {
1177     FloatParts64 pa, pb, pr;
1178 
1179     float16_unpack_canonical(&pa, a, status);
1180     float16_unpack_canonical(&pb, b, status);
1181     pr = addsub_floats(pa, pb, true, status);
1182 
1183     return float16_round_pack_canonical(&pr, status);
1184 }
1185 
1186 static float32 QEMU_SOFTFLOAT_ATTR
1187 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1188 {
1189     FloatParts64 pa, pb, pr;
1190 
1191     float32_unpack_canonical(&pa, a, status);
1192     float32_unpack_canonical(&pb, b, status);
1193     pr = addsub_floats(pa, pb, subtract, status);
1194 
1195     return float32_round_pack_canonical(&pr, status);
1196 }
1197 
1198 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1199 {
1200     return soft_f32_addsub(a, b, false, status);
1201 }
1202 
1203 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1204 {
1205     return soft_f32_addsub(a, b, true, status);
1206 }
1207 
1208 static float64 QEMU_SOFTFLOAT_ATTR
1209 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1210 {
1211     FloatParts64 pa, pb, pr;
1212 
1213     float64_unpack_canonical(&pa, a, status);
1214     float64_unpack_canonical(&pb, b, status);
1215     pr = addsub_floats(pa, pb, subtract, status);
1216 
1217     return float64_round_pack_canonical(&pr, status);
1218 }
1219 
1220 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1221 {
1222     return soft_f64_addsub(a, b, false, status);
1223 }
1224 
1225 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1226 {
1227     return soft_f64_addsub(a, b, true, status);
1228 }
1229 
1230 static float hard_f32_add(float a, float b)
1231 {
1232     return a + b;
1233 }
1234 
1235 static float hard_f32_sub(float a, float b)
1236 {
1237     return a - b;
1238 }
1239 
1240 static double hard_f64_add(double a, double b)
1241 {
1242     return a + b;
1243 }
1244 
1245 static double hard_f64_sub(double a, double b)
1246 {
1247     return a - b;
1248 }
1249 
1250 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1251 {
1252     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1253         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1254     }
1255     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1256 }
1257 
1258 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1259 {
1260     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1261         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1262     } else {
1263         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1264     }
1265 }
1266 
1267 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1268                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1269 {
1270     return float32_gen2(a, b, s, hard, soft,
1271                         f32_is_zon2, f32_addsubmul_post);
1272 }
1273 
1274 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1275                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1276 {
1277     return float64_gen2(a, b, s, hard, soft,
1278                         f64_is_zon2, f64_addsubmul_post);
1279 }
1280 
1281 float32 QEMU_FLATTEN
1282 float32_add(float32 a, float32 b, float_status *s)
1283 {
1284     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1285 }
1286 
1287 float32 QEMU_FLATTEN
1288 float32_sub(float32 a, float32 b, float_status *s)
1289 {
1290     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1291 }
1292 
1293 float64 QEMU_FLATTEN
1294 float64_add(float64 a, float64 b, float_status *s)
1295 {
1296     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1297 }
1298 
1299 float64 QEMU_FLATTEN
1300 float64_sub(float64 a, float64 b, float_status *s)
1301 {
1302     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1303 }
1304 
1305 /*
1306  * Returns the result of adding or subtracting the bfloat16
1307  * values `a' and `b'.
1308  */
1309 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1310 {
1311     FloatParts64 pa, pb, pr;
1312 
1313     bfloat16_unpack_canonical(&pa, a, status);
1314     bfloat16_unpack_canonical(&pb, b, status);
1315     pr = addsub_floats(pa, pb, false, status);
1316 
1317     return bfloat16_round_pack_canonical(&pr, status);
1318 }
1319 
1320 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1321 {
1322     FloatParts64 pa, pb, pr;
1323 
1324     bfloat16_unpack_canonical(&pa, a, status);
1325     bfloat16_unpack_canonical(&pb, b, status);
1326     pr = addsub_floats(pa, pb, true, status);
1327 
1328     return bfloat16_round_pack_canonical(&pr, status);
1329 }
1330 
1331 /*
1332  * Returns the result of multiplying the floating-point values `a' and
1333  * `b'. The operation is performed according to the IEC/IEEE Standard
1334  * for Binary Floating-Point Arithmetic.
1335  */
1336 
1337 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1338 {
1339     bool sign = a.sign ^ b.sign;
1340 
1341     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1342         uint64_t hi, lo;
1343         int exp = a.exp + b.exp;
1344 
1345         mul64To128(a.frac, b.frac, &hi, &lo);
1346         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1347             exp += 1;
1348         } else {
1349             hi <<= 1;
1350         }
1351         hi |= (lo != 0);
1352 
1353         /* Re-use a */
1354         a.exp = exp;
1355         a.sign = sign;
1356         a.frac = hi;
1357         return a;
1358     }
1359     /* handle all the NaN cases */
1360     if (is_nan(a.cls) || is_nan(b.cls)) {
1361         return pick_nan(a, b, s);
1362     }
1363     /* Inf * Zero == NaN */
1364     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1365         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1366         float_raise(float_flag_invalid, s);
1367         parts_default_nan(&a, s);
1368         return a;
1369     }
1370     /* Multiply by 0 or Inf */
1371     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1372         a.sign = sign;
1373         return a;
1374     }
1375     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1376         b.sign = sign;
1377         return b;
1378     }
1379     g_assert_not_reached();
1380 }
1381 
1382 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1383 {
1384     FloatParts64 pa, pb, pr;
1385 
1386     float16_unpack_canonical(&pa, a, status);
1387     float16_unpack_canonical(&pb, b, status);
1388     pr = mul_floats(pa, pb, status);
1389 
1390     return float16_round_pack_canonical(&pr, status);
1391 }
1392 
1393 static float32 QEMU_SOFTFLOAT_ATTR
1394 soft_f32_mul(float32 a, float32 b, float_status *status)
1395 {
1396     FloatParts64 pa, pb, pr;
1397 
1398     float32_unpack_canonical(&pa, a, status);
1399     float32_unpack_canonical(&pb, b, status);
1400     pr = mul_floats(pa, pb, status);
1401 
1402     return float32_round_pack_canonical(&pr, status);
1403 }
1404 
1405 static float64 QEMU_SOFTFLOAT_ATTR
1406 soft_f64_mul(float64 a, float64 b, float_status *status)
1407 {
1408     FloatParts64 pa, pb, pr;
1409 
1410     float64_unpack_canonical(&pa, a, status);
1411     float64_unpack_canonical(&pb, b, status);
1412     pr = mul_floats(pa, pb, status);
1413 
1414     return float64_round_pack_canonical(&pr, status);
1415 }
1416 
1417 static float hard_f32_mul(float a, float b)
1418 {
1419     return a * b;
1420 }
1421 
1422 static double hard_f64_mul(double a, double b)
1423 {
1424     return a * b;
1425 }
1426 
1427 float32 QEMU_FLATTEN
1428 float32_mul(float32 a, float32 b, float_status *s)
1429 {
1430     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1431                         f32_is_zon2, f32_addsubmul_post);
1432 }
1433 
1434 float64 QEMU_FLATTEN
1435 float64_mul(float64 a, float64 b, float_status *s)
1436 {
1437     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1438                         f64_is_zon2, f64_addsubmul_post);
1439 }
1440 
1441 /*
1442  * Returns the result of multiplying the bfloat16
1443  * values `a' and `b'.
1444  */
1445 
1446 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1447 {
1448     FloatParts64 pa, pb, pr;
1449 
1450     bfloat16_unpack_canonical(&pa, a, status);
1451     bfloat16_unpack_canonical(&pb, b, status);
1452     pr = mul_floats(pa, pb, status);
1453 
1454     return bfloat16_round_pack_canonical(&pr, status);
1455 }
1456 
1457 /*
1458  * Returns the result of multiplying the floating-point values `a' and
1459  * `b' then adding 'c', with no intermediate rounding step after the
1460  * multiplication. The operation is performed according to the
1461  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1462  * The flags argument allows the caller to select negation of the
1463  * addend, the intermediate product, or the final result. (The
1464  * difference between this and having the caller do a separate
1465  * negation is that negating externally will flip the sign bit on
1466  * NaNs.)
1467  */
1468 
1469 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1470                                 int flags, float_status *s)
1471 {
1472     bool inf_zero, p_sign;
1473     bool sign_flip = flags & float_muladd_negate_result;
1474     FloatClass p_class;
1475     uint64_t hi, lo;
1476     int p_exp;
1477     int ab_mask, abc_mask;
1478 
1479     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1480     abc_mask = float_cmask(c.cls) | ab_mask;
1481     inf_zero = ab_mask == float_cmask_infzero;
1482 
1483     /* It is implementation-defined whether the cases of (0,inf,qnan)
1484      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1485      * they return if they do), so we have to hand this information
1486      * off to the target-specific pick-a-NaN routine.
1487      */
1488     if (unlikely(abc_mask & float_cmask_anynan)) {
1489         return pick_nan_muladd(a, b, c, inf_zero, s);
1490     }
1491 
1492     if (inf_zero) {
1493         float_raise(float_flag_invalid, s);
1494         parts_default_nan(&a, s);
1495         return a;
1496     }
1497 
1498     if (flags & float_muladd_negate_c) {
1499         c.sign ^= 1;
1500     }
1501 
1502     p_sign = a.sign ^ b.sign;
1503 
1504     if (flags & float_muladd_negate_product) {
1505         p_sign ^= 1;
1506     }
1507 
1508     if (ab_mask & float_cmask_inf) {
1509         p_class = float_class_inf;
1510     } else if (ab_mask & float_cmask_zero) {
1511         p_class = float_class_zero;
1512     } else {
1513         p_class = float_class_normal;
1514     }
1515 
1516     if (c.cls == float_class_inf) {
1517         if (p_class == float_class_inf && p_sign != c.sign) {
1518             float_raise(float_flag_invalid, s);
1519             parts_default_nan(&c, s);
1520         } else {
1521             c.sign ^= sign_flip;
1522         }
1523         return c;
1524     }
1525 
1526     if (p_class == float_class_inf) {
1527         a.cls = float_class_inf;
1528         a.sign = p_sign ^ sign_flip;
1529         return a;
1530     }
1531 
1532     if (p_class == float_class_zero) {
1533         if (c.cls == float_class_zero) {
1534             if (p_sign != c.sign) {
1535                 p_sign = s->float_rounding_mode == float_round_down;
1536             }
1537             c.sign = p_sign;
1538         } else if (flags & float_muladd_halve_result) {
1539             c.exp -= 1;
1540         }
1541         c.sign ^= sign_flip;
1542         return c;
1543     }
1544 
1545     /* a & b should be normals now... */
1546     assert(a.cls == float_class_normal &&
1547            b.cls == float_class_normal);
1548 
1549     p_exp = a.exp + b.exp;
1550 
1551     mul64To128(a.frac, b.frac, &hi, &lo);
1552 
1553     /* Renormalize to the msb. */
1554     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1555         p_exp += 1;
1556     } else {
1557         shortShift128Left(hi, lo, 1, &hi, &lo);
1558     }
1559 
1560     /* + add/sub */
1561     if (c.cls != float_class_zero) {
1562         int exp_diff = p_exp - c.exp;
1563         if (p_sign == c.sign) {
1564             /* Addition */
1565             if (exp_diff <= 0) {
1566                 shift64RightJamming(hi, -exp_diff, &hi);
1567                 p_exp = c.exp;
1568                 if (uadd64_overflow(hi, c.frac, &hi)) {
1569                     shift64RightJamming(hi, 1, &hi);
1570                     hi |= DECOMPOSED_IMPLICIT_BIT;
1571                     p_exp += 1;
1572                 }
1573             } else {
1574                 uint64_t c_hi, c_lo, over;
1575                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1576                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1577                 if (over) {
1578                     shift64RightJamming(hi, 1, &hi);
1579                     hi |= DECOMPOSED_IMPLICIT_BIT;
1580                     p_exp += 1;
1581                 }
1582             }
1583         } else {
1584             /* Subtraction */
1585             uint64_t c_hi = c.frac, c_lo = 0;
1586 
1587             if (exp_diff <= 0) {
1588                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1589                 if (exp_diff == 0
1590                     &&
1591                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1592                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1593                 } else {
1594                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1595                     p_sign ^= 1;
1596                     p_exp = c.exp;
1597                 }
1598             } else {
1599                 shift128RightJamming(c_hi, c_lo,
1600                                      exp_diff,
1601                                      &c_hi, &c_lo);
1602                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1603             }
1604 
1605             if (hi == 0 && lo == 0) {
1606                 a.cls = float_class_zero;
1607                 a.sign = s->float_rounding_mode == float_round_down;
1608                 a.sign ^= sign_flip;
1609                 return a;
1610             } else {
1611                 int shift;
1612                 if (hi != 0) {
1613                     shift = clz64(hi);
1614                 } else {
1615                     shift = clz64(lo) + 64;
1616                 }
1617                 /* Normalizing to a binary point of 124 is the
1618                    correct adjust for the exponent.  However since we're
1619                    shifting, we might as well put the binary point back
1620                    at 63 where we really want it.  Therefore shift as
1621                    if we're leaving 1 bit at the top of the word, but
1622                    adjust the exponent as if we're leaving 3 bits.  */
1623                 shift128Left(hi, lo, shift, &hi, &lo);
1624                 p_exp -= shift;
1625             }
1626         }
1627     }
1628     hi |= (lo != 0);
1629 
1630     if (flags & float_muladd_halve_result) {
1631         p_exp -= 1;
1632     }
1633 
1634     /* finally prepare our result */
1635     a.cls = float_class_normal;
1636     a.sign = p_sign ^ sign_flip;
1637     a.exp = p_exp;
1638     a.frac = hi;
1639 
1640     return a;
1641 }
1642 
1643 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1644                                                 int flags, float_status *status)
1645 {
1646     FloatParts64 pa, pb, pc, pr;
1647 
1648     float16_unpack_canonical(&pa, a, status);
1649     float16_unpack_canonical(&pb, b, status);
1650     float16_unpack_canonical(&pc, c, status);
1651     pr = muladd_floats(pa, pb, pc, flags, status);
1652 
1653     return float16_round_pack_canonical(&pr, status);
1654 }
1655 
1656 static float32 QEMU_SOFTFLOAT_ATTR
1657 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1658                 float_status *status)
1659 {
1660     FloatParts64 pa, pb, pc, pr;
1661 
1662     float32_unpack_canonical(&pa, a, status);
1663     float32_unpack_canonical(&pb, b, status);
1664     float32_unpack_canonical(&pc, c, status);
1665     pr = muladd_floats(pa, pb, pc, flags, status);
1666 
1667     return float32_round_pack_canonical(&pr, status);
1668 }
1669 
1670 static float64 QEMU_SOFTFLOAT_ATTR
1671 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1672                 float_status *status)
1673 {
1674     FloatParts64 pa, pb, pc, pr;
1675 
1676     float64_unpack_canonical(&pa, a, status);
1677     float64_unpack_canonical(&pb, b, status);
1678     float64_unpack_canonical(&pc, c, status);
1679     pr = muladd_floats(pa, pb, pc, flags, status);
1680 
1681     return float64_round_pack_canonical(&pr, status);
1682 }
1683 
1684 static bool force_soft_fma;
1685 
1686 float32 QEMU_FLATTEN
1687 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1688 {
1689     union_float32 ua, ub, uc, ur;
1690 
1691     ua.s = xa;
1692     ub.s = xb;
1693     uc.s = xc;
1694 
1695     if (unlikely(!can_use_fpu(s))) {
1696         goto soft;
1697     }
1698     if (unlikely(flags & float_muladd_halve_result)) {
1699         goto soft;
1700     }
1701 
1702     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1703     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1704         goto soft;
1705     }
1706 
1707     if (unlikely(force_soft_fma)) {
1708         goto soft;
1709     }
1710 
1711     /*
1712      * When (a || b) == 0, there's no need to check for under/over flow,
1713      * since we know the addend is (normal || 0) and the product is 0.
1714      */
1715     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1716         union_float32 up;
1717         bool prod_sign;
1718 
1719         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1720         prod_sign ^= !!(flags & float_muladd_negate_product);
1721         up.s = float32_set_sign(float32_zero, prod_sign);
1722 
1723         if (flags & float_muladd_negate_c) {
1724             uc.h = -uc.h;
1725         }
1726         ur.h = up.h + uc.h;
1727     } else {
1728         union_float32 ua_orig = ua;
1729         union_float32 uc_orig = uc;
1730 
1731         if (flags & float_muladd_negate_product) {
1732             ua.h = -ua.h;
1733         }
1734         if (flags & float_muladd_negate_c) {
1735             uc.h = -uc.h;
1736         }
1737 
1738         ur.h = fmaf(ua.h, ub.h, uc.h);
1739 
1740         if (unlikely(f32_is_inf(ur))) {
1741             float_raise(float_flag_overflow, s);
1742         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1743             ua = ua_orig;
1744             uc = uc_orig;
1745             goto soft;
1746         }
1747     }
1748     if (flags & float_muladd_negate_result) {
1749         return float32_chs(ur.s);
1750     }
1751     return ur.s;
1752 
1753  soft:
1754     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1755 }
1756 
1757 float64 QEMU_FLATTEN
1758 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1759 {
1760     union_float64 ua, ub, uc, ur;
1761 
1762     ua.s = xa;
1763     ub.s = xb;
1764     uc.s = xc;
1765 
1766     if (unlikely(!can_use_fpu(s))) {
1767         goto soft;
1768     }
1769     if (unlikely(flags & float_muladd_halve_result)) {
1770         goto soft;
1771     }
1772 
1773     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1774     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1775         goto soft;
1776     }
1777 
1778     if (unlikely(force_soft_fma)) {
1779         goto soft;
1780     }
1781 
1782     /*
1783      * When (a || b) == 0, there's no need to check for under/over flow,
1784      * since we know the addend is (normal || 0) and the product is 0.
1785      */
1786     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1787         union_float64 up;
1788         bool prod_sign;
1789 
1790         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1791         prod_sign ^= !!(flags & float_muladd_negate_product);
1792         up.s = float64_set_sign(float64_zero, prod_sign);
1793 
1794         if (flags & float_muladd_negate_c) {
1795             uc.h = -uc.h;
1796         }
1797         ur.h = up.h + uc.h;
1798     } else {
1799         union_float64 ua_orig = ua;
1800         union_float64 uc_orig = uc;
1801 
1802         if (flags & float_muladd_negate_product) {
1803             ua.h = -ua.h;
1804         }
1805         if (flags & float_muladd_negate_c) {
1806             uc.h = -uc.h;
1807         }
1808 
1809         ur.h = fma(ua.h, ub.h, uc.h);
1810 
1811         if (unlikely(f64_is_inf(ur))) {
1812             float_raise(float_flag_overflow, s);
1813         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1814             ua = ua_orig;
1815             uc = uc_orig;
1816             goto soft;
1817         }
1818     }
1819     if (flags & float_muladd_negate_result) {
1820         return float64_chs(ur.s);
1821     }
1822     return ur.s;
1823 
1824  soft:
1825     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1826 }
1827 
1828 /*
1829  * Returns the result of multiplying the bfloat16 values `a'
1830  * and `b' then adding 'c', with no intermediate rounding step after the
1831  * multiplication.
1832  */
1833 
1834 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1835                                       int flags, float_status *status)
1836 {
1837     FloatParts64 pa, pb, pc, pr;
1838 
1839     bfloat16_unpack_canonical(&pa, a, status);
1840     bfloat16_unpack_canonical(&pb, b, status);
1841     bfloat16_unpack_canonical(&pc, c, status);
1842     pr = muladd_floats(pa, pb, pc, flags, status);
1843 
1844     return bfloat16_round_pack_canonical(&pr, status);
1845 }
1846 
1847 /*
1848  * Returns the result of dividing the floating-point value `a' by the
1849  * corresponding value `b'. The operation is performed according to
1850  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1851  */
1852 
1853 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1854 {
1855     bool sign = a.sign ^ b.sign;
1856 
1857     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1858         uint64_t n0, n1, q, r;
1859         int exp = a.exp - b.exp;
1860 
1861         /*
1862          * We want a 2*N / N-bit division to produce exactly an N-bit
1863          * result, so that we do not lose any precision and so that we
1864          * do not have to renormalize afterward.  If A.frac < B.frac,
1865          * then division would produce an (N-1)-bit result; shift A left
1866          * by one to produce the an N-bit result, and decrement the
1867          * exponent to match.
1868          *
1869          * The udiv_qrnnd algorithm that we're using requires normalization,
1870          * i.e. the msb of the denominator must be set, which is already true.
1871          */
1872         if (a.frac < b.frac) {
1873             exp -= 1;
1874             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1875         } else {
1876             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1877         }
1878         q = udiv_qrnnd(&r, n1, n0, b.frac);
1879 
1880         /* Set lsb if there is a remainder, to set inexact. */
1881         a.frac = q | (r != 0);
1882         a.sign = sign;
1883         a.exp = exp;
1884         return a;
1885     }
1886     /* handle all the NaN cases */
1887     if (is_nan(a.cls) || is_nan(b.cls)) {
1888         return pick_nan(a, b, s);
1889     }
1890     /* 0/0 or Inf/Inf */
1891     if (a.cls == b.cls
1892         &&
1893         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1894         float_raise(float_flag_invalid, s);
1895         parts_default_nan(&a, s);
1896         return a;
1897     }
1898     /* Inf / x or 0 / x */
1899     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1900         a.sign = sign;
1901         return a;
1902     }
1903     /* Div 0 => Inf */
1904     if (b.cls == float_class_zero) {
1905         float_raise(float_flag_divbyzero, s);
1906         a.cls = float_class_inf;
1907         a.sign = sign;
1908         return a;
1909     }
1910     /* Div by Inf */
1911     if (b.cls == float_class_inf) {
1912         a.cls = float_class_zero;
1913         a.sign = sign;
1914         return a;
1915     }
1916     g_assert_not_reached();
1917 }
1918 
1919 float16 float16_div(float16 a, float16 b, float_status *status)
1920 {
1921     FloatParts64 pa, pb, pr;
1922 
1923     float16_unpack_canonical(&pa, a, status);
1924     float16_unpack_canonical(&pb, b, status);
1925     pr = div_floats(pa, pb, status);
1926 
1927     return float16_round_pack_canonical(&pr, status);
1928 }
1929 
1930 static float32 QEMU_SOFTFLOAT_ATTR
1931 soft_f32_div(float32 a, float32 b, float_status *status)
1932 {
1933     FloatParts64 pa, pb, pr;
1934 
1935     float32_unpack_canonical(&pa, a, status);
1936     float32_unpack_canonical(&pb, b, status);
1937     pr = div_floats(pa, pb, status);
1938 
1939     return float32_round_pack_canonical(&pr, status);
1940 }
1941 
1942 static float64 QEMU_SOFTFLOAT_ATTR
1943 soft_f64_div(float64 a, float64 b, float_status *status)
1944 {
1945     FloatParts64 pa, pb, pr;
1946 
1947     float64_unpack_canonical(&pa, a, status);
1948     float64_unpack_canonical(&pb, b, status);
1949     pr = div_floats(pa, pb, status);
1950 
1951     return float64_round_pack_canonical(&pr, status);
1952 }
1953 
1954 static float hard_f32_div(float a, float b)
1955 {
1956     return a / b;
1957 }
1958 
1959 static double hard_f64_div(double a, double b)
1960 {
1961     return a / b;
1962 }
1963 
1964 static bool f32_div_pre(union_float32 a, union_float32 b)
1965 {
1966     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1967         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1968                fpclassify(b.h) == FP_NORMAL;
1969     }
1970     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1971 }
1972 
1973 static bool f64_div_pre(union_float64 a, union_float64 b)
1974 {
1975     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1976         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1977                fpclassify(b.h) == FP_NORMAL;
1978     }
1979     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1980 }
1981 
1982 static bool f32_div_post(union_float32 a, union_float32 b)
1983 {
1984     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1985         return fpclassify(a.h) != FP_ZERO;
1986     }
1987     return !float32_is_zero(a.s);
1988 }
1989 
1990 static bool f64_div_post(union_float64 a, union_float64 b)
1991 {
1992     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1993         return fpclassify(a.h) != FP_ZERO;
1994     }
1995     return !float64_is_zero(a.s);
1996 }
1997 
1998 float32 QEMU_FLATTEN
1999 float32_div(float32 a, float32 b, float_status *s)
2000 {
2001     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
2002                         f32_div_pre, f32_div_post);
2003 }
2004 
2005 float64 QEMU_FLATTEN
2006 float64_div(float64 a, float64 b, float_status *s)
2007 {
2008     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
2009                         f64_div_pre, f64_div_post);
2010 }
2011 
2012 /*
2013  * Returns the result of dividing the bfloat16
2014  * value `a' by the corresponding value `b'.
2015  */
2016 
2017 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
2018 {
2019     FloatParts64 pa, pb, pr;
2020 
2021     bfloat16_unpack_canonical(&pa, a, status);
2022     bfloat16_unpack_canonical(&pb, b, status);
2023     pr = div_floats(pa, pb, status);
2024 
2025     return bfloat16_round_pack_canonical(&pr, status);
2026 }
2027 
2028 /*
2029  * Float to Float conversions
2030  *
2031  * Returns the result of converting one float format to another. The
2032  * conversion is performed according to the IEC/IEEE Standard for
2033  * Binary Floating-Point Arithmetic.
2034  *
2035  * The float_to_float helper only needs to take care of raising
2036  * invalid exceptions and handling the conversion on NaNs.
2037  */
2038 
2039 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
2040                                  float_status *s)
2041 {
2042     if (dstf->arm_althp) {
2043         switch (a.cls) {
2044         case float_class_qnan:
2045         case float_class_snan:
2046             /* There is no NaN in the destination format.  Raise Invalid
2047              * and return a zero with the sign of the input NaN.
2048              */
2049             float_raise(float_flag_invalid, s);
2050             a.cls = float_class_zero;
2051             a.frac = 0;
2052             a.exp = 0;
2053             break;
2054 
2055         case float_class_inf:
2056             /* There is no Inf in the destination format.  Raise Invalid
2057              * and return the maximum normal with the correct sign.
2058              */
2059             float_raise(float_flag_invalid, s);
2060             a.cls = float_class_normal;
2061             a.exp = dstf->exp_max;
2062             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
2063             break;
2064 
2065         default:
2066             break;
2067         }
2068     } else if (is_nan(a.cls)) {
2069         return return_nan(a, s);
2070     }
2071     return a;
2072 }
2073 
2074 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
2075 {
2076     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2077     FloatParts64 pa, pr;
2078 
2079     float16a_unpack_canonical(&pa, a, s, fmt16);
2080     pr = float_to_float(pa, &float32_params, s);
2081     return float32_round_pack_canonical(&pr, s);
2082 }
2083 
2084 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2085 {
2086     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2087     FloatParts64 pa, pr;
2088 
2089     float16a_unpack_canonical(&pa, a, s, fmt16);
2090     pr = float_to_float(pa, &float64_params, s);
2091     return float64_round_pack_canonical(&pr, s);
2092 }
2093 
2094 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2095 {
2096     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2097     FloatParts64 pa, pr;
2098 
2099     float32_unpack_canonical(&pa, a, s);
2100     pr = float_to_float(pa, fmt16, s);
2101     return float16a_round_pack_canonical(&pr, s, fmt16);
2102 }
2103 
2104 static float64 QEMU_SOFTFLOAT_ATTR
2105 soft_float32_to_float64(float32 a, float_status *s)
2106 {
2107     FloatParts64 pa, pr;
2108 
2109     float32_unpack_canonical(&pa, a, s);
2110     pr = float_to_float(pa, &float64_params, s);
2111     return float64_round_pack_canonical(&pr, s);
2112 }
2113 
2114 float64 float32_to_float64(float32 a, float_status *s)
2115 {
2116     if (likely(float32_is_normal(a))) {
2117         /* Widening conversion can never produce inexact results.  */
2118         union_float32 uf;
2119         union_float64 ud;
2120         uf.s = a;
2121         ud.h = uf.h;
2122         return ud.s;
2123     } else if (float32_is_zero(a)) {
2124         return float64_set_sign(float64_zero, float32_is_neg(a));
2125     } else {
2126         return soft_float32_to_float64(a, s);
2127     }
2128 }
2129 
2130 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2131 {
2132     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2133     FloatParts64 pa, pr;
2134 
2135     float64_unpack_canonical(&pa, a, s);
2136     pr = float_to_float(pa, fmt16, s);
2137     return float16a_round_pack_canonical(&pr, s, fmt16);
2138 }
2139 
2140 float32 float64_to_float32(float64 a, float_status *s)
2141 {
2142     FloatParts64 pa, pr;
2143 
2144     float64_unpack_canonical(&pa, a, s);
2145     pr = float_to_float(pa, &float32_params, s);
2146     return float32_round_pack_canonical(&pr, s);
2147 }
2148 
2149 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2150 {
2151     FloatParts64 pa, pr;
2152 
2153     bfloat16_unpack_canonical(&pa, a, s);
2154     pr = float_to_float(pa, &float32_params, s);
2155     return float32_round_pack_canonical(&pr, s);
2156 }
2157 
2158 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2159 {
2160     FloatParts64 pa, pr;
2161 
2162     bfloat16_unpack_canonical(&pa, a, s);
2163     pr = float_to_float(pa, &float64_params, s);
2164     return float64_round_pack_canonical(&pr, s);
2165 }
2166 
2167 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2168 {
2169     FloatParts64 pa, pr;
2170 
2171     float32_unpack_canonical(&pa, a, s);
2172     pr = float_to_float(pa, &bfloat16_params, s);
2173     return bfloat16_round_pack_canonical(&pr, s);
2174 }
2175 
2176 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2177 {
2178     FloatParts64 pa, pr;
2179 
2180     float64_unpack_canonical(&pa, a, s);
2181     pr = float_to_float(pa, &bfloat16_params, s);
2182     return bfloat16_round_pack_canonical(&pr, s);
2183 }
2184 
2185 /*
2186  * Rounds the floating-point value `a' to an integer, and returns the
2187  * result as a floating-point value. The operation is performed
2188  * according to the IEC/IEEE Standard for Binary Floating-Point
2189  * Arithmetic.
2190  */
2191 
2192 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2193                                int scale, float_status *s)
2194 {
2195     switch (a.cls) {
2196     case float_class_qnan:
2197     case float_class_snan:
2198         return return_nan(a, s);
2199 
2200     case float_class_zero:
2201     case float_class_inf:
2202         /* already "integral" */
2203         break;
2204 
2205     case float_class_normal:
2206         scale = MIN(MAX(scale, -0x10000), 0x10000);
2207         a.exp += scale;
2208 
2209         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2210             /* already integral */
2211             break;
2212         }
2213         if (a.exp < 0) {
2214             bool one;
2215             /* all fractional */
2216             float_raise(float_flag_inexact, s);
2217             switch (rmode) {
2218             case float_round_nearest_even:
2219                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2220                 break;
2221             case float_round_ties_away:
2222                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2223                 break;
2224             case float_round_to_zero:
2225                 one = false;
2226                 break;
2227             case float_round_up:
2228                 one = !a.sign;
2229                 break;
2230             case float_round_down:
2231                 one = a.sign;
2232                 break;
2233             case float_round_to_odd:
2234                 one = true;
2235                 break;
2236             default:
2237                 g_assert_not_reached();
2238             }
2239 
2240             if (one) {
2241                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2242                 a.exp = 0;
2243             } else {
2244                 a.cls = float_class_zero;
2245             }
2246         } else {
2247             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2248             uint64_t frac_lsbm1 = frac_lsb >> 1;
2249             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2250             uint64_t rnd_mask = rnd_even_mask >> 1;
2251             uint64_t inc;
2252 
2253             switch (rmode) {
2254             case float_round_nearest_even:
2255                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2256                 break;
2257             case float_round_ties_away:
2258                 inc = frac_lsbm1;
2259                 break;
2260             case float_round_to_zero:
2261                 inc = 0;
2262                 break;
2263             case float_round_up:
2264                 inc = a.sign ? 0 : rnd_mask;
2265                 break;
2266             case float_round_down:
2267                 inc = a.sign ? rnd_mask : 0;
2268                 break;
2269             case float_round_to_odd:
2270                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2271                 break;
2272             default:
2273                 g_assert_not_reached();
2274             }
2275 
2276             if (a.frac & rnd_mask) {
2277                 float_raise(float_flag_inexact, s);
2278                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2279                     a.frac >>= 1;
2280                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2281                     a.exp++;
2282                 }
2283                 a.frac &= ~rnd_mask;
2284             }
2285         }
2286         break;
2287     default:
2288         g_assert_not_reached();
2289     }
2290     return a;
2291 }
2292 
2293 float16 float16_round_to_int(float16 a, float_status *s)
2294 {
2295     FloatParts64 pa, pr;
2296 
2297     float16_unpack_canonical(&pa, a, s);
2298     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2299     return float16_round_pack_canonical(&pr, s);
2300 }
2301 
2302 float32 float32_round_to_int(float32 a, float_status *s)
2303 {
2304     FloatParts64 pa, pr;
2305 
2306     float32_unpack_canonical(&pa, a, s);
2307     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2308     return float32_round_pack_canonical(&pr, s);
2309 }
2310 
2311 float64 float64_round_to_int(float64 a, float_status *s)
2312 {
2313     FloatParts64 pa, pr;
2314 
2315     float64_unpack_canonical(&pa, a, s);
2316     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2317     return float64_round_pack_canonical(&pr, s);
2318 }
2319 
2320 /*
2321  * Rounds the bfloat16 value `a' to an integer, and returns the
2322  * result as a bfloat16 value.
2323  */
2324 
2325 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2326 {
2327     FloatParts64 pa, pr;
2328 
2329     bfloat16_unpack_canonical(&pa, a, s);
2330     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2331     return bfloat16_round_pack_canonical(&pr, s);
2332 }
2333 
2334 /*
2335  * Returns the result of converting the floating-point value `a' to
2336  * the two's complement integer format. The conversion is performed
2337  * according to the IEC/IEEE Standard for Binary Floating-Point
2338  * Arithmetic---which means in particular that the conversion is
2339  * rounded according to the current rounding mode. If `a' is a NaN,
2340  * the largest positive integer is returned. Otherwise, if the
2341  * conversion overflows, the largest integer with the same sign as `a'
2342  * is returned.
2343 */
2344 
2345 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2346                                      int scale, int64_t min, int64_t max,
2347                                      float_status *s)
2348 {
2349     uint64_t r;
2350     int orig_flags = get_float_exception_flags(s);
2351     FloatParts64 p = round_to_int(in, rmode, scale, s);
2352 
2353     switch (p.cls) {
2354     case float_class_snan:
2355     case float_class_qnan:
2356         s->float_exception_flags = orig_flags | float_flag_invalid;
2357         return max;
2358     case float_class_inf:
2359         s->float_exception_flags = orig_flags | float_flag_invalid;
2360         return p.sign ? min : max;
2361     case float_class_zero:
2362         return 0;
2363     case float_class_normal:
2364         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2365             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2366         } else {
2367             r = UINT64_MAX;
2368         }
2369         if (p.sign) {
2370             if (r <= -(uint64_t) min) {
2371                 return -r;
2372             } else {
2373                 s->float_exception_flags = orig_flags | float_flag_invalid;
2374                 return min;
2375             }
2376         } else {
2377             if (r <= max) {
2378                 return r;
2379             } else {
2380                 s->float_exception_flags = orig_flags | float_flag_invalid;
2381                 return max;
2382             }
2383         }
2384     default:
2385         g_assert_not_reached();
2386     }
2387 }
2388 
2389 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2390                               float_status *s)
2391 {
2392     FloatParts64 p;
2393 
2394     float16_unpack_canonical(&p, a, s);
2395     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2396 }
2397 
2398 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2399                                 float_status *s)
2400 {
2401     FloatParts64 p;
2402 
2403     float16_unpack_canonical(&p, a, s);
2404     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2405 }
2406 
2407 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2408                                 float_status *s)
2409 {
2410     FloatParts64 p;
2411 
2412     float16_unpack_canonical(&p, a, s);
2413     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2414 }
2415 
2416 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2417                                 float_status *s)
2418 {
2419     FloatParts64 p;
2420 
2421     float16_unpack_canonical(&p, a, s);
2422     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2423 }
2424 
2425 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2426                                 float_status *s)
2427 {
2428     FloatParts64 p;
2429 
2430     float32_unpack_canonical(&p, a, s);
2431     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2432 }
2433 
2434 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2435                                 float_status *s)
2436 {
2437     FloatParts64 p;
2438 
2439     float32_unpack_canonical(&p, a, s);
2440     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2441 }
2442 
2443 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2444                                 float_status *s)
2445 {
2446     FloatParts64 p;
2447 
2448     float32_unpack_canonical(&p, a, s);
2449     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2450 }
2451 
2452 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2453                                 float_status *s)
2454 {
2455     FloatParts64 p;
2456 
2457     float64_unpack_canonical(&p, a, s);
2458     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2459 }
2460 
2461 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2462                                 float_status *s)
2463 {
2464     FloatParts64 p;
2465 
2466     float64_unpack_canonical(&p, a, s);
2467     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2468 }
2469 
2470 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2471                                 float_status *s)
2472 {
2473     FloatParts64 p;
2474 
2475     float64_unpack_canonical(&p, a, s);
2476     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2477 }
2478 
2479 int8_t float16_to_int8(float16 a, float_status *s)
2480 {
2481     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2482 }
2483 
2484 int16_t float16_to_int16(float16 a, float_status *s)
2485 {
2486     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2487 }
2488 
2489 int32_t float16_to_int32(float16 a, float_status *s)
2490 {
2491     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2492 }
2493 
2494 int64_t float16_to_int64(float16 a, float_status *s)
2495 {
2496     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2497 }
2498 
2499 int16_t float32_to_int16(float32 a, float_status *s)
2500 {
2501     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2502 }
2503 
2504 int32_t float32_to_int32(float32 a, float_status *s)
2505 {
2506     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2507 }
2508 
2509 int64_t float32_to_int64(float32 a, float_status *s)
2510 {
2511     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2512 }
2513 
2514 int16_t float64_to_int16(float64 a, float_status *s)
2515 {
2516     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2517 }
2518 
2519 int32_t float64_to_int32(float64 a, float_status *s)
2520 {
2521     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2522 }
2523 
2524 int64_t float64_to_int64(float64 a, float_status *s)
2525 {
2526     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2527 }
2528 
2529 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2530 {
2531     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2532 }
2533 
2534 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2535 {
2536     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2537 }
2538 
2539 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2540 {
2541     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2542 }
2543 
2544 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2545 {
2546     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2547 }
2548 
2549 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2550 {
2551     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2552 }
2553 
2554 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2555 {
2556     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2557 }
2558 
2559 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2560 {
2561     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2562 }
2563 
2564 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2565 {
2566     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2567 }
2568 
2569 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2570 {
2571     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2572 }
2573 
2574 /*
2575  * Returns the result of converting the floating-point value `a' to
2576  * the two's complement integer format.
2577  */
2578 
2579 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2580                                  float_status *s)
2581 {
2582     FloatParts64 p;
2583 
2584     bfloat16_unpack_canonical(&p, a, s);
2585     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2586 }
2587 
2588 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2589                                  float_status *s)
2590 {
2591     FloatParts64 p;
2592 
2593     bfloat16_unpack_canonical(&p, a, s);
2594     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2595 }
2596 
2597 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2598                                  float_status *s)
2599 {
2600     FloatParts64 p;
2601 
2602     bfloat16_unpack_canonical(&p, a, s);
2603     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2604 }
2605 
2606 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2607 {
2608     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2609 }
2610 
2611 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2612 {
2613     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2614 }
2615 
2616 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2617 {
2618     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2619 }
2620 
2621 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2622 {
2623     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2624 }
2625 
2626 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2627 {
2628     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2629 }
2630 
2631 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2632 {
2633     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2634 }
2635 
2636 /*
2637  *  Returns the result of converting the floating-point value `a' to
2638  *  the unsigned integer format. The conversion is performed according
2639  *  to the IEC/IEEE Standard for Binary Floating-Point
2640  *  Arithmetic---which means in particular that the conversion is
2641  *  rounded according to the current rounding mode. If `a' is a NaN,
2642  *  the largest unsigned integer is returned. Otherwise, if the
2643  *  conversion overflows, the largest unsigned integer is returned. If
2644  *  the 'a' is negative, the result is rounded and zero is returned;
2645  *  values that do not round to zero will raise the inexact exception
2646  *  flag.
2647  */
2648 
2649 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2650                                        int scale, uint64_t max,
2651                                        float_status *s)
2652 {
2653     int orig_flags = get_float_exception_flags(s);
2654     FloatParts64 p = round_to_int(in, rmode, scale, s);
2655     uint64_t r;
2656 
2657     switch (p.cls) {
2658     case float_class_snan:
2659     case float_class_qnan:
2660         s->float_exception_flags = orig_flags | float_flag_invalid;
2661         return max;
2662     case float_class_inf:
2663         s->float_exception_flags = orig_flags | float_flag_invalid;
2664         return p.sign ? 0 : max;
2665     case float_class_zero:
2666         return 0;
2667     case float_class_normal:
2668         if (p.sign) {
2669             s->float_exception_flags = orig_flags | float_flag_invalid;
2670             return 0;
2671         }
2672 
2673         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2674             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2675         } else {
2676             s->float_exception_flags = orig_flags | float_flag_invalid;
2677             return max;
2678         }
2679 
2680         /* For uint64 this will never trip, but if p.exp is too large
2681          * to shift a decomposed fraction we shall have exited via the
2682          * 3rd leg above.
2683          */
2684         if (r > max) {
2685             s->float_exception_flags = orig_flags | float_flag_invalid;
2686             return max;
2687         }
2688         return r;
2689     default:
2690         g_assert_not_reached();
2691     }
2692 }
2693 
2694 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2695                                 float_status *s)
2696 {
2697     FloatParts64 p;
2698 
2699     float16_unpack_canonical(&p, a, s);
2700     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2701 }
2702 
2703 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2704                                   float_status *s)
2705 {
2706     FloatParts64 p;
2707 
2708     float16_unpack_canonical(&p, a, s);
2709     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2710 }
2711 
2712 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2713                                   float_status *s)
2714 {
2715     FloatParts64 p;
2716 
2717     float16_unpack_canonical(&p, a, s);
2718     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2719 }
2720 
2721 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2722                                   float_status *s)
2723 {
2724     FloatParts64 p;
2725 
2726     float16_unpack_canonical(&p, a, s);
2727     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2728 }
2729 
2730 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2731                                   float_status *s)
2732 {
2733     FloatParts64 p;
2734 
2735     float32_unpack_canonical(&p, a, s);
2736     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2737 }
2738 
2739 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2740                                   float_status *s)
2741 {
2742     FloatParts64 p;
2743 
2744     float32_unpack_canonical(&p, a, s);
2745     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2746 }
2747 
2748 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2749                                   float_status *s)
2750 {
2751     FloatParts64 p;
2752 
2753     float32_unpack_canonical(&p, a, s);
2754     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2755 }
2756 
2757 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2758                                   float_status *s)
2759 {
2760     FloatParts64 p;
2761 
2762     float64_unpack_canonical(&p, a, s);
2763     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2764 }
2765 
2766 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2767                                   float_status *s)
2768 {
2769     FloatParts64 p;
2770 
2771     float64_unpack_canonical(&p, a, s);
2772     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2773 }
2774 
2775 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2776                                   float_status *s)
2777 {
2778     FloatParts64 p;
2779 
2780     float64_unpack_canonical(&p, a, s);
2781     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2782 }
2783 
2784 uint8_t float16_to_uint8(float16 a, float_status *s)
2785 {
2786     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2787 }
2788 
2789 uint16_t float16_to_uint16(float16 a, float_status *s)
2790 {
2791     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2792 }
2793 
2794 uint32_t float16_to_uint32(float16 a, float_status *s)
2795 {
2796     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2797 }
2798 
2799 uint64_t float16_to_uint64(float16 a, float_status *s)
2800 {
2801     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2802 }
2803 
2804 uint16_t float32_to_uint16(float32 a, float_status *s)
2805 {
2806     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2807 }
2808 
2809 uint32_t float32_to_uint32(float32 a, float_status *s)
2810 {
2811     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2812 }
2813 
2814 uint64_t float32_to_uint64(float32 a, float_status *s)
2815 {
2816     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2817 }
2818 
2819 uint16_t float64_to_uint16(float64 a, float_status *s)
2820 {
2821     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2822 }
2823 
2824 uint32_t float64_to_uint32(float64 a, float_status *s)
2825 {
2826     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2827 }
2828 
2829 uint64_t float64_to_uint64(float64 a, float_status *s)
2830 {
2831     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2832 }
2833 
2834 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2835 {
2836     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2837 }
2838 
2839 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2840 {
2841     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2842 }
2843 
2844 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2845 {
2846     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2847 }
2848 
2849 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2850 {
2851     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2852 }
2853 
2854 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2855 {
2856     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2857 }
2858 
2859 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2860 {
2861     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2862 }
2863 
2864 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2865 {
2866     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2867 }
2868 
2869 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2870 {
2871     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2872 }
2873 
2874 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2875 {
2876     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2877 }
2878 
2879 /*
2880  *  Returns the result of converting the bfloat16 value `a' to
2881  *  the unsigned integer format.
2882  */
2883 
2884 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2885                                    int scale, float_status *s)
2886 {
2887     FloatParts64 p;
2888 
2889     bfloat16_unpack_canonical(&p, a, s);
2890     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2891 }
2892 
2893 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2894                                    int scale, float_status *s)
2895 {
2896     FloatParts64 p;
2897 
2898     bfloat16_unpack_canonical(&p, a, s);
2899     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2900 }
2901 
2902 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2903                                    int scale, float_status *s)
2904 {
2905     FloatParts64 p;
2906 
2907     bfloat16_unpack_canonical(&p, a, s);
2908     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2909 }
2910 
2911 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2912 {
2913     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2914 }
2915 
2916 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2917 {
2918     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2919 }
2920 
2921 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2922 {
2923     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2924 }
2925 
2926 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2927 {
2928     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2929 }
2930 
2931 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2932 {
2933     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2934 }
2935 
2936 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2937 {
2938     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2939 }
2940 
2941 /*
2942  * Integer to float conversions
2943  *
2944  * Returns the result of converting the two's complement integer `a'
2945  * to the floating-point format. The conversion is performed according
2946  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2947  */
2948 
2949 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2950 {
2951     FloatParts64 r = { .sign = false };
2952 
2953     if (a == 0) {
2954         r.cls = float_class_zero;
2955     } else {
2956         uint64_t f = a;
2957         int shift;
2958 
2959         r.cls = float_class_normal;
2960         if (a < 0) {
2961             f = -f;
2962             r.sign = true;
2963         }
2964         shift = clz64(f);
2965         scale = MIN(MAX(scale, -0x10000), 0x10000);
2966 
2967         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2968         r.frac = f << shift;
2969     }
2970 
2971     return r;
2972 }
2973 
2974 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2975 {
2976     FloatParts64 pa = int_to_float(a, scale, status);
2977     return float16_round_pack_canonical(&pa, status);
2978 }
2979 
2980 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2981 {
2982     return int64_to_float16_scalbn(a, scale, status);
2983 }
2984 
2985 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2986 {
2987     return int64_to_float16_scalbn(a, scale, status);
2988 }
2989 
2990 float16 int64_to_float16(int64_t a, float_status *status)
2991 {
2992     return int64_to_float16_scalbn(a, 0, status);
2993 }
2994 
2995 float16 int32_to_float16(int32_t a, float_status *status)
2996 {
2997     return int64_to_float16_scalbn(a, 0, status);
2998 }
2999 
3000 float16 int16_to_float16(int16_t a, float_status *status)
3001 {
3002     return int64_to_float16_scalbn(a, 0, status);
3003 }
3004 
3005 float16 int8_to_float16(int8_t a, float_status *status)
3006 {
3007     return int64_to_float16_scalbn(a, 0, status);
3008 }
3009 
3010 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
3011 {
3012     FloatParts64 pa = int_to_float(a, scale, status);
3013     return float32_round_pack_canonical(&pa, status);
3014 }
3015 
3016 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
3017 {
3018     return int64_to_float32_scalbn(a, scale, status);
3019 }
3020 
3021 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
3022 {
3023     return int64_to_float32_scalbn(a, scale, status);
3024 }
3025 
3026 float32 int64_to_float32(int64_t a, float_status *status)
3027 {
3028     return int64_to_float32_scalbn(a, 0, status);
3029 }
3030 
3031 float32 int32_to_float32(int32_t a, float_status *status)
3032 {
3033     return int64_to_float32_scalbn(a, 0, status);
3034 }
3035 
3036 float32 int16_to_float32(int16_t a, float_status *status)
3037 {
3038     return int64_to_float32_scalbn(a, 0, status);
3039 }
3040 
3041 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
3042 {
3043     FloatParts64 pa = int_to_float(a, scale, status);
3044     return float64_round_pack_canonical(&pa, status);
3045 }
3046 
3047 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
3048 {
3049     return int64_to_float64_scalbn(a, scale, status);
3050 }
3051 
3052 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
3053 {
3054     return int64_to_float64_scalbn(a, scale, status);
3055 }
3056 
3057 float64 int64_to_float64(int64_t a, float_status *status)
3058 {
3059     return int64_to_float64_scalbn(a, 0, status);
3060 }
3061 
3062 float64 int32_to_float64(int32_t a, float_status *status)
3063 {
3064     return int64_to_float64_scalbn(a, 0, status);
3065 }
3066 
3067 float64 int16_to_float64(int16_t a, float_status *status)
3068 {
3069     return int64_to_float64_scalbn(a, 0, status);
3070 }
3071 
3072 /*
3073  * Returns the result of converting the two's complement integer `a'
3074  * to the bfloat16 format.
3075  */
3076 
3077 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3078 {
3079     FloatParts64 pa = int_to_float(a, scale, status);
3080     return bfloat16_round_pack_canonical(&pa, status);
3081 }
3082 
3083 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3084 {
3085     return int64_to_bfloat16_scalbn(a, scale, status);
3086 }
3087 
3088 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3089 {
3090     return int64_to_bfloat16_scalbn(a, scale, status);
3091 }
3092 
3093 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3094 {
3095     return int64_to_bfloat16_scalbn(a, 0, status);
3096 }
3097 
3098 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3099 {
3100     return int64_to_bfloat16_scalbn(a, 0, status);
3101 }
3102 
3103 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3104 {
3105     return int64_to_bfloat16_scalbn(a, 0, status);
3106 }
3107 
3108 /*
3109  * Unsigned Integer to float conversions
3110  *
3111  * Returns the result of converting the unsigned integer `a' to the
3112  * floating-point format. The conversion is performed according to the
3113  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3114  */
3115 
3116 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3117 {
3118     FloatParts64 r = { .sign = false };
3119     int shift;
3120 
3121     if (a == 0) {
3122         r.cls = float_class_zero;
3123     } else {
3124         scale = MIN(MAX(scale, -0x10000), 0x10000);
3125         shift = clz64(a);
3126         r.cls = float_class_normal;
3127         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3128         r.frac = a << shift;
3129     }
3130 
3131     return r;
3132 }
3133 
3134 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3135 {
3136     FloatParts64 pa = uint_to_float(a, scale, status);
3137     return float16_round_pack_canonical(&pa, status);
3138 }
3139 
3140 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3141 {
3142     return uint64_to_float16_scalbn(a, scale, status);
3143 }
3144 
3145 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3146 {
3147     return uint64_to_float16_scalbn(a, scale, status);
3148 }
3149 
3150 float16 uint64_to_float16(uint64_t a, float_status *status)
3151 {
3152     return uint64_to_float16_scalbn(a, 0, status);
3153 }
3154 
3155 float16 uint32_to_float16(uint32_t a, float_status *status)
3156 {
3157     return uint64_to_float16_scalbn(a, 0, status);
3158 }
3159 
3160 float16 uint16_to_float16(uint16_t a, float_status *status)
3161 {
3162     return uint64_to_float16_scalbn(a, 0, status);
3163 }
3164 
3165 float16 uint8_to_float16(uint8_t a, float_status *status)
3166 {
3167     return uint64_to_float16_scalbn(a, 0, status);
3168 }
3169 
3170 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3171 {
3172     FloatParts64 pa = uint_to_float(a, scale, status);
3173     return float32_round_pack_canonical(&pa, status);
3174 }
3175 
3176 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3177 {
3178     return uint64_to_float32_scalbn(a, scale, status);
3179 }
3180 
3181 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3182 {
3183     return uint64_to_float32_scalbn(a, scale, status);
3184 }
3185 
3186 float32 uint64_to_float32(uint64_t a, float_status *status)
3187 {
3188     return uint64_to_float32_scalbn(a, 0, status);
3189 }
3190 
3191 float32 uint32_to_float32(uint32_t a, float_status *status)
3192 {
3193     return uint64_to_float32_scalbn(a, 0, status);
3194 }
3195 
3196 float32 uint16_to_float32(uint16_t a, float_status *status)
3197 {
3198     return uint64_to_float32_scalbn(a, 0, status);
3199 }
3200 
3201 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3202 {
3203     FloatParts64 pa = uint_to_float(a, scale, status);
3204     return float64_round_pack_canonical(&pa, status);
3205 }
3206 
3207 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3208 {
3209     return uint64_to_float64_scalbn(a, scale, status);
3210 }
3211 
3212 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3213 {
3214     return uint64_to_float64_scalbn(a, scale, status);
3215 }
3216 
3217 float64 uint64_to_float64(uint64_t a, float_status *status)
3218 {
3219     return uint64_to_float64_scalbn(a, 0, status);
3220 }
3221 
3222 float64 uint32_to_float64(uint32_t a, float_status *status)
3223 {
3224     return uint64_to_float64_scalbn(a, 0, status);
3225 }
3226 
3227 float64 uint16_to_float64(uint16_t a, float_status *status)
3228 {
3229     return uint64_to_float64_scalbn(a, 0, status);
3230 }
3231 
3232 /*
3233  * Returns the result of converting the unsigned integer `a' to the
3234  * bfloat16 format.
3235  */
3236 
3237 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3238 {
3239     FloatParts64 pa = uint_to_float(a, scale, status);
3240     return bfloat16_round_pack_canonical(&pa, status);
3241 }
3242 
3243 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3244 {
3245     return uint64_to_bfloat16_scalbn(a, scale, status);
3246 }
3247 
3248 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3249 {
3250     return uint64_to_bfloat16_scalbn(a, scale, status);
3251 }
3252 
3253 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3254 {
3255     return uint64_to_bfloat16_scalbn(a, 0, status);
3256 }
3257 
3258 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3259 {
3260     return uint64_to_bfloat16_scalbn(a, 0, status);
3261 }
3262 
3263 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3264 {
3265     return uint64_to_bfloat16_scalbn(a, 0, status);
3266 }
3267 
3268 /* Float Min/Max */
3269 /* min() and max() functions. These can't be implemented as
3270  * 'compare and pick one input' because that would mishandle
3271  * NaNs and +0 vs -0.
3272  *
3273  * minnum() and maxnum() functions. These are similar to the min()
3274  * and max() functions but if one of the arguments is a QNaN and
3275  * the other is numerical then the numerical argument is returned.
3276  * SNaNs will get quietened before being returned.
3277  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3278  * and maxNum() operations. min() and max() are the typical min/max
3279  * semantics provided by many CPUs which predate that specification.
3280  *
3281  * minnummag() and maxnummag() functions correspond to minNumMag()
3282  * and minNumMag() from the IEEE-754 2008.
3283  */
3284 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3285                                 bool ieee, bool ismag, float_status *s)
3286 {
3287     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3288         if (ieee) {
3289             /* Takes two floating-point values `a' and `b', one of
3290              * which is a NaN, and returns the appropriate NaN
3291              * result. If either `a' or `b' is a signaling NaN,
3292              * the invalid exception is raised.
3293              */
3294             if (is_snan(a.cls) || is_snan(b.cls)) {
3295                 return pick_nan(a, b, s);
3296             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3297                 return b;
3298             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3299                 return a;
3300             }
3301         }
3302         return pick_nan(a, b, s);
3303     } else {
3304         int a_exp, b_exp;
3305 
3306         switch (a.cls) {
3307         case float_class_normal:
3308             a_exp = a.exp;
3309             break;
3310         case float_class_inf:
3311             a_exp = INT_MAX;
3312             break;
3313         case float_class_zero:
3314             a_exp = INT_MIN;
3315             break;
3316         default:
3317             g_assert_not_reached();
3318             break;
3319         }
3320         switch (b.cls) {
3321         case float_class_normal:
3322             b_exp = b.exp;
3323             break;
3324         case float_class_inf:
3325             b_exp = INT_MAX;
3326             break;
3327         case float_class_zero:
3328             b_exp = INT_MIN;
3329             break;
3330         default:
3331             g_assert_not_reached();
3332             break;
3333         }
3334 
3335         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3336             bool a_less = a_exp < b_exp;
3337             if (a_exp == b_exp) {
3338                 a_less = a.frac < b.frac;
3339             }
3340             return a_less ^ ismin ? b : a;
3341         }
3342 
3343         if (a.sign == b.sign) {
3344             bool a_less = a_exp < b_exp;
3345             if (a_exp == b_exp) {
3346                 a_less = a.frac < b.frac;
3347             }
3348             return a.sign ^ a_less ^ ismin ? b : a;
3349         } else {
3350             return a.sign ^ ismin ? b : a;
3351         }
3352     }
3353 }
3354 
3355 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3356 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3357                                      float_status *s)                   \
3358 {                                                                       \
3359     FloatParts64 pa, pb, pr;                                            \
3360     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3361     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3362     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3363     return float ## sz ## _round_pack_canonical(&pr, s);                \
3364 }
3365 
3366 MINMAX(16, min, true, false, false)
3367 MINMAX(16, minnum, true, true, false)
3368 MINMAX(16, minnummag, true, true, true)
3369 MINMAX(16, max, false, false, false)
3370 MINMAX(16, maxnum, false, true, false)
3371 MINMAX(16, maxnummag, false, true, true)
3372 
3373 MINMAX(32, min, true, false, false)
3374 MINMAX(32, minnum, true, true, false)
3375 MINMAX(32, minnummag, true, true, true)
3376 MINMAX(32, max, false, false, false)
3377 MINMAX(32, maxnum, false, true, false)
3378 MINMAX(32, maxnummag, false, true, true)
3379 
3380 MINMAX(64, min, true, false, false)
3381 MINMAX(64, minnum, true, true, false)
3382 MINMAX(64, minnummag, true, true, true)
3383 MINMAX(64, max, false, false, false)
3384 MINMAX(64, maxnum, false, true, false)
3385 MINMAX(64, maxnummag, false, true, true)
3386 
3387 #undef MINMAX
3388 
3389 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3390 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3391 {                                                                       \
3392     FloatParts64 pa, pb, pr;                                            \
3393     bfloat16_unpack_canonical(&pa, a, s);                               \
3394     bfloat16_unpack_canonical(&pb, b, s);                               \
3395     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3396     return bfloat16_round_pack_canonical(&pr, s);                       \
3397 }
3398 
3399 BF16_MINMAX(min, true, false, false)
3400 BF16_MINMAX(minnum, true, true, false)
3401 BF16_MINMAX(minnummag, true, true, true)
3402 BF16_MINMAX(max, false, false, false)
3403 BF16_MINMAX(maxnum, false, true, false)
3404 BF16_MINMAX(maxnummag, false, true, true)
3405 
3406 #undef BF16_MINMAX
3407 
3408 /* Floating point compare */
3409 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3410                                     float_status *s)
3411 {
3412     if (is_nan(a.cls) || is_nan(b.cls)) {
3413         if (!is_quiet ||
3414             a.cls == float_class_snan ||
3415             b.cls == float_class_snan) {
3416             float_raise(float_flag_invalid, s);
3417         }
3418         return float_relation_unordered;
3419     }
3420 
3421     if (a.cls == float_class_zero) {
3422         if (b.cls == float_class_zero) {
3423             return float_relation_equal;
3424         }
3425         return b.sign ? float_relation_greater : float_relation_less;
3426     } else if (b.cls == float_class_zero) {
3427         return a.sign ? float_relation_less : float_relation_greater;
3428     }
3429 
3430     /* The only really important thing about infinity is its sign. If
3431      * both are infinities the sign marks the smallest of the two.
3432      */
3433     if (a.cls == float_class_inf) {
3434         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3435             return float_relation_equal;
3436         }
3437         return a.sign ? float_relation_less : float_relation_greater;
3438     } else if (b.cls == float_class_inf) {
3439         return b.sign ? float_relation_greater : float_relation_less;
3440     }
3441 
3442     if (a.sign != b.sign) {
3443         return a.sign ? float_relation_less : float_relation_greater;
3444     }
3445 
3446     if (a.exp == b.exp) {
3447         if (a.frac == b.frac) {
3448             return float_relation_equal;
3449         }
3450         if (a.sign) {
3451             return a.frac > b.frac ?
3452                 float_relation_less : float_relation_greater;
3453         } else {
3454             return a.frac > b.frac ?
3455                 float_relation_greater : float_relation_less;
3456         }
3457     } else {
3458         if (a.sign) {
3459             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3460         } else {
3461             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3462         }
3463     }
3464 }
3465 
3466 #define COMPARE(name, attr, sz)                                         \
3467 static int attr                                                         \
3468 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3469 {                                                                       \
3470     FloatParts64 pa, pb;                                                \
3471     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3472     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3473     return compare_floats(pa, pb, is_quiet, s);                         \
3474 }
3475 
3476 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3477 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3478 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3479 
3480 #undef COMPARE
3481 
3482 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3483 {
3484     return soft_f16_compare(a, b, false, s);
3485 }
3486 
3487 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3488 {
3489     return soft_f16_compare(a, b, true, s);
3490 }
3491 
3492 static FloatRelation QEMU_FLATTEN
3493 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3494 {
3495     union_float32 ua, ub;
3496 
3497     ua.s = xa;
3498     ub.s = xb;
3499 
3500     if (QEMU_NO_HARDFLOAT) {
3501         goto soft;
3502     }
3503 
3504     float32_input_flush2(&ua.s, &ub.s, s);
3505     if (isgreaterequal(ua.h, ub.h)) {
3506         if (isgreater(ua.h, ub.h)) {
3507             return float_relation_greater;
3508         }
3509         return float_relation_equal;
3510     }
3511     if (likely(isless(ua.h, ub.h))) {
3512         return float_relation_less;
3513     }
3514     /* The only condition remaining is unordered.
3515      * Fall through to set flags.
3516      */
3517  soft:
3518     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3519 }
3520 
3521 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3522 {
3523     return f32_compare(a, b, false, s);
3524 }
3525 
3526 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3527 {
3528     return f32_compare(a, b, true, s);
3529 }
3530 
3531 static FloatRelation QEMU_FLATTEN
3532 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3533 {
3534     union_float64 ua, ub;
3535 
3536     ua.s = xa;
3537     ub.s = xb;
3538 
3539     if (QEMU_NO_HARDFLOAT) {
3540         goto soft;
3541     }
3542 
3543     float64_input_flush2(&ua.s, &ub.s, s);
3544     if (isgreaterequal(ua.h, ub.h)) {
3545         if (isgreater(ua.h, ub.h)) {
3546             return float_relation_greater;
3547         }
3548         return float_relation_equal;
3549     }
3550     if (likely(isless(ua.h, ub.h))) {
3551         return float_relation_less;
3552     }
3553     /* The only condition remaining is unordered.
3554      * Fall through to set flags.
3555      */
3556  soft:
3557     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3558 }
3559 
3560 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3561 {
3562     return f64_compare(a, b, false, s);
3563 }
3564 
3565 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3566 {
3567     return f64_compare(a, b, true, s);
3568 }
3569 
3570 static FloatRelation QEMU_FLATTEN
3571 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3572 {
3573     FloatParts64 pa, pb;
3574 
3575     bfloat16_unpack_canonical(&pa, a, s);
3576     bfloat16_unpack_canonical(&pb, b, s);
3577     return compare_floats(pa, pb, is_quiet, s);
3578 }
3579 
3580 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3581 {
3582     return soft_bf16_compare(a, b, false, s);
3583 }
3584 
3585 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3586 {
3587     return soft_bf16_compare(a, b, true, s);
3588 }
3589 
3590 /* Multiply A by 2 raised to the power N.  */
3591 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3592 {
3593     if (unlikely(is_nan(a.cls))) {
3594         return return_nan(a, s);
3595     }
3596     if (a.cls == float_class_normal) {
3597         /* The largest float type (even though not supported by FloatParts64)
3598          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3599          * still allows rounding to infinity, without allowing overflow
3600          * within the int32_t that backs FloatParts64.exp.
3601          */
3602         n = MIN(MAX(n, -0x10000), 0x10000);
3603         a.exp += n;
3604     }
3605     return a;
3606 }
3607 
3608 float16 float16_scalbn(float16 a, int n, float_status *status)
3609 {
3610     FloatParts64 pa, pr;
3611 
3612     float16_unpack_canonical(&pa, a, status);
3613     pr = scalbn_decomposed(pa, n, status);
3614     return float16_round_pack_canonical(&pr, status);
3615 }
3616 
3617 float32 float32_scalbn(float32 a, int n, float_status *status)
3618 {
3619     FloatParts64 pa, pr;
3620 
3621     float32_unpack_canonical(&pa, a, status);
3622     pr = scalbn_decomposed(pa, n, status);
3623     return float32_round_pack_canonical(&pr, status);
3624 }
3625 
3626 float64 float64_scalbn(float64 a, int n, float_status *status)
3627 {
3628     FloatParts64 pa, pr;
3629 
3630     float64_unpack_canonical(&pa, a, status);
3631     pr = scalbn_decomposed(pa, n, status);
3632     return float64_round_pack_canonical(&pr, status);
3633 }
3634 
3635 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3636 {
3637     FloatParts64 pa, pr;
3638 
3639     bfloat16_unpack_canonical(&pa, a, status);
3640     pr = scalbn_decomposed(pa, n, status);
3641     return bfloat16_round_pack_canonical(&pr, status);
3642 }
3643 
3644 /*
3645  * Square Root
3646  *
3647  * The old softfloat code did an approximation step before zeroing in
3648  * on the final result. However for simpleness we just compute the
3649  * square root by iterating down from the implicit bit to enough extra
3650  * bits to ensure we get a correctly rounded result.
3651  *
3652  * This does mean however the calculation is slower than before,
3653  * especially for 64 bit floats.
3654  */
3655 
3656 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3657 {
3658     uint64_t a_frac, r_frac, s_frac;
3659     int bit, last_bit;
3660 
3661     if (is_nan(a.cls)) {
3662         return return_nan(a, s);
3663     }
3664     if (a.cls == float_class_zero) {
3665         return a;  /* sqrt(+-0) = +-0 */
3666     }
3667     if (a.sign) {
3668         float_raise(float_flag_invalid, s);
3669         parts_default_nan(&a, s);
3670         return a;
3671     }
3672     if (a.cls == float_class_inf) {
3673         return a;  /* sqrt(+inf) = +inf */
3674     }
3675 
3676     assert(a.cls == float_class_normal);
3677 
3678     /* We need two overflow bits at the top. Adding room for that is a
3679      * right shift. If the exponent is odd, we can discard the low bit
3680      * by multiplying the fraction by 2; that's a left shift. Combine
3681      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3682      */
3683     a_frac = a.frac >> (2 - (a.exp & 1));
3684     a.exp >>= 1;
3685 
3686     /* Bit-by-bit computation of sqrt.  */
3687     r_frac = 0;
3688     s_frac = 0;
3689 
3690     /* Iterate from implicit bit down to the 3 extra bits to compute a
3691      * properly rounded result. Remember we've inserted two more bits
3692      * at the top, so these positions are two less.
3693      */
3694     bit = DECOMPOSED_BINARY_POINT - 2;
3695     last_bit = MAX(p->frac_shift - 4, 0);
3696     do {
3697         uint64_t q = 1ULL << bit;
3698         uint64_t t_frac = s_frac + q;
3699         if (t_frac <= a_frac) {
3700             s_frac = t_frac + q;
3701             a_frac -= t_frac;
3702             r_frac += q;
3703         }
3704         a_frac <<= 1;
3705     } while (--bit >= last_bit);
3706 
3707     /* Undo the right shift done above. If there is any remaining
3708      * fraction, the result is inexact. Set the sticky bit.
3709      */
3710     a.frac = (r_frac << 2) + (a_frac != 0);
3711 
3712     return a;
3713 }
3714 
3715 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3716 {
3717     FloatParts64 pa, pr;
3718 
3719     float16_unpack_canonical(&pa, a, status);
3720     pr = sqrt_float(pa, status, &float16_params);
3721     return float16_round_pack_canonical(&pr, status);
3722 }
3723 
3724 static float32 QEMU_SOFTFLOAT_ATTR
3725 soft_f32_sqrt(float32 a, float_status *status)
3726 {
3727     FloatParts64 pa, pr;
3728 
3729     float32_unpack_canonical(&pa, a, status);
3730     pr = sqrt_float(pa, status, &float32_params);
3731     return float32_round_pack_canonical(&pr, status);
3732 }
3733 
3734 static float64 QEMU_SOFTFLOAT_ATTR
3735 soft_f64_sqrt(float64 a, float_status *status)
3736 {
3737     FloatParts64 pa, pr;
3738 
3739     float64_unpack_canonical(&pa, a, status);
3740     pr = sqrt_float(pa, status, &float64_params);
3741     return float64_round_pack_canonical(&pr, status);
3742 }
3743 
3744 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3745 {
3746     union_float32 ua, ur;
3747 
3748     ua.s = xa;
3749     if (unlikely(!can_use_fpu(s))) {
3750         goto soft;
3751     }
3752 
3753     float32_input_flush1(&ua.s, s);
3754     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3755         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3756                        fpclassify(ua.h) == FP_ZERO) ||
3757                      signbit(ua.h))) {
3758             goto soft;
3759         }
3760     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3761                         float32_is_neg(ua.s))) {
3762         goto soft;
3763     }
3764     ur.h = sqrtf(ua.h);
3765     return ur.s;
3766 
3767  soft:
3768     return soft_f32_sqrt(ua.s, s);
3769 }
3770 
3771 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3772 {
3773     union_float64 ua, ur;
3774 
3775     ua.s = xa;
3776     if (unlikely(!can_use_fpu(s))) {
3777         goto soft;
3778     }
3779 
3780     float64_input_flush1(&ua.s, s);
3781     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3782         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3783                        fpclassify(ua.h) == FP_ZERO) ||
3784                      signbit(ua.h))) {
3785             goto soft;
3786         }
3787     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3788                         float64_is_neg(ua.s))) {
3789         goto soft;
3790     }
3791     ur.h = sqrt(ua.h);
3792     return ur.s;
3793 
3794  soft:
3795     return soft_f64_sqrt(ua.s, s);
3796 }
3797 
3798 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3799 {
3800     FloatParts64 pa, pr;
3801 
3802     bfloat16_unpack_canonical(&pa, a, status);
3803     pr = sqrt_float(pa, status, &bfloat16_params);
3804     return bfloat16_round_pack_canonical(&pr, status);
3805 }
3806 
3807 /*----------------------------------------------------------------------------
3808 | The pattern for a default generated NaN.
3809 *----------------------------------------------------------------------------*/
3810 
3811 float16 float16_default_nan(float_status *status)
3812 {
3813     FloatParts64 p;
3814 
3815     parts_default_nan(&p, status);
3816     p.frac >>= float16_params.frac_shift;
3817     return float16_pack_raw(&p);
3818 }
3819 
3820 float32 float32_default_nan(float_status *status)
3821 {
3822     FloatParts64 p;
3823 
3824     parts_default_nan(&p, status);
3825     p.frac >>= float32_params.frac_shift;
3826     return float32_pack_raw(&p);
3827 }
3828 
3829 float64 float64_default_nan(float_status *status)
3830 {
3831     FloatParts64 p;
3832 
3833     parts_default_nan(&p, status);
3834     p.frac >>= float64_params.frac_shift;
3835     return float64_pack_raw(&p);
3836 }
3837 
3838 float128 float128_default_nan(float_status *status)
3839 {
3840     FloatParts128 p;
3841 
3842     parts_default_nan(&p, status);
3843     frac_shr(&p, float128_params.frac_shift);
3844     return float128_pack_raw(&p);
3845 }
3846 
3847 bfloat16 bfloat16_default_nan(float_status *status)
3848 {
3849     FloatParts64 p;
3850 
3851     parts_default_nan(&p, status);
3852     p.frac >>= bfloat16_params.frac_shift;
3853     return bfloat16_pack_raw(&p);
3854 }
3855 
3856 /*----------------------------------------------------------------------------
3857 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3858 *----------------------------------------------------------------------------*/
3859 
3860 float16 float16_silence_nan(float16 a, float_status *status)
3861 {
3862     FloatParts64 p;
3863 
3864     float16_unpack_raw(&p, a);
3865     p.frac <<= float16_params.frac_shift;
3866     parts_silence_nan(&p, status);
3867     p.frac >>= float16_params.frac_shift;
3868     return float16_pack_raw(&p);
3869 }
3870 
3871 float32 float32_silence_nan(float32 a, float_status *status)
3872 {
3873     FloatParts64 p;
3874 
3875     float32_unpack_raw(&p, a);
3876     p.frac <<= float32_params.frac_shift;
3877     parts_silence_nan(&p, status);
3878     p.frac >>= float32_params.frac_shift;
3879     return float32_pack_raw(&p);
3880 }
3881 
3882 float64 float64_silence_nan(float64 a, float_status *status)
3883 {
3884     FloatParts64 p;
3885 
3886     float64_unpack_raw(&p, a);
3887     p.frac <<= float64_params.frac_shift;
3888     parts_silence_nan(&p, status);
3889     p.frac >>= float64_params.frac_shift;
3890     return float64_pack_raw(&p);
3891 }
3892 
3893 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3894 {
3895     FloatParts64 p;
3896 
3897     bfloat16_unpack_raw(&p, a);
3898     p.frac <<= bfloat16_params.frac_shift;
3899     parts_silence_nan(&p, status);
3900     p.frac >>= bfloat16_params.frac_shift;
3901     return bfloat16_pack_raw(&p);
3902 }
3903 
3904 float128 float128_silence_nan(float128 a, float_status *status)
3905 {
3906     FloatParts128 p;
3907 
3908     float128_unpack_raw(&p, a);
3909     frac_shl(&p, float128_params.frac_shift);
3910     parts_silence_nan(&p, status);
3911     frac_shr(&p, float128_params.frac_shift);
3912     return float128_pack_raw(&p);
3913 }
3914 
3915 /*----------------------------------------------------------------------------
3916 | If `a' is denormal and we are in flush-to-zero mode then set the
3917 | input-denormal exception and return zero. Otherwise just return the value.
3918 *----------------------------------------------------------------------------*/
3919 
3920 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3921 {
3922     if (p.exp == 0 && p.frac != 0) {
3923         float_raise(float_flag_input_denormal, status);
3924         return true;
3925     }
3926 
3927     return false;
3928 }
3929 
3930 float16 float16_squash_input_denormal(float16 a, float_status *status)
3931 {
3932     if (status->flush_inputs_to_zero) {
3933         FloatParts64 p;
3934 
3935         float16_unpack_raw(&p, a);
3936         if (parts_squash_denormal(p, status)) {
3937             return float16_set_sign(float16_zero, p.sign);
3938         }
3939     }
3940     return a;
3941 }
3942 
3943 float32 float32_squash_input_denormal(float32 a, float_status *status)
3944 {
3945     if (status->flush_inputs_to_zero) {
3946         FloatParts64 p;
3947 
3948         float32_unpack_raw(&p, a);
3949         if (parts_squash_denormal(p, status)) {
3950             return float32_set_sign(float32_zero, p.sign);
3951         }
3952     }
3953     return a;
3954 }
3955 
3956 float64 float64_squash_input_denormal(float64 a, float_status *status)
3957 {
3958     if (status->flush_inputs_to_zero) {
3959         FloatParts64 p;
3960 
3961         float64_unpack_raw(&p, a);
3962         if (parts_squash_denormal(p, status)) {
3963             return float64_set_sign(float64_zero, p.sign);
3964         }
3965     }
3966     return a;
3967 }
3968 
3969 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3970 {
3971     if (status->flush_inputs_to_zero) {
3972         FloatParts64 p;
3973 
3974         bfloat16_unpack_raw(&p, a);
3975         if (parts_squash_denormal(p, status)) {
3976             return bfloat16_set_sign(bfloat16_zero, p.sign);
3977         }
3978     }
3979     return a;
3980 }
3981 
3982 /*----------------------------------------------------------------------------
3983 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3984 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3985 | input.  If `zSign' is 1, the input is negated before being converted to an
3986 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3987 | is simply rounded to an integer, with the inexact exception raised if the
3988 | input cannot be represented exactly as an integer.  However, if the fixed-
3989 | point input is too large, the invalid exception is raised and the largest
3990 | positive or negative integer is returned.
3991 *----------------------------------------------------------------------------*/
3992 
3993 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3994                                  float_status *status)
3995 {
3996     int8_t roundingMode;
3997     bool roundNearestEven;
3998     int8_t roundIncrement, roundBits;
3999     int32_t z;
4000 
4001     roundingMode = status->float_rounding_mode;
4002     roundNearestEven = ( roundingMode == float_round_nearest_even );
4003     switch (roundingMode) {
4004     case float_round_nearest_even:
4005     case float_round_ties_away:
4006         roundIncrement = 0x40;
4007         break;
4008     case float_round_to_zero:
4009         roundIncrement = 0;
4010         break;
4011     case float_round_up:
4012         roundIncrement = zSign ? 0 : 0x7f;
4013         break;
4014     case float_round_down:
4015         roundIncrement = zSign ? 0x7f : 0;
4016         break;
4017     case float_round_to_odd:
4018         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
4019         break;
4020     default:
4021         abort();
4022     }
4023     roundBits = absZ & 0x7F;
4024     absZ = ( absZ + roundIncrement )>>7;
4025     if (!(roundBits ^ 0x40) && roundNearestEven) {
4026         absZ &= ~1;
4027     }
4028     z = absZ;
4029     if ( zSign ) z = - z;
4030     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
4031         float_raise(float_flag_invalid, status);
4032         return zSign ? INT32_MIN : INT32_MAX;
4033     }
4034     if (roundBits) {
4035         float_raise(float_flag_inexact, status);
4036     }
4037     return z;
4038 
4039 }
4040 
4041 /*----------------------------------------------------------------------------
4042 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4043 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4044 | and returns the properly rounded 64-bit integer corresponding to the input.
4045 | If `zSign' is 1, the input is negated before being converted to an integer.
4046 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4047 | the inexact exception raised if the input cannot be represented exactly as
4048 | an integer.  However, if the fixed-point input is too large, the invalid
4049 | exception is raised and the largest positive or negative integer is
4050 | returned.
4051 *----------------------------------------------------------------------------*/
4052 
4053 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
4054                                float_status *status)
4055 {
4056     int8_t roundingMode;
4057     bool roundNearestEven, increment;
4058     int64_t z;
4059 
4060     roundingMode = status->float_rounding_mode;
4061     roundNearestEven = ( roundingMode == float_round_nearest_even );
4062     switch (roundingMode) {
4063     case float_round_nearest_even:
4064     case float_round_ties_away:
4065         increment = ((int64_t) absZ1 < 0);
4066         break;
4067     case float_round_to_zero:
4068         increment = 0;
4069         break;
4070     case float_round_up:
4071         increment = !zSign && absZ1;
4072         break;
4073     case float_round_down:
4074         increment = zSign && absZ1;
4075         break;
4076     case float_round_to_odd:
4077         increment = !(absZ0 & 1) && absZ1;
4078         break;
4079     default:
4080         abort();
4081     }
4082     if ( increment ) {
4083         ++absZ0;
4084         if ( absZ0 == 0 ) goto overflow;
4085         if (!(absZ1 << 1) && roundNearestEven) {
4086             absZ0 &= ~1;
4087         }
4088     }
4089     z = absZ0;
4090     if ( zSign ) z = - z;
4091     if ( z && ( ( z < 0 ) ^ zSign ) ) {
4092  overflow:
4093         float_raise(float_flag_invalid, status);
4094         return zSign ? INT64_MIN : INT64_MAX;
4095     }
4096     if (absZ1) {
4097         float_raise(float_flag_inexact, status);
4098     }
4099     return z;
4100 
4101 }
4102 
4103 /*----------------------------------------------------------------------------
4104 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4105 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4106 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4107 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
4108 | with the inexact exception raised if the input cannot be represented exactly
4109 | as an integer.  However, if the fixed-point input is too large, the invalid
4110 | exception is raised and the largest unsigned integer is returned.
4111 *----------------------------------------------------------------------------*/
4112 
4113 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4114                                 uint64_t absZ1, float_status *status)
4115 {
4116     int8_t roundingMode;
4117     bool roundNearestEven, increment;
4118 
4119     roundingMode = status->float_rounding_mode;
4120     roundNearestEven = (roundingMode == float_round_nearest_even);
4121     switch (roundingMode) {
4122     case float_round_nearest_even:
4123     case float_round_ties_away:
4124         increment = ((int64_t)absZ1 < 0);
4125         break;
4126     case float_round_to_zero:
4127         increment = 0;
4128         break;
4129     case float_round_up:
4130         increment = !zSign && absZ1;
4131         break;
4132     case float_round_down:
4133         increment = zSign && absZ1;
4134         break;
4135     case float_round_to_odd:
4136         increment = !(absZ0 & 1) && absZ1;
4137         break;
4138     default:
4139         abort();
4140     }
4141     if (increment) {
4142         ++absZ0;
4143         if (absZ0 == 0) {
4144             float_raise(float_flag_invalid, status);
4145             return UINT64_MAX;
4146         }
4147         if (!(absZ1 << 1) && roundNearestEven) {
4148             absZ0 &= ~1;
4149         }
4150     }
4151 
4152     if (zSign && absZ0) {
4153         float_raise(float_flag_invalid, status);
4154         return 0;
4155     }
4156 
4157     if (absZ1) {
4158         float_raise(float_flag_inexact, status);
4159     }
4160     return absZ0;
4161 }
4162 
4163 /*----------------------------------------------------------------------------
4164 | Normalizes the subnormal single-precision floating-point value represented
4165 | by the denormalized significand `aSig'.  The normalized exponent and
4166 | significand are stored at the locations pointed to by `zExpPtr' and
4167 | `zSigPtr', respectively.
4168 *----------------------------------------------------------------------------*/
4169 
4170 static void
4171  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4172 {
4173     int8_t shiftCount;
4174 
4175     shiftCount = clz32(aSig) - 8;
4176     *zSigPtr = aSig<<shiftCount;
4177     *zExpPtr = 1 - shiftCount;
4178 
4179 }
4180 
4181 /*----------------------------------------------------------------------------
4182 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4183 | and significand `zSig', and returns the proper single-precision floating-
4184 | point value corresponding to the abstract input.  Ordinarily, the abstract
4185 | value is simply rounded and packed into the single-precision format, with
4186 | the inexact exception raised if the abstract input cannot be represented
4187 | exactly.  However, if the abstract value is too large, the overflow and
4188 | inexact exceptions are raised and an infinity or maximal finite value is
4189 | returned.  If the abstract value is too small, the input value is rounded to
4190 | a subnormal number, and the underflow and inexact exceptions are raised if
4191 | the abstract input cannot be represented exactly as a subnormal single-
4192 | precision floating-point number.
4193 |     The input significand `zSig' has its binary point between bits 30
4194 | and 29, which is 7 bits to the left of the usual location.  This shifted
4195 | significand must be normalized or smaller.  If `zSig' is not normalized,
4196 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4197 | and it must not require rounding.  In the usual case that `zSig' is
4198 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4199 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4200 | Binary Floating-Point Arithmetic.
4201 *----------------------------------------------------------------------------*/
4202 
4203 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4204                                    float_status *status)
4205 {
4206     int8_t roundingMode;
4207     bool roundNearestEven;
4208     int8_t roundIncrement, roundBits;
4209     bool isTiny;
4210 
4211     roundingMode = status->float_rounding_mode;
4212     roundNearestEven = ( roundingMode == float_round_nearest_even );
4213     switch (roundingMode) {
4214     case float_round_nearest_even:
4215     case float_round_ties_away:
4216         roundIncrement = 0x40;
4217         break;
4218     case float_round_to_zero:
4219         roundIncrement = 0;
4220         break;
4221     case float_round_up:
4222         roundIncrement = zSign ? 0 : 0x7f;
4223         break;
4224     case float_round_down:
4225         roundIncrement = zSign ? 0x7f : 0;
4226         break;
4227     case float_round_to_odd:
4228         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4229         break;
4230     default:
4231         abort();
4232         break;
4233     }
4234     roundBits = zSig & 0x7F;
4235     if ( 0xFD <= (uint16_t) zExp ) {
4236         if (    ( 0xFD < zExp )
4237              || (    ( zExp == 0xFD )
4238                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4239            ) {
4240             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4241                                    roundIncrement != 0;
4242             float_raise(float_flag_overflow | float_flag_inexact, status);
4243             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4244         }
4245         if ( zExp < 0 ) {
4246             if (status->flush_to_zero) {
4247                 float_raise(float_flag_output_denormal, status);
4248                 return packFloat32(zSign, 0, 0);
4249             }
4250             isTiny = status->tininess_before_rounding
4251                   || (zExp < -1)
4252                   || (zSig + roundIncrement < 0x80000000);
4253             shift32RightJamming( zSig, - zExp, &zSig );
4254             zExp = 0;
4255             roundBits = zSig & 0x7F;
4256             if (isTiny && roundBits) {
4257                 float_raise(float_flag_underflow, status);
4258             }
4259             if (roundingMode == float_round_to_odd) {
4260                 /*
4261                  * For round-to-odd case, the roundIncrement depends on
4262                  * zSig which just changed.
4263                  */
4264                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4265             }
4266         }
4267     }
4268     if (roundBits) {
4269         float_raise(float_flag_inexact, status);
4270     }
4271     zSig = ( zSig + roundIncrement )>>7;
4272     if (!(roundBits ^ 0x40) && roundNearestEven) {
4273         zSig &= ~1;
4274     }
4275     if ( zSig == 0 ) zExp = 0;
4276     return packFloat32( zSign, zExp, zSig );
4277 
4278 }
4279 
4280 /*----------------------------------------------------------------------------
4281 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4282 | and significand `zSig', and returns the proper single-precision floating-
4283 | point value corresponding to the abstract input.  This routine is just like
4284 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4285 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4286 | floating-point exponent.
4287 *----------------------------------------------------------------------------*/
4288 
4289 static float32
4290  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4291                               float_status *status)
4292 {
4293     int8_t shiftCount;
4294 
4295     shiftCount = clz32(zSig) - 1;
4296     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4297                                status);
4298 
4299 }
4300 
4301 /*----------------------------------------------------------------------------
4302 | Normalizes the subnormal double-precision floating-point value represented
4303 | by the denormalized significand `aSig'.  The normalized exponent and
4304 | significand are stored at the locations pointed to by `zExpPtr' and
4305 | `zSigPtr', respectively.
4306 *----------------------------------------------------------------------------*/
4307 
4308 static void
4309  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4310 {
4311     int8_t shiftCount;
4312 
4313     shiftCount = clz64(aSig) - 11;
4314     *zSigPtr = aSig<<shiftCount;
4315     *zExpPtr = 1 - shiftCount;
4316 
4317 }
4318 
4319 /*----------------------------------------------------------------------------
4320 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4321 | double-precision floating-point value, returning the result.  After being
4322 | shifted into the proper positions, the three fields are simply added
4323 | together to form the result.  This means that any integer portion of `zSig'
4324 | will be added into the exponent.  Since a properly normalized significand
4325 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4326 | than the desired result exponent whenever `zSig' is a complete, normalized
4327 | significand.
4328 *----------------------------------------------------------------------------*/
4329 
4330 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4331 {
4332 
4333     return make_float64(
4334         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4335 
4336 }
4337 
4338 /*----------------------------------------------------------------------------
4339 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4340 | and significand `zSig', and returns the proper double-precision floating-
4341 | point value corresponding to the abstract input.  Ordinarily, the abstract
4342 | value is simply rounded and packed into the double-precision format, with
4343 | the inexact exception raised if the abstract input cannot be represented
4344 | exactly.  However, if the abstract value is too large, the overflow and
4345 | inexact exceptions are raised and an infinity or maximal finite value is
4346 | returned.  If the abstract value is too small, the input value is rounded to
4347 | a subnormal number, and the underflow and inexact exceptions are raised if
4348 | the abstract input cannot be represented exactly as a subnormal double-
4349 | precision floating-point number.
4350 |     The input significand `zSig' has its binary point between bits 62
4351 | and 61, which is 10 bits to the left of the usual location.  This shifted
4352 | significand must be normalized or smaller.  If `zSig' is not normalized,
4353 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4354 | and it must not require rounding.  In the usual case that `zSig' is
4355 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4356 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4357 | Binary Floating-Point Arithmetic.
4358 *----------------------------------------------------------------------------*/
4359 
4360 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4361                                    float_status *status)
4362 {
4363     int8_t roundingMode;
4364     bool roundNearestEven;
4365     int roundIncrement, roundBits;
4366     bool isTiny;
4367 
4368     roundingMode = status->float_rounding_mode;
4369     roundNearestEven = ( roundingMode == float_round_nearest_even );
4370     switch (roundingMode) {
4371     case float_round_nearest_even:
4372     case float_round_ties_away:
4373         roundIncrement = 0x200;
4374         break;
4375     case float_round_to_zero:
4376         roundIncrement = 0;
4377         break;
4378     case float_round_up:
4379         roundIncrement = zSign ? 0 : 0x3ff;
4380         break;
4381     case float_round_down:
4382         roundIncrement = zSign ? 0x3ff : 0;
4383         break;
4384     case float_round_to_odd:
4385         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4386         break;
4387     default:
4388         abort();
4389     }
4390     roundBits = zSig & 0x3FF;
4391     if ( 0x7FD <= (uint16_t) zExp ) {
4392         if (    ( 0x7FD < zExp )
4393              || (    ( zExp == 0x7FD )
4394                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4395            ) {
4396             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4397                                    roundIncrement != 0;
4398             float_raise(float_flag_overflow | float_flag_inexact, status);
4399             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4400         }
4401         if ( zExp < 0 ) {
4402             if (status->flush_to_zero) {
4403                 float_raise(float_flag_output_denormal, status);
4404                 return packFloat64(zSign, 0, 0);
4405             }
4406             isTiny = status->tininess_before_rounding
4407                   || (zExp < -1)
4408                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4409             shift64RightJamming( zSig, - zExp, &zSig );
4410             zExp = 0;
4411             roundBits = zSig & 0x3FF;
4412             if (isTiny && roundBits) {
4413                 float_raise(float_flag_underflow, status);
4414             }
4415             if (roundingMode == float_round_to_odd) {
4416                 /*
4417                  * For round-to-odd case, the roundIncrement depends on
4418                  * zSig which just changed.
4419                  */
4420                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4421             }
4422         }
4423     }
4424     if (roundBits) {
4425         float_raise(float_flag_inexact, status);
4426     }
4427     zSig = ( zSig + roundIncrement )>>10;
4428     if (!(roundBits ^ 0x200) && roundNearestEven) {
4429         zSig &= ~1;
4430     }
4431     if ( zSig == 0 ) zExp = 0;
4432     return packFloat64( zSign, zExp, zSig );
4433 
4434 }
4435 
4436 /*----------------------------------------------------------------------------
4437 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4438 | and significand `zSig', and returns the proper double-precision floating-
4439 | point value corresponding to the abstract input.  This routine is just like
4440 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4441 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4442 | floating-point exponent.
4443 *----------------------------------------------------------------------------*/
4444 
4445 static float64
4446  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4447                               float_status *status)
4448 {
4449     int8_t shiftCount;
4450 
4451     shiftCount = clz64(zSig) - 1;
4452     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4453                                status);
4454 
4455 }
4456 
4457 /*----------------------------------------------------------------------------
4458 | Normalizes the subnormal extended double-precision floating-point value
4459 | represented by the denormalized significand `aSig'.  The normalized exponent
4460 | and significand are stored at the locations pointed to by `zExpPtr' and
4461 | `zSigPtr', respectively.
4462 *----------------------------------------------------------------------------*/
4463 
4464 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4465                                 uint64_t *zSigPtr)
4466 {
4467     int8_t shiftCount;
4468 
4469     shiftCount = clz64(aSig);
4470     *zSigPtr = aSig<<shiftCount;
4471     *zExpPtr = 1 - shiftCount;
4472 }
4473 
4474 /*----------------------------------------------------------------------------
4475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4476 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4477 | and returns the proper extended double-precision floating-point value
4478 | corresponding to the abstract input.  Ordinarily, the abstract value is
4479 | rounded and packed into the extended double-precision format, with the
4480 | inexact exception raised if the abstract input cannot be represented
4481 | exactly.  However, if the abstract value is too large, the overflow and
4482 | inexact exceptions are raised and an infinity or maximal finite value is
4483 | returned.  If the abstract value is too small, the input value is rounded to
4484 | a subnormal number, and the underflow and inexact exceptions are raised if
4485 | the abstract input cannot be represented exactly as a subnormal extended
4486 | double-precision floating-point number.
4487 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4488 | number of bits as single or double precision, respectively.  Otherwise, the
4489 | result is rounded to the full precision of the extended double-precision
4490 | format.
4491 |     The input significand must be normalized or smaller.  If the input
4492 | significand is not normalized, `zExp' must be 0; in that case, the result
4493 | returned is a subnormal number, and it must not require rounding.  The
4494 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4495 | Floating-Point Arithmetic.
4496 *----------------------------------------------------------------------------*/
4497 
4498 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4499                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4500                               float_status *status)
4501 {
4502     int8_t roundingMode;
4503     bool roundNearestEven, increment, isTiny;
4504     int64_t roundIncrement, roundMask, roundBits;
4505 
4506     roundingMode = status->float_rounding_mode;
4507     roundNearestEven = ( roundingMode == float_round_nearest_even );
4508     if ( roundingPrecision == 80 ) goto precision80;
4509     if ( roundingPrecision == 64 ) {
4510         roundIncrement = UINT64_C(0x0000000000000400);
4511         roundMask = UINT64_C(0x00000000000007FF);
4512     }
4513     else if ( roundingPrecision == 32 ) {
4514         roundIncrement = UINT64_C(0x0000008000000000);
4515         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4516     }
4517     else {
4518         goto precision80;
4519     }
4520     zSig0 |= ( zSig1 != 0 );
4521     switch (roundingMode) {
4522     case float_round_nearest_even:
4523     case float_round_ties_away:
4524         break;
4525     case float_round_to_zero:
4526         roundIncrement = 0;
4527         break;
4528     case float_round_up:
4529         roundIncrement = zSign ? 0 : roundMask;
4530         break;
4531     case float_round_down:
4532         roundIncrement = zSign ? roundMask : 0;
4533         break;
4534     default:
4535         abort();
4536     }
4537     roundBits = zSig0 & roundMask;
4538     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4539         if (    ( 0x7FFE < zExp )
4540              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4541            ) {
4542             goto overflow;
4543         }
4544         if ( zExp <= 0 ) {
4545             if (status->flush_to_zero) {
4546                 float_raise(float_flag_output_denormal, status);
4547                 return packFloatx80(zSign, 0, 0);
4548             }
4549             isTiny = status->tininess_before_rounding
4550                   || (zExp < 0 )
4551                   || (zSig0 <= zSig0 + roundIncrement);
4552             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4553             zExp = 0;
4554             roundBits = zSig0 & roundMask;
4555             if (isTiny && roundBits) {
4556                 float_raise(float_flag_underflow, status);
4557             }
4558             if (roundBits) {
4559                 float_raise(float_flag_inexact, status);
4560             }
4561             zSig0 += roundIncrement;
4562             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4563             roundIncrement = roundMask + 1;
4564             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4565                 roundMask |= roundIncrement;
4566             }
4567             zSig0 &= ~ roundMask;
4568             return packFloatx80( zSign, zExp, zSig0 );
4569         }
4570     }
4571     if (roundBits) {
4572         float_raise(float_flag_inexact, status);
4573     }
4574     zSig0 += roundIncrement;
4575     if ( zSig0 < roundIncrement ) {
4576         ++zExp;
4577         zSig0 = UINT64_C(0x8000000000000000);
4578     }
4579     roundIncrement = roundMask + 1;
4580     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4581         roundMask |= roundIncrement;
4582     }
4583     zSig0 &= ~ roundMask;
4584     if ( zSig0 == 0 ) zExp = 0;
4585     return packFloatx80( zSign, zExp, zSig0 );
4586  precision80:
4587     switch (roundingMode) {
4588     case float_round_nearest_even:
4589     case float_round_ties_away:
4590         increment = ((int64_t)zSig1 < 0);
4591         break;
4592     case float_round_to_zero:
4593         increment = 0;
4594         break;
4595     case float_round_up:
4596         increment = !zSign && zSig1;
4597         break;
4598     case float_round_down:
4599         increment = zSign && zSig1;
4600         break;
4601     default:
4602         abort();
4603     }
4604     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4605         if (    ( 0x7FFE < zExp )
4606              || (    ( zExp == 0x7FFE )
4607                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4608                   && increment
4609                 )
4610            ) {
4611             roundMask = 0;
4612  overflow:
4613             float_raise(float_flag_overflow | float_flag_inexact, status);
4614             if (    ( roundingMode == float_round_to_zero )
4615                  || ( zSign && ( roundingMode == float_round_up ) )
4616                  || ( ! zSign && ( roundingMode == float_round_down ) )
4617                ) {
4618                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4619             }
4620             return packFloatx80(zSign,
4621                                 floatx80_infinity_high,
4622                                 floatx80_infinity_low);
4623         }
4624         if ( zExp <= 0 ) {
4625             isTiny = status->tininess_before_rounding
4626                   || (zExp < 0)
4627                   || !increment
4628                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4629             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4630             zExp = 0;
4631             if (isTiny && zSig1) {
4632                 float_raise(float_flag_underflow, status);
4633             }
4634             if (zSig1) {
4635                 float_raise(float_flag_inexact, status);
4636             }
4637             switch (roundingMode) {
4638             case float_round_nearest_even:
4639             case float_round_ties_away:
4640                 increment = ((int64_t)zSig1 < 0);
4641                 break;
4642             case float_round_to_zero:
4643                 increment = 0;
4644                 break;
4645             case float_round_up:
4646                 increment = !zSign && zSig1;
4647                 break;
4648             case float_round_down:
4649                 increment = zSign && zSig1;
4650                 break;
4651             default:
4652                 abort();
4653             }
4654             if ( increment ) {
4655                 ++zSig0;
4656                 if (!(zSig1 << 1) && roundNearestEven) {
4657                     zSig0 &= ~1;
4658                 }
4659                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4660             }
4661             return packFloatx80( zSign, zExp, zSig0 );
4662         }
4663     }
4664     if (zSig1) {
4665         float_raise(float_flag_inexact, status);
4666     }
4667     if ( increment ) {
4668         ++zSig0;
4669         if ( zSig0 == 0 ) {
4670             ++zExp;
4671             zSig0 = UINT64_C(0x8000000000000000);
4672         }
4673         else {
4674             if (!(zSig1 << 1) && roundNearestEven) {
4675                 zSig0 &= ~1;
4676             }
4677         }
4678     }
4679     else {
4680         if ( zSig0 == 0 ) zExp = 0;
4681     }
4682     return packFloatx80( zSign, zExp, zSig0 );
4683 
4684 }
4685 
4686 /*----------------------------------------------------------------------------
4687 | Takes an abstract floating-point value having sign `zSign', exponent
4688 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4689 | and returns the proper extended double-precision floating-point value
4690 | corresponding to the abstract input.  This routine is just like
4691 | `roundAndPackFloatx80' except that the input significand does not have to be
4692 | normalized.
4693 *----------------------------------------------------------------------------*/
4694 
4695 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4696                                        bool zSign, int32_t zExp,
4697                                        uint64_t zSig0, uint64_t zSig1,
4698                                        float_status *status)
4699 {
4700     int8_t shiftCount;
4701 
4702     if ( zSig0 == 0 ) {
4703         zSig0 = zSig1;
4704         zSig1 = 0;
4705         zExp -= 64;
4706     }
4707     shiftCount = clz64(zSig0);
4708     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4709     zExp -= shiftCount;
4710     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4711                                 zSig0, zSig1, status);
4712 
4713 }
4714 
4715 /*----------------------------------------------------------------------------
4716 | Returns the least-significant 64 fraction bits of the quadruple-precision
4717 | floating-point value `a'.
4718 *----------------------------------------------------------------------------*/
4719 
4720 static inline uint64_t extractFloat128Frac1( float128 a )
4721 {
4722 
4723     return a.low;
4724 
4725 }
4726 
4727 /*----------------------------------------------------------------------------
4728 | Returns the most-significant 48 fraction bits of the quadruple-precision
4729 | floating-point value `a'.
4730 *----------------------------------------------------------------------------*/
4731 
4732 static inline uint64_t extractFloat128Frac0( float128 a )
4733 {
4734 
4735     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4736 
4737 }
4738 
4739 /*----------------------------------------------------------------------------
4740 | Returns the exponent bits of the quadruple-precision floating-point value
4741 | `a'.
4742 *----------------------------------------------------------------------------*/
4743 
4744 static inline int32_t extractFloat128Exp( float128 a )
4745 {
4746 
4747     return ( a.high>>48 ) & 0x7FFF;
4748 
4749 }
4750 
4751 /*----------------------------------------------------------------------------
4752 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4753 *----------------------------------------------------------------------------*/
4754 
4755 static inline bool extractFloat128Sign(float128 a)
4756 {
4757     return a.high >> 63;
4758 }
4759 
4760 /*----------------------------------------------------------------------------
4761 | Normalizes the subnormal quadruple-precision floating-point value
4762 | represented by the denormalized significand formed by the concatenation of
4763 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4764 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4765 | significand are stored at the location pointed to by `zSig0Ptr', and the
4766 | least significant 64 bits of the normalized significand are stored at the
4767 | location pointed to by `zSig1Ptr'.
4768 *----------------------------------------------------------------------------*/
4769 
4770 static void
4771  normalizeFloat128Subnormal(
4772      uint64_t aSig0,
4773      uint64_t aSig1,
4774      int32_t *zExpPtr,
4775      uint64_t *zSig0Ptr,
4776      uint64_t *zSig1Ptr
4777  )
4778 {
4779     int8_t shiftCount;
4780 
4781     if ( aSig0 == 0 ) {
4782         shiftCount = clz64(aSig1) - 15;
4783         if ( shiftCount < 0 ) {
4784             *zSig0Ptr = aSig1>>( - shiftCount );
4785             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4786         }
4787         else {
4788             *zSig0Ptr = aSig1<<shiftCount;
4789             *zSig1Ptr = 0;
4790         }
4791         *zExpPtr = - shiftCount - 63;
4792     }
4793     else {
4794         shiftCount = clz64(aSig0) - 15;
4795         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4796         *zExpPtr = 1 - shiftCount;
4797     }
4798 
4799 }
4800 
4801 /*----------------------------------------------------------------------------
4802 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4803 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4804 | floating-point value, returning the result.  After being shifted into the
4805 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4806 | added together to form the most significant 32 bits of the result.  This
4807 | means that any integer portion of `zSig0' will be added into the exponent.
4808 | Since a properly normalized significand will have an integer portion equal
4809 | to 1, the `zExp' input should be 1 less than the desired result exponent
4810 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4811 | significand.
4812 *----------------------------------------------------------------------------*/
4813 
4814 static inline float128
4815 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4816 {
4817     float128 z;
4818 
4819     z.low = zSig1;
4820     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4821     return z;
4822 }
4823 
4824 /*----------------------------------------------------------------------------
4825 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4826 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4827 | and `zSig2', and returns the proper quadruple-precision floating-point value
4828 | corresponding to the abstract input.  Ordinarily, the abstract value is
4829 | simply rounded and packed into the quadruple-precision format, with the
4830 | inexact exception raised if the abstract input cannot be represented
4831 | exactly.  However, if the abstract value is too large, the overflow and
4832 | inexact exceptions are raised and an infinity or maximal finite value is
4833 | returned.  If the abstract value is too small, the input value is rounded to
4834 | a subnormal number, and the underflow and inexact exceptions are raised if
4835 | the abstract input cannot be represented exactly as a subnormal quadruple-
4836 | precision floating-point number.
4837 |     The input significand must be normalized or smaller.  If the input
4838 | significand is not normalized, `zExp' must be 0; in that case, the result
4839 | returned is a subnormal number, and it must not require rounding.  In the
4840 | usual case that the input significand is normalized, `zExp' must be 1 less
4841 | than the ``true'' floating-point exponent.  The handling of underflow and
4842 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4843 *----------------------------------------------------------------------------*/
4844 
4845 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4846                                      uint64_t zSig0, uint64_t zSig1,
4847                                      uint64_t zSig2, float_status *status)
4848 {
4849     int8_t roundingMode;
4850     bool roundNearestEven, increment, isTiny;
4851 
4852     roundingMode = status->float_rounding_mode;
4853     roundNearestEven = ( roundingMode == float_round_nearest_even );
4854     switch (roundingMode) {
4855     case float_round_nearest_even:
4856     case float_round_ties_away:
4857         increment = ((int64_t)zSig2 < 0);
4858         break;
4859     case float_round_to_zero:
4860         increment = 0;
4861         break;
4862     case float_round_up:
4863         increment = !zSign && zSig2;
4864         break;
4865     case float_round_down:
4866         increment = zSign && zSig2;
4867         break;
4868     case float_round_to_odd:
4869         increment = !(zSig1 & 0x1) && zSig2;
4870         break;
4871     default:
4872         abort();
4873     }
4874     if ( 0x7FFD <= (uint32_t) zExp ) {
4875         if (    ( 0x7FFD < zExp )
4876              || (    ( zExp == 0x7FFD )
4877                   && eq128(
4878                          UINT64_C(0x0001FFFFFFFFFFFF),
4879                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4880                          zSig0,
4881                          zSig1
4882                      )
4883                   && increment
4884                 )
4885            ) {
4886             float_raise(float_flag_overflow | float_flag_inexact, status);
4887             if (    ( roundingMode == float_round_to_zero )
4888                  || ( zSign && ( roundingMode == float_round_up ) )
4889                  || ( ! zSign && ( roundingMode == float_round_down ) )
4890                  || (roundingMode == float_round_to_odd)
4891                ) {
4892                 return
4893                     packFloat128(
4894                         zSign,
4895                         0x7FFE,
4896                         UINT64_C(0x0000FFFFFFFFFFFF),
4897                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4898                     );
4899             }
4900             return packFloat128( zSign, 0x7FFF, 0, 0 );
4901         }
4902         if ( zExp < 0 ) {
4903             if (status->flush_to_zero) {
4904                 float_raise(float_flag_output_denormal, status);
4905                 return packFloat128(zSign, 0, 0, 0);
4906             }
4907             isTiny = status->tininess_before_rounding
4908                   || (zExp < -1)
4909                   || !increment
4910                   || lt128(zSig0, zSig1,
4911                            UINT64_C(0x0001FFFFFFFFFFFF),
4912                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4913             shift128ExtraRightJamming(
4914                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4915             zExp = 0;
4916             if (isTiny && zSig2) {
4917                 float_raise(float_flag_underflow, status);
4918             }
4919             switch (roundingMode) {
4920             case float_round_nearest_even:
4921             case float_round_ties_away:
4922                 increment = ((int64_t)zSig2 < 0);
4923                 break;
4924             case float_round_to_zero:
4925                 increment = 0;
4926                 break;
4927             case float_round_up:
4928                 increment = !zSign && zSig2;
4929                 break;
4930             case float_round_down:
4931                 increment = zSign && zSig2;
4932                 break;
4933             case float_round_to_odd:
4934                 increment = !(zSig1 & 0x1) && zSig2;
4935                 break;
4936             default:
4937                 abort();
4938             }
4939         }
4940     }
4941     if (zSig2) {
4942         float_raise(float_flag_inexact, status);
4943     }
4944     if ( increment ) {
4945         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4946         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4947             zSig1 &= ~1;
4948         }
4949     }
4950     else {
4951         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4952     }
4953     return packFloat128( zSign, zExp, zSig0, zSig1 );
4954 
4955 }
4956 
4957 /*----------------------------------------------------------------------------
4958 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4959 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4960 | returns the proper quadruple-precision floating-point value corresponding
4961 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4962 | except that the input significand has fewer bits and does not have to be
4963 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4964 | point exponent.
4965 *----------------------------------------------------------------------------*/
4966 
4967 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4968                                               uint64_t zSig0, uint64_t zSig1,
4969                                               float_status *status)
4970 {
4971     int8_t shiftCount;
4972     uint64_t zSig2;
4973 
4974     if ( zSig0 == 0 ) {
4975         zSig0 = zSig1;
4976         zSig1 = 0;
4977         zExp -= 64;
4978     }
4979     shiftCount = clz64(zSig0) - 15;
4980     if ( 0 <= shiftCount ) {
4981         zSig2 = 0;
4982         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4983     }
4984     else {
4985         shift128ExtraRightJamming(
4986             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4987     }
4988     zExp -= shiftCount;
4989     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4990 
4991 }
4992 
4993 
4994 /*----------------------------------------------------------------------------
4995 | Returns the result of converting the 32-bit two's complement integer `a'
4996 | to the extended double-precision floating-point format.  The conversion
4997 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4998 | Arithmetic.
4999 *----------------------------------------------------------------------------*/
5000 
5001 floatx80 int32_to_floatx80(int32_t a, float_status *status)
5002 {
5003     bool zSign;
5004     uint32_t absA;
5005     int8_t shiftCount;
5006     uint64_t zSig;
5007 
5008     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5009     zSign = ( a < 0 );
5010     absA = zSign ? - a : a;
5011     shiftCount = clz32(absA) + 32;
5012     zSig = absA;
5013     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
5014 
5015 }
5016 
5017 /*----------------------------------------------------------------------------
5018 | Returns the result of converting the 32-bit two's complement integer `a' to
5019 | the quadruple-precision floating-point format.  The conversion is performed
5020 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5021 *----------------------------------------------------------------------------*/
5022 
5023 float128 int32_to_float128(int32_t a, float_status *status)
5024 {
5025     bool zSign;
5026     uint32_t absA;
5027     int8_t shiftCount;
5028     uint64_t zSig0;
5029 
5030     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5031     zSign = ( a < 0 );
5032     absA = zSign ? - a : a;
5033     shiftCount = clz32(absA) + 17;
5034     zSig0 = absA;
5035     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
5036 
5037 }
5038 
5039 /*----------------------------------------------------------------------------
5040 | Returns the result of converting the 64-bit two's complement integer `a'
5041 | to the extended double-precision floating-point format.  The conversion
5042 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5043 | Arithmetic.
5044 *----------------------------------------------------------------------------*/
5045 
5046 floatx80 int64_to_floatx80(int64_t a, float_status *status)
5047 {
5048     bool zSign;
5049     uint64_t absA;
5050     int8_t shiftCount;
5051 
5052     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5053     zSign = ( a < 0 );
5054     absA = zSign ? - a : a;
5055     shiftCount = clz64(absA);
5056     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
5057 
5058 }
5059 
5060 /*----------------------------------------------------------------------------
5061 | Returns the result of converting the 64-bit two's complement integer `a' to
5062 | the quadruple-precision floating-point format.  The conversion is performed
5063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5064 *----------------------------------------------------------------------------*/
5065 
5066 float128 int64_to_float128(int64_t a, float_status *status)
5067 {
5068     bool zSign;
5069     uint64_t absA;
5070     int8_t shiftCount;
5071     int32_t zExp;
5072     uint64_t zSig0, zSig1;
5073 
5074     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5075     zSign = ( a < 0 );
5076     absA = zSign ? - a : a;
5077     shiftCount = clz64(absA) + 49;
5078     zExp = 0x406E - shiftCount;
5079     if ( 64 <= shiftCount ) {
5080         zSig1 = 0;
5081         zSig0 = absA;
5082         shiftCount -= 64;
5083     }
5084     else {
5085         zSig1 = absA;
5086         zSig0 = 0;
5087     }
5088     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5089     return packFloat128( zSign, zExp, zSig0, zSig1 );
5090 
5091 }
5092 
5093 /*----------------------------------------------------------------------------
5094 | Returns the result of converting the 64-bit unsigned integer `a'
5095 | to the quadruple-precision floating-point format.  The conversion is performed
5096 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5097 *----------------------------------------------------------------------------*/
5098 
5099 float128 uint64_to_float128(uint64_t a, float_status *status)
5100 {
5101     if (a == 0) {
5102         return float128_zero;
5103     }
5104     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5105 }
5106 
5107 /*----------------------------------------------------------------------------
5108 | Returns the result of converting the single-precision floating-point value
5109 | `a' to the extended double-precision floating-point format.  The conversion
5110 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5111 | Arithmetic.
5112 *----------------------------------------------------------------------------*/
5113 
5114 floatx80 float32_to_floatx80(float32 a, float_status *status)
5115 {
5116     bool aSign;
5117     int aExp;
5118     uint32_t aSig;
5119 
5120     a = float32_squash_input_denormal(a, status);
5121     aSig = extractFloat32Frac( a );
5122     aExp = extractFloat32Exp( a );
5123     aSign = extractFloat32Sign( a );
5124     if ( aExp == 0xFF ) {
5125         if (aSig) {
5126             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5127                                                status);
5128             return floatx80_silence_nan(res, status);
5129         }
5130         return packFloatx80(aSign,
5131                             floatx80_infinity_high,
5132                             floatx80_infinity_low);
5133     }
5134     if ( aExp == 0 ) {
5135         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5136         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5137     }
5138     aSig |= 0x00800000;
5139     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5140 
5141 }
5142 
5143 /*----------------------------------------------------------------------------
5144 | Returns the result of converting the single-precision floating-point value
5145 | `a' to the double-precision floating-point format.  The conversion is
5146 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5147 | Arithmetic.
5148 *----------------------------------------------------------------------------*/
5149 
5150 float128 float32_to_float128(float32 a, float_status *status)
5151 {
5152     bool aSign;
5153     int aExp;
5154     uint32_t aSig;
5155 
5156     a = float32_squash_input_denormal(a, status);
5157     aSig = extractFloat32Frac( a );
5158     aExp = extractFloat32Exp( a );
5159     aSign = extractFloat32Sign( a );
5160     if ( aExp == 0xFF ) {
5161         if (aSig) {
5162             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5163         }
5164         return packFloat128( aSign, 0x7FFF, 0, 0 );
5165     }
5166     if ( aExp == 0 ) {
5167         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5168         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5169         --aExp;
5170     }
5171     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5172 
5173 }
5174 
5175 /*----------------------------------------------------------------------------
5176 | Returns the remainder of the single-precision floating-point value `a'
5177 | with respect to the corresponding value `b'.  The operation is performed
5178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5179 *----------------------------------------------------------------------------*/
5180 
5181 float32 float32_rem(float32 a, float32 b, float_status *status)
5182 {
5183     bool aSign, zSign;
5184     int aExp, bExp, expDiff;
5185     uint32_t aSig, bSig;
5186     uint32_t q;
5187     uint64_t aSig64, bSig64, q64;
5188     uint32_t alternateASig;
5189     int32_t sigMean;
5190     a = float32_squash_input_denormal(a, status);
5191     b = float32_squash_input_denormal(b, status);
5192 
5193     aSig = extractFloat32Frac( a );
5194     aExp = extractFloat32Exp( a );
5195     aSign = extractFloat32Sign( a );
5196     bSig = extractFloat32Frac( b );
5197     bExp = extractFloat32Exp( b );
5198     if ( aExp == 0xFF ) {
5199         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5200             return propagateFloat32NaN(a, b, status);
5201         }
5202         float_raise(float_flag_invalid, status);
5203         return float32_default_nan(status);
5204     }
5205     if ( bExp == 0xFF ) {
5206         if (bSig) {
5207             return propagateFloat32NaN(a, b, status);
5208         }
5209         return a;
5210     }
5211     if ( bExp == 0 ) {
5212         if ( bSig == 0 ) {
5213             float_raise(float_flag_invalid, status);
5214             return float32_default_nan(status);
5215         }
5216         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5217     }
5218     if ( aExp == 0 ) {
5219         if ( aSig == 0 ) return a;
5220         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5221     }
5222     expDiff = aExp - bExp;
5223     aSig |= 0x00800000;
5224     bSig |= 0x00800000;
5225     if ( expDiff < 32 ) {
5226         aSig <<= 8;
5227         bSig <<= 8;
5228         if ( expDiff < 0 ) {
5229             if ( expDiff < -1 ) return a;
5230             aSig >>= 1;
5231         }
5232         q = ( bSig <= aSig );
5233         if ( q ) aSig -= bSig;
5234         if ( 0 < expDiff ) {
5235             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5236             q >>= 32 - expDiff;
5237             bSig >>= 2;
5238             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5239         }
5240         else {
5241             aSig >>= 2;
5242             bSig >>= 2;
5243         }
5244     }
5245     else {
5246         if ( bSig <= aSig ) aSig -= bSig;
5247         aSig64 = ( (uint64_t) aSig )<<40;
5248         bSig64 = ( (uint64_t) bSig )<<40;
5249         expDiff -= 64;
5250         while ( 0 < expDiff ) {
5251             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5252             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5253             aSig64 = - ( ( bSig * q64 )<<38 );
5254             expDiff -= 62;
5255         }
5256         expDiff += 64;
5257         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5258         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5259         q = q64>>( 64 - expDiff );
5260         bSig <<= 6;
5261         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5262     }
5263     do {
5264         alternateASig = aSig;
5265         ++q;
5266         aSig -= bSig;
5267     } while ( 0 <= (int32_t) aSig );
5268     sigMean = aSig + alternateASig;
5269     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5270         aSig = alternateASig;
5271     }
5272     zSign = ( (int32_t) aSig < 0 );
5273     if ( zSign ) aSig = - aSig;
5274     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5275 }
5276 
5277 
5278 
5279 /*----------------------------------------------------------------------------
5280 | Returns the binary exponential of the single-precision floating-point value
5281 | `a'. The operation is performed according to the IEC/IEEE Standard for
5282 | Binary Floating-Point Arithmetic.
5283 |
5284 | Uses the following identities:
5285 |
5286 | 1. -------------------------------------------------------------------------
5287 |      x    x*ln(2)
5288 |     2  = e
5289 |
5290 | 2. -------------------------------------------------------------------------
5291 |                      2     3     4     5           n
5292 |      x        x     x     x     x     x           x
5293 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5294 |               1!    2!    3!    4!    5!          n!
5295 *----------------------------------------------------------------------------*/
5296 
5297 static const float64 float32_exp2_coefficients[15] =
5298 {
5299     const_float64( 0x3ff0000000000000ll ), /*  1 */
5300     const_float64( 0x3fe0000000000000ll ), /*  2 */
5301     const_float64( 0x3fc5555555555555ll ), /*  3 */
5302     const_float64( 0x3fa5555555555555ll ), /*  4 */
5303     const_float64( 0x3f81111111111111ll ), /*  5 */
5304     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5305     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5306     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5307     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5308     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5309     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5310     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5311     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5312     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5313     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5314 };
5315 
5316 float32 float32_exp2(float32 a, float_status *status)
5317 {
5318     bool aSign;
5319     int aExp;
5320     uint32_t aSig;
5321     float64 r, x, xn;
5322     int i;
5323     a = float32_squash_input_denormal(a, status);
5324 
5325     aSig = extractFloat32Frac( a );
5326     aExp = extractFloat32Exp( a );
5327     aSign = extractFloat32Sign( a );
5328 
5329     if ( aExp == 0xFF) {
5330         if (aSig) {
5331             return propagateFloat32NaN(a, float32_zero, status);
5332         }
5333         return (aSign) ? float32_zero : a;
5334     }
5335     if (aExp == 0) {
5336         if (aSig == 0) return float32_one;
5337     }
5338 
5339     float_raise(float_flag_inexact, status);
5340 
5341     /* ******************************* */
5342     /* using float64 for approximation */
5343     /* ******************************* */
5344     x = float32_to_float64(a, status);
5345     x = float64_mul(x, float64_ln2, status);
5346 
5347     xn = x;
5348     r = float64_one;
5349     for (i = 0 ; i < 15 ; i++) {
5350         float64 f;
5351 
5352         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5353         r = float64_add(r, f, status);
5354 
5355         xn = float64_mul(xn, x, status);
5356     }
5357 
5358     return float64_to_float32(r, status);
5359 }
5360 
5361 /*----------------------------------------------------------------------------
5362 | Returns the binary log of the single-precision floating-point value `a'.
5363 | The operation is performed according to the IEC/IEEE Standard for Binary
5364 | Floating-Point Arithmetic.
5365 *----------------------------------------------------------------------------*/
5366 float32 float32_log2(float32 a, float_status *status)
5367 {
5368     bool aSign, zSign;
5369     int aExp;
5370     uint32_t aSig, zSig, i;
5371 
5372     a = float32_squash_input_denormal(a, status);
5373     aSig = extractFloat32Frac( a );
5374     aExp = extractFloat32Exp( a );
5375     aSign = extractFloat32Sign( a );
5376 
5377     if ( aExp == 0 ) {
5378         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5379         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5380     }
5381     if ( aSign ) {
5382         float_raise(float_flag_invalid, status);
5383         return float32_default_nan(status);
5384     }
5385     if ( aExp == 0xFF ) {
5386         if (aSig) {
5387             return propagateFloat32NaN(a, float32_zero, status);
5388         }
5389         return a;
5390     }
5391 
5392     aExp -= 0x7F;
5393     aSig |= 0x00800000;
5394     zSign = aExp < 0;
5395     zSig = aExp << 23;
5396 
5397     for (i = 1 << 22; i > 0; i >>= 1) {
5398         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5399         if ( aSig & 0x01000000 ) {
5400             aSig >>= 1;
5401             zSig |= i;
5402         }
5403     }
5404 
5405     if ( zSign )
5406         zSig = -zSig;
5407 
5408     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5409 }
5410 
5411 /*----------------------------------------------------------------------------
5412 | Returns the result of converting the double-precision floating-point value
5413 | `a' to the extended double-precision floating-point format.  The conversion
5414 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5415 | Arithmetic.
5416 *----------------------------------------------------------------------------*/
5417 
5418 floatx80 float64_to_floatx80(float64 a, float_status *status)
5419 {
5420     bool aSign;
5421     int aExp;
5422     uint64_t aSig;
5423 
5424     a = float64_squash_input_denormal(a, status);
5425     aSig = extractFloat64Frac( a );
5426     aExp = extractFloat64Exp( a );
5427     aSign = extractFloat64Sign( a );
5428     if ( aExp == 0x7FF ) {
5429         if (aSig) {
5430             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5431                                                status);
5432             return floatx80_silence_nan(res, status);
5433         }
5434         return packFloatx80(aSign,
5435                             floatx80_infinity_high,
5436                             floatx80_infinity_low);
5437     }
5438     if ( aExp == 0 ) {
5439         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5440         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5441     }
5442     return
5443         packFloatx80(
5444             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5445 
5446 }
5447 
5448 /*----------------------------------------------------------------------------
5449 | Returns the result of converting the double-precision floating-point value
5450 | `a' to the quadruple-precision floating-point format.  The conversion is
5451 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5452 | Arithmetic.
5453 *----------------------------------------------------------------------------*/
5454 
5455 float128 float64_to_float128(float64 a, float_status *status)
5456 {
5457     bool aSign;
5458     int aExp;
5459     uint64_t aSig, zSig0, zSig1;
5460 
5461     a = float64_squash_input_denormal(a, status);
5462     aSig = extractFloat64Frac( a );
5463     aExp = extractFloat64Exp( a );
5464     aSign = extractFloat64Sign( a );
5465     if ( aExp == 0x7FF ) {
5466         if (aSig) {
5467             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5468         }
5469         return packFloat128( aSign, 0x7FFF, 0, 0 );
5470     }
5471     if ( aExp == 0 ) {
5472         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5473         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5474         --aExp;
5475     }
5476     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5477     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5478 
5479 }
5480 
5481 
5482 /*----------------------------------------------------------------------------
5483 | Returns the remainder of the double-precision floating-point value `a'
5484 | with respect to the corresponding value `b'.  The operation is performed
5485 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5486 *----------------------------------------------------------------------------*/
5487 
5488 float64 float64_rem(float64 a, float64 b, float_status *status)
5489 {
5490     bool aSign, zSign;
5491     int aExp, bExp, expDiff;
5492     uint64_t aSig, bSig;
5493     uint64_t q, alternateASig;
5494     int64_t sigMean;
5495 
5496     a = float64_squash_input_denormal(a, status);
5497     b = float64_squash_input_denormal(b, status);
5498     aSig = extractFloat64Frac( a );
5499     aExp = extractFloat64Exp( a );
5500     aSign = extractFloat64Sign( a );
5501     bSig = extractFloat64Frac( b );
5502     bExp = extractFloat64Exp( b );
5503     if ( aExp == 0x7FF ) {
5504         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5505             return propagateFloat64NaN(a, b, status);
5506         }
5507         float_raise(float_flag_invalid, status);
5508         return float64_default_nan(status);
5509     }
5510     if ( bExp == 0x7FF ) {
5511         if (bSig) {
5512             return propagateFloat64NaN(a, b, status);
5513         }
5514         return a;
5515     }
5516     if ( bExp == 0 ) {
5517         if ( bSig == 0 ) {
5518             float_raise(float_flag_invalid, status);
5519             return float64_default_nan(status);
5520         }
5521         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5522     }
5523     if ( aExp == 0 ) {
5524         if ( aSig == 0 ) return a;
5525         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5526     }
5527     expDiff = aExp - bExp;
5528     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5529     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5530     if ( expDiff < 0 ) {
5531         if ( expDiff < -1 ) return a;
5532         aSig >>= 1;
5533     }
5534     q = ( bSig <= aSig );
5535     if ( q ) aSig -= bSig;
5536     expDiff -= 64;
5537     while ( 0 < expDiff ) {
5538         q = estimateDiv128To64( aSig, 0, bSig );
5539         q = ( 2 < q ) ? q - 2 : 0;
5540         aSig = - ( ( bSig>>2 ) * q );
5541         expDiff -= 62;
5542     }
5543     expDiff += 64;
5544     if ( 0 < expDiff ) {
5545         q = estimateDiv128To64( aSig, 0, bSig );
5546         q = ( 2 < q ) ? q - 2 : 0;
5547         q >>= 64 - expDiff;
5548         bSig >>= 2;
5549         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5550     }
5551     else {
5552         aSig >>= 2;
5553         bSig >>= 2;
5554     }
5555     do {
5556         alternateASig = aSig;
5557         ++q;
5558         aSig -= bSig;
5559     } while ( 0 <= (int64_t) aSig );
5560     sigMean = aSig + alternateASig;
5561     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5562         aSig = alternateASig;
5563     }
5564     zSign = ( (int64_t) aSig < 0 );
5565     if ( zSign ) aSig = - aSig;
5566     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5567 
5568 }
5569 
5570 /*----------------------------------------------------------------------------
5571 | Returns the binary log of the double-precision floating-point value `a'.
5572 | The operation is performed according to the IEC/IEEE Standard for Binary
5573 | Floating-Point Arithmetic.
5574 *----------------------------------------------------------------------------*/
5575 float64 float64_log2(float64 a, float_status *status)
5576 {
5577     bool aSign, zSign;
5578     int aExp;
5579     uint64_t aSig, aSig0, aSig1, zSig, i;
5580     a = float64_squash_input_denormal(a, status);
5581 
5582     aSig = extractFloat64Frac( a );
5583     aExp = extractFloat64Exp( a );
5584     aSign = extractFloat64Sign( a );
5585 
5586     if ( aExp == 0 ) {
5587         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5588         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5589     }
5590     if ( aSign ) {
5591         float_raise(float_flag_invalid, status);
5592         return float64_default_nan(status);
5593     }
5594     if ( aExp == 0x7FF ) {
5595         if (aSig) {
5596             return propagateFloat64NaN(a, float64_zero, status);
5597         }
5598         return a;
5599     }
5600 
5601     aExp -= 0x3FF;
5602     aSig |= UINT64_C(0x0010000000000000);
5603     zSign = aExp < 0;
5604     zSig = (uint64_t)aExp << 52;
5605     for (i = 1LL << 51; i > 0; i >>= 1) {
5606         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5607         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5608         if ( aSig & UINT64_C(0x0020000000000000) ) {
5609             aSig >>= 1;
5610             zSig |= i;
5611         }
5612     }
5613 
5614     if ( zSign )
5615         zSig = -zSig;
5616     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5617 }
5618 
5619 /*----------------------------------------------------------------------------
5620 | Returns the result of converting the extended double-precision floating-
5621 | point value `a' to the 32-bit two's complement integer format.  The
5622 | conversion is performed according to the IEC/IEEE Standard for Binary
5623 | Floating-Point Arithmetic---which means in particular that the conversion
5624 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5625 | largest positive integer is returned.  Otherwise, if the conversion
5626 | overflows, the largest integer with the same sign as `a' is returned.
5627 *----------------------------------------------------------------------------*/
5628 
5629 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5630 {
5631     bool aSign;
5632     int32_t aExp, shiftCount;
5633     uint64_t aSig;
5634 
5635     if (floatx80_invalid_encoding(a)) {
5636         float_raise(float_flag_invalid, status);
5637         return 1 << 31;
5638     }
5639     aSig = extractFloatx80Frac( a );
5640     aExp = extractFloatx80Exp( a );
5641     aSign = extractFloatx80Sign( a );
5642     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5643     shiftCount = 0x4037 - aExp;
5644     if ( shiftCount <= 0 ) shiftCount = 1;
5645     shift64RightJamming( aSig, shiftCount, &aSig );
5646     return roundAndPackInt32(aSign, aSig, status);
5647 
5648 }
5649 
5650 /*----------------------------------------------------------------------------
5651 | Returns the result of converting the extended double-precision floating-
5652 | point value `a' to the 32-bit two's complement integer format.  The
5653 | conversion is performed according to the IEC/IEEE Standard for Binary
5654 | Floating-Point Arithmetic, except that the conversion is always rounded
5655 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5656 | Otherwise, if the conversion overflows, the largest integer with the same
5657 | sign as `a' is returned.
5658 *----------------------------------------------------------------------------*/
5659 
5660 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5661 {
5662     bool aSign;
5663     int32_t aExp, shiftCount;
5664     uint64_t aSig, savedASig;
5665     int32_t z;
5666 
5667     if (floatx80_invalid_encoding(a)) {
5668         float_raise(float_flag_invalid, status);
5669         return 1 << 31;
5670     }
5671     aSig = extractFloatx80Frac( a );
5672     aExp = extractFloatx80Exp( a );
5673     aSign = extractFloatx80Sign( a );
5674     if ( 0x401E < aExp ) {
5675         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5676         goto invalid;
5677     }
5678     else if ( aExp < 0x3FFF ) {
5679         if (aExp || aSig) {
5680             float_raise(float_flag_inexact, status);
5681         }
5682         return 0;
5683     }
5684     shiftCount = 0x403E - aExp;
5685     savedASig = aSig;
5686     aSig >>= shiftCount;
5687     z = aSig;
5688     if ( aSign ) z = - z;
5689     if ( ( z < 0 ) ^ aSign ) {
5690  invalid:
5691         float_raise(float_flag_invalid, status);
5692         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5693     }
5694     if ( ( aSig<<shiftCount ) != savedASig ) {
5695         float_raise(float_flag_inexact, status);
5696     }
5697     return z;
5698 
5699 }
5700 
5701 /*----------------------------------------------------------------------------
5702 | Returns the result of converting the extended double-precision floating-
5703 | point value `a' to the 64-bit two's complement integer format.  The
5704 | conversion is performed according to the IEC/IEEE Standard for Binary
5705 | Floating-Point Arithmetic---which means in particular that the conversion
5706 | is rounded according to the current rounding mode.  If `a' is a NaN,
5707 | the largest positive integer is returned.  Otherwise, if the conversion
5708 | overflows, the largest integer with the same sign as `a' is returned.
5709 *----------------------------------------------------------------------------*/
5710 
5711 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5712 {
5713     bool aSign;
5714     int32_t aExp, shiftCount;
5715     uint64_t aSig, aSigExtra;
5716 
5717     if (floatx80_invalid_encoding(a)) {
5718         float_raise(float_flag_invalid, status);
5719         return 1ULL << 63;
5720     }
5721     aSig = extractFloatx80Frac( a );
5722     aExp = extractFloatx80Exp( a );
5723     aSign = extractFloatx80Sign( a );
5724     shiftCount = 0x403E - aExp;
5725     if ( shiftCount <= 0 ) {
5726         if ( shiftCount ) {
5727             float_raise(float_flag_invalid, status);
5728             if (!aSign || floatx80_is_any_nan(a)) {
5729                 return INT64_MAX;
5730             }
5731             return INT64_MIN;
5732         }
5733         aSigExtra = 0;
5734     }
5735     else {
5736         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5737     }
5738     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5739 
5740 }
5741 
5742 /*----------------------------------------------------------------------------
5743 | Returns the result of converting the extended double-precision floating-
5744 | point value `a' to the 64-bit two's complement integer format.  The
5745 | conversion is performed according to the IEC/IEEE Standard for Binary
5746 | Floating-Point Arithmetic, except that the conversion is always rounded
5747 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5748 | Otherwise, if the conversion overflows, the largest integer with the same
5749 | sign as `a' is returned.
5750 *----------------------------------------------------------------------------*/
5751 
5752 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5753 {
5754     bool aSign;
5755     int32_t aExp, shiftCount;
5756     uint64_t aSig;
5757     int64_t z;
5758 
5759     if (floatx80_invalid_encoding(a)) {
5760         float_raise(float_flag_invalid, status);
5761         return 1ULL << 63;
5762     }
5763     aSig = extractFloatx80Frac( a );
5764     aExp = extractFloatx80Exp( a );
5765     aSign = extractFloatx80Sign( a );
5766     shiftCount = aExp - 0x403E;
5767     if ( 0 <= shiftCount ) {
5768         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5769         if ( ( a.high != 0xC03E ) || aSig ) {
5770             float_raise(float_flag_invalid, status);
5771             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5772                 return INT64_MAX;
5773             }
5774         }
5775         return INT64_MIN;
5776     }
5777     else if ( aExp < 0x3FFF ) {
5778         if (aExp | aSig) {
5779             float_raise(float_flag_inexact, status);
5780         }
5781         return 0;
5782     }
5783     z = aSig>>( - shiftCount );
5784     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5785         float_raise(float_flag_inexact, status);
5786     }
5787     if ( aSign ) z = - z;
5788     return z;
5789 
5790 }
5791 
5792 /*----------------------------------------------------------------------------
5793 | Returns the result of converting the extended double-precision floating-
5794 | point value `a' to the single-precision floating-point format.  The
5795 | conversion is performed according to the IEC/IEEE Standard for Binary
5796 | Floating-Point Arithmetic.
5797 *----------------------------------------------------------------------------*/
5798 
5799 float32 floatx80_to_float32(floatx80 a, float_status *status)
5800 {
5801     bool aSign;
5802     int32_t aExp;
5803     uint64_t aSig;
5804 
5805     if (floatx80_invalid_encoding(a)) {
5806         float_raise(float_flag_invalid, status);
5807         return float32_default_nan(status);
5808     }
5809     aSig = extractFloatx80Frac( a );
5810     aExp = extractFloatx80Exp( a );
5811     aSign = extractFloatx80Sign( a );
5812     if ( aExp == 0x7FFF ) {
5813         if ( (uint64_t) ( aSig<<1 ) ) {
5814             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5815                                              status);
5816             return float32_silence_nan(res, status);
5817         }
5818         return packFloat32( aSign, 0xFF, 0 );
5819     }
5820     shift64RightJamming( aSig, 33, &aSig );
5821     if ( aExp || aSig ) aExp -= 0x3F81;
5822     return roundAndPackFloat32(aSign, aExp, aSig, status);
5823 
5824 }
5825 
5826 /*----------------------------------------------------------------------------
5827 | Returns the result of converting the extended double-precision floating-
5828 | point value `a' to the double-precision floating-point format.  The
5829 | conversion is performed according to the IEC/IEEE Standard for Binary
5830 | Floating-Point Arithmetic.
5831 *----------------------------------------------------------------------------*/
5832 
5833 float64 floatx80_to_float64(floatx80 a, float_status *status)
5834 {
5835     bool aSign;
5836     int32_t aExp;
5837     uint64_t aSig, zSig;
5838 
5839     if (floatx80_invalid_encoding(a)) {
5840         float_raise(float_flag_invalid, status);
5841         return float64_default_nan(status);
5842     }
5843     aSig = extractFloatx80Frac( a );
5844     aExp = extractFloatx80Exp( a );
5845     aSign = extractFloatx80Sign( a );
5846     if ( aExp == 0x7FFF ) {
5847         if ( (uint64_t) ( aSig<<1 ) ) {
5848             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5849                                              status);
5850             return float64_silence_nan(res, status);
5851         }
5852         return packFloat64( aSign, 0x7FF, 0 );
5853     }
5854     shift64RightJamming( aSig, 1, &zSig );
5855     if ( aExp || aSig ) aExp -= 0x3C01;
5856     return roundAndPackFloat64(aSign, aExp, zSig, status);
5857 
5858 }
5859 
5860 /*----------------------------------------------------------------------------
5861 | Returns the result of converting the extended double-precision floating-
5862 | point value `a' to the quadruple-precision floating-point format.  The
5863 | conversion is performed according to the IEC/IEEE Standard for Binary
5864 | Floating-Point Arithmetic.
5865 *----------------------------------------------------------------------------*/
5866 
5867 float128 floatx80_to_float128(floatx80 a, float_status *status)
5868 {
5869     bool aSign;
5870     int aExp;
5871     uint64_t aSig, zSig0, zSig1;
5872 
5873     if (floatx80_invalid_encoding(a)) {
5874         float_raise(float_flag_invalid, status);
5875         return float128_default_nan(status);
5876     }
5877     aSig = extractFloatx80Frac( a );
5878     aExp = extractFloatx80Exp( a );
5879     aSign = extractFloatx80Sign( a );
5880     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5881         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5882                                            status);
5883         return float128_silence_nan(res, status);
5884     }
5885     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5886     return packFloat128( aSign, aExp, zSig0, zSig1 );
5887 
5888 }
5889 
5890 /*----------------------------------------------------------------------------
5891 | Rounds the extended double-precision floating-point value `a'
5892 | to the precision provided by floatx80_rounding_precision and returns the
5893 | result as an extended double-precision floating-point value.
5894 | The operation is performed according to the IEC/IEEE Standard for Binary
5895 | Floating-Point Arithmetic.
5896 *----------------------------------------------------------------------------*/
5897 
5898 floatx80 floatx80_round(floatx80 a, float_status *status)
5899 {
5900     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5901                                 extractFloatx80Sign(a),
5902                                 extractFloatx80Exp(a),
5903                                 extractFloatx80Frac(a), 0, status);
5904 }
5905 
5906 /*----------------------------------------------------------------------------
5907 | Rounds the extended double-precision floating-point value `a' to an integer,
5908 | and returns the result as an extended quadruple-precision floating-point
5909 | value.  The operation is performed according to the IEC/IEEE Standard for
5910 | Binary Floating-Point Arithmetic.
5911 *----------------------------------------------------------------------------*/
5912 
5913 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5914 {
5915     bool aSign;
5916     int32_t aExp;
5917     uint64_t lastBitMask, roundBitsMask;
5918     floatx80 z;
5919 
5920     if (floatx80_invalid_encoding(a)) {
5921         float_raise(float_flag_invalid, status);
5922         return floatx80_default_nan(status);
5923     }
5924     aExp = extractFloatx80Exp( a );
5925     if ( 0x403E <= aExp ) {
5926         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5927             return propagateFloatx80NaN(a, a, status);
5928         }
5929         return a;
5930     }
5931     if ( aExp < 0x3FFF ) {
5932         if (    ( aExp == 0 )
5933              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5934             return a;
5935         }
5936         float_raise(float_flag_inexact, status);
5937         aSign = extractFloatx80Sign( a );
5938         switch (status->float_rounding_mode) {
5939          case float_round_nearest_even:
5940             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5941                ) {
5942                 return
5943                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5944             }
5945             break;
5946         case float_round_ties_away:
5947             if (aExp == 0x3FFE) {
5948                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5949             }
5950             break;
5951          case float_round_down:
5952             return
5953                   aSign ?
5954                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5955                 : packFloatx80( 0, 0, 0 );
5956          case float_round_up:
5957             return
5958                   aSign ? packFloatx80( 1, 0, 0 )
5959                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5960 
5961         case float_round_to_zero:
5962             break;
5963         default:
5964             g_assert_not_reached();
5965         }
5966         return packFloatx80( aSign, 0, 0 );
5967     }
5968     lastBitMask = 1;
5969     lastBitMask <<= 0x403E - aExp;
5970     roundBitsMask = lastBitMask - 1;
5971     z = a;
5972     switch (status->float_rounding_mode) {
5973     case float_round_nearest_even:
5974         z.low += lastBitMask>>1;
5975         if ((z.low & roundBitsMask) == 0) {
5976             z.low &= ~lastBitMask;
5977         }
5978         break;
5979     case float_round_ties_away:
5980         z.low += lastBitMask >> 1;
5981         break;
5982     case float_round_to_zero:
5983         break;
5984     case float_round_up:
5985         if (!extractFloatx80Sign(z)) {
5986             z.low += roundBitsMask;
5987         }
5988         break;
5989     case float_round_down:
5990         if (extractFloatx80Sign(z)) {
5991             z.low += roundBitsMask;
5992         }
5993         break;
5994     default:
5995         abort();
5996     }
5997     z.low &= ~ roundBitsMask;
5998     if ( z.low == 0 ) {
5999         ++z.high;
6000         z.low = UINT64_C(0x8000000000000000);
6001     }
6002     if (z.low != a.low) {
6003         float_raise(float_flag_inexact, status);
6004     }
6005     return z;
6006 
6007 }
6008 
6009 /*----------------------------------------------------------------------------
6010 | Returns the result of adding the absolute values of the extended double-
6011 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
6012 | negated before being returned.  `zSign' is ignored if the result is a NaN.
6013 | The addition is performed according to the IEC/IEEE Standard for Binary
6014 | Floating-Point Arithmetic.
6015 *----------------------------------------------------------------------------*/
6016 
6017 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6018                                 float_status *status)
6019 {
6020     int32_t aExp, bExp, zExp;
6021     uint64_t aSig, bSig, zSig0, zSig1;
6022     int32_t expDiff;
6023 
6024     aSig = extractFloatx80Frac( a );
6025     aExp = extractFloatx80Exp( a );
6026     bSig = extractFloatx80Frac( b );
6027     bExp = extractFloatx80Exp( b );
6028     expDiff = aExp - bExp;
6029     if ( 0 < expDiff ) {
6030         if ( aExp == 0x7FFF ) {
6031             if ((uint64_t)(aSig << 1)) {
6032                 return propagateFloatx80NaN(a, b, status);
6033             }
6034             return a;
6035         }
6036         if ( bExp == 0 ) --expDiff;
6037         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6038         zExp = aExp;
6039     }
6040     else if ( expDiff < 0 ) {
6041         if ( bExp == 0x7FFF ) {
6042             if ((uint64_t)(bSig << 1)) {
6043                 return propagateFloatx80NaN(a, b, status);
6044             }
6045             return packFloatx80(zSign,
6046                                 floatx80_infinity_high,
6047                                 floatx80_infinity_low);
6048         }
6049         if ( aExp == 0 ) ++expDiff;
6050         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6051         zExp = bExp;
6052     }
6053     else {
6054         if ( aExp == 0x7FFF ) {
6055             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6056                 return propagateFloatx80NaN(a, b, status);
6057             }
6058             return a;
6059         }
6060         zSig1 = 0;
6061         zSig0 = aSig + bSig;
6062         if ( aExp == 0 ) {
6063             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
6064                 /* At least one of the values is a pseudo-denormal,
6065                  * and there is a carry out of the result.  */
6066                 zExp = 1;
6067                 goto shiftRight1;
6068             }
6069             if (zSig0 == 0) {
6070                 return packFloatx80(zSign, 0, 0);
6071             }
6072             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
6073             goto roundAndPack;
6074         }
6075         zExp = aExp;
6076         goto shiftRight1;
6077     }
6078     zSig0 = aSig + bSig;
6079     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6080  shiftRight1:
6081     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6082     zSig0 |= UINT64_C(0x8000000000000000);
6083     ++zExp;
6084  roundAndPack:
6085     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6086                                 zSign, zExp, zSig0, zSig1, status);
6087 }
6088 
6089 /*----------------------------------------------------------------------------
6090 | Returns the result of subtracting the absolute values of the extended
6091 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
6092 | difference is negated before being returned.  `zSign' is ignored if the
6093 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6094 | Standard for Binary Floating-Point Arithmetic.
6095 *----------------------------------------------------------------------------*/
6096 
6097 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6098                                 float_status *status)
6099 {
6100     int32_t aExp, bExp, zExp;
6101     uint64_t aSig, bSig, zSig0, zSig1;
6102     int32_t expDiff;
6103 
6104     aSig = extractFloatx80Frac( a );
6105     aExp = extractFloatx80Exp( a );
6106     bSig = extractFloatx80Frac( b );
6107     bExp = extractFloatx80Exp( b );
6108     expDiff = aExp - bExp;
6109     if ( 0 < expDiff ) goto aExpBigger;
6110     if ( expDiff < 0 ) goto bExpBigger;
6111     if ( aExp == 0x7FFF ) {
6112         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6113             return propagateFloatx80NaN(a, b, status);
6114         }
6115         float_raise(float_flag_invalid, status);
6116         return floatx80_default_nan(status);
6117     }
6118     if ( aExp == 0 ) {
6119         aExp = 1;
6120         bExp = 1;
6121     }
6122     zSig1 = 0;
6123     if ( bSig < aSig ) goto aBigger;
6124     if ( aSig < bSig ) goto bBigger;
6125     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6126  bExpBigger:
6127     if ( bExp == 0x7FFF ) {
6128         if ((uint64_t)(bSig << 1)) {
6129             return propagateFloatx80NaN(a, b, status);
6130         }
6131         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6132                             floatx80_infinity_low);
6133     }
6134     if ( aExp == 0 ) ++expDiff;
6135     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6136  bBigger:
6137     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6138     zExp = bExp;
6139     zSign ^= 1;
6140     goto normalizeRoundAndPack;
6141  aExpBigger:
6142     if ( aExp == 0x7FFF ) {
6143         if ((uint64_t)(aSig << 1)) {
6144             return propagateFloatx80NaN(a, b, status);
6145         }
6146         return a;
6147     }
6148     if ( bExp == 0 ) --expDiff;
6149     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6150  aBigger:
6151     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6152     zExp = aExp;
6153  normalizeRoundAndPack:
6154     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6155                                          zSign, zExp, zSig0, zSig1, status);
6156 }
6157 
6158 /*----------------------------------------------------------------------------
6159 | Returns the result of adding the extended double-precision floating-point
6160 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6161 | Standard for Binary Floating-Point Arithmetic.
6162 *----------------------------------------------------------------------------*/
6163 
6164 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6165 {
6166     bool aSign, bSign;
6167 
6168     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6169         float_raise(float_flag_invalid, status);
6170         return floatx80_default_nan(status);
6171     }
6172     aSign = extractFloatx80Sign( a );
6173     bSign = extractFloatx80Sign( b );
6174     if ( aSign == bSign ) {
6175         return addFloatx80Sigs(a, b, aSign, status);
6176     }
6177     else {
6178         return subFloatx80Sigs(a, b, aSign, status);
6179     }
6180 
6181 }
6182 
6183 /*----------------------------------------------------------------------------
6184 | Returns the result of subtracting the extended double-precision floating-
6185 | point values `a' and `b'.  The operation is performed according to the
6186 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6187 *----------------------------------------------------------------------------*/
6188 
6189 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6190 {
6191     bool aSign, bSign;
6192 
6193     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6194         float_raise(float_flag_invalid, status);
6195         return floatx80_default_nan(status);
6196     }
6197     aSign = extractFloatx80Sign( a );
6198     bSign = extractFloatx80Sign( b );
6199     if ( aSign == bSign ) {
6200         return subFloatx80Sigs(a, b, aSign, status);
6201     }
6202     else {
6203         return addFloatx80Sigs(a, b, aSign, status);
6204     }
6205 
6206 }
6207 
6208 /*----------------------------------------------------------------------------
6209 | Returns the result of multiplying the extended double-precision floating-
6210 | point values `a' and `b'.  The operation is performed according to the
6211 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6212 *----------------------------------------------------------------------------*/
6213 
6214 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6215 {
6216     bool aSign, bSign, zSign;
6217     int32_t aExp, bExp, zExp;
6218     uint64_t aSig, bSig, zSig0, zSig1;
6219 
6220     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6221         float_raise(float_flag_invalid, status);
6222         return floatx80_default_nan(status);
6223     }
6224     aSig = extractFloatx80Frac( a );
6225     aExp = extractFloatx80Exp( a );
6226     aSign = extractFloatx80Sign( a );
6227     bSig = extractFloatx80Frac( b );
6228     bExp = extractFloatx80Exp( b );
6229     bSign = extractFloatx80Sign( b );
6230     zSign = aSign ^ bSign;
6231     if ( aExp == 0x7FFF ) {
6232         if (    (uint64_t) ( aSig<<1 )
6233              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6234             return propagateFloatx80NaN(a, b, status);
6235         }
6236         if ( ( bExp | bSig ) == 0 ) goto invalid;
6237         return packFloatx80(zSign, floatx80_infinity_high,
6238                                    floatx80_infinity_low);
6239     }
6240     if ( bExp == 0x7FFF ) {
6241         if ((uint64_t)(bSig << 1)) {
6242             return propagateFloatx80NaN(a, b, status);
6243         }
6244         if ( ( aExp | aSig ) == 0 ) {
6245  invalid:
6246             float_raise(float_flag_invalid, status);
6247             return floatx80_default_nan(status);
6248         }
6249         return packFloatx80(zSign, floatx80_infinity_high,
6250                                    floatx80_infinity_low);
6251     }
6252     if ( aExp == 0 ) {
6253         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6254         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6255     }
6256     if ( bExp == 0 ) {
6257         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6258         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6259     }
6260     zExp = aExp + bExp - 0x3FFE;
6261     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6262     if ( 0 < (int64_t) zSig0 ) {
6263         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6264         --zExp;
6265     }
6266     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6267                                 zSign, zExp, zSig0, zSig1, status);
6268 }
6269 
6270 /*----------------------------------------------------------------------------
6271 | Returns the result of dividing the extended double-precision floating-point
6272 | value `a' by the corresponding value `b'.  The operation is performed
6273 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6274 *----------------------------------------------------------------------------*/
6275 
6276 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6277 {
6278     bool aSign, bSign, zSign;
6279     int32_t aExp, bExp, zExp;
6280     uint64_t aSig, bSig, zSig0, zSig1;
6281     uint64_t rem0, rem1, rem2, term0, term1, term2;
6282 
6283     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6284         float_raise(float_flag_invalid, status);
6285         return floatx80_default_nan(status);
6286     }
6287     aSig = extractFloatx80Frac( a );
6288     aExp = extractFloatx80Exp( a );
6289     aSign = extractFloatx80Sign( a );
6290     bSig = extractFloatx80Frac( b );
6291     bExp = extractFloatx80Exp( b );
6292     bSign = extractFloatx80Sign( b );
6293     zSign = aSign ^ bSign;
6294     if ( aExp == 0x7FFF ) {
6295         if ((uint64_t)(aSig << 1)) {
6296             return propagateFloatx80NaN(a, b, status);
6297         }
6298         if ( bExp == 0x7FFF ) {
6299             if ((uint64_t)(bSig << 1)) {
6300                 return propagateFloatx80NaN(a, b, status);
6301             }
6302             goto invalid;
6303         }
6304         return packFloatx80(zSign, floatx80_infinity_high,
6305                                    floatx80_infinity_low);
6306     }
6307     if ( bExp == 0x7FFF ) {
6308         if ((uint64_t)(bSig << 1)) {
6309             return propagateFloatx80NaN(a, b, status);
6310         }
6311         return packFloatx80( zSign, 0, 0 );
6312     }
6313     if ( bExp == 0 ) {
6314         if ( bSig == 0 ) {
6315             if ( ( aExp | aSig ) == 0 ) {
6316  invalid:
6317                 float_raise(float_flag_invalid, status);
6318                 return floatx80_default_nan(status);
6319             }
6320             float_raise(float_flag_divbyzero, status);
6321             return packFloatx80(zSign, floatx80_infinity_high,
6322                                        floatx80_infinity_low);
6323         }
6324         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6325     }
6326     if ( aExp == 0 ) {
6327         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6328         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6329     }
6330     zExp = aExp - bExp + 0x3FFE;
6331     rem1 = 0;
6332     if ( bSig <= aSig ) {
6333         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6334         ++zExp;
6335     }
6336     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6337     mul64To128( bSig, zSig0, &term0, &term1 );
6338     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6339     while ( (int64_t) rem0 < 0 ) {
6340         --zSig0;
6341         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6342     }
6343     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6344     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6345         mul64To128( bSig, zSig1, &term1, &term2 );
6346         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6347         while ( (int64_t) rem1 < 0 ) {
6348             --zSig1;
6349             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6350         }
6351         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6352     }
6353     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6354                                 zSign, zExp, zSig0, zSig1, status);
6355 }
6356 
6357 /*----------------------------------------------------------------------------
6358 | Returns the remainder of the extended double-precision floating-point value
6359 | `a' with respect to the corresponding value `b'.  The operation is performed
6360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6361 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6362 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6363 | the absolute value of the integer quotient.
6364 *----------------------------------------------------------------------------*/
6365 
6366 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6367                          float_status *status)
6368 {
6369     bool aSign, zSign;
6370     int32_t aExp, bExp, expDiff, aExpOrig;
6371     uint64_t aSig0, aSig1, bSig;
6372     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6373 
6374     *quotient = 0;
6375     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6376         float_raise(float_flag_invalid, status);
6377         return floatx80_default_nan(status);
6378     }
6379     aSig0 = extractFloatx80Frac( a );
6380     aExpOrig = aExp = extractFloatx80Exp( a );
6381     aSign = extractFloatx80Sign( a );
6382     bSig = extractFloatx80Frac( b );
6383     bExp = extractFloatx80Exp( b );
6384     if ( aExp == 0x7FFF ) {
6385         if (    (uint64_t) ( aSig0<<1 )
6386              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6387             return propagateFloatx80NaN(a, b, status);
6388         }
6389         goto invalid;
6390     }
6391     if ( bExp == 0x7FFF ) {
6392         if ((uint64_t)(bSig << 1)) {
6393             return propagateFloatx80NaN(a, b, status);
6394         }
6395         if (aExp == 0 && aSig0 >> 63) {
6396             /*
6397              * Pseudo-denormal argument must be returned in normalized
6398              * form.
6399              */
6400             return packFloatx80(aSign, 1, aSig0);
6401         }
6402         return a;
6403     }
6404     if ( bExp == 0 ) {
6405         if ( bSig == 0 ) {
6406  invalid:
6407             float_raise(float_flag_invalid, status);
6408             return floatx80_default_nan(status);
6409         }
6410         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6411     }
6412     if ( aExp == 0 ) {
6413         if ( aSig0 == 0 ) return a;
6414         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6415     }
6416     zSign = aSign;
6417     expDiff = aExp - bExp;
6418     aSig1 = 0;
6419     if ( expDiff < 0 ) {
6420         if ( mod || expDiff < -1 ) {
6421             if (aExp == 1 && aExpOrig == 0) {
6422                 /*
6423                  * Pseudo-denormal argument must be returned in
6424                  * normalized form.
6425                  */
6426                 return packFloatx80(aSign, aExp, aSig0);
6427             }
6428             return a;
6429         }
6430         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6431         expDiff = 0;
6432     }
6433     *quotient = q = ( bSig <= aSig0 );
6434     if ( q ) aSig0 -= bSig;
6435     expDiff -= 64;
6436     while ( 0 < expDiff ) {
6437         q = estimateDiv128To64( aSig0, aSig1, bSig );
6438         q = ( 2 < q ) ? q - 2 : 0;
6439         mul64To128( bSig, q, &term0, &term1 );
6440         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6441         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6442         expDiff -= 62;
6443         *quotient <<= 62;
6444         *quotient += q;
6445     }
6446     expDiff += 64;
6447     if ( 0 < expDiff ) {
6448         q = estimateDiv128To64( aSig0, aSig1, bSig );
6449         q = ( 2 < q ) ? q - 2 : 0;
6450         q >>= 64 - expDiff;
6451         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6452         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6453         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6454         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6455             ++q;
6456             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6457         }
6458         if (expDiff < 64) {
6459             *quotient <<= expDiff;
6460         } else {
6461             *quotient = 0;
6462         }
6463         *quotient += q;
6464     }
6465     else {
6466         term1 = 0;
6467         term0 = bSig;
6468     }
6469     if (!mod) {
6470         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6471         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6472                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6473                         && ( q & 1 ) )
6474             ) {
6475             aSig0 = alternateASig0;
6476             aSig1 = alternateASig1;
6477             zSign = ! zSign;
6478             ++*quotient;
6479         }
6480     }
6481     return
6482         normalizeRoundAndPackFloatx80(
6483             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6484 
6485 }
6486 
6487 /*----------------------------------------------------------------------------
6488 | Returns the remainder of the extended double-precision floating-point value
6489 | `a' with respect to the corresponding value `b'.  The operation is performed
6490 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6491 *----------------------------------------------------------------------------*/
6492 
6493 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6494 {
6495     uint64_t quotient;
6496     return floatx80_modrem(a, b, false, &quotient, status);
6497 }
6498 
6499 /*----------------------------------------------------------------------------
6500 | Returns the remainder of the extended double-precision floating-point value
6501 | `a' with respect to the corresponding value `b', with the quotient truncated
6502 | toward zero.
6503 *----------------------------------------------------------------------------*/
6504 
6505 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6506 {
6507     uint64_t quotient;
6508     return floatx80_modrem(a, b, true, &quotient, status);
6509 }
6510 
6511 /*----------------------------------------------------------------------------
6512 | Returns the square root of the extended double-precision floating-point
6513 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6514 | for Binary Floating-Point Arithmetic.
6515 *----------------------------------------------------------------------------*/
6516 
6517 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6518 {
6519     bool aSign;
6520     int32_t aExp, zExp;
6521     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6522     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6523 
6524     if (floatx80_invalid_encoding(a)) {
6525         float_raise(float_flag_invalid, status);
6526         return floatx80_default_nan(status);
6527     }
6528     aSig0 = extractFloatx80Frac( a );
6529     aExp = extractFloatx80Exp( a );
6530     aSign = extractFloatx80Sign( a );
6531     if ( aExp == 0x7FFF ) {
6532         if ((uint64_t)(aSig0 << 1)) {
6533             return propagateFloatx80NaN(a, a, status);
6534         }
6535         if ( ! aSign ) return a;
6536         goto invalid;
6537     }
6538     if ( aSign ) {
6539         if ( ( aExp | aSig0 ) == 0 ) return a;
6540  invalid:
6541         float_raise(float_flag_invalid, status);
6542         return floatx80_default_nan(status);
6543     }
6544     if ( aExp == 0 ) {
6545         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6546         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6547     }
6548     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6549     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6550     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6551     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6552     doubleZSig0 = zSig0<<1;
6553     mul64To128( zSig0, zSig0, &term0, &term1 );
6554     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6555     while ( (int64_t) rem0 < 0 ) {
6556         --zSig0;
6557         doubleZSig0 -= 2;
6558         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6559     }
6560     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6561     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6562         if ( zSig1 == 0 ) zSig1 = 1;
6563         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6564         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6565         mul64To128( zSig1, zSig1, &term2, &term3 );
6566         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6567         while ( (int64_t) rem1 < 0 ) {
6568             --zSig1;
6569             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6570             term3 |= 1;
6571             term2 |= doubleZSig0;
6572             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6573         }
6574         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6575     }
6576     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6577     zSig0 |= doubleZSig0;
6578     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6579                                 0, zExp, zSig0, zSig1, status);
6580 }
6581 
6582 /*----------------------------------------------------------------------------
6583 | Returns the result of converting the quadruple-precision floating-point
6584 | value `a' to the 32-bit two's complement integer format.  The conversion
6585 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6586 | Arithmetic---which means in particular that the conversion is rounded
6587 | according to the current rounding mode.  If `a' is a NaN, the largest
6588 | positive integer is returned.  Otherwise, if the conversion overflows, the
6589 | largest integer with the same sign as `a' is returned.
6590 *----------------------------------------------------------------------------*/
6591 
6592 int32_t float128_to_int32(float128 a, float_status *status)
6593 {
6594     bool aSign;
6595     int32_t aExp, shiftCount;
6596     uint64_t aSig0, aSig1;
6597 
6598     aSig1 = extractFloat128Frac1( a );
6599     aSig0 = extractFloat128Frac0( a );
6600     aExp = extractFloat128Exp( a );
6601     aSign = extractFloat128Sign( a );
6602     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6603     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6604     aSig0 |= ( aSig1 != 0 );
6605     shiftCount = 0x4028 - aExp;
6606     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6607     return roundAndPackInt32(aSign, aSig0, status);
6608 
6609 }
6610 
6611 /*----------------------------------------------------------------------------
6612 | Returns the result of converting the quadruple-precision floating-point
6613 | value `a' to the 32-bit two's complement integer format.  The conversion
6614 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6615 | Arithmetic, except that the conversion is always rounded toward zero.  If
6616 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6617 | conversion overflows, the largest integer with the same sign as `a' is
6618 | returned.
6619 *----------------------------------------------------------------------------*/
6620 
6621 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6622 {
6623     bool aSign;
6624     int32_t aExp, shiftCount;
6625     uint64_t aSig0, aSig1, savedASig;
6626     int32_t z;
6627 
6628     aSig1 = extractFloat128Frac1( a );
6629     aSig0 = extractFloat128Frac0( a );
6630     aExp = extractFloat128Exp( a );
6631     aSign = extractFloat128Sign( a );
6632     aSig0 |= ( aSig1 != 0 );
6633     if ( 0x401E < aExp ) {
6634         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6635         goto invalid;
6636     }
6637     else if ( aExp < 0x3FFF ) {
6638         if (aExp || aSig0) {
6639             float_raise(float_flag_inexact, status);
6640         }
6641         return 0;
6642     }
6643     aSig0 |= UINT64_C(0x0001000000000000);
6644     shiftCount = 0x402F - aExp;
6645     savedASig = aSig0;
6646     aSig0 >>= shiftCount;
6647     z = aSig0;
6648     if ( aSign ) z = - z;
6649     if ( ( z < 0 ) ^ aSign ) {
6650  invalid:
6651         float_raise(float_flag_invalid, status);
6652         return aSign ? INT32_MIN : INT32_MAX;
6653     }
6654     if ( ( aSig0<<shiftCount ) != savedASig ) {
6655         float_raise(float_flag_inexact, status);
6656     }
6657     return z;
6658 
6659 }
6660 
6661 /*----------------------------------------------------------------------------
6662 | Returns the result of converting the quadruple-precision floating-point
6663 | value `a' to the 64-bit two's complement integer format.  The conversion
6664 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6665 | Arithmetic---which means in particular that the conversion is rounded
6666 | according to the current rounding mode.  If `a' is a NaN, the largest
6667 | positive integer is returned.  Otherwise, if the conversion overflows, the
6668 | largest integer with the same sign as `a' is returned.
6669 *----------------------------------------------------------------------------*/
6670 
6671 int64_t float128_to_int64(float128 a, float_status *status)
6672 {
6673     bool aSign;
6674     int32_t aExp, shiftCount;
6675     uint64_t aSig0, aSig1;
6676 
6677     aSig1 = extractFloat128Frac1( a );
6678     aSig0 = extractFloat128Frac0( a );
6679     aExp = extractFloat128Exp( a );
6680     aSign = extractFloat128Sign( a );
6681     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6682     shiftCount = 0x402F - aExp;
6683     if ( shiftCount <= 0 ) {
6684         if ( 0x403E < aExp ) {
6685             float_raise(float_flag_invalid, status);
6686             if (    ! aSign
6687                  || (    ( aExp == 0x7FFF )
6688                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6689                     )
6690                ) {
6691                 return INT64_MAX;
6692             }
6693             return INT64_MIN;
6694         }
6695         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6696     }
6697     else {
6698         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6699     }
6700     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6701 
6702 }
6703 
6704 /*----------------------------------------------------------------------------
6705 | Returns the result of converting the quadruple-precision floating-point
6706 | value `a' to the 64-bit two's complement integer format.  The conversion
6707 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6708 | Arithmetic, except that the conversion is always rounded toward zero.
6709 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6710 | the conversion overflows, the largest integer with the same sign as `a' is
6711 | returned.
6712 *----------------------------------------------------------------------------*/
6713 
6714 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6715 {
6716     bool aSign;
6717     int32_t aExp, shiftCount;
6718     uint64_t aSig0, aSig1;
6719     int64_t z;
6720 
6721     aSig1 = extractFloat128Frac1( a );
6722     aSig0 = extractFloat128Frac0( a );
6723     aExp = extractFloat128Exp( a );
6724     aSign = extractFloat128Sign( a );
6725     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6726     shiftCount = aExp - 0x402F;
6727     if ( 0 < shiftCount ) {
6728         if ( 0x403E <= aExp ) {
6729             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6730             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6731                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6732                 if (aSig1) {
6733                     float_raise(float_flag_inexact, status);
6734                 }
6735             }
6736             else {
6737                 float_raise(float_flag_invalid, status);
6738                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6739                     return INT64_MAX;
6740                 }
6741             }
6742             return INT64_MIN;
6743         }
6744         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6745         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6746             float_raise(float_flag_inexact, status);
6747         }
6748     }
6749     else {
6750         if ( aExp < 0x3FFF ) {
6751             if ( aExp | aSig0 | aSig1 ) {
6752                 float_raise(float_flag_inexact, status);
6753             }
6754             return 0;
6755         }
6756         z = aSig0>>( - shiftCount );
6757         if (    aSig1
6758              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6759             float_raise(float_flag_inexact, status);
6760         }
6761     }
6762     if ( aSign ) z = - z;
6763     return z;
6764 
6765 }
6766 
6767 /*----------------------------------------------------------------------------
6768 | Returns the result of converting the quadruple-precision floating-point value
6769 | `a' to the 64-bit unsigned integer format.  The conversion is
6770 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6771 | Arithmetic---which means in particular that the conversion is rounded
6772 | according to the current rounding mode.  If `a' is a NaN, the largest
6773 | positive integer is returned.  If the conversion overflows, the
6774 | largest unsigned integer is returned.  If 'a' is negative, the value is
6775 | rounded and zero is returned; negative values that do not round to zero
6776 | will raise the inexact exception.
6777 *----------------------------------------------------------------------------*/
6778 
6779 uint64_t float128_to_uint64(float128 a, float_status *status)
6780 {
6781     bool aSign;
6782     int aExp;
6783     int shiftCount;
6784     uint64_t aSig0, aSig1;
6785 
6786     aSig0 = extractFloat128Frac0(a);
6787     aSig1 = extractFloat128Frac1(a);
6788     aExp = extractFloat128Exp(a);
6789     aSign = extractFloat128Sign(a);
6790     if (aSign && (aExp > 0x3FFE)) {
6791         float_raise(float_flag_invalid, status);
6792         if (float128_is_any_nan(a)) {
6793             return UINT64_MAX;
6794         } else {
6795             return 0;
6796         }
6797     }
6798     if (aExp) {
6799         aSig0 |= UINT64_C(0x0001000000000000);
6800     }
6801     shiftCount = 0x402F - aExp;
6802     if (shiftCount <= 0) {
6803         if (0x403E < aExp) {
6804             float_raise(float_flag_invalid, status);
6805             return UINT64_MAX;
6806         }
6807         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6808     } else {
6809         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6810     }
6811     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6812 }
6813 
6814 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6815 {
6816     uint64_t v;
6817     signed char current_rounding_mode = status->float_rounding_mode;
6818 
6819     set_float_rounding_mode(float_round_to_zero, status);
6820     v = float128_to_uint64(a, status);
6821     set_float_rounding_mode(current_rounding_mode, status);
6822 
6823     return v;
6824 }
6825 
6826 /*----------------------------------------------------------------------------
6827 | Returns the result of converting the quadruple-precision floating-point
6828 | value `a' to the 32-bit unsigned integer format.  The conversion
6829 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6830 | Arithmetic except that the conversion is always rounded toward zero.
6831 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6832 | if the conversion overflows, the largest unsigned integer is returned.
6833 | If 'a' is negative, the value is rounded and zero is returned; negative
6834 | values that do not round to zero will raise the inexact exception.
6835 *----------------------------------------------------------------------------*/
6836 
6837 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6838 {
6839     uint64_t v;
6840     uint32_t res;
6841     int old_exc_flags = get_float_exception_flags(status);
6842 
6843     v = float128_to_uint64_round_to_zero(a, status);
6844     if (v > 0xffffffff) {
6845         res = 0xffffffff;
6846     } else {
6847         return v;
6848     }
6849     set_float_exception_flags(old_exc_flags, status);
6850     float_raise(float_flag_invalid, status);
6851     return res;
6852 }
6853 
6854 /*----------------------------------------------------------------------------
6855 | Returns the result of converting the quadruple-precision floating-point value
6856 | `a' to the 32-bit unsigned integer format.  The conversion is
6857 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6858 | Arithmetic---which means in particular that the conversion is rounded
6859 | according to the current rounding mode.  If `a' is a NaN, the largest
6860 | positive integer is returned.  If the conversion overflows, the
6861 | largest unsigned integer is returned.  If 'a' is negative, the value is
6862 | rounded and zero is returned; negative values that do not round to zero
6863 | will raise the inexact exception.
6864 *----------------------------------------------------------------------------*/
6865 
6866 uint32_t float128_to_uint32(float128 a, float_status *status)
6867 {
6868     uint64_t v;
6869     uint32_t res;
6870     int old_exc_flags = get_float_exception_flags(status);
6871 
6872     v = float128_to_uint64(a, status);
6873     if (v > 0xffffffff) {
6874         res = 0xffffffff;
6875     } else {
6876         return v;
6877     }
6878     set_float_exception_flags(old_exc_flags, status);
6879     float_raise(float_flag_invalid, status);
6880     return res;
6881 }
6882 
6883 /*----------------------------------------------------------------------------
6884 | Returns the result of converting the quadruple-precision floating-point
6885 | value `a' to the single-precision floating-point format.  The conversion
6886 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6887 | Arithmetic.
6888 *----------------------------------------------------------------------------*/
6889 
6890 float32 float128_to_float32(float128 a, float_status *status)
6891 {
6892     bool aSign;
6893     int32_t aExp;
6894     uint64_t aSig0, aSig1;
6895     uint32_t zSig;
6896 
6897     aSig1 = extractFloat128Frac1( a );
6898     aSig0 = extractFloat128Frac0( a );
6899     aExp = extractFloat128Exp( a );
6900     aSign = extractFloat128Sign( a );
6901     if ( aExp == 0x7FFF ) {
6902         if ( aSig0 | aSig1 ) {
6903             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6904         }
6905         return packFloat32( aSign, 0xFF, 0 );
6906     }
6907     aSig0 |= ( aSig1 != 0 );
6908     shift64RightJamming( aSig0, 18, &aSig0 );
6909     zSig = aSig0;
6910     if ( aExp || zSig ) {
6911         zSig |= 0x40000000;
6912         aExp -= 0x3F81;
6913     }
6914     return roundAndPackFloat32(aSign, aExp, zSig, status);
6915 
6916 }
6917 
6918 /*----------------------------------------------------------------------------
6919 | Returns the result of converting the quadruple-precision floating-point
6920 | value `a' to the double-precision floating-point format.  The conversion
6921 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6922 | Arithmetic.
6923 *----------------------------------------------------------------------------*/
6924 
6925 float64 float128_to_float64(float128 a, float_status *status)
6926 {
6927     bool aSign;
6928     int32_t aExp;
6929     uint64_t aSig0, aSig1;
6930 
6931     aSig1 = extractFloat128Frac1( a );
6932     aSig0 = extractFloat128Frac0( a );
6933     aExp = extractFloat128Exp( a );
6934     aSign = extractFloat128Sign( a );
6935     if ( aExp == 0x7FFF ) {
6936         if ( aSig0 | aSig1 ) {
6937             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6938         }
6939         return packFloat64( aSign, 0x7FF, 0 );
6940     }
6941     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6942     aSig0 |= ( aSig1 != 0 );
6943     if ( aExp || aSig0 ) {
6944         aSig0 |= UINT64_C(0x4000000000000000);
6945         aExp -= 0x3C01;
6946     }
6947     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6948 
6949 }
6950 
6951 /*----------------------------------------------------------------------------
6952 | Returns the result of converting the quadruple-precision floating-point
6953 | value `a' to the extended double-precision floating-point format.  The
6954 | conversion is performed according to the IEC/IEEE Standard for Binary
6955 | Floating-Point Arithmetic.
6956 *----------------------------------------------------------------------------*/
6957 
6958 floatx80 float128_to_floatx80(float128 a, float_status *status)
6959 {
6960     bool aSign;
6961     int32_t aExp;
6962     uint64_t aSig0, aSig1;
6963 
6964     aSig1 = extractFloat128Frac1( a );
6965     aSig0 = extractFloat128Frac0( a );
6966     aExp = extractFloat128Exp( a );
6967     aSign = extractFloat128Sign( a );
6968     if ( aExp == 0x7FFF ) {
6969         if ( aSig0 | aSig1 ) {
6970             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6971                                                status);
6972             return floatx80_silence_nan(res, status);
6973         }
6974         return packFloatx80(aSign, floatx80_infinity_high,
6975                                    floatx80_infinity_low);
6976     }
6977     if ( aExp == 0 ) {
6978         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6979         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6980     }
6981     else {
6982         aSig0 |= UINT64_C(0x0001000000000000);
6983     }
6984     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6985     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6986 
6987 }
6988 
6989 /*----------------------------------------------------------------------------
6990 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6991 | returns the result as a quadruple-precision floating-point value.  The
6992 | operation is performed according to the IEC/IEEE Standard for Binary
6993 | Floating-Point Arithmetic.
6994 *----------------------------------------------------------------------------*/
6995 
6996 float128 float128_round_to_int(float128 a, float_status *status)
6997 {
6998     bool aSign;
6999     int32_t aExp;
7000     uint64_t lastBitMask, roundBitsMask;
7001     float128 z;
7002 
7003     aExp = extractFloat128Exp( a );
7004     if ( 0x402F <= aExp ) {
7005         if ( 0x406F <= aExp ) {
7006             if (    ( aExp == 0x7FFF )
7007                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
7008                ) {
7009                 return propagateFloat128NaN(a, a, status);
7010             }
7011             return a;
7012         }
7013         lastBitMask = 1;
7014         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7015         roundBitsMask = lastBitMask - 1;
7016         z = a;
7017         switch (status->float_rounding_mode) {
7018         case float_round_nearest_even:
7019             if ( lastBitMask ) {
7020                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7021                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7022             }
7023             else {
7024                 if ( (int64_t) z.low < 0 ) {
7025                     ++z.high;
7026                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7027                 }
7028             }
7029             break;
7030         case float_round_ties_away:
7031             if (lastBitMask) {
7032                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7033             } else {
7034                 if ((int64_t) z.low < 0) {
7035                     ++z.high;
7036                 }
7037             }
7038             break;
7039         case float_round_to_zero:
7040             break;
7041         case float_round_up:
7042             if (!extractFloat128Sign(z)) {
7043                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7044             }
7045             break;
7046         case float_round_down:
7047             if (extractFloat128Sign(z)) {
7048                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7049             }
7050             break;
7051         case float_round_to_odd:
7052             /*
7053              * Note that if lastBitMask == 0, the last bit is the lsb
7054              * of high, and roundBitsMask == -1.
7055              */
7056             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7057                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7058             }
7059             break;
7060         default:
7061             abort();
7062         }
7063         z.low &= ~ roundBitsMask;
7064     }
7065     else {
7066         if ( aExp < 0x3FFF ) {
7067             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7068             float_raise(float_flag_inexact, status);
7069             aSign = extractFloat128Sign( a );
7070             switch (status->float_rounding_mode) {
7071             case float_round_nearest_even:
7072                 if (    ( aExp == 0x3FFE )
7073                      && (   extractFloat128Frac0( a )
7074                           | extractFloat128Frac1( a ) )
7075                    ) {
7076                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7077                 }
7078                 break;
7079             case float_round_ties_away:
7080                 if (aExp == 0x3FFE) {
7081                     return packFloat128(aSign, 0x3FFF, 0, 0);
7082                 }
7083                 break;
7084             case float_round_down:
7085                 return
7086                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7087                     : packFloat128( 0, 0, 0, 0 );
7088             case float_round_up:
7089                 return
7090                       aSign ? packFloat128( 1, 0, 0, 0 )
7091                     : packFloat128( 0, 0x3FFF, 0, 0 );
7092 
7093             case float_round_to_odd:
7094                 return packFloat128(aSign, 0x3FFF, 0, 0);
7095 
7096             case float_round_to_zero:
7097                 break;
7098             }
7099             return packFloat128( aSign, 0, 0, 0 );
7100         }
7101         lastBitMask = 1;
7102         lastBitMask <<= 0x402F - aExp;
7103         roundBitsMask = lastBitMask - 1;
7104         z.low = 0;
7105         z.high = a.high;
7106         switch (status->float_rounding_mode) {
7107         case float_round_nearest_even:
7108             z.high += lastBitMask>>1;
7109             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7110                 z.high &= ~ lastBitMask;
7111             }
7112             break;
7113         case float_round_ties_away:
7114             z.high += lastBitMask>>1;
7115             break;
7116         case float_round_to_zero:
7117             break;
7118         case float_round_up:
7119             if (!extractFloat128Sign(z)) {
7120                 z.high |= ( a.low != 0 );
7121                 z.high += roundBitsMask;
7122             }
7123             break;
7124         case float_round_down:
7125             if (extractFloat128Sign(z)) {
7126                 z.high |= (a.low != 0);
7127                 z.high += roundBitsMask;
7128             }
7129             break;
7130         case float_round_to_odd:
7131             if ((z.high & lastBitMask) == 0) {
7132                 z.high |= (a.low != 0);
7133                 z.high += roundBitsMask;
7134             }
7135             break;
7136         default:
7137             abort();
7138         }
7139         z.high &= ~ roundBitsMask;
7140     }
7141     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7142         float_raise(float_flag_inexact, status);
7143     }
7144     return z;
7145 
7146 }
7147 
7148 /*----------------------------------------------------------------------------
7149 | Returns the result of adding the absolute values of the quadruple-precision
7150 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7151 | before being returned.  `zSign' is ignored if the result is a NaN.
7152 | The addition is performed according to the IEC/IEEE Standard for Binary
7153 | Floating-Point Arithmetic.
7154 *----------------------------------------------------------------------------*/
7155 
7156 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
7157                                 float_status *status)
7158 {
7159     int32_t aExp, bExp, zExp;
7160     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7161     int32_t expDiff;
7162 
7163     aSig1 = extractFloat128Frac1( a );
7164     aSig0 = extractFloat128Frac0( a );
7165     aExp = extractFloat128Exp( a );
7166     bSig1 = extractFloat128Frac1( b );
7167     bSig0 = extractFloat128Frac0( b );
7168     bExp = extractFloat128Exp( b );
7169     expDiff = aExp - bExp;
7170     if ( 0 < expDiff ) {
7171         if ( aExp == 0x7FFF ) {
7172             if (aSig0 | aSig1) {
7173                 return propagateFloat128NaN(a, b, status);
7174             }
7175             return a;
7176         }
7177         if ( bExp == 0 ) {
7178             --expDiff;
7179         }
7180         else {
7181             bSig0 |= UINT64_C(0x0001000000000000);
7182         }
7183         shift128ExtraRightJamming(
7184             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7185         zExp = aExp;
7186     }
7187     else if ( expDiff < 0 ) {
7188         if ( bExp == 0x7FFF ) {
7189             if (bSig0 | bSig1) {
7190                 return propagateFloat128NaN(a, b, status);
7191             }
7192             return packFloat128( zSign, 0x7FFF, 0, 0 );
7193         }
7194         if ( aExp == 0 ) {
7195             ++expDiff;
7196         }
7197         else {
7198             aSig0 |= UINT64_C(0x0001000000000000);
7199         }
7200         shift128ExtraRightJamming(
7201             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7202         zExp = bExp;
7203     }
7204     else {
7205         if ( aExp == 0x7FFF ) {
7206             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7207                 return propagateFloat128NaN(a, b, status);
7208             }
7209             return a;
7210         }
7211         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7212         if ( aExp == 0 ) {
7213             if (status->flush_to_zero) {
7214                 if (zSig0 | zSig1) {
7215                     float_raise(float_flag_output_denormal, status);
7216                 }
7217                 return packFloat128(zSign, 0, 0, 0);
7218             }
7219             return packFloat128( zSign, 0, zSig0, zSig1 );
7220         }
7221         zSig2 = 0;
7222         zSig0 |= UINT64_C(0x0002000000000000);
7223         zExp = aExp;
7224         goto shiftRight1;
7225     }
7226     aSig0 |= UINT64_C(0x0001000000000000);
7227     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7228     --zExp;
7229     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7230     ++zExp;
7231  shiftRight1:
7232     shift128ExtraRightJamming(
7233         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7234  roundAndPack:
7235     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7236 
7237 }
7238 
7239 /*----------------------------------------------------------------------------
7240 | Returns the result of subtracting the absolute values of the quadruple-
7241 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7242 | difference is negated before being returned.  `zSign' is ignored if the
7243 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7244 | Standard for Binary Floating-Point Arithmetic.
7245 *----------------------------------------------------------------------------*/
7246 
7247 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7248                                 float_status *status)
7249 {
7250     int32_t aExp, bExp, zExp;
7251     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7252     int32_t expDiff;
7253 
7254     aSig1 = extractFloat128Frac1( a );
7255     aSig0 = extractFloat128Frac0( a );
7256     aExp = extractFloat128Exp( a );
7257     bSig1 = extractFloat128Frac1( b );
7258     bSig0 = extractFloat128Frac0( b );
7259     bExp = extractFloat128Exp( b );
7260     expDiff = aExp - bExp;
7261     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7262     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7263     if ( 0 < expDiff ) goto aExpBigger;
7264     if ( expDiff < 0 ) goto bExpBigger;
7265     if ( aExp == 0x7FFF ) {
7266         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7267             return propagateFloat128NaN(a, b, status);
7268         }
7269         float_raise(float_flag_invalid, status);
7270         return float128_default_nan(status);
7271     }
7272     if ( aExp == 0 ) {
7273         aExp = 1;
7274         bExp = 1;
7275     }
7276     if ( bSig0 < aSig0 ) goto aBigger;
7277     if ( aSig0 < bSig0 ) goto bBigger;
7278     if ( bSig1 < aSig1 ) goto aBigger;
7279     if ( aSig1 < bSig1 ) goto bBigger;
7280     return packFloat128(status->float_rounding_mode == float_round_down,
7281                         0, 0, 0);
7282  bExpBigger:
7283     if ( bExp == 0x7FFF ) {
7284         if (bSig0 | bSig1) {
7285             return propagateFloat128NaN(a, b, status);
7286         }
7287         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7288     }
7289     if ( aExp == 0 ) {
7290         ++expDiff;
7291     }
7292     else {
7293         aSig0 |= UINT64_C(0x4000000000000000);
7294     }
7295     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7296     bSig0 |= UINT64_C(0x4000000000000000);
7297  bBigger:
7298     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7299     zExp = bExp;
7300     zSign ^= 1;
7301     goto normalizeRoundAndPack;
7302  aExpBigger:
7303     if ( aExp == 0x7FFF ) {
7304         if (aSig0 | aSig1) {
7305             return propagateFloat128NaN(a, b, status);
7306         }
7307         return a;
7308     }
7309     if ( bExp == 0 ) {
7310         --expDiff;
7311     }
7312     else {
7313         bSig0 |= UINT64_C(0x4000000000000000);
7314     }
7315     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7316     aSig0 |= UINT64_C(0x4000000000000000);
7317  aBigger:
7318     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7319     zExp = aExp;
7320  normalizeRoundAndPack:
7321     --zExp;
7322     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7323                                          status);
7324 
7325 }
7326 
7327 /*----------------------------------------------------------------------------
7328 | Returns the result of adding the quadruple-precision floating-point values
7329 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7330 | for Binary Floating-Point Arithmetic.
7331 *----------------------------------------------------------------------------*/
7332 
7333 float128 float128_add(float128 a, float128 b, float_status *status)
7334 {
7335     bool aSign, bSign;
7336 
7337     aSign = extractFloat128Sign( a );
7338     bSign = extractFloat128Sign( b );
7339     if ( aSign == bSign ) {
7340         return addFloat128Sigs(a, b, aSign, status);
7341     }
7342     else {
7343         return subFloat128Sigs(a, b, aSign, status);
7344     }
7345 
7346 }
7347 
7348 /*----------------------------------------------------------------------------
7349 | Returns the result of subtracting the quadruple-precision floating-point
7350 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7351 | Standard for Binary Floating-Point Arithmetic.
7352 *----------------------------------------------------------------------------*/
7353 
7354 float128 float128_sub(float128 a, float128 b, float_status *status)
7355 {
7356     bool aSign, bSign;
7357 
7358     aSign = extractFloat128Sign( a );
7359     bSign = extractFloat128Sign( b );
7360     if ( aSign == bSign ) {
7361         return subFloat128Sigs(a, b, aSign, status);
7362     }
7363     else {
7364         return addFloat128Sigs(a, b, aSign, status);
7365     }
7366 
7367 }
7368 
7369 /*----------------------------------------------------------------------------
7370 | Returns the result of multiplying the quadruple-precision floating-point
7371 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7372 | Standard for Binary Floating-Point Arithmetic.
7373 *----------------------------------------------------------------------------*/
7374 
7375 float128 float128_mul(float128 a, float128 b, float_status *status)
7376 {
7377     bool aSign, bSign, zSign;
7378     int32_t aExp, bExp, zExp;
7379     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7380 
7381     aSig1 = extractFloat128Frac1( a );
7382     aSig0 = extractFloat128Frac0( a );
7383     aExp = extractFloat128Exp( a );
7384     aSign = extractFloat128Sign( a );
7385     bSig1 = extractFloat128Frac1( b );
7386     bSig0 = extractFloat128Frac0( b );
7387     bExp = extractFloat128Exp( b );
7388     bSign = extractFloat128Sign( b );
7389     zSign = aSign ^ bSign;
7390     if ( aExp == 0x7FFF ) {
7391         if (    ( aSig0 | aSig1 )
7392              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7393             return propagateFloat128NaN(a, b, status);
7394         }
7395         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7396         return packFloat128( zSign, 0x7FFF, 0, 0 );
7397     }
7398     if ( bExp == 0x7FFF ) {
7399         if (bSig0 | bSig1) {
7400             return propagateFloat128NaN(a, b, status);
7401         }
7402         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7403  invalid:
7404             float_raise(float_flag_invalid, status);
7405             return float128_default_nan(status);
7406         }
7407         return packFloat128( zSign, 0x7FFF, 0, 0 );
7408     }
7409     if ( aExp == 0 ) {
7410         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7411         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7412     }
7413     if ( bExp == 0 ) {
7414         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7415         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7416     }
7417     zExp = aExp + bExp - 0x4000;
7418     aSig0 |= UINT64_C(0x0001000000000000);
7419     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7420     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7421     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7422     zSig2 |= ( zSig3 != 0 );
7423     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7424         shift128ExtraRightJamming(
7425             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7426         ++zExp;
7427     }
7428     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7429 
7430 }
7431 
7432 /*----------------------------------------------------------------------------
7433 | Returns the result of dividing the quadruple-precision floating-point value
7434 | `a' by the corresponding value `b'.  The operation is performed according to
7435 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7436 *----------------------------------------------------------------------------*/
7437 
7438 float128 float128_div(float128 a, float128 b, float_status *status)
7439 {
7440     bool aSign, bSign, zSign;
7441     int32_t aExp, bExp, zExp;
7442     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7443     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7444 
7445     aSig1 = extractFloat128Frac1( a );
7446     aSig0 = extractFloat128Frac0( a );
7447     aExp = extractFloat128Exp( a );
7448     aSign = extractFloat128Sign( a );
7449     bSig1 = extractFloat128Frac1( b );
7450     bSig0 = extractFloat128Frac0( b );
7451     bExp = extractFloat128Exp( b );
7452     bSign = extractFloat128Sign( b );
7453     zSign = aSign ^ bSign;
7454     if ( aExp == 0x7FFF ) {
7455         if (aSig0 | aSig1) {
7456             return propagateFloat128NaN(a, b, status);
7457         }
7458         if ( bExp == 0x7FFF ) {
7459             if (bSig0 | bSig1) {
7460                 return propagateFloat128NaN(a, b, status);
7461             }
7462             goto invalid;
7463         }
7464         return packFloat128( zSign, 0x7FFF, 0, 0 );
7465     }
7466     if ( bExp == 0x7FFF ) {
7467         if (bSig0 | bSig1) {
7468             return propagateFloat128NaN(a, b, status);
7469         }
7470         return packFloat128( zSign, 0, 0, 0 );
7471     }
7472     if ( bExp == 0 ) {
7473         if ( ( bSig0 | bSig1 ) == 0 ) {
7474             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7475  invalid:
7476                 float_raise(float_flag_invalid, status);
7477                 return float128_default_nan(status);
7478             }
7479             float_raise(float_flag_divbyzero, status);
7480             return packFloat128( zSign, 0x7FFF, 0, 0 );
7481         }
7482         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7483     }
7484     if ( aExp == 0 ) {
7485         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7486         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7487     }
7488     zExp = aExp - bExp + 0x3FFD;
7489     shortShift128Left(
7490         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7491     shortShift128Left(
7492         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7493     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7494         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7495         ++zExp;
7496     }
7497     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7498     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7499     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7500     while ( (int64_t) rem0 < 0 ) {
7501         --zSig0;
7502         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7503     }
7504     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7505     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7506         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7507         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7508         while ( (int64_t) rem1 < 0 ) {
7509             --zSig1;
7510             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7511         }
7512         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7513     }
7514     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7515     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7516 
7517 }
7518 
7519 /*----------------------------------------------------------------------------
7520 | Returns the remainder of the quadruple-precision floating-point value `a'
7521 | with respect to the corresponding value `b'.  The operation is performed
7522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7523 *----------------------------------------------------------------------------*/
7524 
7525 float128 float128_rem(float128 a, float128 b, float_status *status)
7526 {
7527     bool aSign, zSign;
7528     int32_t aExp, bExp, expDiff;
7529     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7530     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7531     int64_t sigMean0;
7532 
7533     aSig1 = extractFloat128Frac1( a );
7534     aSig0 = extractFloat128Frac0( a );
7535     aExp = extractFloat128Exp( a );
7536     aSign = extractFloat128Sign( a );
7537     bSig1 = extractFloat128Frac1( b );
7538     bSig0 = extractFloat128Frac0( b );
7539     bExp = extractFloat128Exp( b );
7540     if ( aExp == 0x7FFF ) {
7541         if (    ( aSig0 | aSig1 )
7542              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7543             return propagateFloat128NaN(a, b, status);
7544         }
7545         goto invalid;
7546     }
7547     if ( bExp == 0x7FFF ) {
7548         if (bSig0 | bSig1) {
7549             return propagateFloat128NaN(a, b, status);
7550         }
7551         return a;
7552     }
7553     if ( bExp == 0 ) {
7554         if ( ( bSig0 | bSig1 ) == 0 ) {
7555  invalid:
7556             float_raise(float_flag_invalid, status);
7557             return float128_default_nan(status);
7558         }
7559         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7560     }
7561     if ( aExp == 0 ) {
7562         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7563         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7564     }
7565     expDiff = aExp - bExp;
7566     if ( expDiff < -1 ) return a;
7567     shortShift128Left(
7568         aSig0 | UINT64_C(0x0001000000000000),
7569         aSig1,
7570         15 - ( expDiff < 0 ),
7571         &aSig0,
7572         &aSig1
7573     );
7574     shortShift128Left(
7575         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7576     q = le128( bSig0, bSig1, aSig0, aSig1 );
7577     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7578     expDiff -= 64;
7579     while ( 0 < expDiff ) {
7580         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7581         q = ( 4 < q ) ? q - 4 : 0;
7582         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7583         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7584         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7585         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7586         expDiff -= 61;
7587     }
7588     if ( -64 < expDiff ) {
7589         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7590         q = ( 4 < q ) ? q - 4 : 0;
7591         q >>= - expDiff;
7592         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7593         expDiff += 52;
7594         if ( expDiff < 0 ) {
7595             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7596         }
7597         else {
7598             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7599         }
7600         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7601         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7602     }
7603     else {
7604         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7605         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7606     }
7607     do {
7608         alternateASig0 = aSig0;
7609         alternateASig1 = aSig1;
7610         ++q;
7611         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7612     } while ( 0 <= (int64_t) aSig0 );
7613     add128(
7614         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7615     if (    ( sigMean0 < 0 )
7616          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7617         aSig0 = alternateASig0;
7618         aSig1 = alternateASig1;
7619     }
7620     zSign = ( (int64_t) aSig0 < 0 );
7621     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7622     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7623                                          status);
7624 }
7625 
7626 /*----------------------------------------------------------------------------
7627 | Returns the square root of the quadruple-precision floating-point value `a'.
7628 | The operation is performed according to the IEC/IEEE Standard for Binary
7629 | Floating-Point Arithmetic.
7630 *----------------------------------------------------------------------------*/
7631 
7632 float128 float128_sqrt(float128 a, float_status *status)
7633 {
7634     bool aSign;
7635     int32_t aExp, zExp;
7636     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7637     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7638 
7639     aSig1 = extractFloat128Frac1( a );
7640     aSig0 = extractFloat128Frac0( a );
7641     aExp = extractFloat128Exp( a );
7642     aSign = extractFloat128Sign( a );
7643     if ( aExp == 0x7FFF ) {
7644         if (aSig0 | aSig1) {
7645             return propagateFloat128NaN(a, a, status);
7646         }
7647         if ( ! aSign ) return a;
7648         goto invalid;
7649     }
7650     if ( aSign ) {
7651         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7652  invalid:
7653         float_raise(float_flag_invalid, status);
7654         return float128_default_nan(status);
7655     }
7656     if ( aExp == 0 ) {
7657         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7658         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7659     }
7660     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7661     aSig0 |= UINT64_C(0x0001000000000000);
7662     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7663     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7664     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7665     doubleZSig0 = zSig0<<1;
7666     mul64To128( zSig0, zSig0, &term0, &term1 );
7667     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7668     while ( (int64_t) rem0 < 0 ) {
7669         --zSig0;
7670         doubleZSig0 -= 2;
7671         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7672     }
7673     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7674     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7675         if ( zSig1 == 0 ) zSig1 = 1;
7676         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7677         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7678         mul64To128( zSig1, zSig1, &term2, &term3 );
7679         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7680         while ( (int64_t) rem1 < 0 ) {
7681             --zSig1;
7682             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7683             term3 |= 1;
7684             term2 |= doubleZSig0;
7685             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7686         }
7687         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7688     }
7689     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7690     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7691 
7692 }
7693 
7694 static inline FloatRelation
7695 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7696                           float_status *status)
7697 {
7698     bool aSign, bSign;
7699 
7700     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7701         float_raise(float_flag_invalid, status);
7702         return float_relation_unordered;
7703     }
7704     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7705           ( extractFloatx80Frac( a )<<1 ) ) ||
7706         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7707           ( extractFloatx80Frac( b )<<1 ) )) {
7708         if (!is_quiet ||
7709             floatx80_is_signaling_nan(a, status) ||
7710             floatx80_is_signaling_nan(b, status)) {
7711             float_raise(float_flag_invalid, status);
7712         }
7713         return float_relation_unordered;
7714     }
7715     aSign = extractFloatx80Sign( a );
7716     bSign = extractFloatx80Sign( b );
7717     if ( aSign != bSign ) {
7718 
7719         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7720              ( ( a.low | b.low ) == 0 ) ) {
7721             /* zero case */
7722             return float_relation_equal;
7723         } else {
7724             return 1 - (2 * aSign);
7725         }
7726     } else {
7727         /* Normalize pseudo-denormals before comparison.  */
7728         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7729             ++a.high;
7730         }
7731         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7732             ++b.high;
7733         }
7734         if (a.low == b.low && a.high == b.high) {
7735             return float_relation_equal;
7736         } else {
7737             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7738         }
7739     }
7740 }
7741 
7742 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7743 {
7744     return floatx80_compare_internal(a, b, 0, status);
7745 }
7746 
7747 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7748                                      float_status *status)
7749 {
7750     return floatx80_compare_internal(a, b, 1, status);
7751 }
7752 
7753 static inline FloatRelation
7754 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7755                           float_status *status)
7756 {
7757     bool aSign, bSign;
7758 
7759     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7760           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7761         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7762           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7763         if (!is_quiet ||
7764             float128_is_signaling_nan(a, status) ||
7765             float128_is_signaling_nan(b, status)) {
7766             float_raise(float_flag_invalid, status);
7767         }
7768         return float_relation_unordered;
7769     }
7770     aSign = extractFloat128Sign( a );
7771     bSign = extractFloat128Sign( b );
7772     if ( aSign != bSign ) {
7773         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7774             /* zero case */
7775             return float_relation_equal;
7776         } else {
7777             return 1 - (2 * aSign);
7778         }
7779     } else {
7780         if (a.low == b.low && a.high == b.high) {
7781             return float_relation_equal;
7782         } else {
7783             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7784         }
7785     }
7786 }
7787 
7788 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7789 {
7790     return float128_compare_internal(a, b, 0, status);
7791 }
7792 
7793 FloatRelation float128_compare_quiet(float128 a, float128 b,
7794                                      float_status *status)
7795 {
7796     return float128_compare_internal(a, b, 1, status);
7797 }
7798 
7799 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7800 {
7801     bool aSign;
7802     int32_t aExp;
7803     uint64_t aSig;
7804 
7805     if (floatx80_invalid_encoding(a)) {
7806         float_raise(float_flag_invalid, status);
7807         return floatx80_default_nan(status);
7808     }
7809     aSig = extractFloatx80Frac( a );
7810     aExp = extractFloatx80Exp( a );
7811     aSign = extractFloatx80Sign( a );
7812 
7813     if ( aExp == 0x7FFF ) {
7814         if ( aSig<<1 ) {
7815             return propagateFloatx80NaN(a, a, status);
7816         }
7817         return a;
7818     }
7819 
7820     if (aExp == 0) {
7821         if (aSig == 0) {
7822             return a;
7823         }
7824         aExp++;
7825     }
7826 
7827     if (n > 0x10000) {
7828         n = 0x10000;
7829     } else if (n < -0x10000) {
7830         n = -0x10000;
7831     }
7832 
7833     aExp += n;
7834     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7835                                          aSign, aExp, aSig, 0, status);
7836 }
7837 
7838 float128 float128_scalbn(float128 a, int n, float_status *status)
7839 {
7840     bool aSign;
7841     int32_t aExp;
7842     uint64_t aSig0, aSig1;
7843 
7844     aSig1 = extractFloat128Frac1( a );
7845     aSig0 = extractFloat128Frac0( a );
7846     aExp = extractFloat128Exp( a );
7847     aSign = extractFloat128Sign( a );
7848     if ( aExp == 0x7FFF ) {
7849         if ( aSig0 | aSig1 ) {
7850             return propagateFloat128NaN(a, a, status);
7851         }
7852         return a;
7853     }
7854     if (aExp != 0) {
7855         aSig0 |= UINT64_C(0x0001000000000000);
7856     } else if (aSig0 == 0 && aSig1 == 0) {
7857         return a;
7858     } else {
7859         aExp++;
7860     }
7861 
7862     if (n > 0x10000) {
7863         n = 0x10000;
7864     } else if (n < -0x10000) {
7865         n = -0x10000;
7866     }
7867 
7868     aExp += n - 1;
7869     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7870                                          , status);
7871 
7872 }
7873 
7874 static void __attribute__((constructor)) softfloat_init(void)
7875 {
7876     union_float64 ua, ub, uc, ur;
7877 
7878     if (QEMU_NO_HARDFLOAT) {
7879         return;
7880     }
7881     /*
7882      * Test that the host's FMA is not obviously broken. For example,
7883      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7884      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7885      */
7886     ua.s = 0x0020000000000001ULL;
7887     ub.s = 0x3ca0000000000000ULL;
7888     uc.s = 0x0020000000000000ULL;
7889     ur.h = fma(ua.h, ub.h, uc.h);
7890     if (ur.s != 0x0020000000000001ULL) {
7891         force_soft_fma = true;
7892     }
7893 }
7894